I want to split a string into a sequence of words using a dictionary.
The following function only gives me words with letter boundaries.
function spltToWord(prm){
var spltedAr = prm.split(/(?=[A-Z][a-z])/);
return spltedAr.join(" ").trim();
}
I want to separate the text into dictionary/non-dictionary words.
For example:
Iamno123prisoner - I am no 123 prisoner
USAisveryrich - USA is very rich
Albertgaveme5rupees - Albert gave me 5 rupees
A sample dictionary can be found Here for refrence.
The split() method splits a string into an array of substrings. The split() method returns the new array. The split() method does not change the original string. If (" ") is used as separator, the string is split between words.
the split() method in Python split a string into a list of strings after breaking the given string by the specified separator. Python String join() method is a string method and returns a string in which the elements of the sequence have been joined by the str separator.
(which means "any character" in regex), use either backslash \ to escape the individual special character like so split("\\.") , or use character class [] to represent literal character(s) like so split("[.]") , or use Pattern#quote() to escape the entire string like so split(Pattern.
If the dictionary contains many words, you'll want a faster solution than matching one word at a time against substrings of the input text. If there are ten thousand words in the dictionary, you have to do at least ten thousand character comparisons with this naive approach.
A more efficient solution involves the use of a trie in which you've stored the dictionary words. Checking whether a word is in the dictionary requires only a single pass through the string.
Another advantage is that you can search for the longest match starting from a given position in the text. This approach is demonstrated in the following snippet, which includes a small dictionary that suffices to process your examples. You can replace this with your own dictionary when you run the code locally.
var Global = {
dictionary: [
'aa', 'I', 'i', 's',
'aah', 'am',
'aahed', 'no',
'aahing', '123',
'aahs', 'prisoner',
'aal', 'USA',
'aalii', 'is',
'aaliis', 'very',
'aals', 'rich',
'aardvark', 'Albert',
'aardvarks', 'gave', 'me', '5', 'rupees'
]
};
function addToTrie(word) {
var node = Global.trie;
for (var pos = 0; pos < word.length; ++pos) {
var letter = word.charAt(pos);
if (node[letter] === undefined) {
node[letter] = {};
}
node = node[letter];
}
node.terminal = true;
}
function findLongestMatch(text, start) {
var node = Global.trie,
best = start - 1;
for (var pos = start; pos < text.length; ++pos) {
var letter = text.charAt(pos);
if (node[letter] === undefined) {
break;
}
node = node[letter];
if (node.terminal) {
best = pos;
}
}
return best - start + 1;
}
function loadTrie() {
var words = Global.dictionary,
trie = Global.trie = {};
words.forEach(function (word) {
addToTrie(word);
});
};
function println(label, s, labelStyle) {
if (s === undefined) {
s = label || '';
} else {
var className = 'label' + (labelStyle ? ' ' + labelStyle : '');
s = '<div class="' + className + '">' + label + '</div> ' + s;
}
document.getElementById('message').innerHTML += (s + '<br />');
}
function process(text) {
var results = [],
letters = [],
pos = 0;
while (pos < text.length) {
var length = findLongestMatch(text, pos);
if (length == 0) {
letters.push(text.charAt(pos));
pos += 1;
} else {
if (letters.length != 0) {
results.push({ word: letters.join(''), known: false });
letters = [];
}
results.push({ word: text.substring(pos, pos + length), known: true });
pos += length;
}
}
if (letters.length != 0) {
results.push({ word: letters.join(''), known: false });
letters = [];
}
var breakdown = results.map(function (result) {
return result.word;
});
println();
println(' breakdown:', '/' + breakdown.join('/') + '/');
results.forEach(function (result) {
println((result.known ? 'in dictionary' : ' unknown') + ':',
'"' + result.word + '"', (result.known ? undefined : 'unknown'));
});
};
window.onload = function () {
loadTrie();
process('WellIamno123prisoner');
process('TheUSAisveryrichman');
process('Albertgaveme5rupeesyo');
};
body {
font-family: monospace;
color: #444;
font-size: 14px;
line-height: 20px;
}
.label {
display: inline-block;
width: 200px;
font-size: 13px;
color: #aaa;
text-align: right;
}
.label.unknown {
color: #c61c39;
}
<div id="message"></div>
Edit #2
var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "banana123candynotawordcookieblahdoughnut";
var currentWord = "",
result = "";
while (true) {
for (var i = 0; i < input.length; i++) {
currentWord += input[i];
if (words.indexOf(currentWord) >= 0) {
result += " " + currentWord + " ";
currentWord = "";
}
}
if (currentWord.length > 0) {
result += currentWord[0];
input = currentWord.substr(1);
currentWord = "";
} else {
break;
}
}
console.log(result.trim()); // "banana 123 candy notaword cookie blah doughnut"
Note the result will contain two spaces if there are two consecutive dictionary words. You can fix this using regex.
It isn't the prettiest solution (you can make it recursive which may be more readable), and it doesn't provide multiple solutions (see DP link in comment for that).
Edit this doesn't take into account non-dictionary words.
Using a simple greedy algorithm:
var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "bananacandycookiedoughnut";
var currentWord = "", result = "";
for (var i = 0; i < input.length; i++) {
currentWord += input[i];
if (words.indexOf(currentWord) >= 0) {
result += currentWord + " ";
currentWord = "";
}
}
console.log(result.trim()); // "banana candy cookie donught"
Of course you would want to modify various part of this, e.g. use a set for the dictionary and take into account case [in]sensitivity.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With