Split String into dictionary words and non-dictionary words

Tags:

I want to split a string into a sequence of words using a dictionary.

The following function only gives me words with letter boundaries.

function spltToWord(prm){
    var spltedAr = prm.split(/(?=[A-Z][a-z])/);
    return spltedAr.join(" ").trim();
}

I want to separate the text into dictionary/non-dictionary words.

For example:

Iamno123prisoner - I am no 123 prisoner

USAisveryrich - USA is very rich

Albertgaveme5rupees - Albert gave me 5 rupees

A sample dictionary can be found Here for refrence.

265

asked Aug 19 '15 03:08

Akhilesh Kumar

2 Answers

If the dictionary contains many words, you'll want a faster solution than matching one word at a time against substrings of the input text. If there are ten thousand words in the dictionary, you have to do at least ten thousand character comparisons with this naive approach.

A more efficient solution involves the use of a trie in which you've stored the dictionary words. Checking whether a word is in the dictionary requires only a single pass through the string.

Another advantage is that you can search for the longest match starting from a given position in the text. This approach is demonstrated in the following snippet, which includes a small dictionary that suffices to process your examples. You can replace this with your own dictionary when you run the code locally.

var Global = {
  dictionary: [
    'aa', 'I', 'i', 's',
    'aah', 'am',
    'aahed', 'no',
    'aahing', '123',
    'aahs', 'prisoner',
    'aal', 'USA',
    'aalii', 'is',
    'aaliis', 'very',
    'aals', 'rich',
    'aardvark', 'Albert',
    'aardvarks', 'gave', 'me', '5', 'rupees'
  ]
};

function addToTrie(word) {
  var node = Global.trie;
  for (var pos = 0; pos < word.length; ++pos) {
    var letter = word.charAt(pos);
    if (node[letter] === undefined) {
      node[letter] = {};
    }
    node = node[letter];
  }
  node.terminal = true;
}

function findLongestMatch(text, start) {
  var node = Global.trie,
      best = start - 1;
  for (var pos = start; pos < text.length; ++pos) {
    var letter = text.charAt(pos);
    if (node[letter] === undefined) {
      break;
    }
    node = node[letter];
    if (node.terminal) {
      best = pos;
    }
  }
  return best - start + 1;
}

function loadTrie() {
  var words = Global.dictionary,
      trie = Global.trie = {};
  words.forEach(function (word) {
    addToTrie(word);
  });
};

function println(label, s, labelStyle) {
  if (s === undefined) {
    s = label || '';
  } else {
    var className = 'label' + (labelStyle ? ' ' + labelStyle : '');
    s = '<div class="' + className + '">' + label + '</div> ' + s;
  }
  document.getElementById('message').innerHTML += (s + '<br />');
}

function process(text) {
  var results = [],
      letters = [],
      pos = 0;
  while (pos < text.length) {
    var length = findLongestMatch(text, pos);
    if (length == 0) {
      letters.push(text.charAt(pos));
      pos += 1;
    } else {
      if (letters.length != 0) {
        results.push({ word: letters.join(''), known: false });
        letters = [];
      }
      results.push({ word: text.substring(pos, pos + length), known: true });
      pos += length;
    }
  }
  if (letters.length != 0) {
    results.push({ word: letters.join(''), known: false });
    letters = [];
  }
  var breakdown = results.map(function (result) {
    return result.word;
  });
  println();
  println('    breakdown:',  '/' + breakdown.join('/') + '/');
  results.forEach(function (result) {
    println((result.known ? 'in dictionary' : '      unknown') + ':',
        '"' + result.word + '"', (result.known ? undefined : 'unknown'));
  });
};

window.onload = function () {
  loadTrie();
  process('WellIamno123prisoner');
  process('TheUSAisveryrichman');
  process('Albertgaveme5rupeesyo');
};

body {
  font-family: monospace;
  color: #444;
  font-size: 14px;
  line-height: 20px;
}
.label {
  display: inline-block;
  width: 200px;
  font-size: 13px;
  color: #aaa;
  text-align: right;
}
.label.unknown {
  color: #c61c39;
}

<div id="message"></div>

answered Sep 28 '22 10:09

Michael Laszlo

Edit #2

var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "banana123candynotawordcookieblahdoughnut";
var currentWord = "",
    result = "";

while (true) {
    for (var i = 0; i < input.length; i++) {
        currentWord += input[i];
        if (words.indexOf(currentWord) >= 0) {
            result += " " + currentWord + " ";
            currentWord = "";
        }
    }

    if (currentWord.length > 0) {
        result += currentWord[0];
        input = currentWord.substr(1);
        currentWord = "";
    } else {
        break;
    }
}

console.log(result.trim()); // "banana 123 candy notaword cookie blah doughnut"

Note the result will contain two spaces if there are two consecutive dictionary words. You can fix this using regex.

It isn't the prettiest solution (you can make it recursive which may be more readable), and it doesn't provide multiple solutions (see DP link in comment for that).

Edit this doesn't take into account non-dictionary words.

Using a simple greedy algorithm:

var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "bananacandycookiedoughnut";
var currentWord = "", result = "";

for (var i = 0; i < input.length; i++) {
    currentWord += input[i];
    if (words.indexOf(currentWord) >= 0) {
        result += currentWord + " ";
        currentWord = "";
    }
}

console.log(result.trim()); // "banana candy cookie donught"

Of course you would want to modify various part of this, e.g. use a set for the dictionary and take into account case [in]sensitivity.

answered Sep 28 '22 10:09

Albert Xing

Related questions
                            
                                Decode Base64 encoded PDF content in browser
                            
                                initializing radio buttons added dynamically in materialize
                            
                                Looping with Promises
                            
                                how to get next element that is visible in jquery
                            
                                Get reactive var from another template in Meteor
                            
                                Get notified about dragged images within a contenteditable div
                            
                                DOMException when registering service worker inside an https iframe
                            
                                Attaching an SVG to the DOM with Webpack
                            
                                Javascript hasOwnProperty always false on Event objects?
                            
                                typescript class decorator: typing properties defined in decorator function
                            
                                how to pass MVC model to a UI-bootstrap modal
                            
                                What is considered a watcher in Angular?
                            
                                Detecting Hard Links in Node.js
                            
                                Jquery Open Accordion based on URL anchor hash
                            
                                Enter key down does not open drop down in Mozilla
                            
                                AngularJS: How to use $q in your config phase for a unit test?
                            
                                Why php function str_word_count() returns different count from js equivalent?
                            
                                Where do the webpack output files from the react-hot-loader go?
                            
                                Why do you put requestAnimationFrame before the function body?
                            
                                Irregular repetitions in table using JavaScript and Angular.JS

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Split String into dictionary words and non-dictionary words

Tags:

javascript

string

regex

algorithm

Akhilesh Kumar

People also ask

2 Answers

Michael Laszlo

Albert Xing

Recent Activity

Donate For Us