Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Split String into dictionary words and non-dictionary words

I want to split a string into a sequence of words using a dictionary.

The following function only gives me words with letter boundaries.

function spltToWord(prm){
    var spltedAr = prm.split(/(?=[A-Z][a-z])/);
    return spltedAr.join(" ").trim();
}

I want to separate the text into dictionary/non-dictionary words.

For example:

Iamno123prisoner - I am no 123 prisoner

USAisveryrich - USA is very rich

Albertgaveme5rupees - Albert gave me 5 rupees

A sample dictionary can be found Here for refrence.

like image 265
Akhilesh Kumar Avatar asked Aug 19 '15 03:08

Akhilesh Kumar


People also ask

How do you split words in a string?

The split() method splits a string into an array of substrings. The split() method returns the new array. The split() method does not change the original string. If (" ") is used as separator, the string is split between words.

How do you split combined words in Python?

the split() method in Python split a string into a list of strings after breaking the given string by the specified separator. Python String join() method is a string method and returns a string in which the elements of the sequence have been joined by the str separator.

How do you split a string into characters?

(which means "any character" in regex), use either backslash \ to escape the individual special character like so split("\\.") , or use character class [] to represent literal character(s) like so split("[.]") , or use Pattern#quote() to escape the entire string like so split(Pattern.


2 Answers

If the dictionary contains many words, you'll want a faster solution than matching one word at a time against substrings of the input text. If there are ten thousand words in the dictionary, you have to do at least ten thousand character comparisons with this naive approach.

A more efficient solution involves the use of a trie in which you've stored the dictionary words. Checking whether a word is in the dictionary requires only a single pass through the string.

Another advantage is that you can search for the longest match starting from a given position in the text. This approach is demonstrated in the following snippet, which includes a small dictionary that suffices to process your examples. You can replace this with your own dictionary when you run the code locally.

var Global = {
  dictionary: [
    'aa', 'I', 'i', 's',
    'aah', 'am',
    'aahed', 'no',
    'aahing', '123',
    'aahs', 'prisoner',
    'aal', 'USA',
    'aalii', 'is',
    'aaliis', 'very',
    'aals', 'rich',
    'aardvark', 'Albert',
    'aardvarks', 'gave', 'me', '5', 'rupees'
  ]
};

function addToTrie(word) {
  var node = Global.trie;
  for (var pos = 0; pos < word.length; ++pos) {
    var letter = word.charAt(pos);
    if (node[letter] === undefined) {
      node[letter] = {};
    }
    node = node[letter];
  }
  node.terminal = true;
}

function findLongestMatch(text, start) {
  var node = Global.trie,
      best = start - 1;
  for (var pos = start; pos < text.length; ++pos) {
    var letter = text.charAt(pos);
    if (node[letter] === undefined) {
      break;
    }
    node = node[letter];
    if (node.terminal) {
      best = pos;
    }
  }
  return best - start + 1;
}

function loadTrie() {
  var words = Global.dictionary,
      trie = Global.trie = {};
  words.forEach(function (word) {
    addToTrie(word);
  });
};

function println(label, s, labelStyle) {
  if (s === undefined) {
    s = label || '';
  } else {
    var className = 'label' + (labelStyle ? ' ' + labelStyle : '');
    s = '<div class="' + className + '">' + label + '</div> ' + s;
  }
  document.getElementById('message').innerHTML += (s + '<br />');
}

function process(text) {
  var results = [],
      letters = [],
      pos = 0;
  while (pos < text.length) {
    var length = findLongestMatch(text, pos);
    if (length == 0) {
      letters.push(text.charAt(pos));
      pos += 1;
    } else {
      if (letters.length != 0) {
        results.push({ word: letters.join(''), known: false });
        letters = [];
      }
      results.push({ word: text.substring(pos, pos + length), known: true });
      pos += length;
    }
  }
  if (letters.length != 0) {
    results.push({ word: letters.join(''), known: false });
    letters = [];
  }
  var breakdown = results.map(function (result) {
    return result.word;
  });
  println();
  println('    breakdown:',  '/' + breakdown.join('/') + '/');
  results.forEach(function (result) {
    println((result.known ? 'in dictionary' : '      unknown') + ':',
        '"' + result.word + '"', (result.known ? undefined : 'unknown'));
  });
};

window.onload = function () {
  loadTrie();
  process('WellIamno123prisoner');
  process('TheUSAisveryrichman');
  process('Albertgaveme5rupeesyo');
};
body {
  font-family: monospace;
  color: #444;
  font-size: 14px;
  line-height: 20px;
}
.label {
  display: inline-block;
  width: 200px;
  font-size: 13px;
  color: #aaa;
  text-align: right;
}
.label.unknown {
  color: #c61c39;
}
<div id="message"></div>
like image 95
Michael Laszlo Avatar answered Sep 28 '22 10:09

Michael Laszlo


Edit #2

var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "banana123candynotawordcookieblahdoughnut";
var currentWord = "",
    result = "";

while (true) {
    for (var i = 0; i < input.length; i++) {
        currentWord += input[i];
        if (words.indexOf(currentWord) >= 0) {
            result += " " + currentWord + " ";
            currentWord = "";
        }
    }

    if (currentWord.length > 0) {
        result += currentWord[0];
        input = currentWord.substr(1);
        currentWord = "";
    } else {
        break;
    }
}

console.log(result.trim()); // "banana 123 candy notaword cookie blah doughnut"

Note the result will contain two spaces if there are two consecutive dictionary words. You can fix this using regex.

It isn't the prettiest solution (you can make it recursive which may be more readable), and it doesn't provide multiple solutions (see DP link in comment for that).

Edit this doesn't take into account non-dictionary words.

Using a simple greedy algorithm:

var words = ["apple", "banana", "candy", "cookie", "doughnut"];
var input = "bananacandycookiedoughnut";
var currentWord = "", result = "";

for (var i = 0; i < input.length; i++) {
    currentWord += input[i];
    if (words.indexOf(currentWord) >= 0) {
        result += currentWord + " ";
        currentWord = "";
    }
}

console.log(result.trim()); // "banana candy cookie donught"

Of course you would want to modify various part of this, e.g. use a set for the dictionary and take into account case [in]sensitivity.

like image 32
Albert Xing Avatar answered Sep 28 '22 10:09

Albert Xing