Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Javascript / jQuery Find Text duplicates

How would you approach finding duplicates in a text document. Duplicates can be a set of consecutive words or sentence. Sentence not necessary ending with a dot. let's say page contains a document of 200 lines of which 2 sentences are identical, we want to highlight those 2 sentences as duplicates when "check duplicate button" is clicked.

like image 831
Hussein Avatar asked Feb 18 '11 20:02

Hussein


2 Answers

Interesting question — here's an idea on how I'd do that probably: http://jsfiddle.net/SaQAs/1/ — Not anyhow optimized!

var text = $('p').text(),
    words = text.split(' '),
    sortedWords = words.slice(0).sort(),
    duplicateWords = [],
    sentences = text.split('.'),
    sortedSentences = sentences.slice(0).sort(),
    duplicateSentences = [];


for (var i=0; i<sortedWords.length-1; i++) {
    if (sortedWords[i+1] == sortedWords[i]) {
        duplicateWords.push(sortedWords[i]);
    }
}
duplicateWords = $.unique(duplicateWords);

for (var i=0; i<sortedSentences.length-1; i++) {
    if (sortedSentences[i+1] == sortedSentences[i]) {
        duplicateSentences.push(sortedSentences[i]);
    }
}
duplicateSentences = $.unique(duplicateSentences);

$('a.words').click(function(){
    var highlighted = $.map(words, function(word){
        if ($.inArray(word, duplicateWords) > -1)
            return '<span class="duplicate">' + word + '</span>';
        else return word;
    });
    $('p').html(highlighted.join(' '));
    return false;
});

$('a.sentences').click(function(){
    var highlighted = $.map(sentences, function(sentence){
        if ($.inArray(sentence, duplicateSentences) > -1)
            return '<span class="duplicate">' + sentence + '</span>';
        else return sentence;
    });
    $('p').html(highlighted.join('.'));
    return false;
});

Update 1

This one finds sequences of identical words: http://jsfiddle.net/YQdk5/1/ From here it shouldn't be hard to e.g. ignore any kind of punctuation at the end of fragments when comparing — you'd would just have to write your own version of an inArray method.

var text = $('p').text(),
    words = text.split(' '),
    sortedWords = words.slice(0).sort(),
    duplicateWords = []
    highlighted = [];

for (var i=0; i<sortedWords.length-1; i++) {
    if (sortedWords[i+1] == sortedWords[i]) {
        duplicateWords.push(sortedWords[i]);
    }
}
duplicateWords = $.unique(duplicateWords);

for (var j=0, m=[]; j<words.length; j++) {
    m.push($.inArray(words[j], duplicateWords) > -1);
    if (!m[j] && m[j-1]) 
        highlighted.push('</span>');
    else if (m[j] && !m[j-1])
        highlighted.push('<span class="duplicate">');
    highlighted.push(words[j]);
}

$('p').html(highlighted.join(' '));

Update 2

My regex-fu is weak, but this (pretty messy!) version seems to work okay: http://jsfiddle.net/YQdk5/2/ — I'm pretty sure that there might be a better way of doing this, but for now I've got to leave it alone! :D — Good luck!

Update 3

Thinking about it, I don't think that the code from the previous update is any good. That's why I've removed it. You can still find it here: http://jsfiddle.net/YQdk5/2/ The main point is to use a regex to match words, something along the lines of:

/^word(\.?)$/
like image 72
polarblau Avatar answered Sep 30 '22 12:09

polarblau


Here is the solution that uses suffix tree:

function SuffixTree(text) {
    var regex = /\b\w+/g;
    var words = text.match(regex);
    var wave = [];
    var words_l = words.length;
    if (words_l == 0) return false;
    this.tree = this.node("", false);
    for (var i = 0; i < words_l; ++i) {
        var x = words[i] + "_";
        wave.push(this.tree);
        var wave_l = wave.length;
        for (var j = 0; j < wave_l; ++j) {
            var y = wave[j];
            if (typeof y[x] != 'undefined') y[x].count++;
            else y[x] = this.node(words[i], y);
            wave[j] = y[x];
        }
    }
}

SuffixTree.prototype = {
    dummy: {count: 1},

    node: function(word, num, parent) {
        return {
            count: 1,
            word: word,
            parent: parent
        };
    },

    duplicates: function(h) {
        this.dups = [];
        this.bypass(this.tree, h, 0);
        var l = this.dups.length;
        this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; });
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            this.dups[i] = { s: " " + this.sentence(d.a) + " ", depth: d.depth, count: d.a.count };
        }
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            console.log(i, d.s);
        }
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            var fl = true;
            for (var j = i + 1; j < l; ++j) {
                if (this.dups[j].s.indexOf(d.s) != -1) fl = false;
            }
            if (fl) h(d.s.substr(1, d.s.length - 2), d.count);
        }
    },

    bypass: function(a, h, depth) {
        if (a.constructor != Object) return;
        var fl = true;
        for (var i in a) {
            if (i == 'parent') continue;
            var b = a[i];
            if (b.count == a.count) fl = false;
            this.bypass(b, h, depth + 1);
        }
        if (fl && a.count > 1) {
            this.dups.push({ a: a, depth: depth });
        }
    },

    sentence: function(a) {
        var s = a.word;
        while (a = a.parent) {
            s = a.word + " " + s;
        }
        return s;
    }
};

var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones.";

var T = new SuffixTree(text);
var h = function(s, c) {
    document.write(s + "[" + c + "]<br/>");
};
T.duplicates(h);

1) Split the input text to the array of words. 2) Build the suffix tree. 3) Find the longest suffixes of the tree. 4) Remove sentences that are contained in others (i.e. remove "is" that is a part of "this is a").

You can change the regular expression to take into account html tags.

I hope this helps you.

P.S. h is the callback for found duplicates.

like image 28
artyom.stv Avatar answered Sep 30 '22 13:09

artyom.stv