insert html elements into string of text to match another string of html

Tags:

have two files pdf and html, reading the files in as strings of plain text (after extracting text from pdf) and html and now trying to make plain text have same html tags as html string. Then to compare them to find differences

Final Edit of simple example not currently working

var text1="here is example text";

var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";

var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";

var content=  text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
   for(i=0; i<pdfwords.length; i++){
      if(pdfwords[i]===content[j]){

         output+=alltags[i]+pdfwords[i];
      }
    }
}

document.write(output);

output should be

"<html><body><div>here is another<span>example</span> text</div></body></html>"

diff these two strings output and text2 shows difference as "another" is inserted

845

asked May 16 '16 23:05

0101

2 Answers

This is a simple solution of what you want, it is a dynamic solution as it will handle any tags found and compare only the text content. The findDiff() will find the difference and call the callback function with the output and a array of different words as parameters.

JSFiddle: https://jsfiddle.net/9svuc7om/18/

/**
 * Parse and construct an Array of PDF text tokens
 * @params {string} text   The PDF text to be parsed
 * @return {object}         The parsed Array of tokens
 */
function parsePDFText(text) {
    var token = text.split(' ');
    for (var i=0,l=token.length; i<l; i++) {
        // remove token of first space and consecutive space
        if (token[i] == '') {
            token.splice(i, 1);
        }
    }
    return token;
}

/**
 * Return the minimum indexOf among all the arguments
 * @params {...number} index  The indexOf
 * @return {number}           The minimum indexOf, -1 if all arguments are -1
 */
function findMinIndex() {
    var min;
    for (var i = 0, l = arguments.length; i < l; i++) {
        // indexOf() returns -1 if not found
        if (arguments[i] === -1) {
            continue;
        }
        if (typeof min === 'undefined' || arguments[i] < min) {
            min = arguments[i];
        }
    }
    return min || -1;
}

/**
 * Parse and construct an Array of HTML tokens
 * @params {string} text   The HTML text to be parsed
 * @return {object}       The parsed Array of tokens
 */
function parseHTMLText(text) {
    var currentIndex = 0,
        tl = text.length,
        tokens = [],
        token, firstChar, endPos;
    while (currentIndex < tl) {
        // determine the next token type
        firstChar = text.charAt(currentIndex);
        if (firstChar == '<') {
            // a tag
            // find the position of closing tag, assume all tags are well formed
            endPos = text.indexOf('>', currentIndex + 1) + 1;
            token = {
                type: 'tag',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        } else if (firstChar == ' ') {
            // a space
            token = {
                type: 'space', 
                content: ' ', 
                valid: true
            }
            currentIndex++;
        } else {
            // a character, possibliy part of a word
            // find the end of the word
            // assume a word is delimitered either by tags or space
            endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
            // endPos is `-1` if there are not delimiter anymore, end of string reached
            if (endPos === -1) {
                endPos = tl;
            }
            token = {
                type: 'text',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        }
        tokens.push(token);
    }
    return tokens;
}

/**
 * Find the difference between pdf text and html text and pass the output and differenc to a callback function
 * @params {string} pdfText     The pdf text
 * @params {string} htmlText    The html text
 * @params {function} callback  The callback function
 */
function findDiff(pdfText, htmlText, callback) {
    var output = '', // the final output
        diff = [], // the array of different words
        pdfTokens = parsePDFText(pdfText),
        htmlTokens = parseHTMLText(htmlText), 
        j=0, hl=htmlTokens.length;
    // the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
    for (var i=0,pl=pdfTokens.length; i<pl; i++) {
        // find the first occurrence of the pdf text
        for(; j<hl; j++) {
            if (htmlTokens[j].type != 'text') {
                // exclude comparison to non-text
                continue;
            }
            // check if the two text matches
            if (htmlTokens[j].content == pdfTokens[i]) {
                // a match is found
                j++;
                break;
            } else {
                // push the different html token into `diff` array
                diff.push(htmlTokens[j].content);
                // set the `valid` field of token to false
                htmlTokens[j].valid = false;
            }
        }
    }
    // invalidate the rest of the html text
    for(; j<hl; j++) {
        if (htmlTokens[j].type == 'text') {
            htmlTokens[j].valid = false;
        }
    }
    // concat the final string to output
    for (j=0; j<hl; j++) {
        if (htmlTokens[j].valid) {
            output += htmlTokens[j].content;
        }
    }
    callback(output, diff);
}

And you can call the function by using

findDiff(text1, text2, function(output, diff) {
    console.log(output);
    console.log(diff);
});

However, there are some limitations in this solution

It assumes all the content in pdf are present in the HTML text
It only handles <> and space, if there are other possible delimiter, e.g. tabs, extra code is needed
It assumes all tags are well-formed, and there will not be closing tags in between text content(if you need you should use > < instead)
The function is a simplified solution and is not fully tested. You cannot expect any warranty from it and some adaptations is needed. I would suggest providing only the content inside body or even a narrower range instead of the whole HTML file (if in your case it is possible) because there will be too much variations in the content of a HTML file.

146

answered Oct 08 '22 12:10

Calvin Lau

The easiest way is

var s="Hello everyone on stackoverflow"
var s_split = s.split(' ');
var y = '<html><head></head><body><div>' + s_split[0] + '<span>' + s_split[1] + '</span>' + s_split[2]+' ' + s_split[3] + '</div></body></html>';

Check the jsfiddle

answered Oct 08 '22 13:10

lhrec_106

Related questions
                            
                                Launch app from link, if no app then go to download app from web
                            
                                paper stack (css3) in angular material design
                            
                                Linkedin Oauth Javascript authorization "uh oh!"
                            
                                Error messages and console logs in Electron?
                            
                                How to increase size of pie segment on hover in d3
                            
                                Python Flask calling functions using buttons
                            
                                How to get path variable in express(node.js)
                            
                                Why does ECMA script offer no integer type out of the box?
                            
                                HTML1506: Unexpected token <script>
                            
                                Nodejs and express server closes connection after 2 minutes
                            
                                what is slow parameter in mocha?
                            
                                Calling External API with Javascript
                            
                                jquery .ready() equivalent in d3js?
                            
                                Chrome - Detect browser close or tab close
                            
                                Export leaflet map to geojson
                            
                                How to show different value of input element with ng-model?
                            
                                Why should I use &amp; instead of &?
                            
                                Is there a simple CRUD example app using react.js and firebase?
                            
                                Why array.indexOf(undefined) doesn't work if array is sparse
                            
                                Why are javascript promises asynchronous when calling only synchronous functions?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

insert html elements into string of text to match another string of html

Tags:

javascript

html

jquery

css

compare

0101

People also ask

2 Answers

Calvin Lau

lhrec_106

Recent Activity

Donate For Us