have two files pdf and html, reading the files in as strings of plain text (after extracting text from pdf) and html and now trying to make plain text have same html tags as html string. Then to compare them to find differences
Final Edit of simple example not currently working
var text1="here is example text";
var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";
var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";
var content= text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
for(i=0; i<pdfwords.length; i++){
if(pdfwords[i]===content[j]){
output+=alltags[i]+pdfwords[i];
}
}
}
document.write(output);
output should be
"<html><body><div>here is another<span>example</span> text</div></body></html>"
diff these two strings output and text2 shows difference as "another" is inserted
The Element property innerHTML gets or sets the HTML or XML markup contained within the element. To insert the HTML into the document rather than replace the contents of an element, use the method insertAdjacentHTML() .
The innerHTML property takes a string that specifies a valid combination of text and elements. When the innerHTML property is set, the given string completely replaces the existing content of the object.
The preg_match() function is the best option to extract text between HTML tags with REGEX in PHP. If you want to get content between tags, use regular expressions with preg_match() function in PHP. You can also extract the content inside element based on class name or ID using PHP.
To render the html string in react, we can use the dangerouslySetInnerHTML attribute which is a react version of dom innerHTML property. The term dangerously is used here to notify you that it will be vulnerable to cross-site scripting attacks (XSS).
This is a simple solution of what you want, it is a dynamic solution as it will handle any tags found and compare only the text content. The findDiff()
will find the difference and call the callback function with the output and a array of different words as parameters.
JSFiddle: https://jsfiddle.net/9svuc7om/18/
/**
* Parse and construct an Array of PDF text tokens
* @params {string} text The PDF text to be parsed
* @return {object} The parsed Array of tokens
*/
function parsePDFText(text) {
var token = text.split(' ');
for (var i=0,l=token.length; i<l; i++) {
// remove token of first space and consecutive space
if (token[i] == '') {
token.splice(i, 1);
}
}
return token;
}
/**
* Return the minimum indexOf among all the arguments
* @params {...number} index The indexOf
* @return {number} The minimum indexOf, -1 if all arguments are -1
*/
function findMinIndex() {
var min;
for (var i = 0, l = arguments.length; i < l; i++) {
// indexOf() returns -1 if not found
if (arguments[i] === -1) {
continue;
}
if (typeof min === 'undefined' || arguments[i] < min) {
min = arguments[i];
}
}
return min || -1;
}
/**
* Parse and construct an Array of HTML tokens
* @params {string} text The HTML text to be parsed
* @return {object} The parsed Array of tokens
*/
function parseHTMLText(text) {
var currentIndex = 0,
tl = text.length,
tokens = [],
token, firstChar, endPos;
while (currentIndex < tl) {
// determine the next token type
firstChar = text.charAt(currentIndex);
if (firstChar == '<') {
// a tag
// find the position of closing tag, assume all tags are well formed
endPos = text.indexOf('>', currentIndex + 1) + 1;
token = {
type: 'tag',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
} else if (firstChar == ' ') {
// a space
token = {
type: 'space',
content: ' ',
valid: true
}
currentIndex++;
} else {
// a character, possibliy part of a word
// find the end of the word
// assume a word is delimitered either by tags or space
endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
// endPos is `-1` if there are not delimiter anymore, end of string reached
if (endPos === -1) {
endPos = tl;
}
token = {
type: 'text',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
}
tokens.push(token);
}
return tokens;
}
/**
* Find the difference between pdf text and html text and pass the output and differenc to a callback function
* @params {string} pdfText The pdf text
* @params {string} htmlText The html text
* @params {function} callback The callback function
*/
function findDiff(pdfText, htmlText, callback) {
var output = '', // the final output
diff = [], // the array of different words
pdfTokens = parsePDFText(pdfText),
htmlTokens = parseHTMLText(htmlText),
j=0, hl=htmlTokens.length;
// the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
for (var i=0,pl=pdfTokens.length; i<pl; i++) {
// find the first occurrence of the pdf text
for(; j<hl; j++) {
if (htmlTokens[j].type != 'text') {
// exclude comparison to non-text
continue;
}
// check if the two text matches
if (htmlTokens[j].content == pdfTokens[i]) {
// a match is found
j++;
break;
} else {
// push the different html token into `diff` array
diff.push(htmlTokens[j].content);
// set the `valid` field of token to false
htmlTokens[j].valid = false;
}
}
}
// invalidate the rest of the html text
for(; j<hl; j++) {
if (htmlTokens[j].type == 'text') {
htmlTokens[j].valid = false;
}
}
// concat the final string to output
for (j=0; j<hl; j++) {
if (htmlTokens[j].valid) {
output += htmlTokens[j].content;
}
}
callback(output, diff);
}
And you can call the function by using
findDiff(text1, text2, function(output, diff) {
console.log(output);
console.log(diff);
});
However, there are some limitations in this solution
<>
and space, if there are other possible delimiter, e.g. tabs, extra code is needed>
<
instead)body
or even a narrower range instead of the whole HTML file (if in your case it is possible) because there will be too much variations in the content of a HTML file.The easiest way is
var s="Hello everyone on stackoverflow"
var s_split = s.split(' ');
var y = '<html><head></head><body><div>' + s_split[0] + '<span>' + s_split[1] + '</span>' + s_split[2]+' ' + s_split[3] + '</div></body></html>';
Check the jsfiddle
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With