I'm using a NicEdit rich text editor (content editable div based) in my application and users love to paste from word.
So I wanted to strip out any and all junk tags that might get pasted in.
Here is what I'm currently doing.
//build regex to match junk tags
var unwantedtags = [ "font", "span", "table", "tbody", "div", "td", "tr", "input", "a",
"body", "button", "form", "head", "img", "select", "textarea", "tfoot", "th", "iframe", "object" ];
var unwantedregexstring= "";
$.each(unwantedtags, function(index, value) {
if(unwantedregexstring!= "") {
unwantedregexstring += "|";
}
unwantedregexstring+= "<" + value + ">";
unwantedregexstring+= "|";
unwantedregexstring+= "<" + value + "\\s[^>]*>";
unwantedregexstring+= "|";
unwantedregexstring+= "</" + value + ">";
});
var unwantedRegex = new RegExp(unwantedregexstring, "igm");
//replace junk tags with nothing
function CleanMSWordPaste(mswordtext) {
return mswordtext.replace(unwantedRegex, "");
}
//Function that gets Executed on Paste event
function ExecutePaste(){
//preserve user's selected text
var oldRng = document.selection.createRange();
//create paste area off screen and paste there
$('body').append("<div id='paster' contenteditable='true' style='height:1px;width:1px;position:fixed;left:-100px;top:-100px;'></div>");
$('#paster').focus();
$('#paster')[0].document.execCommand('paste', null, null);
//if html contains junk tags
if(unwantedRegex.test($('#paster').html())) {
//replace html with cleaned html
$('#paster').html(CleanMSWordPaste($('#paster').html()));
//select all content of paste area
var rng = document.body.createTextRange();
rng.moveToElementText($('#paster')[0]);
rng.select();
//copy cleaned html
$('#paster')[0].document.execCommand('copy', null, null);
}
//remove paste area from dom
$('#paster').remove();
//restore user's selected text
oldRng.select();
//preserves scroll position, focuses NicEditor and performs doc.execCommand('paste')
//performance of this alone is fine.
ExecCommand('paste');
}
I'm finding that this is taking quite a long time (ex 1 page of text from word). Is there anything I can do to speed this up? I'm thinking some sort of regex optimization but I don't really have any knowledge of how regexes work in the first place.
It seems that your unwantedregexstring will end up looking something like this:
'<font>|<font\s[^>]*>|</font>|<span>|<span\s[^>]*>|</span>|...'
I'm no expert in regexp engine internals, but that looks a bit overly verbose to me. What if you change your algorithm so that unwantedregexstring looks like this instead?
'</?(font|span|...)\s?.*?>'
That will look for a < followed by an optional / followed by one of your specified tags followed by an optional whitespace character followed by zero or more but as few as possible of any character, until the closing > is encountered.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With