Ok, I can't be the only one with this problem that seems to go on forever and ever.
We use browser-based html editors (a few different ones) - each one has its own 'paste-from-word' button that works great.
However, many of our users just paste in directly from word into the design area. For us, this results in the end-of-the-world -- sometimes it breaks javascript or other things too numerous to mention.
So, I have decided to write a simple function that searches the content of the source code, and if it sees any bad characters that are associated with a microsoft word paste, to throw back a match of 'this looks like a ms-word paste'. Currently, it looks like this:
// detect potentially bad characters - usually from msword paste
function hasInvalidChars ( in_element ) {
var src = $j(in_element).val();
var e = $E(src); // enhanced string
var bad = Array( "mso-list:", "class=\"Mso", "</o:p>", "[if !supportLists]",
"style=\"mso-", "mso-bidi", "“", "”", "<v:shapetype", "<v:path",
"file:///" );
for ( i=0; i< bad.length; i++ ) {
if ( e.contains(bad[i]) ) {
return true;
}
}
return false;
}
Note that if you try running the code, it won't work because (1) i use jQuery and (2) i have a special object ( $E ) that adds a bunch of stuff to a string, one being a 'contains()' function, but you get the idea of what it's doing.
What I am looking for are the array elements that belong in the 'bad[]' array. I have come up with a preliminary list (which may or may not be a good starting point), but I am asking you experts out there - please - can you tell me which characters or phrases you would put here? At this point, if I could catch 80% of the issues, I would be ecstatic.
Thanks.
Using Right-Click Menus Web browsers each offer context-sensitive, right-click menus, and these menus include cut, copy, paste, and select all commands. Right-click within the HTML editor and look for cut, copy, and paste in the context menu that opens.
To use Notepad as a bridge, you would first copy the text from your Word document and then paste it into Notepad. Then, copy your text from Notepad and paste it into your web page.
Recently I was looking for similar thing after bit of googling I found a nice function. It has most of the bad tags listed. here is the link which contains that function:
Javascript Function
<script type="text/javascript" runat="server" language="javascript">
function CleanWordHTML( str )
{
str = str.replace(/<o:p>\s*<\/o:p>/g, "") ;
str = str.replace(/<o:p>.*?<\/o:p>/g, " ") ;
str = str.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;
str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
str = str.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
str = str.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
str = str.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
str = str.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
str = str.replace( /\s*tab-stops:[^"]*/gi, "" ) ;
str = str.replace( /\s*face="[^"]*"/gi, "" ) ;
str = str.replace( /\s*face=[^ >]*/gi, "" ) ;
str = str.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
str = str.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
str = str.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
str = str.replace( /\s*style="\s*"/gi, '' ) ;
str = str.replace( /<SPAN\s*[^>]*>\s* \s*<\/SPAN>/gi, ' ' ) ;
str = str.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ;
str = str.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
str = str.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ;
str = str.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;
str = str.replace(/<\\?\?xml[^>]*>/gi, "") ;
str = str.replace(/<\/?\w+:[^>]*>/gi, "") ;
str = str.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
str = str.replace( /<H1([^>]*)>/gi, '' ) ;
str = str.replace( /<H2([^>]*)>/gi, '' ) ;
str = str.replace( /<H3([^>]*)>/gi, '' ) ;
str = str.replace( /<H4([^>]*)>/gi, '' ) ;
str = str.replace( /<H5([^>]*)>/gi, '' ) ;
str = str.replace( /<H6([^>]*)>/gi, '' ) ;
str = str.replace( /<\/H\d>/gi, '<br>' ) ; //remove this to take out breaks where Heading tags were
str = str.replace( /<(U|I|STRIKE)> <\/\1>/g, ' ' ) ;
str = str.replace( /<(B|b)> <\/\b|B>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
//some RegEx code for the picky browsers
var re = new RegExp("(<P)([^>]*>.*?)(<\/P>)","gi") ;
str = str.replace( re, "<div$2</div>" ) ;
var re2 = new RegExp("(<font|<FONT)([^*>]*>.*?)(<\/FONT>|<\/font>)","gi") ;
str = str.replace( re2, "<div$2</div>") ;
str = str.replace( /size|SIZE = ([\d]{1})/g, '' ) ;
return str ;
}
</script>
http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php
all credits to original author.
Tinymce has a flag for this when pasting.
You should have a look at the past_preprocess setting of the paste plugin.
Here you can access the pasted content using 'o' and find out if the paste came from Word. Example:
paste_preprocess : function(pl, o) { //if(console) console.log('content', o); if (o.wordContent ) { alert('paste from WORD detected!!!'); } ... },
I am using a special function to get rid of unwanted tags (i was not that happy about the default way tinymce handles this - so i wrote my own).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With