I have some escaped HTML like this:
<img border='0' />
I'm trying to match and replace full escape sequences like '
but not partial, like 39
, since 39
is not actually in the unescaped string. Essentially, each escape sequence should be treated like a single token.
This is a JS regex. Is there a way to exclude matches between &
and ;
while still accepting sequences that include both of those characters?
Desired results:
<img border='0' />
for lt
: No match.<img border='0' />
for 39
: No match.<img border='0' />
for '
: Match.<img border='0' />
for border='
: Match.Current code:
> var str = '<img border='0' />'
> str.replace(/(border)/gi, '|$1|')
'<img |border|='0' />' // ok
> str.replace(/(39)/gi, '|$1|')
'<img border=�|39|;0�|39|; />' // not ok
Note: I can't unescape and then re-escape to match. It has to be escaped.
The OP wants a JavaScript regex to match and replace a string within escaped HTML while treating escape sequences (e.g. <
, '
, or 
) as single characters, and not unescape the HTML string during the replacement process.
This means that replacing
"lt"
with "[lt]"
in "< lt"
would result in "< [lt]"
(avoid match within entity)"<"
with "[<]"
in "< lt"
would result in "[<] lt"
(match entity)"&l"
with "[&l]"
in "< <"
would result in "< [&l]t"
(not match partial entity)"t;"
with "[t;]"
in "< lt;"
would result in "< l[t;]"
(not match partial entity)"< l"
with "[< l]"
in "< lt"
would result in "[< l]t"
(match including entity)"lt; &l"
with "[lt; &l]"
in "< <"
would result in "< <"
(not match partial entity) "t; <"
with "[t; <]"
in "lt; <"
would result in "l[t; <]"
(match including entity)"t; <"
with "[t; <]"
in "lt; <"
would result in "lt; <"
(not match partial entity)With the following regex for capturing escaped sequences (e.g. <
, '
, or 
),
/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi
we may use the following function as a starting point that handles most of the cases above (#1, #2, #4, #5, and #7):
function searchAndReplace(searchFor, replacement, str) {
return str.replace(
new RegExp(
prepare(searchFor) +
"|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities
"gi"
),
function(m, entity) {
return entity || replacement;
}
);
}
function prepare(str) {
return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1]
}
// [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG
The remaining cases (#3, #6, #8) involve a potential partial escaped sequence at the end of the search string.
A solution for this is to check the searchFor
string for potential partial escaped sequences at the end and append a corresponding negated lookahead (?!)
to prevent matching a valid escaped sequence. The full solution (passing a set of about 40 test cases) is shown below, and should be faster and less complex than an .exec()
approach:
function searchAndReplace(searchFor, replacement, str) {
return str.replace(
new RegExp(
prepare(searchFor) +
"|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)",
"gi"
),
function(m, entity) {
return entity || replacement;
}
);
}
function prepare(str) {
var add = "";
if (/&$/.test(str)) {
add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)";
} else if (/&[a-z]+$/i.test(str)) {
add = "(?![a-z]*;)";
} else if (/&#$/.test(str)) {
add = "(?!x[a-f\\d]+;|\\d+;)";
} else if (/&#x$/.test(str)) {
add = "(?![a-f\\d]+;)";
} else if (/&#x[a-f\d]+$/i.test(str)) {
add = "(?![a-f\\d]*;)";
}
return str.replace(/[^\w\s]/g, "\\$&") + add;
}
// test function
function test(searchFor, replacement, str, expected) {
var result = searchAndReplace(searchFor, replacement, str);
console.log(
searchFor +
": " +
(result === expected ? "Passed" : "Failed: " + [expected, result])
);
}
// test cases
test("lt", "[lt]", "<img border='0' />", "<img border='0' />");
test("39", "[39]", "<img border='0' />", "<img border='0' />");
test("'", "[']", "<img border='0' />", "<img border=[']0['] />");
test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />");
test("39&", "[39&]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />")
test("0&#", "[0&#]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />")
test("lt", "[]", "<<t;t&l", "&[]<t;t&l");
test("<", "[]", "<<t;t&l", "<[]t;t&l");
test("&l", "[]", "<<t;t&l", "[]t<t;t[]");
test("t;", "[]", "<<t;t&l", "<<[]t&l");
test("t&", "[]", "<<t;t&l", "<<t;[]l");
test("<t", "[]", "<<t;t&l", "<[];t&l");
test("t<", "[]", "<<t;t&l", "&l[]t;t&l");
test("t;t", "[]", "<<t;t&l", "<<[]&l");
test("t&l", "[]", "<<t;t&l", "<<t;[]");
test("39", "[]", "''9;9&#", "�[]'9;9&#");
test("'", "[]", "''9;9&#", "'[]9;9&#");
test("&", "[]", "''9;9&#", "[]#039'9;9[]#");
test("&#", "[]", "''9;9&#", "[]039'9;9[]");
test("9;", "[]", "''9;9&#", "''[]9&#");
test("9&", "[]", "''9;9&#", "''9;[]#");
test("'9", "[]", "''9;9&#", "'[];9&#");
test("9'", "[]", "''9;9&#", "[]9;9&#");
test("9;9", "[]", "''9;9&#", "''[]&#");
test("9&#", "[]", "''9;9&#", "''9;[]");
test("x7", "[]", "f;f&#x", "&#[]ff;f&#x");
test("", "[]", "f;f&#x", "[]f;f&#x");
test("&", "[]", "f;f&#x", "[]#x7ff;f[]#x");
test("&#", "[]", "f;f&#x", "[]x7ff;f[]x");
test("&#x", "[]", "f;f&#x", "[]7ff;f[]");
test("", "[]", "f;f&#x", "[]ff;f&#x");
test("f;", "[]", "f;f&#x", "[]f&#x");
test("f&", "[]", "f;f&#x", "f;[]#x");
test("f", "[]", "f;f&#x", "[];f&#x");
test("f", "[]", "f;f&#x", "[]f;f&#x");
test("f;f", "[]", "f;f&#x", "[]&#x");
test("f&#", "[]", "f;f&#x", "f;[]x");
test("f&#x", "[]", "f;f&#x", "f;[]");
test("t; < lt &l", "[]", "< < lt <lt; < lt <", "< < lt <l[]t");
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With