Regex to not match partial sequences, but match full ones

Question

I have some escaped HTML like this:

&lt;img border=&#039;0&#039; /&gt;

I'm trying to match and replace full escape sequences like ' but not partial, like 39, since 39 is not actually in the unescaped string. Essentially, each escape sequence should be treated like a single token.

This is a JS regex. Is there a way to exclude matches between & and ; while still accepting sequences that include both of those characters?

Desired results:

Search <img border='0' /> for lt: No match.
Search <img border='0' /> for 39: No match.
Search <img border='0' /> for ': Match.
Search <img border='0' /> for border=': Match.

Current code:

> var str = '&lt;img border=&#039;0&#039; /&gt;'
> str.replace(/(border)/gi, '|$1|')
'&lt;img |border|=&#039;0&#039; /&gt;'  // ok
> str.replace(/(39)/gi, '|$1|')
'&lt;img border=&#0|39|;0&#0|39|; /&gt;'  // not ok

Note: I can't unescape and then re-escape to match. It has to be escaped.

Tomas Langkaas · Accepted Answer

The OP wants a JavaScript regex to match and replace a string within escaped HTML while treating escape sequences (e.g. <, ', or ) as single characters, and not unescape the HTML string during the replacement process.

This means that replacing

"lt" with "[lt]" in "< lt" would result in "< [lt]" (avoid match within entity)
"<" with "[<]" in "< lt" would result in "[<] lt" (match entity)
"&l" with "[&l]" in "< &lt" would result in "< [&l]t"(not match partial entity)
"t;" with "[t;]" in "< lt;" would result in "< l[t;]" (not match partial entity)
"< l" with "[< l]" in "< lt" would result in "[< l]t" (match including entity)
"lt; &l" with "[lt; &l]" in "< &lt" would result in "< &lt" (not match partial entity)
"t; <" with "[t; <]" in "lt; <" would result in "l[t; <]" (match including entity)
"t; &lt" with "[t; &lt]" in "lt; <" would result in "lt; <" (not match partial entity)

With the following regex for capturing escaped sequences (e.g. <, ', or ),

/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi

we may use the following function as a starting point that handles most of the cases above (#1, #2, #4, #5, and #7):

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f\d]+;|&#\d+;)", // consume entities
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  return str.replace(/[^\w\s]/g, "\$&"); //escape regex metachars [1]
}

// [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG

The remaining cases (#3, #6, #8) involve a potential partial escaped sequence at the end of the search string.

A solution for this is to check the searchFor string for potential partial escaped sequences at the end and append a corresponding negated lookahead (?!) to prevent matching a valid escaped sequence. The full solution (passing a set of about 40 test cases) is shown below, and should be faster and less complex than an .exec() approach:

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f0-9]+;|&#\d+;)", 
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  var add = "";
  if (/&$/.test(str)) {
    add = "(?!#x[a-z\d]+;|#\d+;|[a-z]+;)";
  } else if (/&[a-z]+$/i.test(str)) {
    add = "(?![a-z]*;)";
  } else if (/&#$/.test(str)) {
    add = "(?!x[a-f\d]+;|\d+;)";
  } else if (/&#x$/.test(str)) {
    add = "(?![a-f\d]+;)";
  } else if (/&#x[a-f\d]+$/i.test(str)) {
    add = "(?![a-f\d]*;)";
  }
  return str.replace(/[^\w\s]/g, "\$&") + add;
}

// test function

function test(searchFor, replacement, str, expected) {
  var result = searchAndReplace(searchFor, replacement, str);
  console.log(
    searchFor +
      ": " +
      (result === expected ? "Passed" : "Failed: " + [expected, result])
  );
}

// test cases

test("lt", "[lt]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=&#039;0&#039; /&gt;");
test("39", "[39]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=&#039;0&#039; /&gt;");
test("&#039;", "[&#039;]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=[&#039;]0[&#039;] /&gt;");
test("border=&#039;", "[border=&#039;]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img [border=&#039;]0&#039; /&gt;");
test("39&", "[39&]", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;")
test("0&#", "[0&#]", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;")
test("lt", "[]", "&lt&lt;t;t&l", "&[]&lt;t;t&l");
test("&lt;", "[]", "&lt&lt;t;t&l", "&lt[]t;t&l");
test("&l", "[]", "&lt&lt;t;t&l", "[]t&lt;t;t[]");
test("t;", "[]", "&lt&lt;t;t&l", "&lt&lt;[]t&l");
test("t&", "[]", "&lt&lt;t;t&l", "&lt&lt;t;[]l");
test("&lt;t", "[]", "&lt&lt;t;t&l", "&lt[];t&l");
test("t&lt;", "[]", "&lt&lt;t;t&l", "&l[]t;t&l");
test("t;t", "[]", "&lt&lt;t;t&l", "&lt&lt;[]&l");
test("t&l", "[]", "&lt&lt;t;t&l", "&lt&lt;t;[]");
test("39", "[]", "&#039&#039;9;9&#", "&#0[]&#039;9;9&#");
test("&#039;", "[]", "&#039&#039;9;9&#", "&#039[]9;9&#");
test("&", "[]", "&#039&#039;9;9&#", "[]#039&#039;9;9[]#");
test("&#", "[]", "&#039&#039;9;9&#", "[]039&#039;9;9[]");
test("9;", "[]", "&#039&#039;9;9&#", "&#039&#039;[]9&#");
test("9&", "[]", "&#039&#039;9;9&#", "&#039&#039;9;[]#");
test("&#039;9", "[]", "&#039&#039;9;9&#", "&#039[];9&#");
test("9&#039;", "[]", "&#039&#039;9;9&#", "&#03[]9;9&#");
test("9;9", "[]", "&#039&#039;9;9&#", "&#039&#039;[]&#");
test("9&#", "[]", "&#039&#039;9;9&#", "&#039&#039;9;[]");
test("x7", "[]", "&#x7f&#x7f;f;f&#x", "&#[]f&#x7f;f;f&#x");
test("&#x7f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f[]f;f&#x");
test("&", "[]", "&#x7f&#x7f;f;f&#x", "[]#x7f&#x7f;f;f[]#x");
test("&#", "[]", "&#x7f&#x7f;f;f&#x", "[]x7f&#x7f;f;f[]x");
test("&#x", "[]", "&#x7f&#x7f;f;f&#x", "[]7f&#x7f;f;f[]");
test("&#x7", "[]", "&#x7f&#x7f;f;f&#x", "[]f&#x7f;f;f&#x");
test("f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;[]f&#x");
test("f&", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]#x");
test("&#x7f;f", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f[];f&#x");
test("f&#x7f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7[]f;f&#x");
test("f;f", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;[]&#x");
test("f&#", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]x");
test("f&#x", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]");
test("t; &lt; lt &l", "[]", "&lt; &lt; lt &lt;lt; &lt; lt &lt", "&lt; &lt; lt &lt;l[]t");

Regex to not match partial sequences, but match full ones

Tags:

javascript

regex

nathancahill

1 Answers

Tomas Langkaas

Recent Activity

Donate For Us

Regex to not match partial sequences, but match full ones

Tags:

javascript

regex

nathancahill

1 Answers

Tomas Langkaas

Related questions

Recent Activity

Donate For Us