Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Regex to not match partial sequences, but match full ones

I have some escaped HTML like this:

<img border='0' />

I'm trying to match and replace full escape sequences like ' but not partial, like 39, since 39 is not actually in the unescaped string. Essentially, each escape sequence should be treated like a single token.

This is a JS regex. Is there a way to exclude matches between & and ; while still accepting sequences that include both of those characters?

Desired results:

  • Search <img border='0' /> for lt: No match.
  • Search <img border='0' /> for 39: No match.
  • Search <img border='0' /> for ': Match.
  • Search <img border='0' /> for border=': Match.

Current code:

> var str = '<img border='0' />'
> str.replace(/(border)/gi, '|$1|')
'<img |border|='0' />'  // ok
> str.replace(/(39)/gi, '|$1|')
'<img border=&#0|39|;0&#0|39|; />'  // not ok

Note: I can't unescape and then re-escape to match. It has to be escaped.

like image 667
nathancahill Avatar asked Apr 14 '17 18:04

nathancahill


1 Answers

The OP wants a JavaScript regex to match and replace a string within escaped HTML while treating escape sequences (e.g. <, ', or ) as single characters, and not unescape the HTML string during the replacement process.

This means that replacing

  1. "lt" with "[lt]" in "< lt" would result in "< [lt]" (avoid match within entity)
  2. "<" with "[<]" in "< lt" would result in "[<] lt" (match entity)
  3. "&l" with "[&l]" in "< &lt" would result in "< [&l]t"(not match partial entity)
  4. "t;" with "[t;]" in "< lt;" would result in "< l[t;]" (not match partial entity)
  5. "< l" with "[< l]" in "< lt" would result in "[< l]t" (match including entity)
  6. "lt; &l" with "[lt; &l]" in "< &lt" would result in "< &lt" (not match partial entity)
  7. "t; <" with "[t; <]" in "lt; <" would result in "l[t; <]" (match including entity)
  8. "t; &lt" with "[t; &lt]" in "lt; <" would result in "lt; <" (not match partial entity)

With the following regex for capturing escaped sequences (e.g. <, ', or ),

/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi

we may use the following function as a starting point that handles most of the cases above (#1, #2, #4, #5, and #7):

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1]
}

// [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG

The remaining cases (#3, #6, #8) involve a potential partial escaped sequence at the end of the search string.

A solution for this is to check the searchFor string for potential partial escaped sequences at the end and append a corresponding negated lookahead (?!) to prevent matching a valid escaped sequence. The full solution (passing a set of about 40 test cases) is shown below, and should be faster and less complex than an .exec() approach:

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", 
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  var add = "";
  if (/&$/.test(str)) {
    add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)";
  } else if (/&[a-z]+$/i.test(str)) {
    add = "(?![a-z]*;)";
  } else if (/&#$/.test(str)) {
    add = "(?!x[a-f\\d]+;|\\d+;)";
  } else if (/&#x$/.test(str)) {
    add = "(?![a-f\\d]+;)";
  } else if (/&#x[a-f\d]+$/i.test(str)) {
    add = "(?![a-f\\d]*;)";
  }
  return str.replace(/[^\w\s]/g, "\\$&") + add;
}

// test function

function test(searchFor, replacement, str, expected) {
  var result = searchAndReplace(searchFor, replacement, str);
  console.log(
    searchFor +
      ": " +
      (result === expected ? "Passed" : "Failed: " + [expected, result])
  );
}

// test cases

test("lt", "[lt]", "<img border='0' />", "<img border='0' />");
test("39", "[39]", "<img border='0' />", "<img border='0' />");
test("'", "[']", "<img border='0' />", "<img border=[']0['] />");
test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />");
test("39&", "[39&]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
test("0&#", "[0&#]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
test("lt", "[]", "&lt<t;t&l", "&[]<t;t&l");
test("<", "[]", "&lt<t;t&l", "&lt[]t;t&l");
test("&l", "[]", "&lt<t;t&l", "[]t<t;t[]");
test("t;", "[]", "&lt<t;t&l", "&lt<[]t&l");
test("t&", "[]", "&lt<t;t&l", "&lt<t;[]l");
test("<t", "[]", "&lt<t;t&l", "&lt[];t&l");
test("t<", "[]", "&lt<t;t&l", "&l[]t;t&l");
test("t;t", "[]", "&lt<t;t&l", "&lt<[]&l");
test("t&l", "[]", "&lt<t;t&l", "&lt<t;[]");
test("39", "[]", "&#039'9;9&#", "&#0[]'9;9&#");
test("'", "[]", "&#039'9;9&#", "&#039[]9;9&#");
test("&", "[]", "&#039'9;9&#", "[]#039'9;9[]#");
test("&#", "[]", "&#039'9;9&#", "[]039'9;9[]");
test("9;", "[]", "&#039'9;9&#", "&#039'[]9&#");
test("9&", "[]", "&#039'9;9&#", "&#039'9;[]#");
test("'9", "[]", "&#039'9;9&#", "&#039[];9&#");
test("9'", "[]", "&#039'9;9&#", "&#03[]9;9&#");
test("9;9", "[]", "&#039'9;9&#", "&#039'[]&#");
test("9&#", "[]", "&#039'9;9&#", "&#039'9;[]");
test("x7", "[]", "߿f&#x", "&#[]ff;f&#x");
test("", "[]", "߿f&#x", "&#x7f[]f;f&#x");
test("&", "[]", "߿f&#x", "[]#x7ff;f[]#x");
test("&#", "[]", "߿f&#x", "[]x7ff;f[]x");
test("&#x", "[]", "߿f&#x", "[]7ff;f[]");
test("&#x7", "[]", "߿f&#x", "[]ff;f&#x");
test("f;", "[]", "߿f&#x", "&#x7f[]f&#x");
test("f&", "[]", "߿f&#x", "߿[]#x");
test("f", "[]", "߿f&#x", "&#x7f[];f&#x");
test("f", "[]", "߿f&#x", "&#x7[]f;f&#x");
test("f;f", "[]", "߿f&#x", "&#x7f[]&#x");
test("f&#", "[]", "߿f&#x", "߿[]x");
test("f&#x", "[]", "߿f&#x", "߿[]");
test("t; < lt &l", "[]", "< < lt <lt; < lt &lt", "< < lt <l[]t");
like image 86
Tomas Langkaas Avatar answered Oct 03 '22 22:10

Tomas Langkaas