Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Javascript and regex: split string and keep the separator

People also ask

Can you use regex in Split JavaScript?

You do not only have to use literal strings for splitting strings into an array with the split method. You can use regex as breakpoints that match more characters for splitting a string.

Can we use regex in split a string?

split(String regex) method splits this string around matches of the given regular expression. This method works in the same way as invoking the method i.e split(String regex, int limit) with the given expression and a limit argument of zero. Therefore, trailing empty strings are not included in the resulting array.

When splitting a string using a given separator it returns?

The Split method extracts the substrings in this string that are delimited by one or more of the strings in the separator parameter, and returns those substrings as elements of an array.


I was having similar but slight different problem. Anyway, here are examples of three different scenarios for where to keep the deliminator.

"1、2、3".split("、") == ["1", "2", "3"]
"1、2、3".split(/(、)/g) == ["1", "、", "2", "、", "3"]
"1、2、3".split(/(?=、)/g) == ["1", "、2", "、3"]
"1、2、3".split(/(?!、)/g) == ["1、", "2、", "3"]
"1、2、3".split(/(.*?、)/g) == ["", "1、", "", "2、", "3"]

Warning: The fourth will only work to split single characters. ConnorsFan presents an alternative:

// Split a path, but keep the slashes that follow directories
var str = 'Animation/rawr/javascript.js';
var tokens = str.match(/[^\/]+\/?|\//g);

Use (positive) lookahead so that the regular expression asserts that the special character exists, but does not actually match it:

string.split(/<br \/>(?=&#?[a-zA-Z0-9]+;)/g);

See it in action:

var string = "aaaaaa<br />&dagger; bbbb<br />&Dagger; cccc";
console.log(string.split(/<br \/>(?=&#?[a-zA-Z0-9]+;)/g));

If you wrap the delimiter in parantheses it will be part of the returned array.

string.split(/(<br \/>&#?[a-zA-Z0-9]+);/g);
// returns ["aaaaaa", "<br />&dagger;", "bbbb", "<br />&Dagger;", "cccc"]

Depending on which part you want to keep change which subgroup you match

string.split(/(<br \/>)&#?[a-zA-Z0-9]+;/g);
// returns ["aaaaaa", "<br />", "bbbb", "<br />", "cccc"]

You could improve the expression by ignoring the case of letters string.split(/()&#?[a-z0-9]+;/gi);

And you can match for predefined groups like this: \d equals [0-9] and \w equals [a-zA-Z0-9_]. This means your expression could look like this.

string.split(/<br \/>(&#?[a-z\d]+;)/gi);

There is a good Regular Expression Reference on JavaScriptKit.


answered it here also JavaScript Split Regular Expression keep the delimiter

use the (?=pattern) lookahead pattern in the regex example

var string = '500x500-11*90~1+1';
string = string.replace(/(?=[$-/:-?{-~!"^_`\[\]])/gi, ",");
string = string.split(",");

this will give you the following result.

[ '500x500', '-11', '*90', '~1', '+1' ]

Can also be directly split

string = string.split(/(?=[$-/:-?{-~!"^_`\[\]])/gi);

giving the same result

[ '500x500', '-11', '*90', '~1', '+1' ]

I made a modification to jichi's answer, and put it in a function which also supports multiple letters.

String.prototype.splitAndKeep = function(separator, method='seperate'){
    var str = this;
    if(method == 'seperate'){
        str = str.split(new RegExp(`(${separator})`, 'g'));
    }else if(method == 'infront'){
        str = str.split(new RegExp(`(?=${separator})`, 'g'));
    }else if(method == 'behind'){
        str = str.split(new RegExp(`(.*?${separator})`, 'g'));
        str = str.filter(function(el){return el !== "";});
    }
    return str;
};

jichi's answers 3rd method would not work in this function, so I took the 4th method, and removed the empty spaces to get the same result.

edit: second method which excepts an array to split char1 or char2

String.prototype.splitAndKeep = function(separator, method='seperate'){
    var str = this;
    function splitAndKeep(str, separator, method='seperate'){
        if(method == 'seperate'){
            str = str.split(new RegExp(`(${separator})`, 'g'));
        }else if(method == 'infront'){
            str = str.split(new RegExp(`(?=${separator})`, 'g'));
        }else if(method == 'behind'){
            str = str.split(new RegExp(`(.*?${separator})`, 'g'));
            str = str.filter(function(el){return el !== "";});
        }
        return str;
    }
    if(Array.isArray(separator)){
        var parts = splitAndKeep(str, separator[0], method);
        for(var i = 1; i < separator.length; i++){
            var partsTemp = parts;
            parts = [];
            for(var p = 0; p < partsTemp.length; p++){
                parts = parts.concat(splitAndKeep(partsTemp[p], separator[i], method));
            }
        }
        return parts;
    }else{
        return splitAndKeep(str, separator, method);
    }
};

usage:

str = "first1-second2-third3-last";

str.splitAndKeep(["1", "2", "3"]) == ["first", "1", "-second", "2", "-third", "3", "-last"];

str.splitAndKeep("-") == ["first1", "-", "second2", "-", "third3", "-", "last"];

An extension function splits string with substring or RegEx and the delimiter is putted according to second parameter ahead or behind.

    String.prototype.splitKeep = function (splitter, ahead) {
        var self = this;
        var result = [];
        if (splitter != '') {
            var matches = [];
            // Getting mached value and its index
            var replaceName = splitter instanceof RegExp ? "replace" : "replaceAll";
            var r = self[replaceName](splitter, function (m, i, e) {
                matches.push({ value: m, index: i });
                return getSubst(m);
            });
            // Finds split substrings
            var lastIndex = 0;
            for (var i = 0; i < matches.length; i++) {
                var m = matches[i];
                var nextIndex = ahead == true ? m.index : m.index + m.value.length;
                if (nextIndex != lastIndex) {
                    var part = self.substring(lastIndex, nextIndex);
                    result.push(part);
                    lastIndex = nextIndex;
                }
            };
            if (lastIndex < self.length) {
                var part = self.substring(lastIndex, self.length);
                result.push(part);
            };
            // Substitution of matched string
            function getSubst(value) {
                var substChar = value[0] == '0' ? '1' : '0';
                var subst = '';
                for (var i = 0; i < value.length; i++) {
                    subst += substChar;
                }
                return subst;
            };
        }
        else {
            result.add(self);
        };
        return result;
    };

The test:

    test('splitKeep', function () {
        // String
        deepEqual("1231451".splitKeep('1'), ["1", "231", "451"]);
        deepEqual("123145".splitKeep('1', true), ["123", "145"]);
        deepEqual("1231451".splitKeep('1', true), ["123", "145", "1"]);
        deepEqual("hello man how are you!".splitKeep(' '), ["hello ", "man ", "how ", "are ", "you!"]);
        deepEqual("hello man how are you!".splitKeep(' ', true), ["hello", " man", " how", " are", " you!"]);
        // Regex
        deepEqual("mhellommhellommmhello".splitKeep(/m+/g), ["m", "hellomm", "hellommm", "hello"]);
        deepEqual("mhellommhellommmhello".splitKeep(/m+/g, true), ["mhello", "mmhello", "mmmhello"]);
    });

I've been using this:

String.prototype.splitBy = function (delimiter) {
  var 
    delimiterPATTERN = '(' + delimiter + ')', 
    delimiterRE = new RegExp(delimiterPATTERN, 'g');

  return this.split(delimiterRE).reduce((chunks, item) => {
    if (item.match(delimiterRE)){
      chunks.push(item)
    } else {
      chunks[chunks.length - 1] += item
    };
    return chunks
  }, [])
}

Except that you shouldn't mess with String.prototype, so here's a function version:

var splitBy = function (text, delimiter) {
  var 
    delimiterPATTERN = '(' + delimiter + ')', 
    delimiterRE = new RegExp(delimiterPATTERN, 'g');

  return text.split(delimiterRE).reduce(function(chunks, item){
    if (item.match(delimiterRE)){
      chunks.push(item)
    } else {
      chunks[chunks.length - 1] += item
    };
    return chunks
  }, [])
}

So you could do:

var haystack = "aaaaaa<br />&dagger; bbbb<br />&Dagger; cccc"
var needle =  '<br \/>&#?[a-zA-Z0-9]+;';
var result = splitBy(haystack , needle)
console.log( JSON.stringify( result, null, 2) )

And you'll end up with:

[
  "<br />&dagger; bbbb",
  "<br />&Dagger; cccc"
]