Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Function To Create Regex Matching a Number Range

I am working with the Amazon Mechanical Turk API and it will only allow me to use regular expressions to filter a field of data.

I would like to input an integer range to a function, such as 256-311 or 45-1233, and return a regex that would match only that range.

A regex matching 256-321 would be:

\b((25[6-9])|(2[6-9][0-9])|(3[0-1][0-9])|(32[0-1]))\b

That part is fairly easy, but I am having trouble with the loop to create this regex.

I am trying to build a function defined like this:

function getRangeRegex( int fromInt, int toInt)
{

      return regexString;
}

I looked all over the web and I am surprised that it doesn't look like anyone has solved this in the past. It is a difficult problem...

Thanks for your time.

like image 414
Bryan Avatar asked Jul 15 '11 16:07

Bryan


1 Answers

PHP Port of RegexNumericRangeGenerator

class RegexRangeNumberGenerator {

    static function parse($min, $max, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
        if (!is_int($min) || !is_int($max) || $min > $max || $min < 0 || $max < 0) {
            return FALSE;
        }
        if ($min == $max) {
            return self::parseIntoPattern($min, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
        }
        $s = [];
        $x = self::parseStartRange($min, $max);
        foreach ($x as $o) {
            $s[] = self::parseEndRange($o[0], $o[1]);
        }
        $n = self::reformatArray($s);
        $h = self::parseIntoRegex($n);
        return self::parseIntoPattern($h, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
    }

    static private function parseIntoPattern($t, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
        $r = ((is_array($t)) ? implode("|", $t) : $t);
        return (($MatchWholeLine && $MatchLeadingZero) ? "^0*(" . $r . ")$" : (($MatchLeadingZero) ? "0*(" . $r . ")" : (($MatchWholeLine) ? "^(" . $r . ")$" : (($MatchWholeWord) ? "\\b(" . $r . ")\\b" : "(" . $r . ")"))));
    }

    static private function parseIntoRegex($t) {
        if (!is_array($t)) {
            throw new Exception("Argument needs to be an array!");
        }
        $r = [];
        for ($i = 0; $i < count($t); $i++) {
            $e = str_split($t[$i][0]);
            $n = str_split($t[$i][1]);
            $s = "";
            $o = 0;
            $h = "";
            for ($a = 0; $a < count($e); $a++) {
                if ($e[$a] === $n[$a]) {
                    $h .= $e[$a];
                } else {
                    if ((intval($e[$a]) + 1) === intval($n[$a])) {
                        $h .= "[" . $e[$a] . $n[$a] . "]";
                    } else {
                        if ($s === ($e[$a] . $n[$a])) {
                            $o++;
                        }
                        $s = $e[$a] . $n[$a];
                        if ($a == (count($e) - 1)) {
                            $h .= (($o > 0) ? "{" . ($o + 1) . "}" : "[" . $e[$a] . "-" . $n[$a] . "]");
                        } else {
                            if ($o === 0) {
                                $h .= "[" . $e[$a] . "-" . $n[$a] . "]";
                            }
                        }
                    }
                }
            }
            $r[] = $h;
        }
        return $r;
    }

    static private function reformatArray($t) {
        $arrReturn = [];
        for ($i = 0; $i < count($t); $i++) {
            $page = count($t[$i]) / 2;
            for ($a = 0; $a < $page; $a++) {
                $arrReturn[] = array_slice($t[$i], (2 * $a), 2);
            }
        }
        return $arrReturn;
    }

    static private function parseStartRange($t, $r) {
        if (strlen($t) === strlen($r)) {
            return [[$t, $r]];
        }
        $break = pow(10, strlen($t)) - 1;
        return array_merge([[$t, $break]], self::parseStartRange($break + 1, $r));
    }

    static private function parseEndRange($t, $r) {
        if (strlen($t) == 1) {
            return [$t, $r];
        }
        if (str_repeat("0", strlen($t)) === "0" . substr($t, 1)) {
            if (str_repeat("0", strlen($r)) == "9" . substr($r, 1)) {
                return [$t, $r];
            }
            if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
                $e = intval(substr($r, 0, 1) . str_repeat("0", strlen($r) - 1)) - 1;
                return array_merge([$t, self::strBreakPoint($e)], self::parseEndRange(self::strBreakPoint($e + 1), $r));
            }
        }
        if (str_repeat("9", strlen($r)) === "9" . substr($r, 1) && (int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
            $e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
            return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), [self::strBreakPoint($e + 1), $r]);
        }
        if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
            $e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
            return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), self::parseEndRange(self::strBreakPoint($e + 1), $r));
        }
        $a = (int) substr($t, 0, 1);
        $o = self::parseEndRange(substr($t, 1), substr($r, 1));
        $h = [];
        for ($u = 0; $u < count($o); $u++) {
            $h[] = ($a . $o[$u]);
        }
        return $h;
    }

    static private function strBreakPoint($t) {
        return str_pad($t, strlen(($t + 1)), "0", STR_PAD_LEFT);
    }
}

Test Results

2-8         ^([2-8])$
5-35        ^([5-9]|[12][0-9]|3[0-5])$
5-100       ^([5-9]|[1-8][0-9]|9[0-9]|100)$
12-1234     ^(1[2-9]|[2-9][0-9]|[1-8][0-9]{2}|9[0-8][0-9]|99[0-9]|1[01][0-9]{2}|12[0-2][0-9]|123[0-4])$
123-123     ^(123)$
256-321     ^(25[6-9]|2[6-9][0-9]|3[01][0-9]|32[01])$
256-257     ^(25[67])$
180-195     ^(18[0-9]|19[0-5])$
like image 194
EmilianoT Avatar answered Nov 02 '22 06:11

EmilianoT