I am working with the Amazon Mechanical Turk API and it will only allow me to use regular expressions to filter a field of data.
I would like to input an integer range to a function, such as 256-311 or 45-1233, and return a regex that would match only that range.
A regex matching 256-321 would be:
\b((25[6-9])|(2[6-9][0-9])|(3[0-1][0-9])|(32[0-1]))\b
That part is fairly easy, but I am having trouble with the loop to create this regex.
I am trying to build a function defined like this:
function getRangeRegex( int fromInt, int toInt)
{
return regexString;
}
I looked all over the web and I am surprised that it doesn't look like anyone has solved this in the past. It is a difficult problem...
Thanks for your time.
PHP Port of RegexNumericRangeGenerator
class RegexRangeNumberGenerator {
static function parse($min, $max, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
if (!is_int($min) || !is_int($max) || $min > $max || $min < 0 || $max < 0) {
return FALSE;
}
if ($min == $max) {
return self::parseIntoPattern($min, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
}
$s = [];
$x = self::parseStartRange($min, $max);
foreach ($x as $o) {
$s[] = self::parseEndRange($o[0], $o[1]);
}
$n = self::reformatArray($s);
$h = self::parseIntoRegex($n);
return self::parseIntoPattern($h, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
}
static private function parseIntoPattern($t, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
$r = ((is_array($t)) ? implode("|", $t) : $t);
return (($MatchWholeLine && $MatchLeadingZero) ? "^0*(" . $r . ")$" : (($MatchLeadingZero) ? "0*(" . $r . ")" : (($MatchWholeLine) ? "^(" . $r . ")$" : (($MatchWholeWord) ? "\\b(" . $r . ")\\b" : "(" . $r . ")"))));
}
static private function parseIntoRegex($t) {
if (!is_array($t)) {
throw new Exception("Argument needs to be an array!");
}
$r = [];
for ($i = 0; $i < count($t); $i++) {
$e = str_split($t[$i][0]);
$n = str_split($t[$i][1]);
$s = "";
$o = 0;
$h = "";
for ($a = 0; $a < count($e); $a++) {
if ($e[$a] === $n[$a]) {
$h .= $e[$a];
} else {
if ((intval($e[$a]) + 1) === intval($n[$a])) {
$h .= "[" . $e[$a] . $n[$a] . "]";
} else {
if ($s === ($e[$a] . $n[$a])) {
$o++;
}
$s = $e[$a] . $n[$a];
if ($a == (count($e) - 1)) {
$h .= (($o > 0) ? "{" . ($o + 1) . "}" : "[" . $e[$a] . "-" . $n[$a] . "]");
} else {
if ($o === 0) {
$h .= "[" . $e[$a] . "-" . $n[$a] . "]";
}
}
}
}
}
$r[] = $h;
}
return $r;
}
static private function reformatArray($t) {
$arrReturn = [];
for ($i = 0; $i < count($t); $i++) {
$page = count($t[$i]) / 2;
for ($a = 0; $a < $page; $a++) {
$arrReturn[] = array_slice($t[$i], (2 * $a), 2);
}
}
return $arrReturn;
}
static private function parseStartRange($t, $r) {
if (strlen($t) === strlen($r)) {
return [[$t, $r]];
}
$break = pow(10, strlen($t)) - 1;
return array_merge([[$t, $break]], self::parseStartRange($break + 1, $r));
}
static private function parseEndRange($t, $r) {
if (strlen($t) == 1) {
return [$t, $r];
}
if (str_repeat("0", strlen($t)) === "0" . substr($t, 1)) {
if (str_repeat("0", strlen($r)) == "9" . substr($r, 1)) {
return [$t, $r];
}
if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(substr($r, 0, 1) . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge([$t, self::strBreakPoint($e)], self::parseEndRange(self::strBreakPoint($e + 1), $r));
}
}
if (str_repeat("9", strlen($r)) === "9" . substr($r, 1) && (int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), [self::strBreakPoint($e + 1), $r]);
}
if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), self::parseEndRange(self::strBreakPoint($e + 1), $r));
}
$a = (int) substr($t, 0, 1);
$o = self::parseEndRange(substr($t, 1), substr($r, 1));
$h = [];
for ($u = 0; $u < count($o); $u++) {
$h[] = ($a . $o[$u]);
}
return $h;
}
static private function strBreakPoint($t) {
return str_pad($t, strlen(($t + 1)), "0", STR_PAD_LEFT);
}
}
Test Results
2-8 ^([2-8])$
5-35 ^([5-9]|[12][0-9]|3[0-5])$
5-100 ^([5-9]|[1-8][0-9]|9[0-9]|100)$
12-1234 ^(1[2-9]|[2-9][0-9]|[1-8][0-9]{2}|9[0-8][0-9]|99[0-9]|1[01][0-9]{2}|12[0-2][0-9]|123[0-4])$
123-123 ^(123)$
256-321 ^(25[6-9]|2[6-9][0-9]|3[01][0-9]|32[01])$
256-257 ^(25[67])$
180-195 ^(18[0-9]|19[0-5])$
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With