Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Generate all valid values for a regular expression

I know by using Xeger, we can get a random value for a specified pattern.

String regex = "[0-9]{2}"; 
Xeger generator = new Xeger(regex);
String result = generator.generate();

I want to know is there a way to return all of the valid strings for the specified regex. For example, for pattern: [0-9]{2}, we can get all of the values from 00 to 99.

Thanks

Edit:

Here we don't consider the infinite outputs like + and *; how can we get all values for a finite regex?

Last edit:

Thanks everyone! Finally I don't consider all the possible values as there may be thousands. I limit a specific number as the number of values to reduce the amount.

like image 201
Eve Avatar asked Apr 11 '13 13:04

Eve


People also ask

How to generate valid email addresses from regular expressions?

For example, if you have written a regular expression that matches emails and you need to generate test cases for it, then you can generate valid email addresses with this utility. You give it your email regular expression, and it will randomly find strings that match this regular expression.

How to use regex generator?

Regex Generator 1 Paste a text sample. Give us an example of the text you want to match using your regex. ... 2 Which parts of the text are interesting for you? Character (.) Character (.) Character (.) Character (.) 3 Hover the generated regular expression to see more information. ... More items...

How to use regex to generate random string in Excel?

Just enter your regex in the field below, press Generate Text button, and you get random data that matches your regular expression. Press button, get regex matching strings. No ads, nonsense or garbage. Announcement: We just launched Online Math Tools – a collection of utilities for solving math problems.

How to find all the posibilities of a regular expression?

One method for locating all the posibilities, is to detect where in the regular expression there are choices. For each possible choice generate a new regular expression based on the original regular expression and the current choice. This new regular expression is now a bit simpler than the original one.


3 Answers

Since a regexp is defined by a finite state machine, I wondered if there was something out there able to automatically reason on such machines and that was a good fit to be repurposed for this work... and clojure.core.logic delivered

So, I looked at this definition of the regexp grammar (unfortunately, it lacks the {} quantifiers, but they should be pretty easy to add to my code) adapted it to the java escapes, and worked out this 110 lines long clojure program:

(ns regexp-unfolder.core
  (:require [instaparse.core :as insta])
  (:require [clojure.core.logic :as l])
  (:require [clojure.set :refer [union difference]])
  (:gen-class :methods [#^{:static true} [unfold [String] clojure.lang.LazySeq]])
)

(def parse-regexp (insta/parser 
             "re = union | simple-re?
             union = re '|' simple-re
             simple-re = concat | base-re
             concat = simple-re base-re
             base-re = elementary-re | star | plus
             star = elementary-re '*'
             plus = elementary-re '+'
             elementary-re = group | char | '$' | any | set
             any = '.'
             group = '(' re ')'
             set = positive-set | negative-set
             positive-set = '['  set-items ']'
             negative-set = '[^' set-items ']'
             set-items = set-item*
             set-item = range | char
             range = char '-' char
             char = #'[^\\\\\\-\\[\\]]|\\.'" ))

(def printables (set (map char (range 32 127))))

(declare fns handle-first)

(defn handle-tree [q qto [ type & nodes]]
  (if (nil? nodes)
    [[q [""] qto]]
    ((fns type handle-first) q qto nodes)))

(defn star [q qto node &]
  (cons [q [""] qto]
         (handle-tree q q (first node))))

(defn plus [q qto node &] 
  (concat (handle-tree q qto (first node))
          (handle-tree qto qto (first node))))

(defn any-char [q qto & _] [[q (vec printables) qto]] )

(defn char-range [[c1 _ c2]]
  (let [extract-char (comp int first seq second)]
    (set (map char (range (extract-char c1) (inc (extract-char c2)))))))

(defn items [nodes]
  (union (mapcat
    (fn [[_ [type & ns]]]
      (if (= type :char)
        #{(first ns)}        
        (char-range ns)))
    (rest (second nodes)))))

(defn handle-set [q qto node &] [[q (vec (items node)) qto]])

(defn handle-negset [q qto node &] [[q (vec (difference printables (items node))) qto]])

(defn handle-range [q qto & nodes] [[q (vec (char-range nodes)) qto]])

(defn handle-char [q qto node &] [[q (vec node) qto]] )

(defn handle-concat [q qto nodes] 
  (let [syms (for [x  (rest nodes)] (gensym q))]
    (mapcat handle-tree  (cons q syms) (concat syms [qto] ) nodes)
  ))

(defn handle-first [q qto [node & _]] (handle-tree q qto node))

(def fns {:concat handle-concat, :star star, :plus plus, :any any-char, :positive-set handle-set, :negative-set handle-negset, :char handle-char})

(l/defne transition-membero
  [state trans newstate otransition]
  ([_ _ _ [state trans-set newstate]]
     (l/membero trans trans-set)))

(defn transitiono [state trans newstate transitions]
  (l/conde
   [(l/fresh [f] 
             (l/firsto transitions f)
             (transition-membero state trans newstate f))]
   [(l/fresh [r]
             (l/resto transitions r)
             (transitiono state trans newstate r))])
  )

(declare transitions)

;; Recognize a regexp finite state machine encoded in triplets [state, transition, next-state], adapted from a snippet made by Peteris Erins

(defn recognizeo
  ([input]
     (recognizeo 'q0 input))
  ([q input]
     (l/matche [input] ; start pattern matching on the input
        (['("")]
           (l/== q 'ok)) ; accept the empty string if we are in an accepting state
        ([[i . nput]]
           (l/fresh [qto]
                  (transitiono q i qto transitions) ; assert it must be what we transition to qto from q with input symbol i
                  (recognizeo qto nput)))))) ; recognize the remainder


(defn -unfold [regex] 
  (def transitions 
    (handle-tree 'q0 'ok (parse-regexp regex)))
  (map (partial apply str) (l/run* [q] (recognizeo q))))

Being written with core.logic, it should be fairly easy to adapt it to work also as a regexp matcher

I limited the printables characters from 32 to 126 ascii, otherwise it'd be too cumbersome to deal with regexps such as [^c], but you can extend it quite easily... also, I haven't implemented yet unions, optional patterns, and the \w, \s, etc. escapes for character classes

This is the biggest thing I wrote in clojure up to now, but the basics seems to be covered just fine... some examples:

regexp-unfolder.core=> (-unfold "ba[rz]")
("bar" "baz")
regexp-unfolder.core=> (-unfold "[a-z3-7]")
("a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "3" "4" "5" "6" "7")
regexp-unfolder.core=> (-unfold "[a-z3-7][01]")
("a0" "a1" "b0" "b1" "c0" "c1" "d0" "d1" "e0" "e1" "f0" "f1" "g0" "g1" "h0" "h1" "i0" "i1" "j0" "j1" "k0" "k1" "l0" "l1" "m0" "m1" "n0" "n1" "o0" "o1" "p0" "p1" "q0" "q1" "r0" "r1" "s0" "s1" "t0" "t1" "u0" "u1" "v0" "v1" "w0" "w1" "x0" "x1" "y0" "y1" "z0" "z1" "30" "31" "40" "41" "50" "51" "60" "70" "61" "71")
regexp-unfolder.core=> (-unfold "[^A-z]")
(" " "@" "!" "\"" "#" "$" "%" "&" "'" "(" ")" "*" "+" "," "-" "." "/" "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" ":" ";" "{" "<" "|" "=" "}" ">" "~" "?")
regexp-unfolder.core=> (take 20 (-unfold "[abc]*"))
("" "a" "b" "c" "aa" "ab" "ac" "ba" "ca" "aaa" "bb" "cb" "aab" "bc" "cc" "aac" "aba" "aca" "baa" "caa")
regexp-unfolder.core=> (take 20 (-unfold "a+b+"))
("ab" "aab" "abb" "abbb" "aaab" "abbbb" "aabb" "abbbbb" "abbbbbb" "aabbb" "abbbbbbb" "abbbbbbbb" "aaaab" "aabbbb" "aaabb" "abbbbbbbbb" "abbbbbbbbbb" "aabbbbb" "abbbbbbbbbbb" "abbbbbbbbbbbb")

Since I started this way, I implemented also infinite outputs :)

If someone is interested, I uploaded it here

and obviously, here's an example of how to invoke unfold from plain old Java:

import static regexp_unfolder.core.unfold;

public class UnfolderExample{
    public static void main(String[] args){
        @SuppressWarnings("unchecked")
        Iterable<String> strings = unfold("a+b+");
        for (String s : strings){
            System.out.println(s);
        }
    }
}
like image 173
berdario Avatar answered Oct 21 '22 12:10

berdario


Here is in C language written open-source generator RegLdg - regular expression grammar language dictionary generator.

I believe, it will be not very difficult to make Java port of this program.

like image 4
Andremoniy Avatar answered Oct 21 '22 12:10

Andremoniy


Finding all matches is very similar to finding a random match. Below is a simple modification of the logic that generates random matches on www.debuggex.com, assuming you already have a parse tree.

The idea is that for every subtree, you return a list of all possible generated strings, given a string that was generated by all previous nodes in your parse tree.

AltTree.all = (prefix) ->
    rets = []
    for child in children
        rets.extend(child.all(prefix))

ConcatTree.all = (prefix) ->
    prefixes = [prefix]
    for child in children
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(child.all(p))
        prefixes = newPrefixes
    return prefixes

RepeatTree.all = (prefix) ->
    prefixes = [prefix]
    rets = []
    for i up to max
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(onlyChild.all(p))
        prefixes = newPrefixes
        if i >= min
            rets.extend(prefixes)
    return rets

CharsetTree.all = (prefix) ->
    rets = []
    for char in allValidChars():
        rets.push(prefix + char)
    return rets

The rest of the trees are left as exercises (most notably the literal tree).

Note that there are intentionally no optimizations for the sake of clarity. Calling myTree.all('') will generate a list such that every valid matching string appears once for every path that generates this string. You will probably want to add deduplication and get rid of the excessive copying.

I should also add that this will only work for regular expressions that have a small number of total matching strings. This is because all of the strings are being stored. If you want to get around this limitation, you can yieldify this algorithm. You will need to maintain a stack (think of it as being a bread crumb trail) of where you are in the tree. When a new string is asked for, you will create it from the path you travelled, and then update the path.

like image 2
Sergiu Toarca Avatar answered Oct 21 '22 13:10

Sergiu Toarca