Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Parsing python grammar with boost::spirit - problem

I am trying to write a python parser with boost::spirit library. Here is the code:

template <typename Iterator>
class Parser : public qi::grammar<Iterator, space_type>
{
public:
    Parser() : Parser::base_type(small_stmt)
    {
        NEWLINE = lit("<NEWLINE>");
        INDENT = lit("<INDENT>");
        DEDENT = lit("<DEDENT>");
        ENDMARKER = lit("<EOT>");
        NAME = identifier.alias();
        NUMBER = integer|longinteger|floatnumber|imagnumber;
        STRING = stringliteral.alias();

        identifier = (alpha | '_') >> *(alpha | digit | '_');

        stringliteral = -stringprefix  >> (shortstring | longstring);
        stringprefix = lit("r") | lit("u") | lit("ur") | lit("R") | lit("U") | lit("UR") | lit("Ur") | lit("uR") | lit("b") | lit("B") | lit("br") | lit("Br") | lit("bR") | lit("BR");
        shortstring =  "'" >> *(shortstringitem - "'") >> "'" | "\"" >> *(shortstringitem - "\"") >> "\"";
        longstring = "'''" >> *longstringitem >> "'''" | "\"\"\"" >> *longstringitem >> "\"\"\"";
        shortstringitem = shortstringchar | escapeseq;
        longstringitem = longstringchar | escapeseq;
        shortstringchar = char_ - "\\" - "\n";
        longstringchar = char_ - "\\";
        escapeseq = '\\' >> char_;

        longinteger = integer >> (lit("l") | lit("L"));
        integer = decimalinteger | octinteger | hexinteger | bininteger;
        decimalinteger = nonzerodigit >> *digit | lit("0");
        octinteger = lit("0") >> (lit("o") | lit("O")) >> +octdigit | lit("0") >> +octdigit;
        hexinteger = lit("0") >> (lit("x") | lit("X")) >> +hexdigit;
        bininteger = lit("0") >> (lit("b") | lit("B")) >> +bindigit;
        nonzerodigit = char_('1', '9');
        octdigit = char_('0', '7');
        bindigit = lit("0") | lit("1");
        hexdigit = digit | char_('a', 'f') | char_('A', 'F');

        floatnumber = pointfloat | exponentfloat;
        pointfloat = -intpart >> fraction | intpart >> ".";
        exponentfloat = (intpart | pointfloat) >> exponent;
        intpart = +digit;
        fraction = "." >> +digit;
        exponent = (lit("e") | lit("E")) >> -(lit("+") | lit("-")) >> +digit;

        imagnumber = (floatnumber | intpart) >> (lit("j") | lit("J"));

        single_input = NEWLINE|simple_stmt|compound_stmt >> NEWLINE;
        file_input = *(NEWLINE|stmt) >> ENDMARKER;
        eval_input = testlist >> *NEWLINE >> ENDMARKER;
        decorator = lit("@") >> dotted_name >> -( lit("(") >> -(arglist) >> lit(")") ) >> NEWLINE;
        decorators = +decorator;
        decorated = decorators >> (classdef|funcdef);
        funcdef = lit("def") >> NAME >> parameters >> lit(":") >> suite;
        parameters = lit("(") >> -(varargslist) >> lit(")");
        varargslist = (*(fpdef >> -(lit("=") >> test) >> lit(",")) >> (lit("*") >> NAME >> -(lit(",") >> lit("**") >> NAME)|lit("**") >> NAME)|fpdef >> -(lit("=") >> test) >> *(lit(",") >> fpdef >> -(lit("=") >> test)) >> -(lit(",")));
        fpdef = NAME|lit("(") >> fplist >> lit(")");
        fplist = fpdef >> *(lit(",") >> fpdef) >> -(lit(","));
        stmt = simple_stmt|compound_stmt;
        simple_stmt = small_stmt >> *(lit(";") >> small_stmt) >> -(lit(";")) >> NEWLINE;
        small_stmt = (expr_stmt|print_stmt|del_stmt|pass_stmt|flow_stmt|import_stmt|global_stmt|exec_stmt|assert_stmt);
        expr_stmt = testlist >> (augassign >> (yield_expr|testlist)|*(lit("=") >> (yield_expr|testlist)));
        augassign = (lit("+=")|lit("-=")|lit("*=")|lit("/=")|lit("%=")|lit("&=")|lit("|=")|lit("^=")|lit("<<=")|lit(">>=")|lit("**=")|lit("//="));
        print_stmt = lit("print") >> ( -( test >> *(lit(",") >> test) >> -(lit(",")) )|lit(">>") >> test >> -( +(lit(",") >> test) >> -(lit(",")) ) );
        del_stmt = lit("del") >> exprlist;
        pass_stmt = lit("pass");
        flow_stmt = break_stmt|continue_stmt|return_stmt|raise_stmt|yield_stmt;
        break_stmt = lit("break");
        continue_stmt = lit("continue");
        return_stmt = lit("return") >> -(testlist);
        yield_stmt = yield_expr.alias();
        raise_stmt = lit("raise") >> -(test >> -(lit(",") >> test >> -(lit(",") >> test)));
        import_stmt = import_name|import_from;
        import_name = lit("import") >> dotted_as_names;
        import_from = (lit("from") >> (*lit(".") >> dotted_name|+lit(".")) >> lit("import") >> (lit("*")|lit("(") >> import_as_names >> lit(")")|import_as_names));
        import_as_name = NAME >> -(lit("as") >> NAME);
        dotted_as_name = dotted_name >> -(lit("as") >> NAME);
        import_as_names = import_as_name >> *(lit(",") >> import_as_name) >> -(lit(","));
        dotted_as_names = dotted_as_name >> *(lit(",") >> dotted_as_name);
        dotted_name = NAME >> *(lit(".") >> NAME);
        global_stmt = lit("global") >> NAME >> *(lit(",") >> NAME);
        exec_stmt = lit("exec") >> expr >> -(lit("in") >> test >> -(lit(",") >> test));
        assert_stmt = lit("assert") >> test >> -(lit(",") >> test);
        compound_stmt = if_stmt|while_stmt|for_stmt|try_stmt|with_stmt|funcdef|classdef|decorated;
        if_stmt = lit("if") >> test >> lit(":") >> suite >> *(lit("elif") >> test >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite);
        while_stmt = lit("while") >> test >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite);
        for_stmt = lit("for") >> exprlist >> lit("in") >> testlist >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite);
        try_stmt = (lit("try") >> lit(":") >> suite >> (+(except_clause >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite) >> -(lit("finally") >> lit(":") >> suite)|lit("finally") >> lit(":") >> suite));
        with_stmt = lit("with") >> with_item >> *(lit(",") >> with_item) >> lit(":") >> suite;
        with_item = test >> -(lit("as") >> expr);
        except_clause = lit("except") >> -(test >> -((lit("as")|lit(",")) >> test));
        suite = simple_stmt|NEWLINE >> INDENT >> +stmt >> DEDENT;
        testlist_safe = old_test >> -(+(lit(",") >> old_test) >> -(lit(",")));
        old_test = or_test|old_lambdef;
        old_lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> old_test;
        test = or_test >> -(lit("if") >> or_test >> lit("else") >> test)|lambdef;
        or_test = and_test >> *(lit("or") >> and_test);
        and_test = not_test >> *(lit("and") >> not_test);
        not_test = lit("not") >> not_test|comparison;
        comparison = expr >> *(comp_op >> expr);
        comp_op = lit("<")|lit(">")|lit("==")|lit(">=")|lit("<=")|lit("<>")|lit("!=")|lit("in")|lit("not in")|lit("is")|lit("is not");
        expr = xor_expr >> *(lit("|") >> xor_expr);
        xor_expr = and_expr >> *(lit("^") >> and_expr);
        and_expr = shift_expr >> *(lit("&") >> shift_expr);
        shift_expr = arith_expr >> *((lit("<<")|lit(">>")) >> arith_expr);
        arith_expr = term >> *((lit("+")|lit("-")) >> term);
        term = factor >> *((lit("*")|lit("/")|lit("%")|lit("//")) >> factor);
        factor = (lit("+")|lit("-")|lit("~")) >> factor|power;
        power = atom >> *trailer >> -(lit("**") >> factor);
        atom = (lit("(") >> -(yield_expr|testlist_comp) >> lit(")")|lit("-(") >> -(listmaker) >> lit(")")|lit("{") >> -(dictorsetmaker) >> lit("}")|lit("`") >> testlist1 >> lit("`")|NAME|NUMBER|+STRING);
        listmaker = test >> ( list_for|*(lit(",") >> test) >> -(lit(",")) );
        testlist_comp = test >> ( comp_for|*(lit(",") >> test) >> -(lit(",")) );
        lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> test;
        trailer = lit("(") >> -(arglist) >> lit(")")|lit("[") >> subscriptlist >> lit("]")|lit(".") >> NAME;
        subscriptlist = subscript >> *(lit(",") >> subscript) >> -(lit(","));
        subscript = lit(".") >> lit(".") >> lit(".")|test|-(test) >> lit(":") >> -(test) >> -(sliceop);
        sliceop = lit(":") >> -(test);
        exprlist = expr >> *(lit(",") >> expr) >> -(lit(","));
        testlist = test >> *(lit(",") >> test) >> -(lit(","));
        dictorsetmaker = ( (test >> lit(":") >> test >> (comp_for|*(lit(",") >> test >> lit(":") >> test) >> -(lit(","))))|(test >> (comp_for|*(lit(",") >> test) >> -(lit(",")))) );
        classdef = lit("class") >> NAME >> -(lit("(") >> -(testlist) >> lit(")")) >> lit(":") >> suite;
        arglist = *(argument >> lit(",")) >> (argument >> -(lit(","))|lit("*") >> test >> *(lit(",") >> argument) >> -(lit(",") >> lit("**") >> test)|lit("**") >> test);
        argument = test >> -(comp_for)|test >> lit("=") >> test;
        list_iter = list_for|list_if;
        list_for = lit("for") >> exprlist >> lit("in") >> testlist_safe >> -(list_iter);
        list_if = lit("if") >> old_test >> -(list_iter);
        comp_iter = comp_for|comp_if;
        comp_for = lit("for") >> exprlist >> lit("in") >> or_test >> -(comp_iter);
        comp_if = lit("if") >> old_test >> -(comp_iter);
        testlist1 = test >> *(lit(",") >> test);
        encoding_decl = NAME.alias();
        yield_expr = lit("yield") >> -(testlist);


    }

    // LEXEMS
    qi::rule<Iterator, space_type> NEWLINE;
    qi::rule<Iterator, space_type> INDENT;
    qi::rule<Iterator, space_type> DEDENT;
    qi::rule<Iterator, space_type> ENDMARKER;
    qi::rule<Iterator, space_type> NAME;
    qi::rule<Iterator, space_type> NUMBER;
    qi::rule<Iterator, space_type> STRING;

    // IDENTIFIER
    qi::rule<Iterator, space_type> identifier;

    // STRING LITERAL
    qi::rule<Iterator, space_type> stringliteral;
    qi::rule<Iterator, space_type> stringprefix;
    qi::rule<Iterator, space_type> shortstring;
    qi::rule<Iterator, space_type> longstring;
    qi::rule<Iterator, space_type> shortstringitem;
    qi::rule<Iterator, space_type> longstringitem;
    qi::rule<Iterator, space_type> shortstringchar;
    qi::rule<Iterator, space_type> longstringchar;
    qi::rule<Iterator, space_type> escapeseq;

    // INTEGER LITERAL
    qi::rule<Iterator, space_type> longinteger;
    qi::rule<Iterator, space_type> integer;
    qi::rule<Iterator, space_type> decimalinteger;
    qi::rule<Iterator, space_type> octinteger;
    qi::rule<Iterator, space_type> hexinteger;
    qi::rule<Iterator, space_type> bininteger;
    qi::rule<Iterator, space_type> nonzerodigit;
    qi::rule<Iterator, space_type> octdigit;
    qi::rule<Iterator, space_type> bindigit;
    qi::rule<Iterator, space_type> hexdigit;

    // FLOAT LITERAL
    qi::rule<Iterator, space_type> floatnumber;
    qi::rule<Iterator, space_type> pointfloat;
    qi::rule<Iterator, space_type> exponentfloat;
    qi::rule<Iterator, space_type> intpart;
    qi::rule<Iterator, space_type> fraction;
    qi::rule<Iterator, space_type> exponent;

    //IMAGINARY LITERAL
    qi::rule<Iterator, space_type> imagnumber;

    // PYTHON GRAMMAR
    qi::rule<Iterator, space_type> single_input;
    qi::rule<Iterator, space_type> file_input;
    qi::rule<Iterator, space_type> eval_input;
    qi::rule<Iterator, space_type> decorator;
    qi::rule<Iterator, space_type> decorators;
    qi::rule<Iterator, space_type> decorated;
    qi::rule<Iterator, space_type> funcdef;
    qi::rule<Iterator, space_type> parameters;
    qi::rule<Iterator, space_type> varargslist;
    qi::rule<Iterator, space_type> fpdef;
    qi::rule<Iterator, space_type> fplist;
    qi::rule<Iterator, space_type> stmt;
    qi::rule<Iterator, space_type> simple_stmt;
    qi::rule<Iterator, space_type> small_stmt;
    qi::rule<Iterator, space_type> expr_stmt;
    qi::rule<Iterator, space_type> augassign;
    qi::rule<Iterator, space_type> print_stmt;
    qi::rule<Iterator, space_type> del_stmt;
    qi::rule<Iterator, space_type> pass_stmt;
    qi::rule<Iterator, space_type> flow_stmt;
    qi::rule<Iterator, space_type> break_stmt;
    qi::rule<Iterator, space_type> continue_stmt;
    qi::rule<Iterator, space_type> return_stmt;
    qi::rule<Iterator, space_type> yield_stmt;
    qi::rule<Iterator, space_type> raise_stmt;
    qi::rule<Iterator, space_type> import_stmt;
    qi::rule<Iterator, space_type> import_name;
    qi::rule<Iterator, space_type> import_from;
    qi::rule<Iterator, space_type> import_as_name;
    qi::rule<Iterator, space_type> dotted_as_name;
    qi::rule<Iterator, space_type> import_as_names;
    qi::rule<Iterator, space_type> dotted_as_names;
    qi::rule<Iterator, space_type> dotted_name;
    qi::rule<Iterator, space_type> global_stmt;
    qi::rule<Iterator, space_type> exec_stmt;
    qi::rule<Iterator, space_type> assert_stmt;
    qi::rule<Iterator, space_type> compound_stmt;
    qi::rule<Iterator, space_type> if_stmt;
    qi::rule<Iterator, space_type> while_stmt;
    qi::rule<Iterator, space_type> for_stmt;
    qi::rule<Iterator, space_type> try_stmt;
    qi::rule<Iterator, space_type> with_stmt;
    qi::rule<Iterator, space_type> with_item;
    qi::rule<Iterator, space_type> except_clause;
    qi::rule<Iterator, space_type> suite;
    qi::rule<Iterator, space_type> testlist_safe;
    qi::rule<Iterator, space_type> old_test;
    qi::rule<Iterator, space_type> old_lambdef;
    qi::rule<Iterator, space_type> test;
    qi::rule<Iterator, space_type> or_test;
    qi::rule<Iterator, space_type> and_test;
    qi::rule<Iterator, space_type> not_test;
    qi::rule<Iterator, space_type> comparison;
    qi::rule<Iterator, space_type> comp_op;
    qi::rule<Iterator, space_type> expr;
    qi::rule<Iterator, space_type> xor_expr;
    qi::rule<Iterator, space_type> and_expr;
    qi::rule<Iterator, space_type> shift_expr;
    qi::rule<Iterator, space_type> arith_expr;
    qi::rule<Iterator, space_type> term;
    qi::rule<Iterator, space_type> factor;
    qi::rule<Iterator, space_type> power;
    qi::rule<Iterator, space_type> atom;
    qi::rule<Iterator, space_type> listmaker;
    qi::rule<Iterator, space_type> testlist_comp;
    qi::rule<Iterator, space_type> lambdef;
    qi::rule<Iterator, space_type> trailer;
    qi::rule<Iterator, space_type> subscriptlist;
    qi::rule<Iterator, space_type> subscript;
    qi::rule<Iterator, space_type> sliceop;
    qi::rule<Iterator, space_type> exprlist;
    qi::rule<Iterator, space_type> testlist;
    qi::rule<Iterator, space_type> dictorsetmaker;
    qi::rule<Iterator, space_type> classdef;
    qi::rule<Iterator, space_type> arglist;
    qi::rule<Iterator, space_type> argument;
    qi::rule<Iterator, space_type> list_iter;
    qi::rule<Iterator, space_type> list_for;
    qi::rule<Iterator, space_type> list_if;
    qi::rule<Iterator, space_type> comp_iter;
    qi::rule<Iterator, space_type> comp_for;
    qi::rule<Iterator, space_type> comp_if;
    qi::rule<Iterator, space_type> testlist1;
    qi::rule<Iterator, space_type> encoding_decl;
    qi::rule<Iterator, space_type> yield_expr;
};

The problem is that when I try to parse simple file:

pass

witch after going through some lexer module is:

pass <NEWLINE> <EOT>

the parsing failed and stops at first character. When I tried to parse this file with pass_stmt rule everything is ok (except that we still have and remaining, but pass word was "consumed"). When I tried to parse it with rule one level up - small_stmt - parser stops at

> <EOT>

consuming

pass <NEWLINE

One level up - simple_stmt gives same result as file_input - parser stops at first character.

Before adding grammar defined in PYTHON GRAMMAR section (get from http://docs.python.org/reference/grammar.html) everything was working properly. Parser recognized identifiers, literals, numbers etc.

Does anybody have an idea what can be wrong here?

like image 742
John Avatar asked May 30 '11 15:05

John


People also ask

How do you generate parsers in Python?

tools that can generate parsers usable from Python (and possibly from other languages) Python libraries to build parsers. Tools that can be used to generate the code for a parser are called parser generators or compiler compiler. Libraries that create parsers are known as parser combinators.

What is a modern parsing library for Python?

A modern parsing library for Python, implementing Earley & LALR(1) and an easy interface . Lark is a parser generator that works as a library. You write the grammar in a string or a file and then use it as an argument to dynamically generate the parser.

How to parse a language from a document in Python?

If you need to parse a language, or document, from Python there are fundamentally three ways to solve the problem: use an existing library supporting that specific language: for example a library to parse XML a tool or library to generate a parser: for example ANTLR, that you can use to build parsers for any language

What is a parse tree in Python?

A parse tree is a representation of the code closer to the concrete syntax. It shows many details of the implementation of the parser. For instance, usually a rule corresponds to the type of a node. A parse tree is usually transformed in an AST by the user, possibly with some help from the parser generator.


1 Answers

I'd suggest you enable debugging as explained here. This will give you insights what's actually happening. Generally, I suggest building grammars step by step and not trying to implement everything in one big leap.

The code you provided above is very difficult to understand for un-unitiated readers as it is very large and not commented. Writing grammars is very much like writing 'normal' code. Encapsulation is the key to success. Try building smaller grammars covering self-contained pieces and combine those sub-grammars as required. See here for some best practices.

like image 58
hkaiser Avatar answered Oct 21 '22 09:10

hkaiser