I am trying to write a python parser with boost::spirit library. Here is the code:
template <typename Iterator>
class Parser : public qi::grammar<Iterator, space_type>
{
public:
Parser() : Parser::base_type(small_stmt)
{
NEWLINE = lit("<NEWLINE>");
INDENT = lit("<INDENT>");
DEDENT = lit("<DEDENT>");
ENDMARKER = lit("<EOT>");
NAME = identifier.alias();
NUMBER = integer|longinteger|floatnumber|imagnumber;
STRING = stringliteral.alias();
identifier = (alpha | '_') >> *(alpha | digit | '_');
stringliteral = -stringprefix >> (shortstring | longstring);
stringprefix = lit("r") | lit("u") | lit("ur") | lit("R") | lit("U") | lit("UR") | lit("Ur") | lit("uR") | lit("b") | lit("B") | lit("br") | lit("Br") | lit("bR") | lit("BR");
shortstring = "'" >> *(shortstringitem - "'") >> "'" | "\"" >> *(shortstringitem - "\"") >> "\"";
longstring = "'''" >> *longstringitem >> "'''" | "\"\"\"" >> *longstringitem >> "\"\"\"";
shortstringitem = shortstringchar | escapeseq;
longstringitem = longstringchar | escapeseq;
shortstringchar = char_ - "\\" - "\n";
longstringchar = char_ - "\\";
escapeseq = '\\' >> char_;
longinteger = integer >> (lit("l") | lit("L"));
integer = decimalinteger | octinteger | hexinteger | bininteger;
decimalinteger = nonzerodigit >> *digit | lit("0");
octinteger = lit("0") >> (lit("o") | lit("O")) >> +octdigit | lit("0") >> +octdigit;
hexinteger = lit("0") >> (lit("x") | lit("X")) >> +hexdigit;
bininteger = lit("0") >> (lit("b") | lit("B")) >> +bindigit;
nonzerodigit = char_('1', '9');
octdigit = char_('0', '7');
bindigit = lit("0") | lit("1");
hexdigit = digit | char_('a', 'f') | char_('A', 'F');
floatnumber = pointfloat | exponentfloat;
pointfloat = -intpart >> fraction | intpart >> ".";
exponentfloat = (intpart | pointfloat) >> exponent;
intpart = +digit;
fraction = "." >> +digit;
exponent = (lit("e") | lit("E")) >> -(lit("+") | lit("-")) >> +digit;
imagnumber = (floatnumber | intpart) >> (lit("j") | lit("J"));
single_input = NEWLINE|simple_stmt|compound_stmt >> NEWLINE;
file_input = *(NEWLINE|stmt) >> ENDMARKER;
eval_input = testlist >> *NEWLINE >> ENDMARKER;
decorator = lit("@") >> dotted_name >> -( lit("(") >> -(arglist) >> lit(")") ) >> NEWLINE;
decorators = +decorator;
decorated = decorators >> (classdef|funcdef);
funcdef = lit("def") >> NAME >> parameters >> lit(":") >> suite;
parameters = lit("(") >> -(varargslist) >> lit(")");
varargslist = (*(fpdef >> -(lit("=") >> test) >> lit(",")) >> (lit("*") >> NAME >> -(lit(",") >> lit("**") >> NAME)|lit("**") >> NAME)|fpdef >> -(lit("=") >> test) >> *(lit(",") >> fpdef >> -(lit("=") >> test)) >> -(lit(",")));
fpdef = NAME|lit("(") >> fplist >> lit(")");
fplist = fpdef >> *(lit(",") >> fpdef) >> -(lit(","));
stmt = simple_stmt|compound_stmt;
simple_stmt = small_stmt >> *(lit(";") >> small_stmt) >> -(lit(";")) >> NEWLINE;
small_stmt = (expr_stmt|print_stmt|del_stmt|pass_stmt|flow_stmt|import_stmt|global_stmt|exec_stmt|assert_stmt);
expr_stmt = testlist >> (augassign >> (yield_expr|testlist)|*(lit("=") >> (yield_expr|testlist)));
augassign = (lit("+=")|lit("-=")|lit("*=")|lit("/=")|lit("%=")|lit("&=")|lit("|=")|lit("^=")|lit("<<=")|lit(">>=")|lit("**=")|lit("//="));
print_stmt = lit("print") >> ( -( test >> *(lit(",") >> test) >> -(lit(",")) )|lit(">>") >> test >> -( +(lit(",") >> test) >> -(lit(",")) ) );
del_stmt = lit("del") >> exprlist;
pass_stmt = lit("pass");
flow_stmt = break_stmt|continue_stmt|return_stmt|raise_stmt|yield_stmt;
break_stmt = lit("break");
continue_stmt = lit("continue");
return_stmt = lit("return") >> -(testlist);
yield_stmt = yield_expr.alias();
raise_stmt = lit("raise") >> -(test >> -(lit(",") >> test >> -(lit(",") >> test)));
import_stmt = import_name|import_from;
import_name = lit("import") >> dotted_as_names;
import_from = (lit("from") >> (*lit(".") >> dotted_name|+lit(".")) >> lit("import") >> (lit("*")|lit("(") >> import_as_names >> lit(")")|import_as_names));
import_as_name = NAME >> -(lit("as") >> NAME);
dotted_as_name = dotted_name >> -(lit("as") >> NAME);
import_as_names = import_as_name >> *(lit(",") >> import_as_name) >> -(lit(","));
dotted_as_names = dotted_as_name >> *(lit(",") >> dotted_as_name);
dotted_name = NAME >> *(lit(".") >> NAME);
global_stmt = lit("global") >> NAME >> *(lit(",") >> NAME);
exec_stmt = lit("exec") >> expr >> -(lit("in") >> test >> -(lit(",") >> test));
assert_stmt = lit("assert") >> test >> -(lit(",") >> test);
compound_stmt = if_stmt|while_stmt|for_stmt|try_stmt|with_stmt|funcdef|classdef|decorated;
if_stmt = lit("if") >> test >> lit(":") >> suite >> *(lit("elif") >> test >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite);
while_stmt = lit("while") >> test >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite);
for_stmt = lit("for") >> exprlist >> lit("in") >> testlist >> lit(":") >> suite >> -(lit("else") >> lit(":") >> suite);
try_stmt = (lit("try") >> lit(":") >> suite >> (+(except_clause >> lit(":") >> suite) >> -(lit("else") >> lit(":") >> suite) >> -(lit("finally") >> lit(":") >> suite)|lit("finally") >> lit(":") >> suite));
with_stmt = lit("with") >> with_item >> *(lit(",") >> with_item) >> lit(":") >> suite;
with_item = test >> -(lit("as") >> expr);
except_clause = lit("except") >> -(test >> -((lit("as")|lit(",")) >> test));
suite = simple_stmt|NEWLINE >> INDENT >> +stmt >> DEDENT;
testlist_safe = old_test >> -(+(lit(",") >> old_test) >> -(lit(",")));
old_test = or_test|old_lambdef;
old_lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> old_test;
test = or_test >> -(lit("if") >> or_test >> lit("else") >> test)|lambdef;
or_test = and_test >> *(lit("or") >> and_test);
and_test = not_test >> *(lit("and") >> not_test);
not_test = lit("not") >> not_test|comparison;
comparison = expr >> *(comp_op >> expr);
comp_op = lit("<")|lit(">")|lit("==")|lit(">=")|lit("<=")|lit("<>")|lit("!=")|lit("in")|lit("not in")|lit("is")|lit("is not");
expr = xor_expr >> *(lit("|") >> xor_expr);
xor_expr = and_expr >> *(lit("^") >> and_expr);
and_expr = shift_expr >> *(lit("&") >> shift_expr);
shift_expr = arith_expr >> *((lit("<<")|lit(">>")) >> arith_expr);
arith_expr = term >> *((lit("+")|lit("-")) >> term);
term = factor >> *((lit("*")|lit("/")|lit("%")|lit("//")) >> factor);
factor = (lit("+")|lit("-")|lit("~")) >> factor|power;
power = atom >> *trailer >> -(lit("**") >> factor);
atom = (lit("(") >> -(yield_expr|testlist_comp) >> lit(")")|lit("-(") >> -(listmaker) >> lit(")")|lit("{") >> -(dictorsetmaker) >> lit("}")|lit("`") >> testlist1 >> lit("`")|NAME|NUMBER|+STRING);
listmaker = test >> ( list_for|*(lit(",") >> test) >> -(lit(",")) );
testlist_comp = test >> ( comp_for|*(lit(",") >> test) >> -(lit(",")) );
lambdef = lit("lambda") >> -(varargslist) >> lit(":") >> test;
trailer = lit("(") >> -(arglist) >> lit(")")|lit("[") >> subscriptlist >> lit("]")|lit(".") >> NAME;
subscriptlist = subscript >> *(lit(",") >> subscript) >> -(lit(","));
subscript = lit(".") >> lit(".") >> lit(".")|test|-(test) >> lit(":") >> -(test) >> -(sliceop);
sliceop = lit(":") >> -(test);
exprlist = expr >> *(lit(",") >> expr) >> -(lit(","));
testlist = test >> *(lit(",") >> test) >> -(lit(","));
dictorsetmaker = ( (test >> lit(":") >> test >> (comp_for|*(lit(",") >> test >> lit(":") >> test) >> -(lit(","))))|(test >> (comp_for|*(lit(",") >> test) >> -(lit(",")))) );
classdef = lit("class") >> NAME >> -(lit("(") >> -(testlist) >> lit(")")) >> lit(":") >> suite;
arglist = *(argument >> lit(",")) >> (argument >> -(lit(","))|lit("*") >> test >> *(lit(",") >> argument) >> -(lit(",") >> lit("**") >> test)|lit("**") >> test);
argument = test >> -(comp_for)|test >> lit("=") >> test;
list_iter = list_for|list_if;
list_for = lit("for") >> exprlist >> lit("in") >> testlist_safe >> -(list_iter);
list_if = lit("if") >> old_test >> -(list_iter);
comp_iter = comp_for|comp_if;
comp_for = lit("for") >> exprlist >> lit("in") >> or_test >> -(comp_iter);
comp_if = lit("if") >> old_test >> -(comp_iter);
testlist1 = test >> *(lit(",") >> test);
encoding_decl = NAME.alias();
yield_expr = lit("yield") >> -(testlist);
}
// LEXEMS
qi::rule<Iterator, space_type> NEWLINE;
qi::rule<Iterator, space_type> INDENT;
qi::rule<Iterator, space_type> DEDENT;
qi::rule<Iterator, space_type> ENDMARKER;
qi::rule<Iterator, space_type> NAME;
qi::rule<Iterator, space_type> NUMBER;
qi::rule<Iterator, space_type> STRING;
// IDENTIFIER
qi::rule<Iterator, space_type> identifier;
// STRING LITERAL
qi::rule<Iterator, space_type> stringliteral;
qi::rule<Iterator, space_type> stringprefix;
qi::rule<Iterator, space_type> shortstring;
qi::rule<Iterator, space_type> longstring;
qi::rule<Iterator, space_type> shortstringitem;
qi::rule<Iterator, space_type> longstringitem;
qi::rule<Iterator, space_type> shortstringchar;
qi::rule<Iterator, space_type> longstringchar;
qi::rule<Iterator, space_type> escapeseq;
// INTEGER LITERAL
qi::rule<Iterator, space_type> longinteger;
qi::rule<Iterator, space_type> integer;
qi::rule<Iterator, space_type> decimalinteger;
qi::rule<Iterator, space_type> octinteger;
qi::rule<Iterator, space_type> hexinteger;
qi::rule<Iterator, space_type> bininteger;
qi::rule<Iterator, space_type> nonzerodigit;
qi::rule<Iterator, space_type> octdigit;
qi::rule<Iterator, space_type> bindigit;
qi::rule<Iterator, space_type> hexdigit;
// FLOAT LITERAL
qi::rule<Iterator, space_type> floatnumber;
qi::rule<Iterator, space_type> pointfloat;
qi::rule<Iterator, space_type> exponentfloat;
qi::rule<Iterator, space_type> intpart;
qi::rule<Iterator, space_type> fraction;
qi::rule<Iterator, space_type> exponent;
//IMAGINARY LITERAL
qi::rule<Iterator, space_type> imagnumber;
// PYTHON GRAMMAR
qi::rule<Iterator, space_type> single_input;
qi::rule<Iterator, space_type> file_input;
qi::rule<Iterator, space_type> eval_input;
qi::rule<Iterator, space_type> decorator;
qi::rule<Iterator, space_type> decorators;
qi::rule<Iterator, space_type> decorated;
qi::rule<Iterator, space_type> funcdef;
qi::rule<Iterator, space_type> parameters;
qi::rule<Iterator, space_type> varargslist;
qi::rule<Iterator, space_type> fpdef;
qi::rule<Iterator, space_type> fplist;
qi::rule<Iterator, space_type> stmt;
qi::rule<Iterator, space_type> simple_stmt;
qi::rule<Iterator, space_type> small_stmt;
qi::rule<Iterator, space_type> expr_stmt;
qi::rule<Iterator, space_type> augassign;
qi::rule<Iterator, space_type> print_stmt;
qi::rule<Iterator, space_type> del_stmt;
qi::rule<Iterator, space_type> pass_stmt;
qi::rule<Iterator, space_type> flow_stmt;
qi::rule<Iterator, space_type> break_stmt;
qi::rule<Iterator, space_type> continue_stmt;
qi::rule<Iterator, space_type> return_stmt;
qi::rule<Iterator, space_type> yield_stmt;
qi::rule<Iterator, space_type> raise_stmt;
qi::rule<Iterator, space_type> import_stmt;
qi::rule<Iterator, space_type> import_name;
qi::rule<Iterator, space_type> import_from;
qi::rule<Iterator, space_type> import_as_name;
qi::rule<Iterator, space_type> dotted_as_name;
qi::rule<Iterator, space_type> import_as_names;
qi::rule<Iterator, space_type> dotted_as_names;
qi::rule<Iterator, space_type> dotted_name;
qi::rule<Iterator, space_type> global_stmt;
qi::rule<Iterator, space_type> exec_stmt;
qi::rule<Iterator, space_type> assert_stmt;
qi::rule<Iterator, space_type> compound_stmt;
qi::rule<Iterator, space_type> if_stmt;
qi::rule<Iterator, space_type> while_stmt;
qi::rule<Iterator, space_type> for_stmt;
qi::rule<Iterator, space_type> try_stmt;
qi::rule<Iterator, space_type> with_stmt;
qi::rule<Iterator, space_type> with_item;
qi::rule<Iterator, space_type> except_clause;
qi::rule<Iterator, space_type> suite;
qi::rule<Iterator, space_type> testlist_safe;
qi::rule<Iterator, space_type> old_test;
qi::rule<Iterator, space_type> old_lambdef;
qi::rule<Iterator, space_type> test;
qi::rule<Iterator, space_type> or_test;
qi::rule<Iterator, space_type> and_test;
qi::rule<Iterator, space_type> not_test;
qi::rule<Iterator, space_type> comparison;
qi::rule<Iterator, space_type> comp_op;
qi::rule<Iterator, space_type> expr;
qi::rule<Iterator, space_type> xor_expr;
qi::rule<Iterator, space_type> and_expr;
qi::rule<Iterator, space_type> shift_expr;
qi::rule<Iterator, space_type> arith_expr;
qi::rule<Iterator, space_type> term;
qi::rule<Iterator, space_type> factor;
qi::rule<Iterator, space_type> power;
qi::rule<Iterator, space_type> atom;
qi::rule<Iterator, space_type> listmaker;
qi::rule<Iterator, space_type> testlist_comp;
qi::rule<Iterator, space_type> lambdef;
qi::rule<Iterator, space_type> trailer;
qi::rule<Iterator, space_type> subscriptlist;
qi::rule<Iterator, space_type> subscript;
qi::rule<Iterator, space_type> sliceop;
qi::rule<Iterator, space_type> exprlist;
qi::rule<Iterator, space_type> testlist;
qi::rule<Iterator, space_type> dictorsetmaker;
qi::rule<Iterator, space_type> classdef;
qi::rule<Iterator, space_type> arglist;
qi::rule<Iterator, space_type> argument;
qi::rule<Iterator, space_type> list_iter;
qi::rule<Iterator, space_type> list_for;
qi::rule<Iterator, space_type> list_if;
qi::rule<Iterator, space_type> comp_iter;
qi::rule<Iterator, space_type> comp_for;
qi::rule<Iterator, space_type> comp_if;
qi::rule<Iterator, space_type> testlist1;
qi::rule<Iterator, space_type> encoding_decl;
qi::rule<Iterator, space_type> yield_expr;
};
The problem is that when I try to parse simple file:
pass
witch after going through some lexer module is:
pass <NEWLINE> <EOT>
the parsing failed and stops at first character. When I tried to parse this file with pass_stmt rule everything is ok (except that we still have and remaining, but pass word was "consumed"). When I tried to parse it with rule one level up - small_stmt - parser stops at
> <EOT>
consuming
pass <NEWLINE
One level up - simple_stmt gives same result as file_input - parser stops at first character.
Before adding grammar defined in PYTHON GRAMMAR section (get from http://docs.python.org/reference/grammar.html) everything was working properly. Parser recognized identifiers, literals, numbers etc.
Does anybody have an idea what can be wrong here?
tools that can generate parsers usable from Python (and possibly from other languages) Python libraries to build parsers. Tools that can be used to generate the code for a parser are called parser generators or compiler compiler. Libraries that create parsers are known as parser combinators.
A modern parsing library for Python, implementing Earley & LALR(1) and an easy interface . Lark is a parser generator that works as a library. You write the grammar in a string or a file and then use it as an argument to dynamically generate the parser.
If you need to parse a language, or document, from Python there are fundamentally three ways to solve the problem: use an existing library supporting that specific language: for example a library to parse XML a tool or library to generate a parser: for example ANTLR, that you can use to build parsers for any language
A parse tree is a representation of the code closer to the concrete syntax. It shows many details of the implementation of the parser. For instance, usually a rule corresponds to the type of a node. A parse tree is usually transformed in an AST by the user, possibly with some help from the parser generator.
I'd suggest you enable debugging as explained here. This will give you insights what's actually happening. Generally, I suggest building grammars step by step and not trying to implement everything in one big leap.
The code you provided above is very difficult to understand for un-unitiated readers as it is very large and not commented. Writing grammars is very much like writing 'normal' code. Encapsulation is the key to success. Try building smaller grammars covering self-contained pieces and combine those sub-grammars as required. See here for some best practices.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With