I'm trying to write a parser for the following BNF rules using boost spirit
(Boost v1.64)
The rules are:
<numeric-literal>::= integer
<type-name> ::= "in" | "out" | "in_out"
<array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"
<tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
<type-spec> ::= <type-name> | <array-type-spec> | <tuple-type-spec>
Below is my attempt, using boost::make_recursive_variant
It seems to work ok on the string in
But it fails on in[2]
.
Where is my mistake?
What would be an elegant solution?
namespace Ast {
enum class TypeName { IN, OUT, INOUT};
using NumericLiteral = int;
using TypeSpec = boost::make_recursive_variant
<
TypeName,
std::pair<boost::recursive_variant_, NumericLiteral>,
std::vector < boost::recursive_variant_ >
>::type;
}
//grammar:
namespace myGrammar {
namespace qi = boost::spirit::qi;
template <typename Iterator = char const*,typename Signature = Ast::TypeSpec()>
struct myRules : qi::grammar < Iterator, Signature> {
myRules() : myRules::base_type(start) {
fillSymbols();
rNumericLiteral = qi::int_;
rTypeName = sTypeName;
rTypeSpec = rTypeName | (rTypeSpec >> '[' >> rNumericLiteral >> ']') | ('(' >> qi::repeat(2, qi::inf)[(rTypeSpec % ',')] >> ')');
start = qi::skip(qi::space)[rTypeSpec];
}
private:
using Skipper = qi::space_type;
qi::rule<Iterator, Ast::TypeSpec()> start;
qi::rule<Iterator, Ast::NumericLiteral(), Skipper> rNumericLiteral;
qi::rule<Iterator, Ast::TypeName(), Skipper> rTypeName;
qi::rule<Iterator, Ast::TypeSpec(), Skipper> rTypeSpec;
//symbols
qi::symbols<char, Ast::TypeName>sTypeName;
void fillSymbols()
{
using namespace Ast;
sTypeName.add
("in", TypeName::IN)
("out", TypeName::OUT)
("in_out", TypeName::INOUT)
}
};
}
There's a problem translating this grammar 1:1 to a PEG grammar since left-recursion leads to infinite recursion.
You can still trivially rearrange the rules so left-recursion doesn't occur, but you will have more trouble synthesizing the AST you want.
Here's a halfway station that has half-decent test results:
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
/*
<numeric-literal> ::= integer
<type-name> ::= "in" | "out" | "in_out"
<array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"
<tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
<type-spec> ::= <type-name> | <array-type-spec> | <tuple-type-spec>
*/
namespace Ast {
enum class TypeName { IN, OUT, INOUT };
static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
switch(tn) {
case TypeName::IN: return os << "IN";
case TypeName::OUT: return os << "OUT";
case TypeName::INOUT: return os << "INOUT";
}
return os << "?";
}
using NumericLiteral = int;
using TypeSpec = boost::make_recursive_variant<
TypeName,
std::pair<boost::recursive_variant_, NumericLiteral>,
std::vector<boost::recursive_variant_>
>::type;
using ArraySpec = std::pair<TypeSpec, NumericLiteral>;
using TupleSpec = std::vector<TypeSpec>;
}
// grammar:
namespace myGrammar {
namespace qi = boost::spirit::qi;
template <typename Iterator = char const *, typename Signature = Ast::TypeSpec()>
struct myRules : qi::grammar<Iterator, Signature> {
myRules() : myRules::base_type(start) {
rNumericLiteral = qi::int_;
rTypeName = sTypeName >> !qi::alpha;
rTupleSpec = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')';
rScalarSpec = rTypeName | rTupleSpec;
rArraySpec = rScalarSpec >> '[' >> rNumericLiteral >> ']';
rTypeSpec = rArraySpec | rScalarSpec;
start = qi::skip(qi::space)[rTypeSpec >> qi::eoi];
BOOST_SPIRIT_DEBUG_NODES((start)(rTypeSpec)(rTypeName)(rArraySpec)(rScalarSpec)(rTypeSpec)(rNumericLiteral))
}
private:
using Skipper = qi::space_type;
qi::rule<Iterator, Ast::TypeSpec()> start;
qi::rule<Iterator, Ast::NumericLiteral(), Skipper> rNumericLiteral;
qi::rule<Iterator, Ast::ArraySpec(), Skipper> rArraySpec;
qi::rule<Iterator, Ast::TypeSpec(), Skipper> rTypeSpec, rScalarSpec;
qi::rule<Iterator, Ast::TupleSpec(), Skipper> rTupleSpec;
// implicit lexeme
qi::rule<Iterator, Ast::TypeName()> rTypeName;
// symbols
struct TypeName_r : qi::symbols<char, Ast::TypeName> {
TypeName_r() {
using Ast::TypeName;
add ("in", TypeName::IN)
("out", TypeName::OUT)
("in_out", TypeName::INOUT);
}
} sTypeName;
};
}
static inline std::ostream& operator<<(std::ostream& os, Ast::TypeSpec tn) {
struct {
std::ostream& _os;
void operator()(Ast::TypeSpec const& ts) const {
apply_visitor(*this, ts);
}
void operator()(Ast::TypeName tn) const { std::cout << tn; }
void operator()(Ast::TupleSpec const& tss) const {
std::cout << "(";
for (auto const& ts: tss) {
(*this)(ts);
std::cout << ", ";
}
std::cout << ")";
}
void operator()(Ast::ArraySpec const& as) const {
(*this)(as.first);
std::cout << '[' << as.second << ']';
}
} const dumper{os};
dumper(tn);
return os;
}
int main() {
using It = std::string::const_iterator;
myGrammar::myRules<It> const parser;
std::string const test_ok[] = {
"in",
"out",
"in_out",
"(in, out)",
"(out, in)",
"(in, in, in, out, in_out)",
"in[13]",
"in[0]",
"in[-2]",
"in[1][2][3]",
"in[3][3][3]",
"(in[3][3][3], out, in_out[0])",
"(in[3][3][3], out, in_out[0])",
"(in, out)[13]",
"(in, out)[13][0]",
};
std::string const test_fail[] = {
"",
"i n",
"inout",
"()",
"(in)",
"(out)",
"(in_out)",
"IN",
};
auto expect = [&](std::string const& sample, bool expected) {
It f = sample.begin(), l = sample.end();
Ast::TypeSpec spec;
bool ok = parse(f, l, parser, spec);
std::cout << "Test passed:" << std::boolalpha << (expected == ok) << "\n";
if (expected || (expected != ok)) {
if (ok) {
std::cout << "Parsed: " << spec << "\n";
} else {
std::cout << "Parse failed\n";
}
}
if (f!=l) {
std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}
};
for (std::string const sample : test_ok) expect(sample, true);
for (std::string const sample : test_fail) expect(sample, false);
}
Prints
Test passed:true
Parsed: IN
Test passed:true
Parsed: OUT
Test passed:true
Parsed: INOUT
Test passed:true
Parsed: (IN, OUT, )
Test passed:true
Parsed: (OUT, IN, )
Test passed:true
Parsed: (IN, IN, IN, OUT, INOUT, )
Test passed:true
Parsed: IN[13]
Test passed:true
Parsed: IN[0]
Test passed:true
Parsed: IN[-2]
Test passed:false
Parse failed
Remaining unparsed: 'in[1][2][3]'
Test passed:false
Parse failed
Remaining unparsed: 'in[3][3][3]'
Test passed:false
Parse failed
Remaining unparsed: '(in[3][3][3], out, in_out[0])'
Test passed:false
Parse failed
Remaining unparsed: '(in[3][3][3], out, in_out[0])'
Test passed:true
Parsed: (IN, OUT, )[13]
Test passed:false
Parse failed
Remaining unparsed: '(in, out)[13][0]'
Test passed:true
Test passed:true
Remaining unparsed: 'i n'
Test passed:true
Remaining unparsed: 'inout'
Test passed:true
Remaining unparsed: '()'
Test passed:true
Remaining unparsed: '(in)'
Test passed:true
Remaining unparsed: '(out)'
Test passed:true
Remaining unparsed: '(in_out)'
Test passed:true
Remaining unparsed: 'IN'
As you can see most things get parsed correctly, except for chained array dimensions like in[1][2]
. The trouble is that we resolved ambiguity by inducing a "precedence" in the rules:
rScalarSpec = rTypeName | rTupleSpec;
rArraySpec = rScalarSpec >> '[' >> rNumericLiteral >> ']';
rTypeSpec = rArraySpec | rScalarSpec;
This means we always try expecting an array dimension first, and only fallback to scalar type-spec if we failed to find one. This is because any array-spec would always be matched as a scalarspec first making it impossible to parse the array-dimension part.
To fix the multi-dimensional case, you could try asserting that [
doesn't follow the array-spec:
rArraySpec = rScalarSpec >> '[' >> rNumericLiteral >> ']' >> !qi::lit('[')
| rArraySpec >> '[' >> rNumericLiteral >> ']';
But -- BOOM -- we're back at left-recursion again (in case we enter the second branch, e.g. in[1][
).
Two thoughts cross my mind.
I'd say it would be very beneficial to remove the distinction between scalar/array spec in the AST. If a scalar were to be treated as a zero-rank array that would just mean we could always parse an optional dimension into the same resulting AST type.
The other thought more or less continues down the road shown above and would require backtracking all the way down if a presumed scalar spec was followed by a '[' character. This would lead to bad worst case behaviour in cases like (very long spec)[1][1][1][1][1][1][1][1][1][1]
.
Let me implement the first idea outlined after a coffee break :)
Here the TypeSpec always carries a (possibly empty) collection of dimensions:
namespace Ast {
enum class TypeName { IN, OUT, INOUT };
static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
switch(tn) {
case TypeName::IN: return os << "IN";
case TypeName::OUT: return os << "OUT";
case TypeName::INOUT: return os << "INOUT";
}
return os << "?";
}
struct TypeSpec;
using ScalarSpec = boost::make_recursive_variant<
TypeName,
std::vector<TypeSpec>
>::type;
struct TypeSpec {
ScalarSpec spec;
std::vector<unsigned> dim;
};
using TupleSpec = std::vector<TypeSpec>;
}
Note that we also improved by making dimensions unsigned. The grammar will check that it's not 0
for completeness. A number of "positive" test cases have moved to the "expected-to-fail" cases for this reason.
Now the grammar is a straightforward mimic of that:
rRank %= qi::uint_ [qi::_pass = (qi::_1 > 0)];
rTypeName = sTypeName;
rTupleSpec = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')';
rScalarSpec = rTypeName | rTupleSpec;
rTypeSpec = rScalarSpec >> *('[' >> rRank >> ']');
Note the semantic action using Phoenix to assert that the array dimension cannot be 0
And here's the live demo showing all testcases passing:
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/adapted.hpp>
/*
<numeric-literal> ::= integer
<type-name> ::= "in" | "out" | "in_out"
<array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"
<tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
<type-spec> ::= <type-name> | <array-type-spec> | <tuple-type-spec>
*/
namespace Ast {
enum class TypeName { IN, OUT, INOUT };
static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
switch(tn) {
case TypeName::IN: return os << "IN";
case TypeName::OUT: return os << "OUT";
case TypeName::INOUT: return os << "INOUT";
}
return os << "?";
}
struct TypeSpec;
using ScalarSpec = boost::make_recursive_variant<
TypeName,
std::vector<TypeSpec>
>::type;
struct TypeSpec {
ScalarSpec spec;
std::vector<unsigned> dim;
};
using TupleSpec = std::vector<TypeSpec>;
}
BOOST_FUSION_ADAPT_STRUCT(Ast::TypeSpec, spec, dim)
// grammar:
namespace myGrammar {
namespace qi = boost::spirit::qi;
template <typename Iterator = char const *, typename Signature = Ast::TypeSpec()>
struct myRules : qi::grammar<Iterator, Signature> {
myRules() : myRules::base_type(start) {
rRank %= qi::uint_ [qi::_pass = (qi::_1 > 0)];
rTypeName = sTypeName;
rTupleSpec = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')';
rScalarSpec = rTypeName | rTupleSpec;
rTypeSpec = rScalarSpec >> *('[' >> rRank >> ']');
start = qi::skip(qi::space)[rTypeSpec >> qi::eoi];
BOOST_SPIRIT_DEBUG_NODES((start)(rTypeSpec)(rTypeName)(rScalarSpec)(rTypeSpec)(rRank))
}
private:
using Skipper = qi::space_type;
qi::rule<Iterator, Ast::TypeSpec()> start;
qi::rule<Iterator, Ast::ScalarSpec(), Skipper> rScalarSpec;
qi::rule<Iterator, Ast::TypeSpec(), Skipper> rTypeSpec;
qi::rule<Iterator, Ast::TupleSpec(), Skipper> rTupleSpec;
// implicit lexeme
qi::rule<Iterator, Ast::TypeName()> rTypeName;
qi::rule<Iterator, unsigned()> rRank;
// symbols
struct TypeName_r : qi::symbols<char, Ast::TypeName> {
TypeName_r() {
using Ast::TypeName;
add ("in", TypeName::IN)
("out", TypeName::OUT)
("in_out", TypeName::INOUT);
}
} sTypeName;
};
}
static inline std::ostream& operator<<(std::ostream& os, Ast::TypeSpec tn) {
struct {
std::ostream& _os;
void operator()(Ast::ScalarSpec const& ts) const {
apply_visitor(*this, ts);
}
void operator()(Ast::TypeName tn) const { std::cout << tn; }
void operator()(Ast::TupleSpec const& tss) const {
std::cout << "(";
for (auto const& ts: tss) {
(*this)(ts);
std::cout << ", ";
}
std::cout << ")";
}
void operator()(Ast::TypeSpec const& as) const {
(*this)(as.spec);
for (auto rank : as.dim)
std::cout << '[' << rank << ']';
}
} const dumper{os};
dumper(tn);
return os;
}
int main() {
using It = std::string::const_iterator;
myGrammar::myRules<It> const parser;
std::string const test_ok[] = {
"in",
"out",
"in_out",
"(in, out)",
"(out, in)",
"(in, in, in, out, in_out)",
"in[13]",
"in[1][2][3]",
"in[3][3][3]",
"(in[3][3][3], out, in_out[1])",
"(in[3][3][3], out, in_out[1])",
"(in, out)[13]",
"(in, out)[13][14]",
};
std::string const test_fail[] = {
"",
"i n",
"inout",
"()",
"(in)",
"(out)",
"(in_out)",
"IN",
"in[0]",
"in[-2]",
"(in[3][3][3], out, in_out[0])",
"(in[3][3][3], out, in_out[0])",
};
auto expect = [&](std::string const& sample, bool expected) {
It f = sample.begin(), l = sample.end();
Ast::TypeSpec spec;
bool ok = parse(f, l, parser, spec);
std::cout << "Test passed:" << std::boolalpha << (expected == ok) << "\n";
if (expected || (expected != ok)) {
if (ok) {
std::cout << "Parsed: " << spec << "\n";
} else {
std::cout << "Parse failed\n";
}
}
if (f!=l) {
std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}
};
for (std::string const sample : test_ok) expect(sample, true);
for (std::string const sample : test_fail) expect(sample, false);
}
Prints
Test passed:true
Parsed: IN
Test passed:true
Parsed: OUT
Test passed:true
Parsed: INOUT
Test passed:true
Parsed: (IN, OUT, )
Test passed:true
Parsed: (OUT, IN, )
Test passed:true
Parsed: (IN, IN, IN, OUT, INOUT, )
Test passed:true
Parsed: IN[13]
Test passed:true
Parsed: IN[1][2][3]
Test passed:true
Parsed: IN[3][3][3]
Test passed:true
Parsed: (IN[3][3][3], OUT, INOUT[1], )
Test passed:true
Parsed: (IN[3][3][3], OUT, INOUT[1], )
Test passed:true
Parsed: (IN, OUT, )[13]
Test passed:true
Parsed: (IN, OUT, )[13][14]
Test passed:true
Test passed:true
Remaining unparsed: 'i n'
Test passed:true
Remaining unparsed: 'inout'
Test passed:true
Remaining unparsed: '()'
Test passed:true
Remaining unparsed: '(in)'
Test passed:true
Remaining unparsed: '(out)'
Test passed:true
Remaining unparsed: '(in_out)'
Test passed:true
Remaining unparsed: 'IN'
Test passed:true
Remaining unparsed: 'in[0]'
Test passed:true
Remaining unparsed: 'in[-2]'
Test passed:true
Remaining unparsed: '(in[3][3][3], out, in_out[0])'
Test passed:true
Remaining unparsed: '(in[3][3][3], out, in_out[0])'
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With