Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

boost spirit istream_iterator consumes too much from stream

Consider the following example extracted from a more complex code:

#include <boost/fusion/adapted.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <map>
#include <string>

namespace qi  = boost::spirit::qi;
namespace phx = boost::phoenix;

// The class implements a XML tag storing the name and a variable number of attributes:
struct Tag
{
    // The typedef defines the type used for a XML name:
    typedef std::string name_type;

    // The typedef defines the type used for a XML value:
    typedef std::string value_type;

    // The typedef defines the type of a XML attribute:
    typedef std::pair<
        name_type,
        value_type
    > attribute_type;

    // The type defines a list of attributes.
    // Note: We use a std::map to simplify the attribute search.
    typedef std::map<
        name_type,
        value_type
    > list_type;

    // Clear all information stored within the instance:
    void clear( )
    {
        m_name.clear( ); m_attribute.clear( );
    }

    std::size_t m_indent;    // The tag shall be / is indented by m_indent number of tabs.
    name_type   m_name;      // Name of the tag.
    list_type   m_attribute; // List of tag attributes.
};

// Define the mapping between Tools::Serialization::Archive::Type::Xml::Format::Tag and boost::fusion:
BOOST_FUSION_ADAPT_STRUCT( Tag,
                         ( std::size_t   , m_indent    )
                         ( Tag::name_type, m_name      )
                         ( Tag::list_type, m_attribute ) )

// This class implements the decoder skipper grammar:
template < typename _Iterator >
    struct skipper
    : qi::grammar< _Iterator >
    {
        skipper( ) : skipper::base_type( m_skipper )
        {
            // The rule defines the default skipper grammar:
            m_skipper = ( qi::space )  // Skip all "spaces".
                        |
                        ( qi::cntrl ); // Skip all "cntrl".
        }

        // The following variables define the rules used within this grammar:
        qi::rule< _Iterator > m_skipper;
    };

// This class implements the grammar used to parse a XML "begin tag".
// The expected format is as follows: <name a="xyz" b="xyz" ... N="xyz">
template < typename _Iterator, typename _Skipper >
    struct tag_begin : qi::grammar< _Iterator, Tag( ), _Skipper >
    {
        tag_begin( ) : tag_begin::base_type( m_tag )
        {
            // The rule for a XML name shall stop when a ' ' or '>' is detected:
            m_string = qi::lexeme[ *( qi::char_( "a-zA-Z0-9_.:" ) ) ];

            // The rule for a XML attribute shall parse the following format: 'name="value"':
            m_attribute =    m_string
                          >> "=\""
                          >> m_string
                          >> '"';

            // The rule for an XML attribute list is a sequence of attributes separated by ' ':
            m_list = *( m_attribute - '>' );

            // Finally the resulting XML tag has the following format: <name a="xyz" b="xyz" ... N="xyz">
            m_tag =     '<'
                     >> -qi::int_
                     >> m_string
                     >> m_list
                     >> '>';

            // Enable debug support for the used rules. To activate the debug output define macro BOOST_SPIRIT_DEBUG:
            BOOST_SPIRIT_DEBUG_NODES( ( m_string )( m_attribute )( m_list ) )
        }

    // The following variables define the rules used within this grammar:
    qi::rule< _Iterator, Tag::name_type( )     , _Skipper > m_string;
    qi::rule< _Iterator, Tag::attribute_type( ), _Skipper > m_attribute;
    qi::rule< _Iterator, Tag::list_type( )     , _Skipper > m_list;
    qi::rule< _Iterator, Tag( )                , _Skipper > m_tag;
};

bool beginTag( std::istream& stream, Tag& tag )
{
    // Ensure that no whitespace characters are skipped:
    stream.unsetf( std::ios::skipws );

    // Create begin and end iterator for given stream:
    boost::spirit::istream_iterator begin( stream );
    boost::spirit::istream_iterator end;

    // Define the grammar skipper type:
    typedef skipper<
        boost::spirit::istream_iterator
    > skipper_type;

    // Create an instance of the used skipper:
    skipper_type sk;

    // Create an instance of the used grammar:
    tag_begin<
        boost::spirit::istream_iterator,
        skipper_type
    > gr;

    // Try to parse the data stored within the stream according the grammar and store the result in the tag variable:
    bool r = boost::spirit::qi::phrase_parse( begin,
                                              end,
                                              gr,
                                              sk,
                                              tag );

    char nextSym = 0;
    stream >> nextSym;

    for( auto i = tag.m_attribute.begin( ); i != tag.m_attribute.end( ); ++i )
    {
        std::cout << i->first << " : " << i->second << std::endl;
    }
    std::cout << "Next symbol: " << nextSym << std::endl;

    return r;
}

int main( )
{
    std::stringstream s;
    s << "<object cName=\"bool\" cVersion=\"1\" vName=\"bool\">       <value>0</value></object>";

    Tag t;
    beginTag( s, t );

    return 0;
}

I use the grammar to extract the xml tag content. In principle this works as expected and the results are as follows:

cName : bool
cVersion : 1
vName : bool
Next symbol: v

The problem is that the parser consumes to much data. My expectation is that the parser stops when first tag is closed '>'. But it seems that the parser consumes also the following spaces and the '<' symbol. So the next symbol read from the stream is equal to 'v'. I would like to avoid this because following parser calls expect the '<' symbol. Any ideas ?

like image 556
Maik Avatar asked Sep 25 '22 11:09

Maik


1 Answers

There's no reliable way to achieve this.

The problem is you're not re-using the istream_iterators across parse calls. The whole purpose for boost::spirit::istream_iterator is provide a multi_pass capable iterator on top of an InputIterator¹.

Because Spirit allows arbitrary grammars with arbitrary backtracking, you cannot prevent consuming more than the input that was actually successfully parsed.

The obvious solution here is to integrate all the subsequent steps into the same grammar and/or to reuse the iterators (so the iterator's stored backtrack buffer still contains the characters you need).


Demonstration / Proof of Concept

Here is a version that parses open tags in a loop

while (boost::spirit::qi::phrase_parse(begin, end, gr, sk, tag)) {
    std::cout << "============\nParsed open tag '" << tag.m_name << "'\n";
    for (auto const& p: tag.m_attribute)
        std::cout << p.first << ": " << p.second << "\n";

    count += 1;
    tag.clear();
};

std::cout << "Next symbol: ";
std::copy(begin, end, std::ostream_iterator<char>(std::cout));

And it prints:

============
Parsed open tag 'object'
cName: bool
cVersion: 1
vName: bool
============
Parsed open tag 'value'
Next symbol: 0</value>
        </object>

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
#include <map>

namespace qi = boost::spirit::qi;

// The class implements a XML tag storing the name and a variable number of
// attributes:
struct Tag {
    typedef std::string name_type;
    typedef std::string value_type;

    typedef std::pair<name_type, value_type> attribute_type;
    typedef std::map<name_type, value_type>  list_type;

    // Clear all information stored within the instance:
    void clear() {
        m_name.clear();
        m_attribute.clear();
    }

    std::size_t m_indent;  // The tag shall be / is indented by m_indent number of tabs.
    name_type m_name;      // Name of the tag.
    list_type m_attribute; // List of tag attributes.
};

BOOST_FUSION_ADAPT_STRUCT(Tag, m_indent, m_name, m_attribute)

// This class implements the grammar used to parse a "XML" begin tag.
// The expected format is as follows: <name a="xyz" b="xyz" ... N="xyz">
template <typename Iterator, typename Skipper> struct tag_begin : qi::grammar<Iterator, Tag(), Skipper> {
    tag_begin() : tag_begin::base_type(m_tag) {
        m_string     = *qi::char_("a-zA-Z0-9_.:");
        m_attribute  = m_string >> '=' >> qi::lexeme['"' >> m_string >> '"'];
        m_attributes = *m_attribute;
        m_tag        = '<' >> -qi::int_ >> m_string >> m_attributes >> '>';

        BOOST_SPIRIT_DEBUG_NODES((m_string)(m_attribute)(m_attributes))
    }
  private:

    // The following variables define the rules used within this grammar:
    qi::rule<Iterator, Tag::attribute_type(), Skipper> m_attribute;
    qi::rule<Iterator, Tag::list_type(), Skipper> m_attributes;
    qi::rule<Iterator, Tag(), Skipper> m_tag;
    // lexemes
    qi::rule<Iterator, Tag::name_type()> m_string;
};

bool beginTag(std::istream &stream, Tag &tag) {
    // Ensure that no whitespace characters are skipped:
    stream.unsetf(std::ios::skipws);

    typedef boost::spirit::istream_iterator It; 
    typedef qi::rule<It> skipper_type;

    skipper_type sk = qi::space | qi::cntrl;
    tag_begin<boost::spirit::istream_iterator, skipper_type> gr;

    It begin(stream), end;

    int count = 0;
    while (boost::spirit::qi::phrase_parse(begin, end, gr, sk, tag)) {
        std::cout << "============\nParsed open tag '" << tag.m_name << "'\n";
        for (auto const& p: tag.m_attribute)
            std::cout << p.first << ": " << p.second << "\n";

        count += 1;
        tag.clear();
    };

    std::cout << "Next symbol: ";
    std::copy(begin, end, std::ostream_iterator<char>(std::cout));

    return count > 0;
}

int main() {
    std::stringstream s;
    s << R"(
        <object cName="bool" cVersion="1" vName="bool">
            <value>0</value>
        </object>
    )";

    Tag t;
    beginTag(s, t);
}

¹ (which is strictly forward-only and cannot be repeatedly dereferenced)

like image 148
sehe Avatar answered Sep 28 '22 04:09

sehe