c++ - boost::spirit::lex 令牌是如何被识别的

Question

我正在学习使用 boost::spirit。为此，我想创建一些简单的词法分析器，将它们组合起来，然后开始使用 Spirit 进行解析。但结果很混乱：

这是词法分析器：

// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>


#include <iostream>
#include <string>

using namespace boost::spirit;
using namespace boost::spirit::ascii;


enum tokenids
{
  IDANY = lex::min_token_id + 10,
  T_USER,
  T_DOMAINLABEL,
  T_CRLF
};


template <typename Lexer>
struct sip_token : lex::lexer<Lexer>
{
  sip_token()
  {
    this->self.add_pattern
      ("ALPHANUM", "[0-9a-zA-Z]")
      ("MARK", "[-_.!~*'()]")           
      ("UNRESERVED","{ALPHANUM}|{MARK}")            
      ("USER", "({UNRESERVED})+" ) 
      ("DOMAINLABEL", "({ALPHANUM})+")
      // ("DOMAINLABEL", "{ALPHANUM}|({ALPHANUM}({ALPHANUM}|-)*{ALPHANUM})") 
      ;     

    this->self.add
      ("{USER}",T_USER)
      ("{DOMAINLABEL}", T_DOMAINLABEL)          
      ("\r\n", T_CRLF)
      (".", IDANY)    // string literals will not be esacped by the library
      ;
  } 
};


template <typename Iterator>
struct sip_grammar : qi::grammar<Iterator>
// struct sip_grammar : qi::grammar<Iterator>
{
  template <typename TokenDef>
  sip_grammar(TokenDef const& tok)
    : sip_grammar::base_type(start)
    , c(0), w(0), l(0)
  {
    using boost::phoenix::ref;
    using boost::phoenix::size;
    using boost::spirit::qi::eol;


    start =  (      
      (qi::token(T_DOMAINLABEL))[++ref(c), ++ref(l)]
      >>   qi::token(T_CRLF) [++ref(w)]
      ) 
      ;
  }

  std::size_t c, w, l;
  qi::rule<Iterator> start; 
};



int main(int argc, char* argv[])
{
  typedef lex::lexertl::token<
  char const*, boost::mpl::vector<std::string>
  > token_type;

  typedef std::string::const_iterator str_iterator_type;
  typedef lex::lexertl::lexer<token_type> lexer_type;
  typedef sip_token<lexer_type>::iterator_type iterator_type;

  std::string str;
  while (std::getline(std::cin, str))
  {
    if (str.empty() || str[0] == 'q' || str[0] == 'Q')
      break;        
    else
      str += "\r\n";

    sip_token<lexer_type> siplexer;
    sip_grammar<iterator_type > g(siplexer);

    char const* first = str.c_str();
    char const* last = &first[str.size()];

    /*<  Parsing is done based on the the token stream, not the character
    stream read from the input. The function `tokenize_and_parse()` wraps
    the passed iterator range `[first, last)` by the lexical analyzer and
    uses its exposed iterators to parse the toke stream.
    >*/  
    unsigned result = 0;
    bool r = lex::tokenize_and_parse(first, last, siplexer, g);     

    if (r) {
      std::cout << "Parsing OK" << g.l << ", " << g.w
        << ", " << g.c << "\n";
    }
    else {
      std::string rest(first, last);
      std::cerr << "Parsing failed\n" << "stopped at: \""
        << rest << "\"\n";
    }

  }
  return 0;
}
//]

在代码中，我在“T_USER”之后添加了“T_DOMAINLABEL”，T_DOMAINLABEL总是解析失败。似乎词法分析器会首先匹配T_USER。这是为什么？这是否意味着我不能将这些相似的模式加在一起？

score 2 · Accepted Answer

好吧，T_USER 匹配：

  ("{USER}",T_USER)

  // which is defined as
  ("USER", "({UNRESERVED})+" ) 

  // which is defined as
  ("UNRESERVED","{ALPHANUM}|{MARK}")

所以，它需要任何一系列的字母数字字符（以及“标记”，现在已经无关紧要了）

T_DOMAINLABEL 匹配：

  ("{DOMAINLABEL}", T_DOMAINLABEL)          

  // which is defined as
  ("DOMAINLABEL", "({ALPHANUM})+")

如您所见，任何 T_DOMAINLABEL 令牌始终是有效的 T_USER 令牌。所以，它永远不可能得到 T_DOMAINLABEL。

这不是因为“令牌不匹配”，而是令牌化急切而不做回溯（在单个令牌之外）的结果。

c++ - boost::spirit::lex 令牌是如何被识别的

1 回答 1

Related

Reference