c++ - Boost tokenizer 将引用的字符串视为一个标记

Question

有没有办法让 Boost 分词器在不拆分引用部分的情况下拆分字符串以下？

string s = "1st 2nd \"3rd with some comment\" 4th";

Exptected output:
1st
2nd
3rd with some comment
4th

score 3 · Accepted Answer

您可以使用escaped_list_separator分词器库中的一个。有关如何将其应用于您的问题的更多详细信息，请参阅此问题。

score 2 · Accepted Answer

C++11解决方案

#include <iostream>
#include <string>
#include <vector>

std::vector<std::string> tokenize(const std::string& str) {
    std::vector<std::string> tokens;
    std::string buffer;
    std::string::const_iterator iter = str.cbegin();

    bool in_string = false;

    while (iter != str.cend()) {
        char c = *iter;
        if (c == '"') {
            if (in_string) {
                tokens.push_back(buffer);
                buffer.clear();
            }
            in_string = !in_string;
        } else if (c == ' ') {
            if (in_string) {
                buffer.push_back(c);
            } else {
                if (!buffer.empty()) {
                    tokens.push_back(buffer);
                    buffer.clear();
                }
            }
        } else {
            buffer.push_back(c);
        }

        ++iter;
    }

    if (!buffer.empty()) {
        tokens.push_back(buffer);
    }

    return tokens;
}

int main() {
    std::string s = "1st 2nd \"3rd with some comment\" 4th";
    std::vector<std::string> tokens = tokenize(s);
    for (auto iter = tokens.cbegin(); iter != tokens.cend(); ++iter) {
        std::cout << *iter << "\n";
    }
}

score 1 · Accepted Answer

试试这段代码，这样你就可以避免使用 Boost.Tokenizer 和 Boost.Spirit 库

#include <vector>
#include <string>
#include <iostream>

const char Separators[] = { ' ', 9 };

bool Str_IsSeparator( const char Ch )
{
    for ( size_t i = 0; i != sizeof( Separators ); i++ )
    {
        if ( Separators[i] == Ch ) { return true; }
    }

    return false;
}

void SplitLine( size_t FromToken, size_t ToToken, const std::string& Str, std::vector<std::string>& Components /*, bool ShouldTrimSpaces*/ )
{
    size_t TokenNum = 0;
    size_t Offset   = FromToken - 1;

    const char* CStr  = Str.c_str();
    const char* CStrj = Str.c_str();

    while ( *CStr )
    {
        // bypass spaces & delimiting chars
        while ( *CStr && Str_IsSeparator( *CStr ) ) { CStr++; }

        if ( !*CStr ) { return; }

        bool InsideQuotes = ( *CStr == '\"' );

        if ( InsideQuotes )
        {
            for ( CStrj = ++CStr; *CStrj && *CStrj != '\"'; CStrj++ );
        }
        else
        {
            for ( CStrj = CStr; *CStrj && !Str_IsSeparator( *CStrj ); CStrj++ );
        }

        // extract token
        if ( CStr != CStrj )
        {
            TokenNum++;

            // store each token found
            if ( TokenNum >= FromToken )
            {
                  Components[ TokenNum-Offset ].assign( CStr, CStrj );
                  // if ( ShouldTrimSpaces ) { Str_TrimSpaces( &Components[ TokenNum-Offset ] ); }
                  // proceed to next token
                  if ( TokenNum >= ToToken ) { return; }
            }
            CStr = CStrj;

            // exclude last " from token, handle EOL
            if ( *CStr ) { CStr++; }
        }
    }
}

int main()
{
    std::string test = "1st 2nd \"3rd with some comment\" 4th";
    std::vector<std::string> Out;

    Out.resize(5);
    SplitLine(1, 4, test, Out);

    for(size_t j = 0 ; j != Out.size() ; j++) { std::cout << Out[j] << std::endl; }

    return 0;
}

它使用预先分配的字符串数组（它不是从零开始的，但很容易修复），而且非常简单。

c++ - Boost tokenizer 将引用的字符串视为一个标记

3 回答 3

Related

Reference