c++ - 如何将包含 utf-16 编码文本的 std::string 转换为 utf-16 wstring？

Question

所以我们得到一个字符串 РќРѕРІР°СЏ РїР°РїРєР°，它是 utf-16 编码行（Новая папка在 utf-16 中）的 utf-8 表示，我们希望将此字符串转换为 wstring 而不更改编码.. 意思是从字面上将所有数据从字符串带到 wstring 而无需任何转换. 所以我们会得到 wstring 的Новая папка内容。怎么做这样的事情？

更新： 我的意思是 - 我们在字符串中拥有正确 utf-16 字符串的所有数据。我们所需要做的就是将该数据放入 wstring... 这意味着如果 wstring 包含 wchar 可能恰好是0000我们必须将 2 个字符串字符00一起00才能得到它。那就是我不知道该怎么做。

Update2 我是如何来到这里的——我有义务在我的服务器上使用的 C++ 库是 C 风格的解析器。它以 std::string 的形式返回我的用户请求地址。当我让我的客户以这种格式向我发送请求时。

url_encode(UTF16toUTF8(wstring)) //pseudocode.

在哪里

string UTF16toUTF8(const wstring & in)
{
    string out;
    unsigned int codepoint;
    bool completecode = false;
    for (wstring::const_iterator p = in.begin();  p != in.end();  ++p)
    {
        if (*p >= 0xd800 && *p <= 0xdbff)
        {
            codepoint = ((*p - 0xd800) << 10) + 0x10000;
            completecode = false;
        }
        else if (!completecode && *p >= 0xdc00 && *p <= 0xdfff)
        {
            codepoint |= *p - 0xdc00;
            completecode = true;
        }
        else
        {
            codepoint = *p;
            completecode = true;
        }
        if (completecode)
        {
            if (codepoint <= 0x7f)
                out.push_back(codepoint);
            else if (codepoint <= 0x7ff)
            {
                out.push_back(0xc0 | ((codepoint >> 6) & 0x1f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else if (codepoint <= 0xffff)
            {
                out.push_back(0xe0 | ((codepoint >> 12) & 0x0f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else
            {
                out.push_back(0xf0 | ((codepoint >> 18) & 0x07));
                out.push_back(0x80 | ((codepoint >> 12) & 0x3f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
        }
    }
    return out;
}

std::string url_encode( std::string sSrc )
{
    const char SAFE[256] =
    {
        /*      0 1 2 3  4 5 6 7  8 9 A B  C D E F */
        /* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,

        /* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
        /* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,

        /* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,

        /* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
    };
    const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
    unsigned char * pEnd = pStart;
    const unsigned char * const SRC_END = pSrc + SRC_LEN;

    for (; pSrc < SRC_END; ++pSrc)
    {
        if (SAFE[*pSrc]) 
            *pEnd++ = *pSrc;
        else
        {
            // escape this char
            *pEnd++ = '%';
            *pEnd++ = DEC2HEX[*pSrc >> 4];
            *pEnd++ = DEC2HEX[*pSrc & 0x0F];
        }
    }

    std::string sResult((char *)pStart, (char *)pEnd);
    delete [] pStart;
    return sResult;
}

std::string url_decode( std::string sSrc )
{
    // Note from RFC1630:  "Sequences which start with a percent sign
    // but are not followed by two hexadecimal characters (0-9, A-F) are reserved
    // for future extension"

    const char HEX2DEC[256] = 
    {
        /*       0  1  2  3   4  5  6  7   8  9  A  B   C  D  E  F */
        /* 0 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 1 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 2 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 3 */  0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,

        /* 4 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 5 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 6 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 7 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* 8 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 9 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* A */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* B */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* C */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* D */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* E */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* F */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
    };

    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    const unsigned char * const SRC_END = pSrc + SRC_LEN;
    const unsigned char * const SRC_LAST_DEC = SRC_END - 2;   // last decodable '%' 

    char * const pStart = new char[SRC_LEN];
    char * pEnd = pStart;

    while (pSrc < SRC_LAST_DEC)
    {
        if (*pSrc == '%')
        {
            char dec1, dec2;
            if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
                && -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
            {
                *pEnd++ = (dec1 << 4) + dec2;
                pSrc += 3;
                continue;
            }
        }

        *pEnd++ = *pSrc++;
    }

    // the last 2- chars
    while (pSrc < SRC_END)
        *pEnd++ = *pSrc++;

    std::string sResult(pStart, pEnd);
    delete [] pStart;
    return sResult;
}

当然我调用 url_decode，但我得到一个字符串..( 所以我希望现在我的问题更清楚了。

score 2 · Accepted Answer

这是我正在修补的解决您的问题的方法：

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
std::wstring correct( (wchar_t*)wrong.data() );

根据http://www.cplusplus.com/reference/string/string/data/ data() 成员函数应该给我们原始 char* 并且简单地转换为 (wchar_t*) 应该会导致它坚持 00 和正如您在示例中描述的那样，00 一起构成 0000。

我个人不喜欢这样选角，但这就是我目前所想到的。

编辑 - 您使用的是哪个库？它是否带有其他一些功能来扭转它所做的事情？

如果它很受欢迎，肯定有人以前遇到过这个问题。他们是如何解决的？

编辑2 - 这是一种令人作呕的方式，使用malloc，一些假设原始字符串中不会有任何半码点，以及另一个可怕的演员表。:(

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
wchar_t *lesswrong = (wchar_t*) malloc (wrong.size()/sizeof(wchar_t) + sizeof(wchar_t));
lesswrong = (wchar_t*)wrong.data();
lesswrong[wrong.size()] = '\0';
std::wstring correct( lesswrong );

这不可能是正确的。即使它有效，它也是如此丑陋。

编辑 3 - 像 Kerrick sadi 一样，这是一种更好的方法。

std::string wrong("РќРѕРІР°СЏ РїР°РїРєР°");
std::wstring correct( (wchar_t*)wrong.data(), wrong.size()/2 );

score 1 · Accepted Answer

如果我理解正确，您有一个std::string包含UTF-16编码字符串的对象，并且您希望在std::wstring不更改编码的情况下将其转换为 a。如果我是正确的，那么，您不必转换编码，也不必转换表示，而只需转换存储。

您还认为该字符串可能被错误地编码为UTF-8. 但是，UTF-8它是可变长度编码，但是您错误解释的数据的长度（РќРѕРІР°СЏ РїР°РїРєР°是 22 个字符长）正好是原始数据长度的两倍（Новая папка是 11 个字符长）。这就是为什么我怀疑这可能只是存储错误而不是编码错误的情况。

以下代码执行此操作：

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert((input.size() & 1) == 0);
    size_t len = input.size() / 2;
    std::wstring output;
    output.resize(len);

    for (size_t i = 0; i < len; ++i) {
        unsigned char chr1 = (unsigned char)input[2 * i];
        unsigned char chr2 = (unsigned char)input[2 * i + 1];

        // Note: this line suppose that you use `UTF-16-BE` both for
        // the std::string and the std::wstring. You'll have to swap
        // chr1 & chr2 if this is not the case.
        unsigned short val = (chr2 << 8)|(chr1);
        output[i] = (wchar_t)(val);
    }

    return output;
}

如果您知道在所有平台上您的目标sizeof(wchar_t)等于 2（这不是一个用于 64 位程序的 Mac OS 例如sizeof(wchar_t)等于 4），那么您可以使用简单的转换：

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert(sizeof(wchar_t) == 2); // A static assert would be better here
    assert((input.size() & 1) == 0);
    return input.empty()
        ? std::wstring()
        : std::wstring((wchar_t*)input[0], input.size() / 2);
}

c++ - 如何将包含 utf-16 编码文本的 std::string 转换为 utf-16 wstring？

2 回答 2

Related

Reference