2

如何在应用程序将字符接收为 UTF-8 编码 %C3%BA 并将其存储为 unicode 等效 %FA 的 C++ 应用程序中转换 ú。我只想知道我将如何编写代码来执行此编码过程

4

2 回答 2

8

我昨天刚刚写了一些代码来做到这一点......

我并不是说这是执行此操作的“完美”方式,但它似乎适用于我运行过的所有测试用例(为此我编写了两个方向)。

我将把它留给您将“%NN”转换为整数值。

#include <iostream>
#include <deque>

std::deque<int> unicode_to_utf8(int charcode)
{
    std::deque<int> d;
    if (charcode < 128)
    {
        d.push_back(charcode);
    }
    else
    {
        int first_bits = 6; 
        const int other_bits = 6;
        int first_val = 0xC0;
        int t = 0;
        while (charcode >= (1 << first_bits))
        {
            {
                t = 128 | (charcode & ((1 << other_bits)-1));
                charcode >>= other_bits;
                first_val |= 1 << (first_bits);
                first_bits--;
            }
            d.push_front(t);
        }
        t = first_val | charcode;
        d.push_front(t);
    }
    return d;
}


int utf8_to_unicode(std::deque<int> &coded)
{
    int charcode = 0;
    int t = coded.front();
    coded.pop_front();
    if (t < 128)
    {
        return t;
    }
    int high_bit_mask = (1 << 6) -1;
    int high_bit_shift = 0;
    int total_bits = 0;
    const int other_bits = 6;
    while((t & 0xC0) == 0xC0)
    {
        t <<= 1;
        t &= 0xff;
        total_bits += 6;
        high_bit_mask >>= 1; 
        high_bit_shift++;
        charcode <<= other_bits;
        charcode |= coded.front() & ((1 << other_bits)-1);
        coded.pop_front();
    } 
    charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
    return charcode;
}

int main()
{
    int charcode; 

    for(;;)
    {
        std::cout << "Enter unicode value:" << std::endl;
        std::cin >> charcode; 
        auto x = unicode_to_utf8(charcode);
        for(auto c : x)
        {
            std::cout << "\\x" << std::hex << c << " ";
        }
        std::cout << std::endl;
        int c = utf8_to_unicode(x);
        std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
    }
}
于 2013-08-30T14:00:21.670 回答
1

这实际上在标准库中:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    return 0;
}

另一种方法是使用conv_utf8_utf32.to_bytes.

%hex使用 printf以您的格式打印的示例:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert
#include <cstdio>


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    // print the bytes in %hex format
    for (char byte: utf8_bytes) {
        printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
    }   
    printf("\n");


    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    // print the code points in %hex format
    for (char32_t chr: unicode_codepoints) {
        printf("%%%2X", chr);
    }   
    printf("\n");


    return 0;
}
于 2020-01-14T09:37:25.130 回答