c++ - 如何在 C++ 中获取 UTF-8 的 1 字节字母的 UCS 代码？

Question

我需要检查一个字母（英语和俄语）是否按字母顺序排列。默认情况下，文件应该使用 UTF-8 编码。我发现，最好的解决方案是使用 UCS 代码。计算 2 字节编码字母的 UCS-code 的方法是

#include <stdio.h>
#include <stdlib.h>

char utf8len[256] = { 
  // len = utf8len[c] & 0x7  cont = utf8len[c] & 0x8 
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 0  - 15
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 16 - 31
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 32 - 47
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 48 - 63
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 64 - 79
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 80 - 95
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 96 - 111
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, // 112 - 127

  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, // 80 - 8f
  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, // 90 - 9f
  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, // a0 - af
  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, // b0 - bf

  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, // c0 - cf
  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, // d0 - df

  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, // e0 - ef

  4,  4,  4,  4,  4,  4,  4,  4,  // f0 - f7

  5,  5,  5,  5,  // f8, f9, fa, fb

  6,  6,  // fc, fd

  0,  0   // fe, ff 
};

#define UTF8LEN(c) (utf8len[(unsigned char)(c)] & 0x7)
#define UTF8CONT(c) (utf8len[(unsigned char)(c)] & 0x8)

int main (int argc, char *argv[])
{
  char *s = "Б№1АГД"; //string which contains cyrillic symbols

  while (*s) {
    int ucode;

    printf ("[%s] %d\n", s, UTF8LEN(*s));
    if ((UTF8LEN(*s) == 2) && UTF8CONT(s[1])) {
      ucode = ((*s & 0x1f) << 6) | (s[1] & 0x3f); //! HERE I GET UCS CODE 
      printf ("ucode = 0x%x\n", ucode);
      s++;
    }
    s++;
  }

}

这是我正在寻找的解决方案的一半。这段代码只允许我使用西里尔符号（因为它们在 UTF-8 中用 2 个字节编码）。问题是，我也需要使用拉丁字母。那么我应该怎么做才能获得 1 字节符号的 UCS 代码（在我的情况下使用 UTF8LEN(c)=1）？

更新：可能，解决方案是：

ucode = *s

这行得通吗？

c++ - 如何在 C++ 中获取 UTF-8 的 1 字节字母的 UCS 代码？

0 回答 0

Related

Reference