我正在使用libunistring处理 C 中的 unicode 字符串。无法使用其他库。我的目标是从 unicode 字符串的索引位置读取单个字符,打印它,并将其与固定值进行比较。这应该很简单,但是...
这是我的尝试(完整的 C 程序):
/* This file must be UTF-8 encoded in order to work */
#include <locale.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unitypes.h>
#include <uniconv.h>
#include <unistdio.h>
#include <unistr.h>
#include <uniwidth.h>
int cmpchr(const char *label, const uint32_t charExpected, const uint32_t charActual) {
int result = u32_cmp(&charExpected, &charActual, 1);
if (result == 0) {
printf("%s is recognized as '%lc', good!\n", label, charExpected);
} else {
printf("%s is NOT recognized as '%lc'.\n", label, charExpected);
}
return result;
}
int main() {
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
const char *enc = locale_charset();
printf("Current locale charset: %s (should be UTF-8)\n\n", enc);
const char *buf = "foo 楽あり bébé";
const uint32_t *mbcs = u32_strconv_from_locale(buf);
printf("%s\n", u32_strconv_to_locale(mbcs));
uint32_t c0 = mbcs[0];
uint32_t c5 = mbcs[5];
uint32_t cLast = mbcs[u32_strlen(mbcs) - 1];
printf(" - char 0: %lc\n", c0);
printf(" - char 5: %lc\n", c5);
printf(" - last : %lc\n", cLast);
/* When this file is UTF-8-encoded, I'm passing a UTF-8 character
* as a uint32_t, which should be wrong! */
cmpchr("Char 0", 'f', c0);
cmpchr("Char 5", 'あ', c5);
cmpchr("Last char", 'é', cLast);
return 0;
}
为了运行这个程序:
- 将程序保存到名为ustridx.c的 UTF-8 编码文件
sudo apt-get install libunistring-dev
gcc -o ustridx.o -W -Wall -O -c ustridx.c ; gcc -o ustridx -lunistring ustridx.o
- 确保终端设置为 UTF-8 语言环境 (
locale
) - 运行它
./ustridx
输出:
Current locale charset: UTF-8 (should be UTF-8)
foo 楽あり bébé
- char 0: f
- char 5: あ
- last : é
Char 0 is recognized as 'f', good!
Char 5 is NOT recognized as '�����'.
Last char is NOT recognized as '쎩'.
期望的行为是正确识别char 5和last char,并在输出的最后两行正确打印。