c - 将通用字符名称转义为 C 中的相应字符

Question

新编辑： 基本上我提供了一个不正确的例子。在我的实际应用程序中，字符串当然不会总是“C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt”。相反，我将在 java 中有一个输入窗口，然后我会将unicode 字符“转义”为通用字符名称。然后它将在 C中“未转义” （我这样做是为了避免将多字节字符从 java 传递到 c 时出现问题）。所以这里有一个例子，我实际上要求用户输入一个字符串（文件名）：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char src[100];
   scanf("%s", &src);
   printf("%s\n", src);
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

现在它会认为通用字符名称只是实际文件名的一部分。那么如何“取消转义”输入中包含的通用字符名称？

第一次编辑： 所以我像这样编译这个例子：“gcc -std=c99 read.c”，其中“read.c”是我的源文件。我需要 -std=c99 参数，因为我使用前缀 '\u' 作为我的通用字符名称。如果我将它更改为 '\x' 它工作正常，我可以删除 -std=c99 参数。但在我的实际应用程序中，输入不会使用前缀“\x”，而是使用前缀“\u”。那么我该如何解决这个问题呢？

这段代码给出了想要的结果，但对于我的实际应用程序，我不能真正使用 '\x'：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\x00E5gformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

原文： 我找到了一些如何在其他编程语言（如javascript ）中执行此操作的示例，但我找不到任何有关如何在 C 中执行此操作的示例。这是一个产生相同错误的示例代码：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
   int len = strlen(src); /* This returns 68. */
   char fname[len];
   sprintf(fname,"%s", src);
   int exists = func((const char*) src);
   printf("%s\n", fname);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 0' which means it doesn't exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

如果我改为使用没有通用字符名称的相同字符串：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 1' which means it does exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

它会输出'文件是否存在？1'。这意味着它确实存在。但问题是我需要能够处理通用字符。那么如何对包含通用字符名称的字符串进行转义呢？

提前致谢。

score 1 · Accepted Answer

错误的数组大小（忘记了 .txt 并且\0编码的非 ASCII 字符占用了超过 1 个字节。）

// length of the string without the universal character name. 
// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text
// 123456789012345678901234567890123456789012345678901234567890123
//          1         2         3         4         5         6
// int len = 63;

// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt
int len = 100;


char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
char fname[len];
// or if you can use VLA
char fname[strlen(src)+1];

sprintf(fname, "%s", src);

score 1 · Accepted Answer

我正在重新编辑答案，希望能更清楚。首先，我假设您对此很熟悉： http: //www.joelonsoftware.com/articles/Unicode.html。处理字符编码时需要背景知识。

现在我从我在我的 linux 机器上输入的一个简单的测试程序开始test.c

#include <stdio.h>
#include <string.h>
#include <wchar.h>
#define BUF_SZ 255
void test_fwrite_universal(const char *fname)
{
    printf("test_fwrite_universal on %s\n", fname);
    printf("In memory we have %d bytes: ", strlen(fname));
    for (unsigned i=0; i<strlen(fname); ++i) {
        printf("%x ", (unsigned char)fname[i]);
    }
    printf("\n");
    
    FILE* file = fopen(fname, "w");
    if (file) {
        fwrite((const void*)fname, 1, strlen(fname),  file);        
        fclose(file);
        file = NULL;
        printf("Wrote to file successfully\n");
    }
}

int main()
{
    test_fwrite_universal("file_\u00e5.txt");
    test_fwrite_universal("file_å.txt");   
    test_fwrite_universal("file_\u0436.txt");   
    return 0;
}

文本文件被编码为 UTF-8。在我的 linux 机器上，我的语言环境是 en_US.UTF-8 所以我像这样编译并运行程序：

gcc -std=c99 test.c -fexec-charset=UTF-8 -o test

测试

test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_ж.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74 
Wrote to file successfully

文本文件是 UTF-8，我的语言环境是 UTF-8，char 的执行字符集是 UTF-8。在 main 中，我用字符串调用函数 fwrite 3 次。该函数逐字节打印字符串。然后写入一个具有该名称的文件并将该字符串写入该文件。

我们可以看到“file_\u00e5.txt”和“file_å.txt”是一样的：66 69 6c 65 5f c3 a5 2e 74 78 74 果然（http://www.fileformat.info/info/unicode/ char/e5/index.htm ) 代码点 +00E5 的 UTF-8 表示形式是：c3 a5 在最后一个示例中，我使用了 \u0436，它是一个俄语字符 ж (UTF-8 d0 b6)

现在让我们在我的 Windows 机器上尝试同样的方法。在这里，我使用 mingw 并执行相同的代码：

C:\test>gcc -std=c99 test.c -fexec-charset=UTF-8 -o test.exe

C:\测试>测试

test_fwrite_universal on file_├Ñ.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_├Ñ.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_╨╢.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74
Wrote to file successfully

所以看起来出现了可怕的错误 printf 没有正确写入字符并且磁盘上的文件看起来也有问题。有两件事值得注意：就字节值而言，文件名在 linux 和 windows 中是相同的。使用记事本++之类的文件打开时，文件的内容也是正确的

问题的原因是 Windows 和语言环境上的 C 标准库。在 linux 上系统语言环境是 UTF-8 在 Windows 上我的默认语言环境是 CP-437。当我调用诸如printf fopen假设输入在 CP-437 中的函数时， c3 a5实际上是两个字符。

在我们查看合适的 Windows 解决方案之前，让我们尝试解释一下为什么在file_å.txtvs中有不同的结果file_\u00e5.txt。我相信关键是你的文本文件的编码。如果我test.c在 CP-437 中写同样的话：

C:\test>iconv -f UTF-8 -t cp437 test.c > test_lcl.c

C:\test>gcc -std=c99 test_lcl.c -fexec-charset=UTF-8 -o test_lcl.exe

C:\测试>test_lcl

test_fwrite_universal on file_├Ñ.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 10 bytes: 66 69 6c 65 5f 86 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_╨╢.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74
Wrote to file successfully

我现在得到了 file_å 和 file_\u00e5 之间的区别。文件中的字符å实际上编码为0x86。请注意，这次第二个字符串是 10 个字符而不是 11 个字符。如果我们查看文件并告诉 Notepad++ 使用 UTF-8，我们会看到一个有趣的结果。写入文件的实际数据也是如此。

最后如何让这该死的东西在 Windows 上工作。不幸的是，似乎不可能将标准库与 UTF-8 编码的字符串一起使用。在 Windows 上，您不能将 C 语言环境设置为此。请参阅：en_US.UTF-8 区域设置的 Windows 等效项是什么？.

但是我们可以使用宽字符来解决这个问题：

#include <stdio.h>
#include <string.h>
#include <windows.h>
#define BUF_SZ 255
void test_fopen_windows(const char *fname)
{
    wchar_t buf[BUF_SZ] = {0};
    int sz = MultiByteToWideChar(CP_UTF8, 0, fname, strlen(fname), (LPWSTR)buf, BUF_SZ-1);
    wprintf(L"converted %d characters\n", sz);
    wprintf(L"Converting to wide characters %s\n", buf);
    FILE* file =_wfopen(buf, L"w");
    if (file) {
        fwrite((const void*)fname, 1, strlen(fname),  file);        
        fclose(file);
        wprintf(L"Wrote file %s successfully\n", buf);
    }
}


int main()
{
    test_fopen_windows("file_\u00e5.txt");
    return 0;
}

编译使用：

gcc -std=gnu99 -fexec-charset=UTF-8 test_wide.c -o test_wide.exe

_wfopen 不符合 ANSI 标准， -std=c99 实际上表示 STRICT_ANSI，因此您应该使用 gnu99 来获得该功能。

c - 将通用字符名称转义为 C 中的相应字符

2 回答 2

Related

Reference