0

我需要将拉丁语 ISO-8859-1 字符编码为 UTF-8(以及反向操作)。

我第一次使用这个答案有没有办法从 UTF8 转换为 iso-8859-1?执行操作并且它工作;

现在我想使用提供所有转换机制的 libiconv,它应该可以帮助我保持代码更简单。

我遵循了此处提供的示例: https ://www.lemoda.net/c/iconv-example/iconv-example.html

我写了一个看起来像这样的方法:

char *iconvISO2UTF8(char *iso) {
    iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");

    if (iconvDesc == (iconv_t) - 1) {
        /* Something went wrong.  */
        if (errno == EINVAL)
            fprintf(stderr, "conversion from '%s' to '%s' not available", "ISO−8859−1", "UTF-8");           
        else
            fprintf(stderr, "LibIcon initialization failure");          

        return NULL;
    }

    size_t iconv_value;
    char * utf8;
    size_t len;
    size_t utf8len; 
    char * utf8start;

    int len_start;


    len = strlen (iso);
    if (! len) {        
        fprintf(stderr, "iconvISO2UTF8: input String is empty.");           
        return NULL;
    }

    /* Assign enough space to put the UTF-8. */
    utf8len = 2 * len;
    utf8 = calloc (utf8len, sizeof (char));
    if (! utf8) {
        fprintf(stderr, "iconvISO2UTF8: Calloc failed.");           
        return NULL;
    }
    /* Keep track of the variables. */
    utf8start = utf8;
    len_start = len;

    iconv_value = iconv (iconvDesc, & iso, & len, & utf8, & utf8len);
    /* Handle failures. */
    if (iconv_value == (size_t) - 1) {      
        switch (errno) {
                /* See "man 3 iconv" for an explanation. */
            case EILSEQ:
                fprintf(stderr, "iconv failed: Invalid multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);             
                break;
            case EINVAL:
                fprintf(stderr, "iconv failed: Incomplete multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);              
                break;
            case E2BIG:
                fprintf(stderr, "iconv failed: No more room, in string '%s', length %d, out string '%s', length %d\n", iso, (int)  len, utf8start, (int) utf8len);                              
                break;
            default:
                fprintf(stderr, "iconv failed, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);                             
        }
        return NULL;
    }


    if(iconv_close (iconvDesc) != 0) {
        fprintf(stderr, "libicon close failed: %s", strerror (errno));          
    }

    return utf8start;

}

当我用普通的类似 ascii 的字符(如“abracadabra”)调用此函数时,iconv 有效。但是,一旦我向它发送强调字符,例如“éàèüöä”,那么 iconv() 调用就会失败,并显示 EILSEQ 代码:

iconv 失败:无效的多字节序列,在字符串 'éàèüöä',长度 6,输出字符串 '',长度 12

这是一个示例主程序,当存储在使用 ISO-8859-1 编码的源文件中并在使用 ISO-8859-1 作为默认字符集的 linux 系统上编译时会崩溃:

int main(int argc, char **argv) {
    char *iso1 = "abracadabra";
    char *utf = iconvISO2UTF8(iso1);
    puts(utf);
    free(utf);

    char *iso2 = "éàèüöä";
    utf = iconvISO2UTF8(iso2);
    puts(utf);
    free(utf);
}

是否可以使用 iconv 运行这种转换?如果是的话,这段代码有什么问题?

4

1 回答 1

5

请仔细阅读iconv_open(3)手册页:

iconv_t iconv_open(const char *tocode, const char *fromcode);

如果您要从 ISO 8859-1 转换为 UTF-8,那么这很矛盾:

iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");

应该说

iconv_t iconvDesc = iconv_open ("UTF-8//TRANSLIT//IGNORE", "ISO−8859-1");
于 2018-09-17T09:35:06.080 回答