c++ - 解析带有多行字段和转义双引号的 CSV 文件

Question

解析具有多行字段和转义引号的 CSV 文件的最佳方法是什么？

例如，这个 CSV

First field of first row,"This field is multiline

but that's OK because it's enclosed in double qoutes, and this
is an escaped "" double qoute" but this one "" is not
   "This is second field of second row, but it is not multiline
because it doesn't start 
with an immediate double quote"

在 Excel 中看起来像这样：

在此处输入图像描述

我是否只是以某种方式保留状态，有一个标志说明我现在正在阅读的字段是否以 qoute 等开头？另外，如果有双引号但不在字段开头的情况下会发生什么情况，即, "ABC"，或者,"item "" item" ""那些被认为无效的情况？此外，""如果不在带引号的字段内，引号是否会转义？Excel似乎没有。

还有其他我可能错过的极端案例吗？

score 3 · Accepted Answer

您的示例数据在 'double qoute' (sic) 之后有一个单双引号，它终止了字段的双引号开头，但您必须继续阅读直到下一个逗号或行尾。这是一个格式错误的多行字段，但您会看到 Excel 对它的作用。您可以在（优秀的）《编程实践》一书中找到对此的描述（死链接— Internet 存档副本；普林斯顿大学的实时链接），其中包括 C 中的 CSV 解析库和 C++ 中的重新实现，并讨论了这个级别的细节。RFC 4180 “Common Format and MIME Type for Comma-Separated Values”中还有一个CSV标准，你也可以学习维基百科就此主题而言。

在另一个答案中是一些示例代码，仍在测试中。它在其限制范围内非常有弹性。在这里，它被修改为 SSCCE（简短、自包含、正确的示例）。

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input: %d [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wprintf(L"Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}

示例输出

$ cat csv.data
a,bb, c ,d""e,f
1,"2","",,"""",4
$ ./wcsv < csv.data
Input: 16 [a,bb, c ,d""e,f]
Field: [a,] (eol = 0)
Field: [bb,] (eol = 0)
Field: [ c ,] (eol = 0)
Field: [d""e,] (eol = 0)
Field: [f] (eol = 1)
Input: 17 [1,"2","",,"""",4]
Field: [1,] (eol = 0)
Field: ["2",] (eol = 0)
Field: ["",] (eol = 0)
Field: [,] (eol = 0)
Field: ["""",] (eol = 0)
Field: [4] (eol = 1)
Input: 296 [First field of first row,"This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
   "This is second field of second row, but it is not multiline
   because it doesn't start 
   with an immediate double quote"]
Field: [First field of first row,] (eol = 0)
Field: ["This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field: [   "This is second field of second row,] (eol = 0)
Field: [ but it is not multiline] (eol = 1)
Field: [   because it doesn't start ] (eol = 1)
Field: [   with an immediate double quote"] (eol = 1)
$

我说的是“在其范围内”；它的局限性是什么？

首先，它隔离原始字段，而不是转换后的字段。因此，必须修改它隔离的字段以产生“真实”值，去掉封闭的双引号，并将内部的双引号替换为单引号。将原始字段转换为实际值模仿了nextCsvField()函数中的大量代码。输入是字段的开头和字段的结尾（分隔符）。这是第二个带有额外函数的 SSCCE，上面显示csvFieldData()的dissect()函数已修改为调用它。解剖输出的格式略有不同，因此看起来更好：

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
    wchar_t *dst = buffer;
    wchar_t *end = buffer + buflen - 1;
    const wchar_t *src = fld_s;

    if (*src == L'"')
    {
        const wchar_t *p = src + 1;
        while (p < fld_e && dst < end)
        {
            if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
            {
                *dst++ = p[0];
                p += 2;
            }
            else if (p[0] == L'"')
            {
                p++;
                break;
            }
            else
                *dst++ = *p++;
        }
        src = p;
    }
    while (src < fld_e && dst < end)
        *dst++ = *src++;
    if (dst >= end)
        return 0;
    *dst = L'\0';
    return(buffer);
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wchar_t buffer[1024];
        wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
            wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}

示例输出

$ ./wcsv < csv.data
Input  16: [a,bb, c ,d""e,f]
Raw Field: [a,] (eol = 0)
Field   1: [a]
Raw Field: [bb,] (eol = 0)
Field   2: [bb]
Raw Field: [ c ,] (eol = 0)
Field   3: [ c ]
Raw Field: [d""e,] (eol = 0)
Field   4: [d""e]
Raw Field: [f] (eol = 1)
Field   1: [f]
Input  17: [1,"2","",,"""",4]
Raw Field: [1,] (eol = 0)
Field   1: [1]
Raw Field: ["2",] (eol = 0)
Field   1: [2]
Raw Field: ["",] (eol = 0)
Field   0: []
Raw Field: [,] (eol = 0)
Field   0: []
Raw Field: ["""",] (eol = 0)
Field   2: [""]
Raw Field: [4] (eol = 1)
Field   1: [4]
Input 296: [First field of first row,"This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
   "This is second field of second row, but it is not multiline
   because it doesn't start 
   with an immediate double quote"]
Raw Field: [First field of first row,] (eol = 0)
Field  24: [First field of first row]
Raw Field: ["This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field 140: [This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped " double quote" but this one "" is not]
Raw Field: [   "This is second field of second row,] (eol = 0)
Field  38: [   "This is second field of second row]
Raw Field: [ but it is not multiline] (eol = 1)
Field  24: [ but it is not multiline]
Raw Field: [   because it doesn't start ] (eol = 1)
Field  28: [   because it doesn't start ]
Raw Field: [   with an immediate double quote"] (eol = 1)
Field  34: [   with an immediate double quote"]
$

我没有测试过\r\n（或普通的\r）行尾。

score 1 · Accepted Answer

我编写了一个函数（我仍在测试），它试图尽可能地解析它。

// Returns a pointer to the start of the next field, or zero if this is the
// last field in the CSV p is the start position of the field sep is the
// separator used, i.e. comma or semicolon newline says whether the field ends
// with a newline or with a comma
static const wchar_t* nextCsvField(const wchar_t *p, wchar_t sep, bool *newline, const wchar_t **escapedEnd)
{
    *escapedEnd = 0;
    *newline = false;

    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote.
            p = wcschr(p, L'"');
            // If we didn't find it then this is the last field.
            if (!p)
                return 0;
            // Check for "", it is an escaped double-quote.
            if (p[1] != '"') {
                *escapedEnd = p;
                break;
            }
            // If it's the last symbol then this is the last field
            if (!p[1])
                return 0;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

c++ - 解析带有多行字段和转义双引号的 CSV 文件

2 回答 2

示例输出

示例输出

Related

Reference