您的示例数据在 'double qoute' (sic) 之后有一个单双引号,它终止了字段的双引号开头,但您必须继续阅读直到下一个逗号或行尾。这是一个格式错误的多行字段,但您会看到 Excel 对它的作用。您可以在(优秀的)《编程实践》一书中找到对此的描述(死链接— Internet 存档副本;普林斯顿大学的实时链接),其中包括 C 中的 CSV 解析库和 C++ 中的重新实现,并讨论了这个级别的细节。RFC 4180 “Common Format and MIME Type for Comma-Separated Values”中还有一个CSV标准,你也可以学习维基百科就此主题而言。
在另一个答案中是一些示例代码,仍在测试中。它在其限制范围内非常有弹性。在这里,它被修改为 SSCCE(简短、自包含、正确的示例)。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input: %d [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wprintf(L"Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
示例输出
$ cat csv.data
a,bb, c ,d""e,f
1,"2","",,"""",4
$ ./wcsv < csv.data
Input: 16 [a,bb, c ,d""e,f]
Field: [a,] (eol = 0)
Field: [bb,] (eol = 0)
Field: [ c ,] (eol = 0)
Field: [d""e,] (eol = 0)
Field: [f] (eol = 1)
Input: 17 [1,"2","",,"""",4]
Field: [1,] (eol = 0)
Field: ["2",] (eol = 0)
Field: ["",] (eol = 0)
Field: [,] (eol = 0)
Field: ["""",] (eol = 0)
Field: [4] (eol = 1)
Input: 296 [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Field: [First field of first row,] (eol = 0)
Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field: [ "This is second field of second row,] (eol = 0)
Field: [ but it is not multiline] (eol = 1)
Field: [ because it doesn't start ] (eol = 1)
Field: [ with an immediate double quote"] (eol = 1)
$
我说的是“在其范围内”;它的局限性是什么?
首先,它隔离原始字段,而不是转换后的字段。因此,必须修改它隔离的字段以产生“真实”值,去掉封闭的双引号,并将内部的双引号替换为单引号。将原始字段转换为实际值模仿了nextCsvField()
函数中的大量代码。输入是字段的开头和字段的结尾(分隔符)。这是第二个带有额外函数的 SSCCE,上面显示csvFieldData()
的dissect()
函数已修改为调用它。解剖输出的格式略有不同,因此看起来更好:
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
示例输出
$ ./wcsv < csv.data
Input 16: [a,bb, c ,d""e,f]
Raw Field: [a,] (eol = 0)
Field 1: [a]
Raw Field: [bb,] (eol = 0)
Field 2: [bb]
Raw Field: [ c ,] (eol = 0)
Field 3: [ c ]
Raw Field: [d""e,] (eol = 0)
Field 4: [d""e]
Raw Field: [f] (eol = 1)
Field 1: [f]
Input 17: [1,"2","",,"""",4]
Raw Field: [1,] (eol = 0)
Field 1: [1]
Raw Field: ["2",] (eol = 0)
Field 1: [2]
Raw Field: ["",] (eol = 0)
Field 0: []
Raw Field: [,] (eol = 0)
Field 0: []
Raw Field: ["""",] (eol = 0)
Field 2: [""]
Raw Field: [4] (eol = 1)
Field 1: [4]
Input 296: [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Raw Field: [First field of first row,] (eol = 0)
Field 24: [First field of first row]
Raw Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field 140: [This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped " double quote" but this one "" is not]
Raw Field: [ "This is second field of second row,] (eol = 0)
Field 38: [ "This is second field of second row]
Raw Field: [ but it is not multiline] (eol = 1)
Field 24: [ but it is not multiline]
Raw Field: [ because it doesn't start ] (eol = 1)
Field 28: [ because it doesn't start ]
Raw Field: [ with an immediate double quote"] (eol = 1)
Field 34: [ with an immediate double quote"]
$
我没有测试过\r\n
(或普通的\r
)行尾。