这是基于 Kernighan 和 Plauger的 The Practice of Programming中的 CSV 代码的代码,该代码已经适应处理您奇怪的格式错误的 CSV 数据。(这并不难;我已经完成了主要代码的工作和打包,所以我只需要添加 CSV 输出函数并修改advquoted()
函数以处理这个问题中奇怪的格式。
csv2.h
/*
@(#)File: $RCSfile: csv2.h,v $
@(#)Version: $Revision: 2.1 $
@(#)Last changed: $Date: 2012/11/01 22:23:07 $
@(#)Purpose: Scanner for Comma Separated Variable (CSV) Data
@(#)Author: J Leffler
@(#)Origin: Kernighan & Pike, 'The Practice of Programming'
*/
/*TABSTOP=4*/
#ifndef CSV2_H
#define CSV2_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef MAIN_PROGRAM
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_h[] = "@(#)$Id: csv2.h,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
#endif /* MAIN_PROGRAM */
#include <stdio.h>
extern char *csvgetline(FILE *ifp); /* Read next input line */
extern char *csvgetfield(size_t n); /* Return field n */
extern size_t csvnfield(void); /* Return number of fields */
extern void csvreset(void); /* Release space used by CSV */
extern int csvputfield(FILE *ofp, const char *field);
extern int csvputline(FILE *ofp, char **fields, int nfields);
extern void csvseteol(const char *eol);
#ifdef __cplusplus
}
#endif
#endif /* CSV2_H */
csv2.c
/*
@(#)File: $RCSfile: csv2.c,v $
@(#)Version: $Revision: 2.1 $
@(#)Last changed: $Date: 2012/11/01 22:23:07 $
@(#)Purpose: Scanner for Comma Separated Variable (CSV) Data
@(#)Modification: Deal with specific malformed CSV
@(#)Author: J Leffler
@(#)Origin: Kernighan & Pike, 'The Practice of Programming'
*/
/*TABSTOP=4*/
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_c[] = "@(#)$Id: csv2.c,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
/*
** See RFC 4180 (http://www.ietf.org/rfc/rfc4180.txt).
**
** Specific malformed CSV - see SO 13183644 (http://stackoverflow.com/questions/13183644).
** Data contains malformed CSV fields like: OK,""this is a problem"",OK
** Two (but not three) field quotes at the start extract as "this is a problem" (with the quotes).
*/
#include "csv2.h"
#include <stdlib.h>
#include <string.h>
enum { NOMEM = -2 };
static char *line = 0; /* Input line */
static char *sline = 0; /* Split line */
static size_t maxline = 0; /* Size of line[] and sline[] */
static char **field = 0; /* Field pointers */
static size_t maxfield = 0; /* Size of field[] */
static size_t nfield = 0; /* Number of fields */
static char fieldsep[]= ","; /* Field separator characters */
static char fieldquote = '"'; /* Quote character */
static char eolstr[8] = "\n";
void csvreset(void)
{
free(line);
free(sline);
free(field);
line = 0;
sline = 0;
field = 0;
maxline = maxfield = nfield = 0;
}
static int endofline(FILE *ifp, int c)
{
int eol = (c == '\r' || c == '\n');
if (c == '\r')
{
c = getc(ifp);
if (c != '\n' && c != EOF)
ungetc(c, ifp);
}
return(eol);
}
/* Modified to deal with specific malformed CSV */
static char *advquoted(char *p)
{
size_t i;
size_t j;
if (p[0] == fieldquote && (p[1] != *fieldsep && p[1] != fieldquote))
{
/* Malformed CSV: ""some stuff"" --> "some stuff" */
/* Find "\"\"," or "\"\"\0" to mark end of field */
/* If we don't find it, drop through to 'regular' case */
char *eof = strstr(&p[2], "\"\"");
if (eof != 0 && (eof[2] == *fieldsep || eof[2] == '\0'))
{
p[eof + 1 - p] = '\0';
return(eof + 2);
}
}
for (i = j = 0; p[j] != '\0'; i++, j++)
{
if (p[j] == fieldquote && p[++j] != fieldquote)
{
size_t k = strcspn(p+j, fieldsep);
memmove(p+i, p+j, k); // 1 -> i fixing transcription error
i += k;
j += k;
break;
}
p[i] = p[j];
}
p[i] = '\0';
return(p + j);
}
static int split(void)
{
char *p;
char **newf;
char *sepp;
int sepc;
nfield = 0;
if (line[0] == '\0')
return(0);
strcpy(sline, line);
p = sline;
do
{
if (nfield >= maxfield)
{
maxfield *= 2;
newf = (char **)realloc(field, maxfield * sizeof(field[0]));
if (newf == 0)
return NOMEM;
field = newf;
}
if (*p == fieldquote)
sepp = advquoted(++p);
else
sepp = p + strcspn(p, fieldsep);
sepc = sepp[0];
sepp[0] = '\0';
field[nfield++] = p;
p = sepp + 1;
} while (sepc == ',');
return(nfield);
}
char *csvgetline(FILE *ifp)
{
size_t i;
int c;
if (line == NULL)
{
/* Allocate on first call */
maxline = maxfield = 1;
line = (char *)malloc(maxline); /*=C++=*/
sline = (char *)malloc(maxline); /*=C++-*/
field = (char **)malloc(maxfield*sizeof(field[0])); /*=C++=*/
if (line == NULL || sline == NULL || field == NULL)
{
csvreset();
return(NULL); /* out of memory */
}
}
for (i = 0; (c = getc(ifp)) != EOF && !endofline(ifp, c); i++)
{
if (i >= maxline - 1)
{
char *newl;
char *news;
maxline *= 2;
newl = (char *)realloc(line, maxline); /*=C++=*/
news = (char *)realloc(sline, maxline); /*=C++-*/
if (newl == NULL || news == NULL)
{
csvreset();
return(NULL); /* out of memory */
}
line = newl;
sline = news;
}
line[i] = c;
}
line[i] = '\0';
if (split() == NOMEM)
{
csvreset();
return(NULL);
}
return((c == EOF && i == 0) ? NULL : line);
}
char *csvgetfield(size_t n)
{
if (n >= nfield)
return(0);
return(field[n]);
}
size_t csvnfield(void)
{
return(nfield);
}
int csvputfield(FILE *ofp, const char *ofield)
{
const char escapes[] = "\",\r\n";
if (strpbrk(ofield, escapes) != 0)
{
size_t len = strlen(ofield) + 2;
const char *pos = ofield;
while ((pos = strchr(pos, '"')) != 0)
{
len++;
pos++;
}
char *space = malloc(len+1);
if (space == 0)
return EOF;
char *cpy = space;
pos = ofield;
*cpy++ = '"';
char c;
while ((c = *pos++) != '\0')
{
if (c == '"')
*cpy++ = c;
*cpy++ = c;
}
*cpy++ = '"';
*cpy = '\0';
int rc = fputs(space, ofp);
free(space);
return rc;
}
else
return fputs(ofield, ofp);
}
int csvputline(FILE *ofp, char **fields, int nfields)
{
for (int i = 0; i < nfields; i++)
{
if (i > 0)
putc(',', ofp);
if (csvputfield(ofp, fields[i]) == EOF)
return EOF;
}
return(fputs(eolstr, ofp));
}
void csvseteol(const char *eol)
{
size_t nbytes = strlen(eol);
if (nbytes >= sizeof(eolstr))
nbytes = sizeof(eolstr) - 1;
memmove(eolstr, eol, nbytes);
eolstr[nbytes] = '\0';
}
#ifdef TEST
int main(void)
{
char *in_line;
while ((in_line = csvgetline(stdin)) != 0)
{
size_t n = csvnfield();
char *fields[n]; /* C99 VLA */
printf("line = '%s'\n", in_line);
for (size_t i = 0; i < n; i++)
{
printf("field[%zu] = '%s'\n", i, csvgetfield(i));
printf("field[%zu] = [", i);
csvputfield(stdout, csvgetfield(i));
fputs("]\n", stdout);
fields[i] = csvgetfield(i);
}
printf("fields[0..%zu] = ", n-1);
csvputline(stdout, fields, n);
}
return(0);
}
#endif /* TEST */
编译代码以使用示例函数-DTEST
创建程序。main()
你需要一个 C99 编译器;中的代码main()
使用 VLA(可变长度数组)。您可以通过动态内存分配或悲观(过度杀伤)内存分配来避免这种情况(几千个指针的数组现在不会杀死大多数系统,但很少有 CSV 文件每行会有几千个字段)。
示例数据
密切基于问题中的数据。
009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky
503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"
610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"
示例输出
line = '009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"'
field[0] = '009098'
field[0] = [009098]
field[1] = '0981098094'
field[1] = [0981098094]
field[2] = 'something'
field[2] = [something]
field[3] = 'something else'
field[3] = [something else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = ' "another thing"'
field[5] = [" ""another thing"""]
fields[0..5] = 009098,0981098094,something,something else,"""this one, well, is a problem"""," ""another thing"""
line = '123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky'
field[0] = '123458'
field[0] = [123458]
field[1] = '1234561007'
field[1] = [1234561007]
field[2] = 'anything'
field[2] = [anything]
field[3] = 'nothing else'
field[3] = [nothing else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = 'dohicky'
field[5] = [dohicky]
fields[0..5] = 123458,1234561007,anything,nothing else,"""this one, well, is a problem""",dohicky
line = '503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"'
field[0] = '503458'
field[0] = [503458]
field[1] = '1234598094'
field[1] = [1234598094]
field[2] = 'nothing'
field[2] = [nothing]
field[3] = 'everything else'
field[3] = [everything else]
field[4] = '"this one, well, it isn't a problem"'
field[4] = ["""this one, well, it isn't a problem"""]
field[5] = 'abelone'
field[5] = [abelone]
fields[0..5] = 503458,1234598094,nothing,everything else,"""this one, well, it isn't a problem""",abelone
line = '610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"'
field[0] = '610078'
field[0] = [610078]
field[1] = '1236100794'
field[1] = [1236100794]
field[2] = 'everything'
field[2] = [everything]
field[3] = 'anything else'
field[3] = [anything else]
field[4] = 'this "isn't a problem", he said.'
field[4] = ["this ""isn't a problem"", he said."]
field[5] = 'Orcas Rule'
field[5] = [Orcas Rule]
fields[0..5] = 610078,1236100794,everything,anything else,"this ""isn't a problem"", he said.",Orcas Rule
字段打印两次,一次测试字段提取,一次测试字段打印。csvputline()
除了将文件从格式错误的 CSV 转换为格式正确的 CSV之外,您可以通过删除打印来简化输出。