我写了一个简单的程序,可以打开一个 csv 文件,读取它,创建一个新的 csv 文件,然后只写一些列(我不想要所有的列,希望删除一些会使文件更易于管理) . 该文件为1.15GB,但fopen()
没有问题。分段错误发生在第一个进度后不久的我的 while 循环中printf()
。
我只测试了 csv 的前几行,下面的逻辑就是我想要的。奇怪的部分何时index == 0
是由于最后一列的形式(xxx, yyy)\n
(,
逗号分隔值文件中的只是荒谬)。
这是代码,while循环是问题所在:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char** argv) {
long size;
FILE* inF = fopen("allCrimes.csv", "rb");
if (!inF) {
puts("fopen() error");
return 0;
}
fseek(inF, 0, SEEK_END);
size = ftell(inF);
rewind(inF);
printf("In file size = %ld bytes.\n", size);
char* buf = malloc((size+1)*sizeof(char));
if (fread(buf, 1, size, inF) != size) {
puts("fread() error");
return 0;
}
fclose(inF);
buf[size] = '\0';
FILE *outF = fopen("lessColumns.csv", "w");
if (!outF) {
puts("fopen() error");
return 0;
}
int index = 0;
char* currComma = strchr(buf, ',');
fwrite(buf, 1, (int)(currComma-buf), outF);
int progress = 0;
while (currComma != NULL) {
index++;
index = (index%14 == 0) ? 0 : index;
progress++;
if (progress%1000 == 0) printf("%d\n", progress/1000);
int start = (int)(currComma-buf);
currComma = strchr(currComma+1, ',');
if (!currComma) break;
if ((index >= 3 && index <= 10) || index == 13) continue;
int end = (int)(currComma-buf);
int endMinusStart = end-start;
char* newEntry = malloc((endMinusStart+1)*sizeof(char));
strncpy(newEntry, buf+start, endMinusStart);
newEntry[end+1] = '\0';
if (index == 0) {
char* findNewLine = strchr(newEntry, '\n');
int newLinePos = (int)(findNewLine-newEntry);
char* modifiedNewEntry = malloc((strlen(newEntry)-newLinePos+1)*sizeof(char));
strcpy(modifiedNewEntry, newEntry+newLinePos);
fwrite(modifiedNewEntry, 1, strlen(modifiedNewEntry), outF);
}
else fwrite(newEntry, 1, end-start, outF);
}
fclose(outF);
return 0;
}
编辑:原来的问题是 csv 文件,
在我没有预料到的地方有导致逻辑失败的地方。我最终编写了一个新的解析器,它删除了逗号数量不正确的行。它删除了 243,875 行(约占文件的 4%)。我将发布该代码,因为它至少反映了一些关于free()
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char** argv) {
long size;
FILE* inF = fopen("allCrimes.csv", "rb");
if (!inF) {
puts("fopen() error");
return 0;
}
fseek(inF, 0, SEEK_END);
size = ftell(inF);
rewind(inF);
printf("In file size = %ld bytes.\n", size);
char* buf = malloc((size+1)*sizeof(char));
if (fread(buf, 1, size, inF) != size) {
puts("fread() error");
return 0;
}
fclose(inF);
buf[size] = '\0';
FILE *outF = fopen("uniformCommaCount.csv", "w");
if (!outF) {
puts("fopen() error");
return 0;
}
int numOmitted = 0;
int start = 0;
while (1) {
char* currNewLine = strchr(buf+start, '\n');
if (!currNewLine) {
puts("Done");
break;
}
int end = (int)(currNewLine-buf);
char* entry = malloc((end-start+2)*sizeof(char));
strncpy(entry, buf+start, end-start+1);
entry[end-start+1] = '\0';
int commaCount = 0;
char* commaPointer = entry;
for (; *commaPointer; commaPointer++) if (*commaPointer == ',') commaCount++;
if (commaCount == 14) fwrite(entry, 1, end-start+1, outF);
else numOmitted++;
free(entry);
start = end+1;
}
fclose(outF);
printf("Omitted %d lines\n", numOmitted);
return 0;
}