如果您无法或不愿意重组程序以一次写入一组,请为每个“小”文件(使用setbuf
, setvbuf
)设置更大的缓冲区。这样做的效果是缓冲区刷新到磁盘将表现出更多的“局部性”,即不是将 X 量的数据刷新 100 次到 100 个不同的文件,而是将 10 倍的数据量刷新到 100 个不同的文件 10 次。
测试用例程序(故意没有错误检查):
-- hugefile.h --
struct record
{
unsigned int group;
char data [1020];
};
--- gen-hugefile.c ---
#include <stdio.h>
#include <stdlib.h>
#include "hugefile.h"
int
main (int argc, char **argv)
{
unsigned int i, nrecords = strtol (argv [1], 0, 10);
FILE *f;
f = fopen ("hugefile.db", "w");
for (i = 0; i < nrecords; ++i)
{
struct record r;
r.group = rand () % 100;
fwrite (&r, sizeof r, 1, f);
}
fclose (f);
return 0;
}
--- read-hugefile.c ---
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include "hugefile.h"
FILE *in;
FILE *out[100];
int
main ()
{
int i;
char name [128];
in = fopen ("hugefile.db", "r");
#ifdef BUFFER
setvbuf (in, malloc (2*BUFFER), _IOFBF, 2*BUFFER);
#endif
for (i = 0; i < 100; ++i)
{
sprintf (name, "out/file%03d.db", i);
out [i] = fopen (name, "w");
#ifdef BUFFER
setvbuf (out [i], malloc (BUFFER), _IOFBF, BUFFER);
#endif
}
struct record r;
while ((i = fread (&r, sizeof r, 1, in)) == 1)
fwrite (&r, sizeof r, 1, out [r.group]);
fflush (0);
return 0;
}
velco@sue:~/tmp/hugefile$ ls
gen-hugefile.c hugefile.h read-hugefile.c
velco@sue:~/tmp/hugefile$ gcc -O2 gen-hugefile.c -o gen-hugefile
velco@sue:~/tmp/hugefile$ ./gen-hugefile 1000000
velco@sue:~/tmp/hugefile$ ls -lh
total 978M
-rwxrwxr-x 1 velco velco 8.5K Dec 14 13:33 gen-hugefile
-rw-rw-r-- 1 velco velco 364 Dec 14 13:31 gen-hugefile.c
-rw-rw-r-- 1 velco velco 977M Dec 14 13:34 hugefile.db
-rw-rw-r-- 1 velco velco 61 Dec 14 12:56 hugefile.h
-rw-rw-r-- 1 velco velco 603 Dec 14 13:32 read-hugefile.c
velco@sue:~/tmp/hugefile$ gcc -O2 read-hugefile.c -o read-hugefile
velco@sue:~/tmp/hugefile$ gcc -O2 -DBUFFER=1048576 read-hugefile.c -o read-hugefile-buf
velco@sue:~/tmp/hugefile$ mkdir out
velco@sue:~/tmp/hugefile$ time ./read-hugefile
real 0m34.031s
user 0m0.716s
sys 0m6.204s
velco@sue:~/tmp/hugefile$ time ./read-hugefile
real 0m25.960s
user 0m0.600s
sys 0m6.320s
velco@sue:~/tmp/hugefile$ time ./read-hugefile-buf
real 0m20.756s
user 0m1.528s
sys 0m5.420s
velco@sue:~/tmp/hugefile$ time ./read-hugefile-buf
real 0m16.450s
user 0m1.324s
sys 0m5.012s
velco@sue:~/tmp/hugefile$