一些实验:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Windows.h>
void generateFiles(int n) {
char fileName[32];
char fileStr[1032];
for (int i=0;i<n;i++) {
sprintf( fileName, "c:\\t\\%i.txt", i );
FILE * f = fopen( fileName, "w" );
for (int j=0;j<256;j++) {
int lineLen = rand() % 1024;
memset(fileStr, 'X', lineLen );
fileStr[lineLen] = 0x0D;
fileStr[lineLen+1] = 0x0A;
fileStr[lineLen+2] = 0x00;
fwrite( fileStr, 1, lineLen+2, f );
}
fclose(f);
}
}
void readFiles(int n) {
char fileName[32];
for (int i=0;i<n;i++) {
sprintf( fileName, "c:\\t\\%i.txt", i );
FILE * f = fopen( fileName, "r" );
fseek(f, 0L, SEEK_END);
int size = ftell(f);
fseek(f, 0L, SEEK_SET);
char * data = (char*)malloc(size);
fread(data, size, 1, f);
free(data);
fclose(f);
}
}
DWORD WINAPI readInThread( LPVOID lpParam )
{
int * number = (int *)lpParam;
char fileName[32];
sprintf( fileName, "c:\\t\\%i.txt", *number );
FILE * f = fopen( fileName, "r" );
fseek(f, 0L, SEEK_END);
int size = ftell(f);
fseek(f, 0L, SEEK_SET);
char * data = (char*)malloc(size);
fread(data, size, 1, f);
free(data);
fclose(f);
return 0;
}
int main(int argc, char ** argv) {
long t1 = GetTickCount();
generateFiles(256);
printf("Write: %li ms\n", GetTickCount() - t1 );
t1 = GetTickCount();
readFiles(256);
printf("Read: %li ms\n", GetTickCount() - t1 );
t1 = GetTickCount();
const int MAX_THREADS = 256;
int pDataArray[MAX_THREADS];
DWORD dwThreadIdArray[MAX_THREADS];
HANDLE hThreadArray[MAX_THREADS];
for( int i=0; i<MAX_THREADS; i++ )
{
pDataArray[i] = (int) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY,
sizeof(int));
pDataArray[i] = i;
hThreadArray[i] = CreateThread(
NULL,
0,
readInThread,
&pDataArray[i],
0,
&dwThreadIdArray[i]);
}
WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);
printf("Read (threaded): %li ms\n", GetTickCount() - t1 );
}
第一个函数只是制作测试数据集的丑陋事情(我知道它可以做得更好,但老实说我没有时间)
第一个实验 - 顺序读取 第二个实验 - 并行读取
结果:
256 个文件:
Write: 250 ms
Read: 140 ms
Read (threaded): 78 ms
1024 个文件:
Write: 1250 ms
Read: 547 ms
Read (threaded): 843 ms
我认为第二次尝试清楚地表明,从长远来看,创建“愚蠢”线程只会让事情变得更糟。当然,它需要在预先分配的工作人员、一些线程池等方面进行改进,但我认为对于从磁盘读取 100-200k 这样的快速操作来说,将这个功能移动到线程中并没有真正的好处。我没有时间编写更“聪明”的解决方案,但我怀疑它会更快,因为您将不得不为互斥锁等添加系统调用......
走极端你可能会想到预分配内存池等。但正如在代码之前提到的那样,你发布的代码是错误的。这是几毫秒的问题,但肯定不是几秒钟
800 个文件(每行 20 个字符,256 行)
Write: 250 ms
Read: 63 ms
Read (threaded): 500 ms
结论:
答案是:
您的阅读代码是错误的,您阅读文件的速度如此之慢以至于速度显着提高,然后您使任务并行运行。在上面的代码中,阅读实际上比产生线程的费用要快