4

输入是大约 70GB 的单个文件,其每一行都包含客户信息。一个程序读取这个文件并为每个客户制作一个文件。有 8000 个客户,但我们必须为 40000 个客户提供服务。目前 UNIX 排序命令用于按客户端对文件进行排序,然后写入客户端文件。这样,程序只打开了一个文件处理程序来创建文件。我们不想使用 sort 命令,因为它消耗大约 1.5 小时。然而,这意味着打开的 8000 个文件处理程序需要保持打开状态。内核参数可能需要修改。是否可以在不更改内核参数的情况下打开这么多文件。我尝试浏览 libevent 网站,但不确定这是否是正确的解决方案。

4

2 回答 2

11

You don't necessarily need 8000 file handles open at the same time, nor do you need to sort the data. Sorting is a waste unless you need each clients lines sorted as well.

Presumably, you can identify the client by some item on the line. Let's say (for example) it's the first 8 characters on each line, then your pseudo-code looks like this:

delete all files matching "*_out.dat"
for each line in file:
    key = left (line, 8)
    open file key + "_out.dat" for append
    write line to file
    close file

That's it. Simple. Only one file open at a time and no time wasted sorting.

Now there are further improvements that could be made to that, amongst them being:

  1. Don't close the file for the previous line unless the next line has a different key. This will catch situations where you have a hundred records in a row for the same key and will hold the file open in that case.

  2. Maintain a cache of open file handles like a most recently used list (say 16 different keys). Again, this will prevent closure until a file handle has to be reused but it will laso handle the situation where clusters are more efficient (such as customer 1,2,3,7,1,2,3,2,2,3,7,4,...).

But the basic theory remains the same: don't try to open 8000 (or 40000) files at once when you can get by with less.


Alternatively, just process the data, stashing it all into a database and using queries to then create each file with a series of queries. Whether that's faster than the solution above should be tested, as in fact should every suggestion given here. Measure, don't guess!


Now, since I've invoked that optimisation mantra, let's do some timings, keeping in mind that this is specific to my hardware and may be different on yours.

Start with the following script, which generates a one-million-line file where the first eight characters of each line are a random number between 10000000 and 10032767 inclusive. We'll use characters 5 through 8 inclusive to give us the customer number, ten thousand customers at roughly a hundred lines per customer:

#!/bin/bash
line='the quick brown fox jumps over the lazy dog'
for p0 in 1 2 3 4 5 6 7 8 9 0 ; do
 for p1 in 1 2 3 4 5 6 7 8 9 0 ; do
  for p2 in 1 2 3 4 5 6 7 8 9 0 ; do
   for p3 in 1 2 3 4 5 6 7 8 9 0 ; do
    for p4 in 1 2 3 4 5 6 7 8 9 0 ; do
     for p5 in 1 2 3 4 5 6 7 8 9 0 ; do
      ((x = 10000000 + $RANDOM))
      echo "$x$line"
     done
    done
   done
  done
 done
done

The file size produced is about 50M. We can scale it up to 100M by simply concatenating 2 copies of it to another file, and this gives us about two hundred lines per customer.


Now, examine the following program:

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define FOUT_STR "1234_out.dat"

int main (void) {
    FILE *fIn, *fOut;
    char outFile[sizeof (FOUT_STR)];
    char buff[1000];

    if ((fIn = fopen ("data.dat", "r")) == NULL) {
        printf ("Error %d opening 'data.dat'\n", errno);
        return 1;
    }

    memcpy (outFile, FOUT_STR, sizeof (FOUT_STR));
    if ((fOut = fopen (outFile, "w")) == NULL) {
        printf ("Error %d opening '%s'\n", errno, outFile);
        fclose(fIn);
        return 1;
    }

    while (fgets (buff, sizeof (buff), fIn) != NULL) {
        fputs (buff, fOut);
    }

    fclose (fOut);
    fclose (fIn);
    return 0;
}

This gives a baseline figure for just writing all entries to a single file, and takes under a second to run.


Now let's have one which opens a new file every two hundred lines - this is the behaviour you'd see if the file was already sorted by customer:

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define FOUT_STR "1234_out.dat"

int main (void) {
    FILE *fIn, *fOut;
    char outFile[sizeof (FOUT_STR)];
    char buff[1000];
    char custNum[5];
    int i = -1;

    if ((fIn = fopen ("data.dat", "r")) == NULL) {
        printf ("Error %d opening 'data.dat'\n", errno);
        return 1;
    }

    fOut = NULL;
    while (fgets (buff, sizeof (buff), fIn) != NULL) {
        i++;
        if ((i % 200) == 0) {
            if (fOut != NULL)
                fclose (fOut);
            sprintf (custNum, "%04d", i / 200);
            memcpy (outFile, FOUT_STR, sizeof (FOUT_STR));
            memcpy (outFile, custNum, 4);
            if ((fOut = fopen (outFile, "w")) == NULL) {
                printf ("Error %d opening '%s'\n", errno, outFile);
                break;
            }
        }
        fputs (buff, fOut);
    }
    if (fOut != NULL)
        fclose (fOut);

    fclose (fIn);
    return 0;
}

This takes about 2s (0:00:02) for the 100M file, and testing it with a 200M and 400M file indicates that it scales linearly. That means that with a sorted 70G file, you're looking at about 1400s or 0:23:20. Note that this would be on top of your sort cost which you have as 1.5h (1:30:00), giving you a total cost of 1:53:20.


Now let's implement out simplistic program which simply opens each file for append for every line:

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define FOUT_STR "1234_out.dat"

int main (void) {
    FILE *fIn, *fOut;
    char outFile[sizeof (FOUT_STR)];
    char buff[1000];

    if ((fIn = fopen ("data.dat", "r")) == NULL) {
        printf ("Error %d opening 'data.dat'\n", errno);
        return 1;
    }

    while (fgets (buff, sizeof (buff), fIn) != NULL) {
        memcpy (outFile, FOUT_STR, sizeof (FOUT_STR));
        memcpy (outFile, &(buff[4]), 4);
        if ((fOut = fopen (outFile, "a")) == NULL) {
            printf ("Error %d opening '%s'\n", errno, outFile);
            break;
        }
        fputs (buff, fOut);
        fclose (fOut);
    }

    fclose (fIn);
    return 0;
}

When we run this with the 100M file, it takes 244s (0:04:04). And again, testing with a 200M and 400M file indicates linear scaling. So, for the 70G file, that's going to be 47:26:40, not really what you would call an improvement over your sub-two-hour sort-and-process option.


However, let's try a different tack, with the following program, which maintains a hundred file handles each time through the input file (done a hundred times):

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define FOUT_STR "1234_out.dat"

int main (void) {
    FILE *fIn, *fOut[100];
    char outFile[sizeof (FOUT_STR)];
    char buff[1000];
    int seg, cust;
    char segNum[3], custNum[3];

    for (seg = 0; seg < 100; seg++) {
        sprintf (segNum, "%02d", seg);

        if ((fIn = fopen ("data.dat", "r")) == NULL) {
            printf ("Error %d opening 'data.dat'\n", errno);
            return 1;
        }

        for (cust = 0; cust < 100; cust++) {
            sprintf (custNum, "%02d", cust);

            memcpy (outFile, FOUT_STR, sizeof (FOUT_STR));
            memcpy (outFile+0, segNum, 2);
            memcpy (outFile+2, custNum, 2);
            if ((fOut[cust] = fopen (outFile, "w")) == NULL) {
                printf ("Error %d opening '%s'\n", errno, outFile);
                return 1;
            }
        }

        while (fgets (buff, sizeof (buff), fIn) != NULL) {
            if (memcmp (&(buff[4]), segNum, 2) == 0) {
                cust = (buff[6] - '0') * 10 + buff[7] - '0';
                fputs (buff, fOut[cust]);
            }
        }

        for (cust = 0; cust < 100; cust++) {
            fclose (fOut[cust]);
        }

        fclose (fIn);
    }

    return 0;
}

This is a slight variation which actually processes the input file a hundred times, each time only processing the lines meant for a hundred individual output files.

When this is run on the 100M file, it takes about 28s (0:00:28). Once again, that seems to scale linearly for a 200M and 400M file, so the 70G file should take 5:26:40.

Still not even close to the sub-two-hour figure.


So what happens when we open a thousand output files at a time:

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define FOUT_STR "1234_out.dat"

int main (void) {
    FILE *fIn, *fOut[1000];
    char outFile[sizeof (FOUT_STR)];
    char buff[1000];
    int seg, cust;
    char segNum[2], custNum[4];

    for (seg = 0; seg < 10; seg++) {
        sprintf (segNum, "%01d", seg);

        if ((fIn = fopen ("data.dat", "r")) == NULL) {
            printf ("Error %d opening 'data.dat'\n", errno);
            return 1;
        }

        for (cust = 0; cust < 1000; cust++) {
            sprintf (custNum, "%03d", cust);

            memcpy (outFile, FOUT_STR, sizeof (FOUT_STR));
            memcpy (outFile+0, segNum, 1);
            memcpy (outFile+1, custNum, 3);
            if ((fOut[cust] = fopen (outFile, "w")) == NULL) {
                printf ("Error %d opening '%s'\n", errno, outFile);
                return 1;
            }
        }

        while (fgets (buff, sizeof (buff), fIn) != NULL) {
            if (memcmp (&(buff[4]), segNum, 1) == 0) {
                cust = (buff[5] - '0') * 100 + (buff[6] - '0') * 10 + buff[7] - '0';
                fputs (buff, fOut[cust]);
            }
        }

        for (cust = 0; cust < 1000; cust++) {
            fclose (fOut[cust]);
        }

        fclose (fIn);
    }

    return 0;
}

That takes about 12s for the 100M file, and would give us 2:20:00, coming close to the sort but not quite there.


Unfortuntely, when we go the next logical step, trying to opening up the entire 10000 files in one hit, we see:

Error 24 opening '1020_out.dat'

meaning that we've finally come to the limit (standard input, standard output, standard error and about 1019 other file handles), which indicates that 1024 handles is about all we are allowed.

So maybe the sort-and-process method is the best way to go.

于 2012-04-12T10:41:37.877 回答
0

我不知道 Unix 平台的限制,但是在 Windows 中,您可以使用 WINAPI 打开任意数量的文件,也可以使用 _setMaxstdio 设置打开文件句柄的最大数量,默认情况下为 512(使用 fopen)。

这是一个类似的解决方案,可以帮助你!

于 2012-04-12T10:40:02.623 回答