c++ - C++ 读取大数据，解析，然后写入数据

Question

我正在尝试读取一个大型数据集，按照我需要的方式对其进行格式化，然后将其写入另一个文件。我正在尝试使用 C++ 而不是 SAS 或 STATA 以获得速度优势。数据文件通常约为 10 GB。我当前的代码需要一个多小时才能运行（然后我杀死它，因为我确信我的代码效率很低。

有没有更有效的方法来做到这一点？也许将文件读入内存，然后使用 switch 语句对其进行分析？（我有 32gb ram linux 64bit）。是否有可能读取，然后在循环中写入会减慢它，因为它一直在读取，然后写入？我试图从一个驱动器读取它，然后写入另一个驱动器以加快速度。

开关盒是否会减慢速度？

我现在使用 getline 读取数据的过程，使用 switch 语句正确解析它，然后将其写入我的输出文件。并重复 3 亿行。switch 语句中还有大约 10 个案例，但为简洁起见，我没有复制。

代码在 main 函数中可能非常难看，但我想在我研究吸引力之前让它工作。

我试过使用 read() 但没有任何成功。如果我需要澄清任何事情，请告诉我。

感谢您的帮助！

 #include <iostream>
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <stdio.h>
 //#include <cstring>
 //#include <boost/algorithm/string.hpp>

 #include <vector>

  using namespace std;
 //using namespace boost;


 struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};

  int main () 
{
string a; 
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;

short time;     //counter to keep second filled
short smalltime;    //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M; 
//vector<> fout;
string line;

ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");

dataline oneline;

if (myfile.is_open())
    {
    while ( myfile.good() )
        {
        getline (myfile,line);
//      cout << line<<endl;;

        a=line.substr(0,1);
        stringstream ss(a);
        char type;
        ss>>type;


        switch (type)
            { 
            case 'T':
                {
                if (type == 'T')
                    {
                    times=line.substr(1,5);
                    stringstream s(times);
                    s>>time;
                    //oneline.second=time;
                    //oneline.second;
                    //cout<<time<<endl;
                    }
                else
                    {
                    time=time;
                    }
                break;
                }
            case 'M':
                {
                if (type == 'M')
                    {
                    smalltimes=line.substr(1,3);
                    stringstream ss(smalltimes);
                    ss>>smalltime;      //oneline.mill;
                //  cout<<smalltime<<endl;                            //smalltime=oneline.mill;
                    }
                else
                    {
                    smalltime=smalltime;
                    }
                break;
                }


            case 'R':
                {
                oneline.second=time;
                oneline.mill=smalltime;

                a=line.substr(0,1);
                stringstream ss(a);
                ss>>oneline.type;

                b=line.substr(1,6);
                stringstream sss(b);
                sss>>oneline.ticker;

                c=line.substr(7,1);
                stringstream ssss(c);
                ssss>>oneline.marketCategory;

                d=line.substr(8,1);
                stringstream sssss(d);
                sssss>>oneline.financialStatus;

                e=line.substr(9,6);
                stringstream ssssss(e);
                ssssss>>oneline.roundLotSize;

                f=line.substr(15,1);
                stringstream sssssss(f);
                sssssss>>oneline.roundLotOnly;

                *oneline.tradingState=0;
                *oneline.reserved=0;
                *oneline.reason=0;
                *oneline.mpid=0;
                *oneline.primaryMarketMaker=0;
                *oneline.primaryMarketMode=0;
                *oneline.marketParticipantState=0;
                oneline.orderNumber=0;
                *oneline.buySell=0;
                oneline.shares=0;
                oneline.price=0;
                oneline.executedShares=0;
                oneline.matchNumber=0;
                *oneline.printable=0;
                oneline.executionPrice=0;
                oneline.canceledShares=0;
                oneline.sharesBig=0;
                oneline.crossPrice=0;
                *oneline.crossType=0;
                oneline.pairedShares=0;
                oneline.imbalanceShares=0;
                *oneline.imbalanceDirection=0;
                oneline.fairPrice=0;
                oneline.nearPrice=0;
                oneline.currentReferencePrice=0;
                *oneline.priceVariationIndicator=0;

                break;
                }//End Case 
            }//End Switch
            }//end While
    myfile.close();

     }//End If
else cout << "Unable to open file"; 
cout<<"Junk"<<endl;

return 0;
}

更新所以我一直在尝试使用内存映射，但现在我遇到了分段错误。我一直在尝试遵循不同的示例来拼凑一些对我有用的东西。为什么我会遇到分段错误？我采用了我的代码的第一部分，如下所示：

int main (int argc, char** path) 
 {
 long i;
 int fd;
 char *map;
 char *FILEPATH = path;
 unsigned long FILESIZE;
 FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
 fseek(fp, 0, SEEK_END);
 FILESIZE = ftell(fp);
 fseek(fp, 0, SEEK_SET);
 fclose(fp);
 fd = open(FILEPATH, O_RDONLY);

 map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);

 char z;
 stringstream ss;

 for (long i = 0; i <= FILESIZE; ++i) 
    {
    z = map[i];
    if (z != '\n') 
        {
        ss << z;
            }
    else 
        {
            // c style tokenizing
            ss.str("");
            }
        }
 if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
 close(fd);

score 1 · Accepted Answer

数据文件通常约为 10 GB。...开关盒是否会减慢速度？

几乎可以肯定不是，闻起来像你受 I/O 限制。但是你应该考虑测量它。现代 CPU 具有性能计数器，使用正确的工具很容易利用这些计数器。但是让我们开始将问题划分为一些主要领域：设备的 I/O、内存的加载/存储、CPU。您可以在代码中读取时钟的位置放置一些标记，以了解每个操作需要多长时间。linux您可以使用clock_gettime()或rdtsc指令来访问比操作系统刻度更精确的时钟。

考虑mmap/ CreateFileMapping，其中任何一个都可能为您正在访问的页面提供更好的效率/吞吐量。

如果流过大量已经被分页的数据，请考虑大/巨大的页面。

从手册中mmap()：

描述

mmap() 在调用进程的虚拟地址空间中创建一个新映射。新映射的起始地址在 addr 中指定。length 参数指定映射的长度。

这是一个mmap()例子：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define FILEPATH "/tmp/mmapped.bin"
#define NUMINTS  (1000)
#define FILESIZE (NUMINTS * sizeof(int))

int main(int argc, char *argv[])
{
    int i;
    int fd;
    int *map;  /* mmapped array of int's */

    fd = open(FILEPATH, O_RDONLY);
    if (fd == -1) {
    perror("Error opening file for reading");
    exit(EXIT_FAILURE);
    }

    map = mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
    close(fd);
    perror("Error mmapping the file");
    exit(EXIT_FAILURE);
    }

    /* Read the file int-by-int from the mmap
     */
    for (i = 1; i <=NUMINTS; ++i) {
    printf("%d: %d\n", i, map[i]);
    }

    if (munmap(map, FILESIZE) == -1) {
    perror("Error un-mmapping the file");
    }
    close(fd);
    return 0;
}

c++ - C++ 读取大数据，解析，然后写入数据

1 回答 1

Related

Reference