我正在尝试读取一个大型数据集,按照我需要的方式对其进行格式化,然后将其写入另一个文件。我正在尝试使用 C++ 而不是 SAS 或 STATA 以获得速度优势。数据文件通常约为 10 GB。我当前的代码需要一个多小时才能运行(然后我杀死它,因为我确信我的代码效率很低。
有没有更有效的方法来做到这一点?也许将文件读入内存,然后使用 switch 语句对其进行分析?(我有 32gb ram linux 64bit)。是否有可能读取,然后在循环中写入会减慢它,因为它一直在读取,然后写入?我试图从一个驱动器读取它,然后写入另一个驱动器以加快速度。
开关盒是否会减慢速度?
我现在使用 getline 读取数据的过程,使用 switch 语句正确解析它,然后将其写入我的输出文件。并重复 3 亿行。switch 语句中还有大约 10 个案例,但为简洁起见,我没有复制。
代码在 main 函数中可能非常难看,但我想在我研究吸引力之前让它工作。
我试过使用 read() 但没有任何成功。如果我需要澄清任何事情,请告诉我。
感谢您的帮助!
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <stdio.h>
//#include <cstring>
//#include <boost/algorithm/string.hpp>
#include <vector>
using namespace std;
//using namespace boost;
struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};
int main ()
{
string a;
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;
short time; //counter to keep second filled
short smalltime; //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M;
//vector<> fout;
string line;
ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");
dataline oneline;
if (myfile.is_open())
{
while ( myfile.good() )
{
getline (myfile,line);
// cout << line<<endl;;
a=line.substr(0,1);
stringstream ss(a);
char type;
ss>>type;
switch (type)
{
case 'T':
{
if (type == 'T')
{
times=line.substr(1,5);
stringstream s(times);
s>>time;
//oneline.second=time;
//oneline.second;
//cout<<time<<endl;
}
else
{
time=time;
}
break;
}
case 'M':
{
if (type == 'M')
{
smalltimes=line.substr(1,3);
stringstream ss(smalltimes);
ss>>smalltime; //oneline.mill;
// cout<<smalltime<<endl; //smalltime=oneline.mill;
}
else
{
smalltime=smalltime;
}
break;
}
case 'R':
{
oneline.second=time;
oneline.mill=smalltime;
a=line.substr(0,1);
stringstream ss(a);
ss>>oneline.type;
b=line.substr(1,6);
stringstream sss(b);
sss>>oneline.ticker;
c=line.substr(7,1);
stringstream ssss(c);
ssss>>oneline.marketCategory;
d=line.substr(8,1);
stringstream sssss(d);
sssss>>oneline.financialStatus;
e=line.substr(9,6);
stringstream ssssss(e);
ssssss>>oneline.roundLotSize;
f=line.substr(15,1);
stringstream sssssss(f);
sssssss>>oneline.roundLotOnly;
*oneline.tradingState=0;
*oneline.reserved=0;
*oneline.reason=0;
*oneline.mpid=0;
*oneline.primaryMarketMaker=0;
*oneline.primaryMarketMode=0;
*oneline.marketParticipantState=0;
oneline.orderNumber=0;
*oneline.buySell=0;
oneline.shares=0;
oneline.price=0;
oneline.executedShares=0;
oneline.matchNumber=0;
*oneline.printable=0;
oneline.executionPrice=0;
oneline.canceledShares=0;
oneline.sharesBig=0;
oneline.crossPrice=0;
*oneline.crossType=0;
oneline.pairedShares=0;
oneline.imbalanceShares=0;
*oneline.imbalanceDirection=0;
oneline.fairPrice=0;
oneline.nearPrice=0;
oneline.currentReferencePrice=0;
*oneline.priceVariationIndicator=0;
break;
}//End Case
}//End Switch
}//end While
myfile.close();
}//End If
else cout << "Unable to open file";
cout<<"Junk"<<endl;
return 0;
}
更新所以我一直在尝试使用内存映射,但现在我遇到了分段错误。我一直在尝试遵循不同的示例来拼凑一些对我有用的东西。为什么我会遇到分段错误?我采用了我的代码的第一部分,如下所示:
int main (int argc, char** path)
{
long i;
int fd;
char *map;
char *FILEPATH = path;
unsigned long FILESIZE;
FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
fseek(fp, 0, SEEK_END);
FILESIZE = ftell(fp);
fseek(fp, 0, SEEK_SET);
fclose(fp);
fd = open(FILEPATH, O_RDONLY);
map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
char z;
stringstream ss;
for (long i = 0; i <= FILESIZE; ++i)
{
z = map[i];
if (z != '\n')
{
ss << z;
}
else
{
// c style tokenizing
ss.str("");
}
}
if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
close(fd);