我在一个文件中有大量数据,我需要读取并对其进行一些概率处理,因此我需要计算整个文件中每个单词的出现次数并对其进行更多计算。这些文件包含 100 万条半记录,每条记录大约 6 个字符串。我使用向量来保存这些数据,但程序在保存大约 8000 条记录后崩溃。有没有办法将此向量保存在计算机上而不是程序的内存中?!.. 或者我从搜索中听到了一种叫做符号表的东西,但我不明白它是什么意思或如何使用它。
这个问题的任何解决方案?
这是主文件
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
#include "Tuple.h"
#include "VerbPair.h"
using namespace std;
string filename = "verb-argument-tuples.txt";
vector<Tuple> mytuples;
vector<VerbPair> verbpairs;
vector<Tuple> readTupleFile(string filename)
{
cout << "Started parsing the file of tuples..." << endl;
vector<Tuple> mt;
string temp;
Tuple t;
ifstream infile;
infile.open(filename);
while(!(infile.eof()))
{
getline(infile,temp);
t.parseTuple(temp);
mt.push_back(t);
}
infile.close();
cout << "Done with reading tuples file..." << endl;
return mt;
}
vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
vector<VerbPair> pairs;
bool flag = false;
VerbPair temp;
for(int i=0;i<mytuples.size();i++)
{
flag = false;
for(int h=0;h<pairs.size();h++)
{
if (mytuples[i].verb.compare(pairs[h].verb) == 0)
{
pairs[h].freq += mytuples[i].count;
flag =true;
break;
}
}
if(! flag)
{
temp.verb = mytuples[i].verb;
temp.freq = mytuples[i].count;
pairs.push_back(temp);
}
}
return pairs;
}
int numOfLines(string filename)
{
int numLines = 0;
string j ="";
ifstream infile;
infile.open(filename);
while(!infile.eof())
{
getline(infile,j);
numLines++;
}
infile.close();
return numLines;
}
void train(string filename)
{
mytuples = readTupleFile(filename);
verbpairs = getVerbPairs(mytuples);
}
void store(string filename)
{
}
void load(string filename)
{
}
int main()
{
cout << "Started Application..." << endl;
train(filename);
cout << "Size of verb pairs is " << verbpairs.size() << endl;
}
元组.h
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;
class Tuple
{
public:
int count;
string verb;
string frame;
vector<string> args;
private:
int i;
int h;
string p;
public:
void parseTuple(string s)
{
cout << "parsing.... " << s << endl;
i=0;
h=0;
p="";
while(s[i] != 32 && s[i]!= 9) //that means temp[i] is a number
{
h = h*10 + (s[i] - '0');
i++;
}
this->count = h;
i++;
// loops for everything but not the space and tab
while(s[i] != 32 && s[i]!= 9)
{
p +=s[i];
i++;
}
this->verb = p;
i++;
p="";
while(s[i] != 32 && s[i]!= 9)
{
p +=s[i];
i++;
}
this->frame = p;
i++;
p="";
while(i < s.length())
{
while(s[i] != 32 && s[i]!= 9 && i < s.length())
{
p += s[i];
i++;
}
this->args.push_back(p);
i++;
p="";
}
}
};
和 VerbPair.h
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;
class VerbPair
{
public:
string verb;
int freq;
};