c++ - 在矢量 C++ 中保存大数据

Question

我在一个文件中有大量数据，我需要读取并对其进行一些概率处理，因此我需要计算整个文件中每个单词的出现次数并对其进行更多计算。这些文件包含 100 万条半记录，每条记录大约 6 个字符串。我使用向量来保存这些数据，但程序在保存大约 8000 条记录后崩溃。有没有办法将此向量保存在计算机上而不是程序的内存中？！.. 或者我从搜索中听到了一种叫做符号表的东西，但我不明白它是什么意思或如何使用它。

这个问题的任何解决方案？

这是主文件

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>

#include "Tuple.h"
#include "VerbPair.h"
using namespace std;

string filename = "verb-argument-tuples.txt";
vector<Tuple> mytuples;
vector<VerbPair> verbpairs;

vector<Tuple> readTupleFile(string filename)
{
    cout << "Started parsing the file of tuples..." << endl;
    vector<Tuple> mt;
    string temp;
    Tuple t;

    ifstream infile;
    infile.open(filename);
    while(!(infile.eof()))
    {
        getline(infile,temp);
        t.parseTuple(temp);
        mt.push_back(t);
    }

    infile.close();
    cout << "Done with reading tuples file..." << endl;
    return mt;
}

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

int numOfLines(string filename)
{
    int numLines = 0;
    string j ="";
    ifstream infile;
    infile.open(filename);

    while(!infile.eof())
    {
        getline(infile,j);
        numLines++;
    }
    infile.close();
    return numLines;
}

void train(string filename)
{
    mytuples = readTupleFile(filename);
    verbpairs = getVerbPairs(mytuples);
}
void store(string filename)
{

}
void load(string filename)
{

}

int main()
{
    cout << "Started Application..." << endl;
    train(filename);
    cout << "Size of verb pairs is " << verbpairs.size() << endl;
}

元组.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class Tuple
{
public:
    int count;
    string verb;
    string frame;
    vector<string> args;
private:
    int i;
    int h;
    string p;

public:
    void parseTuple(string s)
    {
        cout << "parsing.... " << s << endl;
        i=0;
        h=0;
        p="";
        while(s[i] != 32 && s[i]!= 9) //that means temp[i] is a number
        {
            h = h*10 + (s[i] - '0');
            i++;
        }
        this->count = h;
        i++;

        // loops for everything but not the space and tab
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->verb = p;
        i++;

        p="";
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->frame = p;
        i++;

        p="";
        while(i < s.length())
        {
            while(s[i] != 32 && s[i]!= 9 && i < s.length())
            {
                p += s[i];
                i++;
            }
            this->args.push_back(p);
            i++;
            p="";
        }
    }
};

和 VerbPair.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class VerbPair
{
public:
    string verb;
    int freq;
};

score 1 · Accepted Answer

Can you try with using reserve function with vector. Since you possibly know that you have large data, you should also use reserve function.

Also, use map in this case, since using map, you will be able to count the number of occurences easily.

For the crash, you will have to show us the code.

score 0 · Accepted Answer

既然有重复数据，为什么要使用vector. 只需使用map<string,int>. 每次遇到一个单词时，增加地图中的相应值。

score 0 · Accepted Answer

你的代码中有很多影子变量，比如你filename全局声明变量，然后在三行后在本地使用它。你对元组向量和动词对向量做同样的事情。

也许一些封装会使您的调试任务更容易。

另一个样式问题是这样的函数：

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

有几件事使调试变得困难。第一个是影子，第二个是你不要让编译器帮助你。

vector<VerbPair> getVerbPairs(const vector<Tuple>& mytuples)
{
  vector<VerbPair> pairs;
  bool flag = false;
  VerbPair temp;
  for(int i=0;i<mytuples.size();i++)
    {
      flag = false;
      for(int h=0;h<pairs.size();h++)
    {
      if (mytuples[i].verb.compare(pairs[h].verb) == 0)
        {
          pairs[h].freq += mytuples[i].count;
          flag =true;
          break;
        }
    }
      if(! flag)
    {
      temp.verb = mytuples[i].verb;
      temp.freq = mytuples[i].count;
      pairs.push_back(temp);
    }
    }
  return pairs;
}

这样编译器会告诉你是否试图弄乱 mytupes 向量。

c++ - 在矢量 C++ 中保存大数据

3 回答 3

Related

Reference