c++ - CPP：解析字符串流太慢

Question

我的 cpp 代码需要读取由空格分隔的浮点值组成的 7 MB 文本文件。将字符串值解析为浮点数组大约需要 6 秒，这对我的用例来说太多了。

我一直在网上查，人们说通常是物理 IO 需要时间。为了消除这种情况，我一次性将文件读入字符串流，并将其用于浮点解析。代码速度仍然没有提高。任何想法如何让它运行得更快？

这是我的代码（为简单起见，用 dummy_f 替换了数组条目）：

    #include "stdafx.h"
    #include <iostream>
    #include <fstream>
    #include "time.h"
    #include <sstream>
    using namespace std;

    int main()
    {
      ifstream testfile;
      string filename = "test_file.txt";
      testfile.open(filename.c_str());

      stringstream string_stream;
      string_stream << testfile.rdbuf();

      testfile.close();

      clock_t begin = clock();
      float dummy_f;

      cout<<"started stream at time "<<(double) (clock() - begin) /(double) CLOCKS_PER_SEC<<endl;

      for(int t = 0; t < 6375; t++)
      {

           string_stream >> dummy_f;

           for(int t1 = 0; t1 < 120; t1++)
           {
               string_stream >> dummy_f;
           }
      }

      cout<<"finished stream at time "<<(double) (clock() - begin) /(double) CLOCKS_PER_SEC<<endl;

      string_stream.str("");

      return 0;
     }

编辑：

这是 test_cases.txt 文件的链接https://drive.google.com/file/d/0BzHKbgLzf282N0NBamZ1VW5QeFE/view?usp=sharing

使用此文件运行时，请将内循环尺寸更改为 128（打错了）

编辑：找到了一种让它工作的方法。将 dummy_f 声明为字符串并从字符串流中读取为字符串单词。然后使用 atof 将字符串转换为浮点数。花费的时间是 0.4 秒，这对我来说已经足够了。

  string dummy_f;
  vector<float> my_vector;
  for(int t = 0; t < 6375; t++)
  {

       string_stream >> dummy_f;
       my_vector.push_back(atof(dummy_f.c_str()));
       for(int t1 = 0; t1 < 128; t1++)
       {
           string_stream >> dummy_f;
            my_vector.push_back(atof(dummy_f.c_str()));
       }
  }

score 0 · Accepted Answer

在我的 Linux 机器上它只需要 <0.3 秒，所以如果 OP 在 Debug/Release 构建中没有出错，那么它应该是 Windows 独有的问题：

hidden$ cat read-float.cpp 
#include <fstream>
#include <iostream>
#include <vector>
using namespace std;

int main() {
  ifstream fs("/tmp/xx.txt");
  vector<float> v;
  for (int i = 0; i < 6375; i++) {
    for (int j = 0; j < 129; j++) {
      float f;
      fs >> f;
      v.emplace_back(f);
    }
  }
  cout << "Read " << v.size() << " floats" << endl;
}
hidden$ g++ -std=c++11 read-float.cpp -O3
hidden$ time ./a.out 
Read 822375 floats

real    0m0.287s
user    0m0.279s
sys 0m0.008s

hidden$ g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.8/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.8.4-2ubuntu1~14.04' --with-bugurl=file:///usr/share/doc/gcc-4.8/README.Bugs --enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.8 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.8 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libmudflap --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.8-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04)

score 0 · Accepted Answer

下面粘贴了一个使用 atof 运行速度快 3 倍的替代实现。在我的笔记本电脑上，基于原始字符串流的一个需要 2.3 秒才能完成，而对于相同数量的浮点数，这个在 0.8 秒内完成。

static char filecontents[10*1024*1024];

int testfun2()
{
  ifstream testfile;
  string filename = "test_file.txt";
  testfile.open(filename.c_str());
  int numfloats=0;
  testfile.read(filecontents,10*1024*1024);
  size_t numBytesRead = testfile.gcount();
  filecontents[numBytesRead]='\0';
  testfile.close();

  clock_t begin = clock();
  float dummy_f;

  cout<<endl<<"started at time "<<(double) (clock() - begin) /(double) CLOCKS_PER_SEC<<endl;

  char* p= filecontents;
  char* pend = p + numBytesRead;
  while(p<pend)
  {
      while(*p && (*p <= ' '))
      {
         ++p; //skip leading white space ,\r, \n
      }
      char* pvar = p;
      while(*p > ' ')
      {
        ++p; //skip over numbers
      }
      if(*p)
      {  *p = '\0';// shorter input makes atof faster.
        ++p;
      }
      if(*pvar)
      {
         dummy_f = atof(pvar);
         ++numfloats;
      }
      //cout << endl << dummy_f;
  }

  cout<<endl<< "finished at time "<<(double) (clock() - begin) /(double) CLOCKS_PER_SEC<<endl;

  cout << endl << "numfloats= " << numfloats;
  return numfloats;
 }

score 0 · Accepted Answer

更新：与@Mats 的评论中的讨论得出结论，锁定开销不太可能与此有关，所以我们回到第一方来解释为什么 Visual C++ 的库在解析浮点数时如此缓慢。您的示例测试文件看起来主要是数量级与 1.0 相差不远的数字，没有发生任何奇怪的事情。（根据 Agner Fog 的表格，英特尔在 Sandybridge 及以后的 FPU 无论如何都不会对非规范化进行性能惩罚。）

正如其他人所说，是时候分析您的代码并找出哪个函数占用了所有 CPU 时间。此外，性能计数器可以告诉您分支错误预测或缓存未命中是否导致问题。

每次调用都cin >> dummy_f需要锁定以确保另一个线程不会同时修改输入缓冲区。如果这是瓶颈所在，一次读取 4 或 8 个浮点数scanf("%f%f%f%f", &dummy_array[0], &dummy_array[1], ...)会更有效率。（scanf 也不是一个很好的 API，因为它需要每个数组元素的地址作为函数参数。但是，通过在一个 scanf 中使用多个转换展开展开仍然是一个小的性能提升。）

您正在尝试使用 stringstream 解决此问题，这可能有效也可能无效。它是函数中的局部变量，所以如果编译器可以看到所有函数并内联它们，它就不用担心锁定。不能有任何其他线程可以访问此变量。

c++ - CPP：解析字符串流太慢

3 回答 3

Related

Reference