c++ - 提高字符串的分配性能

Question

我将 Java GC 测试程序移植到 C++（见下面的代码）和 Python。Java 和 Python 的性能比 C++ 好得多，我认为这是由于new每次都必须执行所有调用来创建字符串。我试过使用 Boost fast_pool_allocator，但实际上性能从 700ms 恶化到 1200ms。我使用分配器是错误的，还是我应该做的其他事情？

编辑：用g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ 是 4.8.1 版本，Python 的一次迭代大约需要 300 毫秒，Java 大约需要 50 毫秒。std::allocator大约 700 毫秒，大约 1200 毫秒boost::fast_pool_allocator。

#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>


using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
    template <typename T> pool_string to_string(const T& in) {
        std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
        stm << in;
        return stm.str();
    }
}


#include "mytime.hpp"

class Garbage {
public:
    vector<pool_string> outer;
    vector<pool_string> old;
    const int nThreads = 1;
    //static auto time = chrono::high_resolution_clock();

    void go() {
//        outer.resize(1000000);
        //old.reserve(1000000);
        auto tt = mytime::msecs();
        for (int i = 0; i < 10; ++i) {
            if (i % 100 == 0) {
                cout << "DOING AN OLD" << endl;
                doOld();
                tt = mytime::msecs();
            }

            for (int j = 0; j < 1000000/nThreads; ++j)
                outer.push_back(patch::to_string(j));

            outer.clear();
            auto t = mytime::msecs();
            cout << (t - tt) << endl;
            tt = t;
        }
    }

    void doOld() {
        old.clear();
        for (int i = 0; i < 1000000/nThreads; ++i)
            old.push_back(patch::to_string(i));
    }
};

int main() {
    Garbage().go();
}

score 5 · Accepted Answer

问题是您每次都使用新的字符串流来转换整数。

修理它：

namespace patch {
    template <typename T> pool_string to_string(const T& in) {
        return boost::lexical_cast<pool_string>(in);
    }
}

现在时间是：

DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185

real    0m0.801s
user    0m0.784s
sys 0m0.016s

在Coliru现场观看

完整代码供参考：

#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>

using string = std::string;

namespace patch {
    template <typename T> string to_string(const T& in) {
        return boost::lexical_cast<string>(in);
    }
}

class Timer
{
    typedef std::chrono::high_resolution_clock clock;
    clock::time_point _start;
  public:
    Timer() { reset(); }
    void reset() { _start = now(); }
    double elapsed()
    {
        using namespace std::chrono;
        auto e = now() - _start;
        return duration_cast<nanoseconds>(e).count()*1.0e-9;
    }
    clock::time_point now()
    {
        return clock::now();
    }
};


class Garbage {
    public:
        std::vector<string> outer;
        std::vector<string> old;
        const int nThreads = 1;

        void go() {
            outer.resize(1000000);
            //old.reserve(1000000);
            Timer timer;

            for (int i = 0; i < 10; ++i) {
                if (i % 100 == 0) {
                    std::cout << "DOING AN OLD" << std::endl;
                    doOld();
                }

                for (int j = 0; j < 1000000/nThreads; ++j)
                    outer.push_back(patch::to_string(j));

                outer.clear();
                std::cout << timer.elapsed() << std::endl;
                timer.reset();
            }
        }

        void doOld() {
            old.clear();
            for (int i = 0; i < 1000000/nThreads; ++i)
                old.push_back(patch::to_string(i));
        }
};

int main() {
    Garbage().go();
}

score 2 · Accepted Answer

由于我没有在我的机器上使用 boost，我简化了代码以使用标准 C++11 to_string（因此意外地“修复”了 sehe 发现的问题），并得到了这个：

#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;


class Timer
{
    typedef std::chrono::high_resolution_clock clock;
    clock::time_point _start;
    public:
    Timer() { reset(); }
    void reset() { _start = now(); }
    double elapsed()
    {
        using namespace std::chrono;
        auto e = now() - _start;
        return duration_cast<nanoseconds>(e).count()*1.0e-9;
    }
    clock::time_point now()
    {
        return clock::now();
    }
};


class Garbage {
public:
    vector<string> outer;
    vector<string> old;
    const int nThreads = 1;
Timer timer;

    void go() {
//        outer.resize(1000000);
        //old.reserve(1000000);
        for (int i = 0; i < 10; ++i) {
            if (i % 100 == 0) {
                cout << "DOING AN OLD" << endl;
                doOld();
            }

            for (int j = 0; j < 1000000/nThreads; ++j)
                outer.push_back(to_string(j));

            outer.clear();
            cout << timer.elapsed() << endl;
            timer.reset();
        }
    }

    void doOld() {
        old.clear();
        for (int i = 0; i < 1000000/nThreads; ++i)
            old.push_back(to_string(i));
    }
};

int main() {
    Garbage().go();
}

编译：

$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835

使用 2014 年 4 月 18 日星期五的源代码构建的 clang 3.5 使用相同的编译器选项给出了类似的结果。

我的处理器是 AMD Phenom(tm) II X4 965，运行频率为 3.6GHz（如果我没记错的话）。

c++ - 提高字符串的分配性能

2 回答 2

Related

Reference