c++ - 分配数组会减慢计算速度

Question

我正在研究使用余弦相似度尽快在两组中找到最相似向量的代码。
代码使用原始数组（为了速度和简单性），我开始注意到，当我分配更多数组时，程序变慢了，即使我根本没有改变我的计算。我设法将程序提炼到以下一百行左右，而不会丢失问题：

#include <iostream>

const int vec_len = 192;

struct fvec
{
    int64_t nvec;
    short int **vecs;
#ifdef PARTIALS
    int **partials;
#endif
    fvec(int size)
    {
        nvec = size;
        vecs = new short int *[nvec];
#ifdef PARTIALS
        partials = new int *[nvec];
#endif
        for (int64_t i = 0; i < nvec; i++)
        {
            vecs[i] = new short int[vec_len];
#ifdef PARTIALS
            partials[i] = new int[vec_len];
#endif
            for (int j = 0; j < vec_len; j++) vecs[i][j] = std::rand() * 10000 / RAND_MAX;
        }
    }
    ~fvec()
    {
        for (int64_t i = 0; i < nvec; i++)
        {
            delete[] vecs[i];
#ifdef PARTIALS
            delete[] partials[i];
#endif
        }
        delete[] vecs;
#ifdef PARTIALS
        delete[] partials;
#endif
    }
};

struct cvec
{
    int nvec;
    short int **vecs;
#ifdef PARTIALS
    int **partials;
#endif
    cvec(int size)
    {
        nvec = size;
        vecs = new short int *[nvec];
#ifdef PARTIALS
        partials = new int *[nvec];
#endif
        for (int nv = 0; nv < nvec; nv++)
        {
            vecs[nv] = new short int[vec_len];
#ifdef PARTIALS
            partials[nv] = new int[vec_len];
#endif
            for (int i = 0; i < vec_len; i++) vecs[nv][i] = std::rand() * 10000 / RAND_MAX;
        }
    }
    ~cvec()
    {
        for (int i = 0; i < nvec; i++)
        {
            delete[] vecs[i];
#ifdef PARTIALS
            delete[] partials[i];
#endif
        }
        delete[] vecs;
#ifdef PARTIALS
        delete[] partials;
#endif
    }
};

float sim(short int *a, short int *b)
{
    int ret = 0;
    for (int i = 0; i < vec_len; i++) ret += a[i] * b[i];
    return ret;
}

void iterative_nn(const cvec &c, const fvec &f, int *results)
{
    for (int64_t i = 0; i < f.nvec; i++)
    {
        results[i] = 0;
        for (int j = 0; j < c.nvec; j++)
        {
            float tmpsim = sim(f.vecs[i], c.vecs[j]);
            if (tmpsim > results[i]) results[i] = tmpsim;
        }
        if (i % 100 == 0) std::cout << "\r" << i << std::flush;
    }
}

int main(int argc, char **argv)
{
    int res[5000];
    iterative_nn(cvec{100000}, fvec{5000}, res);
    std::cout << "\n";
    return 0;
}

如您所见，我有两个类保存两组数组。我用随机值填充两组数组（用于演示），然后调用一个函数来遍历所有数组并计算它们的相似性。
当我通过在命令行上指定 -DPARTIALS 为每个类添加另一组数组时，程序会减慢到我计算机速度的一半左右。显然，该指令涉及的唯一行是附加数组的分配和释放！
此外，额外的时间并没有花在分配和释放上，在任何一种情况下都需要不到一秒钟的时间。额外的时间花在了迭代搜索上，指令没有影响（或者我认为是这样）。因此，我的问题是：仅仅分配使我的程序减慢一半的额外数组是什么？

上面的代码需要使用 -std=c++11 编译。如果我使用 -O3，它对我来说运行大约 25 秒或 1 分钟。

score 0 · Accepted Answer

导致性能下降的因素有两个：

当 CPU 在计算循环中从内存中加载数据时，会发生更多的缓存命中失败。
新的和删除的时间。

我已将以下代码移到一个单独的循环中，它显着提高了性能，我相信这是因为第 1 项。

#ifdef PARTIALS
            partials[nv] = new int[vec_len];
#endif

无部分原码：1m16s。
带部分的原始代码：1m40s。
没有部分的单独循环：1 分 16 秒。
带部分的单独循环：1m20s。

所以在我的情况下，#1 大约需要 4 秒。并且缓存未命中大约需要 20 秒。

更改后的代码如下（我是用 O3 而不是 c11 构建的）：

#include <iostream>

const int vec_len = 192;

struct fvec
{
    int64_t nvec;
    short int **vecs;
#ifdef PARTIALS
    int **partials;
#endif
    fvec(int size)
    {
        nvec = size;
        vecs = new short int *[nvec];
#ifdef PARTIALS
        partials = new int *[nvec];
#endif
#ifdef PARTIALS // <<<<< put it here in an separator loop.
        for (int64_t i = 0; i < nvec; i++)
        {
            partials[i] = new int[vec_len];
        }
#endif
        for (int64_t i = 0; i < nvec; i++)
        {
            vecs[i] = new short int[vec_len];
            for (int j = 0; j < vec_len; j++) vecs[i][j] = std::rand() * 10000 / RAND_MAX;
        }
    }
    ~fvec()
    {
        for (int64_t i = 0; i < nvec; i++)
        {
            delete[] vecs[i];
#ifdef PARTIALS
            delete[] partials[i];
#endif
        }
        delete[] vecs;
#ifdef PARTIALS
        delete[] partials;
#endif
    }
};

struct cvec
{
    int nvec;
    short int **vecs;
#ifdef PARTIALS
    int **partials;
#endif
    cvec(int size)
    {
        nvec = size;
        vecs = new short int *[nvec];
#ifdef PARTIALS
        partials = new int *[nvec];
#endif

#ifdef PARTIALS // <<<<< put it here in an separator loop.
        for (int nv = 0; nv < nvec; nv++)
        {
            partials[nv] = new int[vec_len];
        }
#endif

        for (int nv = 0; nv < nvec; nv++)
        {
            vecs[nv] = new short int[vec_len];
            for (int i = 0; i < vec_len; i++) vecs[nv][i] = std::rand() * 10000 / RAND_MAX;
        }
    }
    ~cvec()
    {
#ifdef PARTIALS 
        for (int i = 0; i < nvec; i++)
        {
            delete[] partials[i];
        }
#endif

        for (int i = 0; i < nvec; i++)
        {
            delete[] vecs[i];
        }
        delete[] vecs;
#ifdef PARTIALS
        delete[] partials;
#endif
    }
};

float sim(short int *a, short int *b)
{
    int ret = 0;
    for (int i = 0; i < vec_len; i++) ret += a[i] * b[i];
    return ret;
}

void iterative_nn(const cvec &c, const fvec &f, int *results)
{
    for (int64_t i = 0; i < f.nvec; i++)
    {
        results[i] = 0;
        for (int j = 0; j < c.nvec; j++)
        {
            float tmpsim = sim(f.vecs[i], c.vecs[j]);
            if (tmpsim > results[i]) results[i] = tmpsim;
        }
        if (i % 100 == 0) std::cout << "\r" << i << std::flush;
    }
}

int main(int argc, char **argv)
{
    int res[5000];
    iterative_nn(cvec(100000), fvec(5000), res);
    std::cout << "\n";
    return 0;
}

c++ - 分配数组会减慢计算速度

1 回答 1

Related

Reference