因此,使用以下代码,将参数的类型x
从更改const ull
为const ull&
(with typedef unsigned long long ull
) 会在使用 gcc 4.7.2 和 flags 编译时产生大约 25% 的加速-O3 -std=c++11 -g
,我不明白为什么这会产生如此大的差异。
static void inline single_mult(const std::vector<ull>::iterator& data,
const std::vector<ull>::const_iterator& rbegin,
const std::vector<ull>::const_iterator& rend,
const ull x) {
ull tmp=0, carry=0, i=0;
for (auto rhs_it = rbegin; rhs_it != rend; ++rhs_it) {
tmp = x*(*rhs_it) + data[i] + carry;
if (tmp >= imax) {
carry = tmp >> numbits;
tmp &= imax - 1;
} else {
carry = 0;
}
data[i++] = tmp;
}
data[i] += carry;
}
它在以下函数中调用(用于做课本长乘法)
static void naive(std::vector<ull>::iterator data,
std::vector<ull>::const_iterator cbegin,
std::vector<ull>::const_iterator cend ,
std::vector<ull>::const_iterator rbegin,
std::vector<ull>::const_iterator rend) {
for (auto data_it = cbegin;
data_it != cend; ++data_it) {
if (*data_it != 0) {
single_mult(data, rbegin, rend, *data_it);
}
++data;
}
}
计时是通过调用clock()
一个循环来测量它需要多长时间来完成的。不是最准确/最精确的方式,但我认为一致的 25% 差异意味着差异具有统计学意义。
完整的工作代码块:
#include <vector>
#include <limits>
#include <ctime>
#include <iostream>
typedef unsigned long long ull;
typedef unsigned int uint;
const ull numbits = (ull) std::numeric_limits<uint>::digits;
const ull imax = 1LL << numbits;
static void inline single_mult(const std::vector<ull>::iterator& data,
const std::vector<ull>::const_iterator& rbegin,
const std::vector<ull>::const_iterator& rend,
const ull x) {
ull tmp=0, carry=0, i=0;
for (auto rhs_it = rbegin; rhs_it != rend; ++rhs_it) {
tmp = x*(*rhs_it) + data[i] + carry;
if (tmp >= imax) {
carry = tmp >> numbits;
tmp &= imax - 1;
} else {
carry = 0;
}
data[i++] = tmp;
}
data[i] += carry;
}
static void naive(std::vector<ull>::iterator data,
std::vector<ull>::const_iterator cbegin,
std::vector<ull>::const_iterator cend ,
std::vector<ull>::const_iterator rbegin,
std::vector<ull>::const_iterator rend) {
for (auto data_it = cbegin; data_it != cend; ++data_it) {
if (*data_it != 0) {
single_mult(data, rbegin, rend, *data_it);
}
++data;
}
}
int main() {
int N = 10000;
std::vector<ull> vec(N);
for (int i=0; i<N; ++i) {
vec[i] = i;
}
auto s1 = clock();
int num = 10;
std::vector<ull> result(2*N);
for (int i=0; i<num; ++i) {
//Need to zero out the vector or the result will be different.
std::fill(result.begin(), result.end(), 0);
naive(result.begin(), vec.cbegin(), vec.cend(), vec.cbegin(), vec.cend());
}
s1 = (clock() - s1);
std::cout << "Took " << float(s1)/CLOCKS_PER_SEC << "seconds total." << std::endl;
}
和运行时(我命名了传递值value.cpp
和引用的文件reference.cpp
)
$ g++ -O3 -std=c++11 -g -o reference reference.cpp
$ g++ -O3 -std=c++11 -g -o value value.cpp
$ ./reference
Took 1.05seconds total.
$ ./value
Took 1.83seconds total.