由于 c++17,std 库具有并行算法,所以我尝试使用以下代码,对数字列表求和,并想看看是否有任何性能提升。
#include <algorithm>
#include <chrono>
#include <execution>
#include <numeric>
#include <iostream>
#include <vector>
int main() {
size_t n = 100000000;
std::vector<size_t> vec(n);
std::iota(vec.begin(), vec.end(), 0);
auto par_sum = [&](size_t k) {
auto t1 = std::chrono::high_resolution_clock::now();
std::vector<size_t> rez(k);
std::iota(rez.begin(), rez.end(), 0);
size_t batch = static_cast<size_t>(n / k) + 1;
std::for_each(std::execution::par_unseq, rez.begin(), rez.end(),
[&](size_t id) {
size_t cum = 0;
for (size_t i = id*batch; i < std::min((id+1)*batch, n); ++i) {
cum += vec[i];
}
rez[id] = cum;
});
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
std::cout << "n_worker = " << k
<< ", time = " << duration
<< ", rez = " << std::accumulate(rez.begin(), rez.end(), 0lu)
<< std::endl;
};
par_sum(1);
par_sum(3);
par_sum(5);
}
编译
g++ -std=c++17 -L/usr/local/lib -O3 -mavx -ltbb a.cpp
结果表明
n_worker = 1, time = 51875, rez = 4999999950000000
n_worker = 3, time = 57616, rez = 4999999950000000
n_worker = 5, time = 63193, rez = 4999999950000000
问题,
- 对 1 个工人没有性能提升,为什么?