#include <iostream>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <omp.h>
std::vector<int> col_sums(const std::vector<std::vector<short>>& data) {
unsigned int height = data.size(), width = data[0].size();
std::vector<int> totalSums(width, 0), threadSums(width, 0);
#pragma omp parallel firstprivate(threadSums)
{
#pragma omp parallel for
for (unsigned int i = 0; i < height; i++) {
threadSums.data()[0:width] += data[i].data()[0:width];
}
#pragma omp critical
{
totalSums.data()[0:width] += threadSums.data()[0:width];
}
}
return totalSums;
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cout << "Run program as \"executable <rows> <columns>\n";
} else {
std::stringstream args;
args << argv[1] << " " << argv[2];
int rows, columns;
args >> rows >> columns;
std::vector<std::vector<short>> data(rows, std::vector<short>(columns));
std::vector<int> columnSums = col_sums(data);
}
}
export OMP_NUM_THREADS=4
icpc -Ofast -fopenmp -g dummy.cpp -o dummy
/usr/bin/time -v ./dummy 115000 20000
- CPU% = 225%(应该是 380%+)
我对 OpenMP 和 CilkPlus 相当有经验,但在这里扩展的障碍让我望而却步,这是一个相当初级的程序。我知道它必须是显而易见的,但我觉得我已经消除了所有的数据危害和控制危害。我完全被难住了。