1
#include <iostream>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <omp.h>

std::vector<int> col_sums(const std::vector<std::vector<short>>& data) {
    unsigned int height = data.size(), width = data[0].size();
    std::vector<int> totalSums(width, 0), threadSums(width, 0);

    #pragma omp parallel firstprivate(threadSums)
    {
        #pragma omp parallel for
        for (unsigned int i = 0; i < height; i++) {
            threadSums.data()[0:width] += data[i].data()[0:width];
        }
        #pragma omp critical
        {
            totalSums.data()[0:width] += threadSums.data()[0:width];
        }
    }
    return totalSums;
}

int main(int argc, char** argv) {
    if (argc < 3) {
        std::cout << "Run program as \"executable <rows> <columns>\n";
    } else {
        std::stringstream args;
        args << argv[1] << " " << argv[2];
        int rows, columns;
        args >> rows >> columns;
        std::vector<std::vector<short>> data(rows, std::vector<short>(columns));
        std::vector<int> columnSums = col_sums(data);
    }
}
  • export OMP_NUM_THREADS=4
  • icpc -Ofast -fopenmp -g dummy.cpp -o dummy
  • /usr/bin/time -v ./dummy 115000 20000
  • CPU% = 225%(应该是 380%+)

我对 OpenMP 和 CilkPlus 相当有经验,但在这里扩展的障碍让我望而却步,这是一个相当初级的程序。我知道它必须是显而易见的,但我觉得我已经消除了所有的数据危害和控制危害。我完全被难住了。

4

0 回答 0