c++ - 优化 parallel_for 实现

Question

我有一些代码使用 Microsoft 的 PPL 来执行 parallel_for 循环，然后我不得不将其移至 Linux 和 Mac，这使我制作了自己的版本。它做了应该做的事情，并且性能不错，但它仍然比其他相同的 PPL parallel_for 循环慢 20%。

我也许应该提一下，通常会执行 10000 到 100000 次迭代，但每次迭代只是几个平方根和乘法。但是，它必须运行得非常快，因为它是用于交互式应用程序的。

对于 C++ 11 来说仍然是新的，所以如果有经验的人可以查看我的实现并就为什么它不是一直存在以及可以改进的地方提供一些反馈，我会很高兴。

template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
    int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;

    int blockSize = (end - start) / threadCount;
    if (blockSize*threadCount < end - start)
        blockSize++;

    std::vector<std::future<void>> futures;

    int blockStart = start;
    int blockEnd = blockStart + blockSize;
    if (blockEnd > end) blockEnd = end;

    for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
    {
        futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
        {
            for (size_t i = blockStart; i < blockEnd; ++i)
            {
                userLambda(i);
            }
        })));

        blockStart += blockSize;
        blockEnd = blockStart + blockSize;
        if (blockStart >= end) break;
        if (blockEnd > end) blockEnd = end;
    }

    for (std::future<void> &f: futures)
        f.get();
}

完整的测试代码如下。

#include "stdafx.h" //nothing in there in this test
#include <ppl.h>
#include <chrono>
#include <iostream>
#include <vector>
#include <future>

template<size_t THREADS_PER_CORE = 1>
void parallel_forMine(size_t start, size_t end, const std::function<void(size_t)> &userLambda)
{
int threadCount = std::thread::hardware_concurrency()*THREADS_PER_CORE;

int blockSize = (end - start) / threadCount;
if (blockSize*threadCount < end - start)
    blockSize++;

std::vector<std::future<void>> futures;

int blockStart = start;
int blockEnd = blockStart + blockSize;
if (blockEnd > end) blockEnd = end;

for (int threadIndex = 0; threadIndex < threadCount; threadIndex++)
{
    futures.push_back(std::move(std::async(std::launch::async, [blockStart, blockEnd, &userLambda]
    {
        for (size_t i = blockStart; i < blockEnd; ++i)
        {
            userLambda(i);
        }
    })));

    blockStart += blockSize;
    blockEnd = blockStart + blockSize;
    if (blockStart >= end) break;
    if (blockEnd > end) blockEnd = end;
}

for (std::future<void> &f: futures)
    f.get();
}



int main()
{
    //serial execution
    std::vector<double> valueSerial(1000);
    auto startSerial = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < 1000; i++)
        for (int j = 0; j < 1000000; j++)
            valueSerial[i] += sqrt(abs(cos(sin(sqrt(i)))));
    auto durationSerial = (std::chrono::high_resolution_clock::now() - startSerial).count() / 1000;
    std::cout << durationSerial << " Serial" << std::endl;


//PPL parallel for
std::vector<double> valueParallelForPPL(1000);
auto startParallelForPPL = std::chrono::high_resolution_clock::now();
Concurrency::parallel_for(size_t(0), size_t(1000), [&](size_t i)
{
    for (int j = 0; j < 1000000; j++)
        valueParallelForPPL[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelForPPL = (std::chrono::high_resolution_clock::now() - startParallelForPPL).count() / 1000;
std::cout << durationParallelForPPL << " PPL parallel for"<<std::endl;


//my parallel for
std::vector<double> valueParallelFor(1000);
auto startParallelFor = std::chrono::high_resolution_clock::now();
parallel_forMine(0, 1000, [&](size_t i)
{
    for (int j = 0; j < 1000000; j++)
        valueParallelFor[i] += sqrt(abs(cos(sin(sqrt(i)))));
});
auto durationParallelFor = (std::chrono::high_resolution_clock::now() - startParallelFor).count() / 1000;
std::cout << durationParallelFor << " My parallel for"<<std::endl;


//only really to make sure the compiler doesn't optimize everything away
for (int i = 0; i < valueSerial.size();i++)
    if (valueSerial[i] != valueParallelFor[i] || valueParallelFor[i]!= valueParallelForPPL[i])
        std::cout << "error";


std::cin.get();

return 0;
}

c++ - 优化 parallel_for 实现

0 回答 0

Related

Reference