cuda - 推力慢到找到数组的最大值和最小值？

Question

这是我的内核调用代码

inline void find_min_max(thrust::device_vector<Npp8u> dev_vec, Npp8u *min, Npp8u *max){
    thrust::pair<thrust::device_vector<Npp8u>::iterator,thrust::device_vector<Npp8u>::iterator> tuple;
    tuple = thrust::minmax_element(dev_vec.begin(),dev_vec.end());
    *min = *(tuple.first);
    *max = *tuple.second;
}

我还通过使用 map-reduce 范例和简单的 CPU 代码，用我的原始 CUDA 内核实现了相同的算法。作为测量的结果，我看到推力是最慢的。

为简洁起见，我使用事件来测量原始 CUDA 和推力代码。如果事件适用于推力基准测试，我很确定我正确测量了执行时间。

这是测量部分；

    ....
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    thrust::device_vector<Npp8u> image_dev(imageHost, imageHost+N);

    // Device vector allocation
    find_min_max(image_dev,&min,&max);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime1;
    cudaEventElapsedTime(&elapsedTime1, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime1 = elapsedTime1/1000
....

我真正的问题是，除了推力中的简单 minmax_element 函数之外，是否有更好的方法？

我的机器规格：它是华硕 k55v 笔记本电脑，配备 GeForce 635M 和 i7 处理器。

以及推力代码和 CPU代码的所有代码

score 8 · Accepted Answer

您没有显示任何要与推力进行比较的代码，没有给出任何机器规格（GPU、CPU 等），也没有告诉我们实际测量的时间是多少。

尽管如此，我还是使用了您的代码并从中创建了一个测试用例，比较了推力与 STL（因为您没有显示您的 CPU 代码或任何其他实现）：

#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <algorithm>
#include <time.h>

#define N 1000000
#define LOOPS 1000

inline void find_min_max(thrust::device_vector<int> &dev_vec, int *min, int *max){
    thrust::pair<thrust::device_vector<int>::iterator,thrust::device_vector<int>::iterator> tuple;
    tuple = thrust::minmax_element(dev_vec.begin(),dev_vec.end());
    *min = *(tuple.first);
    *max = *tuple.second;
}


int main(){
    int minele, maxele;

    std::vector<int> a;
    for (int i=0; i<N; i++)
      a.push_back(rand());
    thrust::host_vector<int> h_a(N);
    thrust::copy(a.begin(), a.end(), h_a.begin());

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    for (int i=0; i < LOOPS; i++){
      thrust::device_vector<int> d_a = h_a;
      find_min_max(d_a,&minele,&maxele);
      }
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime1, totalTime1;
    cudaEventElapsedTime(&elapsedTime1, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime1 = elapsedTime1/(1000*LOOPS);
    printf("thrust min element = %d, max element = %d\n", minele, maxele);
    printf("thrust time = %f\n", totalTime1);

    clock_t t;
    t = clock();
    std::vector<int>::iterator resultmax, resultmin;
    for (int i = 0; i<LOOPS; i++){
      resultmax = std::max_element(a.begin(), a.end());
      resultmin = std::min_element(a.begin(), a.end());
      }
    t = clock() - t;
    printf("STL min element = %d, max element = %d\n", *resultmin, *resultmax);
    printf("STL time = %f\n", ((float)t)/(CLOCKS_PER_SEC*LOOPS));
  return 0;
}

我使用 CUDA 5.0、RHEL 5.5、Xeon X5560 2.8GHz CPU 和 Quadro 5000 GPU 编译了这段代码，这是一个 cc 2.0 设备，比 M2050 慢一些（11 个 SM 对 14 个），结果如下：

thrust min element = 1210, max element = 2147480021
thrust time = 0.001741
STL min element = 1210, max element = 2147480021
STL time = 0.004520

即使我们考虑到我在 STL 中使用 2 个函数调用来获取最小值和最大值（知道 c++11 标准包含单个 minmax 函数调用）并将 STL 时间减半，推力更快。

如果您想讨论为什么您的案例可能很特别，请提供一个完整的、可编译的、简单的比较代码，类似于我提供的代码，以及您的机器规格和实际的计时结果。

作为一个小的优化注释，如果你通过引用（&）而不是值传递device_vector给你的find_min_max函数，它会运行得更快一点。

在我的情况下，如果我将 host->device_vector 副本从计时循环中取出，我的推力时间从 0.001741 秒下降到 0.000387 秒，这表明 host->device 副本大约占总推力时间的 78%。

编辑：现在你已经发布了你的代码（虽然你没有提到你得到的时间）我用 512x512 lena 灰度图像运行它，并在我的设置中得到以下结果：

$ ./cpu
        Version: P5
        Comment: # Created by Imlib
        Width: 512 Height: 512
        Max value: 255
ELAPSED TIME -AVG finding max and min: 0.0014437
ELAPSED TIME -AVG finding max and min: 0.0038715
$ ./thr
Load PGM file.
        Version: P5
        Comment: # Created by Imlib
        Width: 512 Height: 512
        Max value: 255
ELAPSED TIME -AVG for kernel 1: 0.000658944
ELAPSED TIME -AVG for kernel 2: 0.000179552
$

所以在我看来，即使是你的代码，我的设置推力也更快。

cuda - 推力慢到找到数组的最大值和最小值？

1 回答 1

Related

Reference