0

我只是一个 CUDA 初学者,并试图在我的程序上使用更快的并行减少开普勒,但我没有得到结果,下面是我正在做的函数,输出为 0,我将不胜感激知道我的错误是什么吗?

#ifndef __CUDACC__  
#define __CUDACC__
#endif

#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>

__inline__ __device__
float warpReduceSum(float val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2) 
    val += __shfl_down(val, offset);
  return val;
}

__inline__ __device__
float blockReduceSum(float val) {

  static __shared__ int shared[32]; // Shared mem for 32 partial sums
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;

  val = warpReduceSum(val);     // Each warp performs partial reduction

  if (lane==0) shared[wid]=val; // Write reduced value to shared memory

  __syncthreads();              // Wait for all partial reductions

  //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp

  return val;
}

__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
  float sum = 0;
  //reduce multiple elements per thread
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) 
  {
    sum += in[i];
  }
  sum = blockReduceSum(sum);
  if (threadIdx.x==0)
    out[blockIdx.x]=sum;
}

int main()
{
    int n = 1000000;
    float *b = new float[1]();
    float *d = new float[1]();
    float *a ;


    int blocks = (n/512)+1;
    float *d_intermediate;

    cudaMalloc((void**)&d_intermediate, n*sizeof(float));
    cudaMalloc((void**)&a, n*sizeof(float));

    cudaMemset(a, 1, n*sizeof(float));

    deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
    deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
    cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_intermediate);
    std::cout << d[0];
    return 0;

}
4

1 回答 1

4

您的代码存在各种问题:

  1. 任何时候您在使用 CUDA 代码时遇到问题,您都应该使用适当的 cuda 错误检查并使用 运行您的代码cuda-memcheck然后再向其他人寻求帮助。即使您不理解错误输出,它也会对其他试图帮助您的人有用。如果您使用此代码完成了该操作,您将被告知各种错误/问题

  2. 任何传递给 CUDA 内核的指针都应该是有效的 CUDA 设备指针。您的b指针是主机指针:

    float *b = new float[1]();
    

    所以你不能在这里使用它:

    deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
                                                     ^
    

    因为你显然想用它来存储设备上的单个float数量,我们可以很容易地重用a指针。

  3. 出于类似的原因,这是不明智的:

    cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
    

    在这种情况下,两者b都是d主机指针。这不会将数据从设备复制到主机。

  4. 这可能不符合您的想法:

    cudaMemset(a, 1, n*sizeof(float));
    

    我想你认为这将填充一个float数量为 1 的数组,但它不会。 cudaMemset, like memset, 填充字节并采用字节数量。如果你用它来填充一个float数组,你实际上是在创建一个用0x01010101. 我不知道将位模式转换为float数量时会转换成什么值,但它不会给你一个float值 1。我们将通过用循环填充一个普通的主机数组来解决这个问题,然后传输它数据到要减少的设备。

这是一个修改后的代码,解决了上述问题,并为我正确运行:

$ cat t1290.cu
#include <iostream>
#include <stdio.h>
#include <math.h>

__inline__ __device__
float warpReduceSum(float val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2)
    val += __shfl_down(val, offset);
  return val;
}

__inline__ __device__
float blockReduceSum(float val) {

  static __shared__ int shared[32]; // Shared mem for 32 partial sums
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;

  val = warpReduceSum(val);     // Each warp performs partial reduction

  if (lane==0) shared[wid]=val; // Write reduced value to shared memory

  __syncthreads();              // Wait for all partial reductions

  //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp

  return val;
}

__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
  float sum = 0;
  //reduce multiple elements per thread
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
  {
    sum += in[i];
  }
  sum = blockReduceSum(sum);
  if (threadIdx.x==0)
    out[blockIdx.x]=sum;
}

int main()
{
        int n = 1000000;
        float b;
        float *a, *a_host;
        a_host = new float[n];

        int blocks = (n/512)+1;
        float *d_intermediate;

        cudaMalloc((void**)&d_intermediate, blocks*sizeof(float));
        cudaMalloc((void**)&a, n*sizeof(float));
        for (int i = 0; i < n; i++) a_host[i] = 1;
        cudaMemcpy(a, a_host, n*sizeof(float), cudaMemcpyHostToDevice);

        deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
        deviceReduceKernel<<<1, 1024>>>(d_intermediate, a, blocks);
        cudaMemcpy(&b, a, sizeof(float), cudaMemcpyDeviceToHost);
        cudaFree(d_intermediate);
        std::cout << b << std::endl;
        return 0;
}
$ nvcc -arch=sm_35 -o t1290 t1290.cu
$ cuda-memcheck ./t1290
========= CUDA-MEMCHECK
1e+06
========= ERROR SUMMARY: 0 errors
$
于 2017-02-20T23:04:37.710 回答