上一个问题询问如何在 CUDA 中有效地找到数组的最大值:Finding max value in CUDA,最热门的回复提供了指向 NVIDIA 关于优化缩减内核的演示文稿的链接。
如果您使用的是 Visual Studio,只需删除标头引用以及CPU EXECUTION之间的所有内容。
我设置了一个找到最大值的变体,但它与 CPU 的发现不匹配:
// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
int i = 0;
float max = -100000;
for(i = 0; i < n; i++)
{
if(maxes[i] > max)
max = maxes[i];
}
return max;
}
// Too obvious...
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
unsigned int gridSize = BLOCKSIZE*2*gridDim.x;
sdata[tid] = 0;
//MMX(index,i)
//MMX(index,i+blockSize)
// Final Optimized Kernel
while (i < n) {
sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
i += gridSize;
}
__syncthreads();
if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
if (tid < 32) {
if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
我有一个巨大的设置来测试这个算法:
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "book.h"
#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4
using namespace std;
// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);
// Returns random floating point number
float RandomReal(float low, float high)
{
float d;
d = (float) rand() / ((float) RAND_MAX + 1);
return (low + d * (high - low));
}
int main()
{
/*****************VARIABLE SETUP*****************/
// Pointer to CPU numbers
float *numbers;
// Pointer to GPU numbers
float *dev_numbers;
// Counter
int i = 0;
// Randomize
srand(time(0));
// Timers
// Kernel timers
cudaEvent_t start_kernel, stop_kernel;
float elapsedTime_kernel;
HANDLE_ERROR(cudaEventCreate(&start_kernel));
HANDLE_ERROR(cudaEventCreate(&stop_kernel));
// cudaMalloc timers
cudaEvent_t start_malloc, stop_malloc;
float elapsedTime_malloc;
HANDLE_ERROR(cudaEventCreate(&start_malloc));
HANDLE_ERROR(cudaEventCreate(&stop_malloc));
// CPU timers
struct timeval start, stop;
float elapsedTime = 0;
/*****************VARIABLE SETUP*****************/
/*****************CPU ARRAY SETUP*****************/
// Setup CPU array
HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
for(i = 0; i < ARRAYSIZE; i++)
numbers[i] = RandomReal(0, 50000.0);
/*****************CPU ARRAY SETUP*****************/
/*****************GPU ARRAY SETUP*****************/
// Start recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(start_malloc,0));
// Allocate memory to GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
// Transfer CPU array to GPU
HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
// An array to temporarily store maximum values on the GPU
float *dev_max;
HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
// An array to hold grab the GPU max
float maxes[GRIDSIZE];
/*****************GPU ARRAY SETUP*****************/
/*****************KERNEL EXECUTION*****************/
// Start recording kernel execution time
HANDLE_ERROR(cudaEventRecord(start_kernel,0));
// Run kernel
MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
// Transfer maxes over
HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
// Print out the max
cout << GetMax(maxes, GRIDSIZE) << endl;
// Stop recording kernel execution time
HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
// Stop recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
// Print results
printf("%5.3f\t%5.3f\n", elapsedTime_kernel, elapsedTime_malloc);
/*****************KERNEL EXECUTION*****************/
/*****************CPU EXECUTION*****************/
// Capture the start time
gettimeofday(&start, NULL);
// Call generic P7Viterbi function
cout << GetMax(numbers, ARRAYSIZE) << endl;
// Capture the stop time
gettimeofday(&stop, NULL);
// Retrieve time elapsed in milliseconds
long int elapsed_sec = stop.tv_sec - start.tv_sec;
long int elapsed_usec = stop.tv_usec - start.tv_usec;
elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
// Print results
printf("%5.3f\n", elapsedTime);
/*****************CPU EXECUTION*****************/
// Free memory
cudaFreeHost(numbers);
cudaFree(dev_numbers);
cudaFree(dev_max);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_malloc);
cudaEventDestroy(stop_malloc);
// Exit program
return 0;
}
我在这个测试程序上运行了 cuda-memcheck,打开了 -g 和 -G 开关,它报告了 0 个问题。谁能发现这个问题?
注意:当您编译程序时,请确保在当前目录中有来自 CUDA by Example 的 book.h。源码链接在这里:http: //developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming 下载源码,book.h会在common目录/文件夹下。