我创建了一个 CUDA 函数,用于使用其直方图计算图像的总和。
我正在尝试为多种计算能力编译内核和包装函数。
核心:
__global__ void calc_hist(unsigned char* pSrc, int* hist, int width, int height, int pitch)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
#if __CUDA_ARCH__ > 110 //Shared Memory For Devices Above Compute 1.1
__shared__ int shared_hist[256];
#endif
int global_tid = yIndex * pitch + xIndex;
int block_tid = threadIdx.y * blockDim.x + threadIdx.x;
if(xIndex>=width || yIndex>=height) return;
#if __CUDA_ARCH__ == 110 //Calculate Histogram In Global Memory For Compute 1.1
atomicAdd(&hist[pSrc[global_tid]],1); /*< Atomic Add In Global Memory */
#elif __CUDA_ARCH__ > 110 //Calculate Histogram In Shared Memory For Compute Above 1.1
shared_hist[block_tid] = 0; /*< Clear Shared Memory */
__syncthreads();
atomicAdd(&shared_hist[pSrc[global_tid]],1); /*< Atomic Add In Shared Memory */
__syncthreads();
if(shared_hist[block_tid] > 0) /* Only Write Non Zero Bins Into Global Memory */
atomicAdd(&(hist[block_tid]),shared_hist[block_tid]);
#else
return; //Do Nothing For Devices Of Compute Capabilty 1.0
#endif
}
包装功能:
int sum_8u_c1(unsigned char* pSrc, double* sum, int width, int height, int pitch, cudaStream_t stream = NULL)
{
#if __CUDA_ARCH__ == 100
printf("Compute Capability Not Supported\n");
return 0;
#else
int *hHist,*dHist;
cudaMalloc(&dHist,256*sizeof(int));
cudaHostAlloc(&hHist,256 * sizeof(int),cudaHostAllocDefault);
cudaMemsetAsync(dHist,0,256 * sizeof(int),stream);
dim3 Block(16,16);
dim3 Grid;
Grid.x = (width + Block.x - 1)/Block.x;
Grid.y = (height + Block.y - 1)/Block.y;
calc_hist<<<Grid,Block,0,stream>>>(pSrc,dHist,width,height,pitch);
cudaMemcpyAsync(hHist,dHist,256 * sizeof(int),cudaMemcpyDeviceToHost,stream);
cudaStreamSynchronize(stream);
(*sum) = 0.0;
for(int i=1; i<256; i++)
(*sum) += (hHist[i] * i);
printf("sum = %f\n",(*sum));
cudaFree(dHist);
cudaFreeHost(hHist);
return 1;
#endif
}
问题一:
编译时sm_10
,包装器和内核不应该执行。但事实并非如此。整个包装函数执行。输出显示sum = 0.0
。
我希望输出与Compute Capability Not Supported
我printf
在包装函数开头添加的语句一样。
如何防止包装器功能执行sm_10
?我不想添加任何运行时检查,例如 if 语句等。可以通过模板元编程来实现吗?
问题2:
编译大于时sm_10
,只有cudaStreamSynchronize
在内核调用之后添加,程序才能正确执行。但如果我不同步,则输出为sum = 0.0
. 为什么会这样?我希望该功能尽可能与主机异步。是否可以移动内核内的唯一循环?
我在 Windows 8 上使用 GTX460M、CUDA 5.0、Visual Studio 2008。