这是我的内核代码
typedef unsigned char Npp8u;
...
// Kernel Implementation
__device__ unsigned int min_device;
__device__ unsigned int max_device;
__global__ void findMax_Min(Npp8u * data, int numEl){
int index = blockDim.x*blockIdx.x + threadIdx.x;
int shared_index = threadIdx.x;
__shared__ Npp8u data_shared_min[BLOCKDIM];
__shared__ Npp8u data_shared_max[BLOCKDIM];
// check index condition
if(index < numEl){
data_shared_min[shared_index] = data[index]; //pass values from global to shared memory
__syncthreads();
data_shared_max[shared_index] = data[index]; //pass values from global to shared memory
for (unsigned int stride = BLOCKDIM/2; stride > 0; stride >>= 1) {
if(threadIdx.x < stride){
if(data_shared_max[threadIdx.x] < data_shared_max[threadIdx.x+stride]) data_shared_max[shared_index] = data_shared_max[shared_index+stride];
if(data_shared_min[threadIdx.x]> data_shared_min[threadIdx.x+stride]) data_shared_min[shared_index] = data_shared_min[shared_index+stride];
}
__syncthreads();
}
if(threadIdx.x == 0 ){
atomicMin(&(min_device), (unsigned int)data_shared_min[threadIdx.x ]);
//min_device =10;
__syncthreads();
atomicMax(&(max_device), (unsigned int)data_shared_max[threadIdx.x ]);
}
}else{
data_shared_min[shared_index] = 9999;
}
}
我有一个 512x512 的图像,我想找到最小和最大像素值。data
是图像的一维版本。此代码适用于最大值但不适用于最小值。正如我从 matlab 中检查的那样,最大值为 202,最小值为 10,但它发现最小值为 0。这是我的内核代码和 memcpy 调用
int main(){
// Host parameter declarations.
Npp8u * imageHost;
int nWidth, nHeight, nMaxGray;
// Load image to the host.
std::cout << "Load PGM file." << std::endl;
imageHost = LoadPGM("lena_before.pgm", nWidth, nHeight, nMaxGray);
// Device parameter declarations.
Npp8u * imageDevice;
unsigned int max, min;
size_t size = sizeof(Npp8u)*nWidth*nHeight;
cudaMalloc((Npp8u**)&imageDevice, size);
cudaMemcpy(imageDevice, imageHost, size, cudaMemcpyHostToDevice);
int numPixels = nWidth*nHeight;
dim3 numThreads(BLOCKDIM);
dim3 numBlocks(numPixels/BLOCKDIM + (numPixels%BLOCKDIM == 0 ? 0 : 1));
findMax_Min<<<numBlocks, numThreads>>>(imageDevice,numPixels);
cudaMemcpyFromSymbol(&max,max_device, sizeof(max_device), 0, cudaMemcpyDeviceToHost);
cudaMemcpyFromSymbol(&min,min_device, sizeof(min_device), 0, cudaMemcpyDeviceToHost);
printf("Min value for image : %i\n", min);
printf("Max value for image : %i\n", max);
...
另一个有趣的事情是在内核调用之后更改顺序cudaMemcpy
也会导致故障并且值都被读取为零。我没有看到问题。有没有人看到被阻塞的部分?