我有一个顺序平滑算法
void triangularSmooth(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height, const float *filter, NSTimer &timer, dim3 grid_size, dim3 block_size) {
for ( int y = 0; y < height; y++ ) {
for ( int x = 0; x < width; x++ ) {
unsigned int filterItem = 0;
float filterSum = 0.0f;
float smoothPix = 0.0f;
for ( int fy = y - 2; fy < y + 3; fy++ ) {
for ( int fx = x - 2; fx < x + 3; fx++ ) {
if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
filterItem++;
continue;
}
smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
filterSum += filter[filterItem];
filterItem++;
}
}
smoothPix /= filterSum;
smoothImage[(y * width) + x] = static_cast< unsigned char >(smoothPix);
}
}
}
我在 CUDA 中实现并希望使用共享变量来保存 grayImage 中的像素。但是在此之前,我正在尝试按原样运行它。为此,我有内核代码:
__global__ void smooth(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height, const float *filter)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int filterItem = 0;
float filterSum = 0.0f;
float smoothPix = 0.0f;
for ( int fy = y - 2; fy < y + 3; fy++ ) {
for ( int fx = x - 2; fx < x + 3; fx++ ) {
if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
filterItem++;
continue;
}
smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
filterSum += filter[filterItem];
filterItem++;
}
}
smoothPix /= filterSum;
smoothImage[(y * width) + x] = static_cast< unsigned char >(smoothPix);
}
并调用:
const float filter[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
dim3 gridSize((width*height)/1024,(width*height)/1024,1);
dim3 blockSize(256,256,1);
smooth <<< gridSize, blockSize >>> (grayImage, smoothImage, width, height, filter);
cudaDeviceSynchronize();
问题在于,生成的平滑图像看起来像像素都在错误的其他位置(混合在一起)。这是来自网格和块的尺寸吗?我尝试了很多其他可能的尺寸。什么是正确的方法?
我正在使用 GTX480,版本 - 2.x,线程块网格的最大维度 - 3,线程块网格的最大 x、y 或 z 维度 - 65535,每个块的最大线程数 - 1024