我编写 CUDA 代码已经有一段时间了,但我现在才开始了解如何使用纹理缓存。
使用simpleTexture
来自 Nvidia SDK 的示例作为灵感,我编写了一个使用纹理缓存的简单示例。主机将 Lena 图像复制到 GPU 并将其绑定为纹理。内核只是将纹理缓存的内容复制到一个输出数组中。
奇怪的是,结果(参见代码下方的全灰色图像)与输入不匹配。关于可能出现问题的任何想法?
代码(看texCache_dummyKernel
):
texture<float, 2, cudaReadModeElementType> tex; //declare texture reference for 2D float texture
//note: tex is global, so no input ptr is needed
__global__ void texCache_dummyKernel(float* out, const int width, const int height){ //copy tex to output
int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
int y = blockIdx.y*blockDim.y + threadIdx.y;
int idx = y*width+x;
if(x < width && y < height)
out[idx] = tex2D(tex, y, x);
}
int main(int argc, char **argv){
cv::Mat img = getRawImage("./Lena.pgm");
img.convertTo(img, CV_32FC1);
float* hostImg = (float*)&img.data[0];
int width = img.cols; int height = img.rows;
dim3 grid; dim3 block;
block.x = 16; block.y = 16;
grid.x = width/block.x + 1;
grid.y = height/block.y + 1;
cudaArray *dImg; //cudaArray*, not float*
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
CHECK_CUDART(cudaMallocArray(&dImg, &channelDesc, width, height));
CHECK_CUDART(cudaMemcpyToArray(dImg, 0, 0, hostImg, width*height*sizeof(float), cudaMemcpyHostToDevice));
setTexCacheParams(); //defined below
CHECK_CUDART(cudaBindTextureToArray(tex, dImg, channelDesc)); //Bind the array to the texture
float* dResult; //device memory for output
CHECK_CUDART(cudaMalloc((void**)&dResult, sizeof(float)*width*height));
texCache_dummyKernel<<<grid, block>>>(dResult, width, height); //dImg isn't an input param, since 'tex' is a global variable
CHECK_CUDART(cudaGetLastError()); //make sure kernel didn't crash
float* hostResult = (float*)malloc(sizeof(float)*width*height);
CHECK_CUDART(cudaMemcpy(hostResult, dResult, sizeof(float)*width*height, cudaMemcpyDeviceToHost));
outputProcessedImage(hostResult, width, height, "result.png"); //defined below
}
我可能应该提供我上面使用的几个辅助函数:
void setTexCacheParams(){ //configuration directly pulled from simpleTexture in nvidia sdk
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true; // access with normalized texture coordinates
}
void outputProcessedImage(float* processedImg, int width, int height, string out_filename){
cv::Mat img = cv::Mat::zeros(height, width, CV_32FC1);
for(int i=0; i<height; i++)
for(int j=0; j<width; j++)
img.at<float>(i,j) = processedImg[i*width + j]; //just grab the 1st of the 4 pixel spaces in a uchar4
img.convertTo(img, CV_8UC1); //float to uchar
vector<int> compression_params;
compression_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
compression_params.push_back(9);
cv::imwrite(out_filename, img, compression_params);
}
输入:
输出:
- 对不起,这篇文章是这样一堵代码墙!对于使这样的代码更简洁的任何建议,我将不胜感激。
- 我在上面的文件 I/O 中使用了 OpenCV……希望这不会造成混淆。
- 当我更改内核以从一维数组中读取输入图像
float*
,并且我几乎保持其他所有内容相同时,我得到了正确的结果。