我编写 CUDA 代码已经有一段时间了,但我现在才开始了解如何使用纹理缓存。

使用simpleTexture来自 Nvidia SDK 的示例作为灵感,我编写了一个使用纹理缓存的简单示例。主机将 Lena 图像复制到 GPU 并将其绑定为纹理。内核只是将纹理缓存的内容复制到一个输出数组中。



texture<float, 2, cudaReadModeElementType> tex; //declare texture reference for 2D float texture

//note: tex is global, so no input ptr is needed
__global__ void texCache_dummyKernel(float* out, const int width, const int height){ //copy tex to output
    int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
    int y = blockIdx.y*blockDim.y + threadIdx.y;
    int idx = y*width+x;

    if(x < width && y < height)
        out[idx] = tex2D(tex, y, x);

int main(int argc, char **argv){        
    cv::Mat img = getRawImage("./Lena.pgm");
    img.convertTo(img, CV_32FC1);
    float* hostImg = (float*)&img.data[0];
    int width = img.cols; int height = img.rows;

    dim3 grid;  dim3 block;
    block.x = 16;  block.y = 16;
    grid.x = width/block.x + 1;          
    grid.y = height/block.y + 1;

    cudaArray *dImg; //cudaArray*, not float*
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);        
    CHECK_CUDART(cudaMallocArray(&dImg, &channelDesc, width, height));
    CHECK_CUDART(cudaMemcpyToArray(dImg, 0, 0, hostImg, width*height*sizeof(float), cudaMemcpyHostToDevice));
    setTexCacheParams(); //defined below
    CHECK_CUDART(cudaBindTextureToArray(tex, dImg, channelDesc)); //Bind the array to the texture

    float* dResult; //device memory for output
    CHECK_CUDART(cudaMalloc((void**)&dResult, sizeof(float)*width*height));

    texCache_dummyKernel<<<grid, block>>>(dResult, width, height); //dImg isn't an input param, since 'tex' is a global variable
    CHECK_CUDART(cudaGetLastError()); //make sure kernel didn't crash

    float* hostResult = (float*)malloc(sizeof(float)*width*height);
    CHECK_CUDART(cudaMemcpy(hostResult, dResult, sizeof(float)*width*height, cudaMemcpyDeviceToHost));
    outputProcessedImage(hostResult, width, height, "result.png"); //defined below


void setTexCacheParams(){ //configuration directly pulled from simpleTexture in nvidia sdk
    tex.addressMode[0] = cudaAddressModeWrap;
    tex.addressMode[1] = cudaAddressModeWrap;
    tex.filterMode = cudaFilterModeLinear;
    tex.normalized = true;    // access with normalized texture coordinates

void outputProcessedImage(float* processedImg, int width, int height, string out_filename){
    cv::Mat img = cv::Mat::zeros(height, width, CV_32FC1);
    for(int i=0; i<height; i++)
        for(int j=0; j<width; j++)
            img.at<float>(i,j) = processedImg[i*width + j]; //just grab the 1st of the 4 pixel spaces in a uchar4

    img.convertTo(img, CV_8UC1); //float to uchar
    vector<int> compression_params;
    cv::imwrite(out_filename, img, compression_params);





  • 对不起,这篇文章是这样一堵代码墙!对于使这样的代码更简洁的任何建议,我将不胜感激。
  • 我在上面的文件 I/O 中使用了 OpenCV……希望这不会造成混淆。
  • 当我更改内核以从一维数组中读取输入图像float*,并且我几乎保持其他所有内容相同时,我得到了正确的结果。

1 回答 1


在您的原始代码中,您已将纹理初始化为使用归一化坐标。这意味着纹理在每个空间维度的 [0,1] 上进行寻址。所以你的内核应该是这样的:

void texCache_dummyKernel(float* out, const int width, const int height)
    int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
    int y = blockIdx.y*blockDim.y + threadIdx.y;
    int idx = y*width+x;

    if(x < width && y < height) {
        float u = float(x)/float(width), v = float(y)/float(height);
        out[idx] = tex2D(tex, u, v);




out[idx] = tex2D(tex, float(x)+0.5f, float(y)+0.5f);

因为纹理寻址总是使用浮点坐标完成,并且纹理数据以体素为中心,因此每个坐标添加 0.5 以确保读取来自纹理内每个插值区域或体积的质心。

您可以在 CUDA C 编程指南的一个附录中找到纹理过滤和寻址模式的描述及其对插值的影响。

