2

我是 CUDA 的新手。我已经弄清楚如何在 CUDA 中制作 1D 和 2D 纹理。但是,我正在为如何使用 1D 分层纹理而苦苦挣扎。使用纹理的内核的输出全为零,这绝对是不正确的。但是,我不确定我做错了什么。我严重怀疑我是否正确设置了这个纹理,但我到处检查 cuda 错误,找不到任何问题。有人可以告诉我如何正确设置一维分层纹理并使用它。这是我的代码。提前致谢:

// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out

// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define pi acos(-1)

// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (location_idx < numlocations) {
        // Get the location you want to interpolate from the array
        float loc2find = (float) d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
    }
}

// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 10;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 3;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1/(float)upsamp;
    float h_data[len][numlayers], h_loc[loclen];
    for (int i = 0; i < len; i++) 
        for (int j = 0; j < numlayers; j++)
            h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
    for (int i = 0; i < loclen; i ++) 
        h_loc[i] = i*idx_spacing;

    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);

    // Allocate CUDA array in device memory
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, len, numlayers);

    // Copy to device memory some data located at address h_data in host memory 
    cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);

    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;

    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);

    // Allocate result of transformation in device memory
    float* d_output;
    cudaMalloc(&d_output, loclen * sizeof(float));

    // Invoke kernel
    int thdsPerBlk = 256;
    int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
    printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
    interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);

    // Print Results
    printf("\n Original Indices \n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("\n Original array \n");
    for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
    printf("\n Output Indices \n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n Output Array \n");
    cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n");

    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);

    return 0;
}
4

2 回答 2

3

您必须使用cudaMalloc3DArray标志cudaArrayLayered集来为分层纹理分配内存。工具包示例中有一个完整的分层纹理使用示例,您可以研究这些示例以了解它们是如何工作的。

于 2016-08-29T05:36:55.173 回答
3

不幸的是,CUDA SDK 仅向您展示如何在您拥有 2D 分层纹理时执行此操作。当涉及到 1D 分层纹理时,还有一些棘手的问题。make_cudaExtent事实证明,在进行extentDesc如下操作时,您必须将 0 放入第二个参数中:

cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered

但是,当使用make_cudaExtentfor mParams.extentfor时cudaMemcpy3D,您仍然需要为第二个参数输入 1:

mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything

此外,还有一些其他不明显的细节,例如make_cudaPitchedPtr. 因此,我已经为 1D 分层纹理包含了完整且可运行的代码。我在任何地方都找不到这样的例子。因此,希望这将帮助其他处于同一条船上的人:

// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out

// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define pi acos(-1)

// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
    if (location_idx < numlocations && layer < numlayers) {
        // Get the location you want to interpolate from the array
        float loc2find = (float)d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
        //printf("location=%d layer=%d loc2find=%f  result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
    }
}

// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 7;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 4;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1 / (float)upsamp;
    float h_data[numlayers*len], h_loc[loclen];
    for (int i = 0; i < len; i++)
        for (int j = 0; j < numlayers; j++)
            h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
    for (int i = 0; i < loclen; i++)
        h_loc[i] = i*idx_spacing;

    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);

    // Allocate CUDA array in device memory
    cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaMemcpy3DParms mParams = { 0 };
    mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
    mParams.kind = cudaMemcpyHostToDevice;
    mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything
    cudaArray* cuArray;
    cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
    mParams.dstArray = cuArray;
    cudaMemcpy3D(&mParams);

    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;

    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);

    // Allocate result of transformation in device memory
    float *d_output;
    cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
    float h_output[loclen * numlayers];

    // Invoke kernel
    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
        (numlayers + dimBlock.y - 1) / dimBlock.y, 1);
    interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);

    // Print Results
    printf("\n Original Indices \n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("\n Original array \n");
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < len; i++) {
            printf("%5.3f ", h_data[i + j*len]);
        }
        printf("\n");
    }
    printf("\n Output Indices \n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n Output Array \n");
    cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < loclen; i++) {
            printf("%5.3f ", h_output[i + j*loclen]);
        }
        printf("\n");
    }
    printf("\n");

    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);

    return 0;
}
于 2016-08-30T01:08:33.727 回答