cuda - 读取设备代码中的 cudaArray

Question

有没有办法cudaArray从设备中读取 a 中的值而不将其包装在纹理参考/对象中？我看过的所有示例都cudaArray专门用于创建纹理。这是他们可以使用的唯一方法，还是我可以做类似的事情：

__global__ kernel(cudaArray *arr, ...) {
    float x = tex1D<float>(arr, ...);
    ...
}

cudaArray *arr;
cudaMallocArray(&arr, ...);
cudaMemcpyToArray(arr, ...);
kernel<<<...>>>(arr, ...);

所以基本上，应该用什么代替tex1D那里？此外，如果这是可能的，我会很好奇是否有人认为这样做会对性能有任何好处，但我也会运行我自己的测试来看看。

谢谢！

score 7 · Accepted Answer

cudaArray 是为纹理或表面内存目的而定义的。如此处所示：

CUDA 数组是针对纹理获取优化的不透明内存布局。它们是一维、二维或三维，由元素组成，每个元素有 1、2 或 4 个分量，可以是有符号或无符号 8、16 或 32 位整数、16 位浮点数或 32 位浮点数。CUDA 数组只能由内核通过纹理内存中描述的纹理获取或表面内存中描述的表面读取和写入来访问。

所以实际上你必须在内核中使用纹理函数或表面函数来访问 cudaArray 中的数据。

使用纹理有多种性能优势可能性。纹理可以暗示插值（即使用浮点坐标从纹理中读取）。任何需要这种数据插值的应用程序都可以从 GPU 上纹理单元内的硬件插值引擎中受益。

另一个好处，也许是在任意 GPU 代码中使用纹理最重要的好处，是用于备份存储在全局内存中的纹理的纹理缓存。纹理是一种只读操作，但如果您有一组只读数据，则纹理缓存可能会提高或扩展您快速访问数据的能力。这通常意味着在访问存储在纹理机制中的数据的函数中必须存在数据局部性/数据重用。检索到的纹理数据不会破坏 L1 缓存中的任何内容，因此通常这种数据分段/优化将是围绕数据缓存的更大策略的一部分。如果对 L1 缓存没有其他要求，则纹理机制/缓存不会比已经在 L1 中提供更快的数据访问。

score 5 · Accepted Answer

Robert Crovella 已经回答了您的问题。我相信为下一个用户提供两个解决方案的工作示例可能会很有用：纹理和表面。

#include <stdio.h>
#include <thrust\device_vector.h>

// --- 2D float texture
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;

// --- 2D surface memory
surface<void, 2> surf2D;

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*************************************/
/* cudaArray PRINTOUT TEXTURE KERNEL */
/*************************************/
__global__ void cudaArrayPrintoutTexture(int width, int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    printf("Thread index: (%i, %i); cudaArray = %f\n", x, y, tex2D(texRef, x / (float)width + 0.5f, y / (float)height + 0.5f));
}

/*************************************/
/* cudaArray PRINTOUT TEXTURE KERNEL */
/*************************************/
__global__ void cudaArrayPrintoutSurface(int width, int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    float temp;

    surf2Dread(&temp, surf2D, x * 4, y);

    printf("Thread index: (%i, %i); cudaArray = %f\n", x, y, temp);
}

/********/
/* MAIN */
/********/
void main()
{
    int width = 3, height = 3;

    thrust::host_vector<float> h_data(width*height, 3.f);

    // --- Allocate CUDA array in device memory
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

    cudaArray* cuArray;

    /*******************/
    /* TEXTURE BINDING */
    /*******************/
    gpuErrchk(cudaMallocArray(&cuArray, &channelDesc, width, height));

    // --- Copy to host data to device memory
    gpuErrchk(cudaMemcpyToArray(cuArray, 0, 0, thrust::raw_pointer_cast(h_data.data()), width*height*sizeof(float), cudaMemcpyHostToDevice));

    // --- Set texture parameters
    texRef.addressMode[0] = cudaAddressModeWrap;
    texRef.addressMode[1] = cudaAddressModeWrap;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = true;

    // --- Bind the array to the texture reference
    gpuErrchk(cudaBindTextureToArray(texRef, cuArray, channelDesc));

    // --- Invoking printout kernel
    dim3 dimBlock(3, 3);
    dim3 dimGrid(1, 1);
    cudaArrayPrintoutTexture<<<dimGrid, dimBlock>>>(width, height);

    gpuErrchk(cudaUnbindTexture(texRef));

    gpuErrchk(cudaFreeArray(cuArray));

    /******************/
    /* SURFACE MEMORY */
    /******************/
    gpuErrchk(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));

    // --- Copy to host data to device memory
    gpuErrchk(cudaMemcpyToArray(cuArray, 0, 0, thrust::raw_pointer_cast(h_data.data()), width*height*sizeof(float), cudaMemcpyHostToDevice));

    gpuErrchk(cudaBindSurfaceToArray(surf2D, cuArray));

    cudaArrayPrintoutSurface<<<dimGrid, dimBlock>>>(width, height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaFreeArray(cuArray));
}

cuda - 读取设备代码中的 cudaArray

2 回答 2

Related

Reference