我正在实施一个简单的 boxcar 过滤器,只是作为评估 2D 局部纹理和全局内存访问的不同速度的借口。
更详细的,.cu
文件如下
#include <cuda.h>
#include <cuda_runtime.h>
#include "cufft.h"
#include "Kernels_Test_Texture_Float.cuh"
#define BLOCK_SIZE_x 16
#define BLOCK_SIZE_y 16
/**********************/
/* TEST TEXTURE FLOAT */
/**********************/
extern "C" void Function_Test_Texture_Float(float* data, float* dev_result, int N1, int N2){
size_t pitch;
float* data_d;
cudaMallocPitch((void**)&data_d,&pitch, N1 * sizeof(float), N2);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaBindTexture2D(0,&data_d_texture,data_d,&desc,N1,N2,pitch);
cudaMemcpy2D(data_d,pitch,data,sizeof(float)*N1,sizeof(float)*N1,N2,cudaMemcpyHostToDevice);
cudaMemset(dev_result,0,sizeof(float)*N1*N2);
dim3 dimBlock(BLOCK_SIZE_x,BLOCK_SIZE_y); dim3 dimGrid(N1/BLOCK_SIZE_x + (N1%BLOCK_SIZE_x == 0 ? 0:1),N2/BLOCK_SIZE_x + (N2%BLOCK_SIZE_x == 0 ? 0:1));
Kernel_Test_Texture_Float<<<dimGrid,dimBlock>>>(dev_result,N1, N2);
}
/**************/
/* TEST FLOAT */
/**************/
extern "C" void Function_Test_Float(float* data, float* dev_result2, int N1, int N2){
float* data_d; cudaMalloc((void**)&data_d,sizeof(float)*N1*N2);
cudaMemcpy(data_d,data,sizeof(float)*N1*N2,cudaMemcpyHostToDevice);
cudaMemset(dev_result2,0,sizeof(float)*N1*N2);
dim3 dimBlock(BLOCK_SIZE_x,BLOCK_SIZE_y); dim3 dimGrid(N1/BLOCK_SIZE_x + (N1%BLOCK_SIZE_x == 0 ? 0:1),N2/BLOCK_SIZE_x + (N2%BLOCK_SIZE_x == 0 ? 0:1));
Kernel_Test_Float<<<dimGrid,dimBlock>>>(dev_result2,data_d,N1, N2);
}
文件.cuh
如下
texture<float,2> data_d_texture;
/**************************/
/* 2D TEXTURE TEST KERNEL */
/**************************/
__global__ void Kernel_Test_Texture_Float(float* dev_result, int N1, int N2)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
float datum, accumulator=0.;
int size_x=5;
int size_y=5;
if((i<(N1-size_x))&&(j<(N2-size_y)))
{
for (int k=0; k<size_x; k++)
for (int l=0; l<size_y; l++){
datum = tex2D(data_d_texture,i+k,j+l);
accumulator = accumulator + datum;
}
dev_result[j*blockDim.x*gridDim.x+i] = accumulator;
}
}
/******************/
/* 2D TEST KERNEL */
/******************/
__global__ void Kernel_Test_Float(float* dev_result2, float* data_d, int N1, int N2)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
float accumulator=0.;
int size_x=5;
int size_y=5;
if((i<(N1-size_x))&&(j<(N2-size_y)))
{
for (int k=0; k<size_x; k++)
for (int l=0; l<size_y; l++){
accumulator = accumulator + data_d[(j+l)*blockDim.x*gridDim.x+(i+k)];
}
dev_result2[j*blockDim.x*gridDim.x+i] = accumulator;
}
}
但是,全局内存内核的结果比纹理内存内核快得多(94us
vs 615us
- 计时是 Visual Profiler 的结果 - 卡是 GeForce GT 540M)。
我正在使用纹理内存或全局内存确实比缓存纹理更快有什么问题吗?
提前感谢您的任何评论。