我正在测量标准和 1Dtexture 访问内存之间的差异。为此,我创建了两个内核
__global__ void texture1D(float* doarray,int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]=tex1Dfetch(texreference,index);
return;
}
__global__ void standard1D(float* diarray, float* doarray, int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]= diarray[index];
return;
}
然后,我调用 eache 内核来测量它所花费的时间:
//copy array from host to device memory
cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice);
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventRecord(startEvent, 0) );
//bind texture reference with linear memory
cudaBindTexture(0,texreference,diarray,sizeof(float)*size);
//execute device kernel
texture1D<<<(int)ceil((float)size/threadSize),threadSize>>>(doarray,size);
//unbind texture reference to free resource
cudaUnbindTexture(texreference);
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
//copy result array from device to host memory
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
//check result
checkResutl(horray, harray, size);
cudaEvent_t startEvent2, stopEvent2;
checkCuda( cudaEventCreate(&startEvent2) );
checkCuda( cudaEventCreate(&stopEvent2) );
checkCuda( cudaEventRecord(startEvent2, 0) );
standard1D<<<(int)ceil((float)size/threadSize),threadSize>>>(diarray,doarray,size);
checkCuda( cudaEventRecord(stopEvent2, 0) );
checkCuda( cudaEventSynchronize(stopEvent2) );
//copy back to CPU
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
并打印结果:
float time,time2;
checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
checkCuda( cudaEventElapsedTime(&time2, startEvent2, stopEvent2) );
printf("Texture bandwidth (GB/s): %f\n",bytes * 1e-6 / time);
printf("Standard bandwidth (GB/s): %f\n",bytes * 1e-6 / time2);
事实证明,无论我分配的数组大小(size
),标准带宽总是高得多。是它应该是这样还是我在某个时候搞砸了?我对Texture内存访问的理解是它可以加速全局内存访问。