当我阅读编程指南时,我感觉共享内存总会提高性能,但似乎并非如此。我有两个功能:
const int Ntimes=1;
__global__ void testgl(float *A, float *C, int numElements){
int ti = threadIdx.x;
int b0 = blockDim.x*blockIdx.x;
if (b0+ti < numElements){
for(int i=0;i<Ntimes;i++){
A[b0+ti]=A[b0+ti]*A[b0+ti]*10-2*A[b0+ti]+1;
}
C[b0+ti] = A[b0+ti]*A[b0+ti];
}
}
__global__ void testsh(float *A, float *C, int numElements){
int ti = threadIdx.x;
int b0 = blockDim.x*blockIdx.x;
__shared__ float a[1024];
if (b0+ti < numElements){
a[ti]=A[b0+ti];
}
__syncthreads();
if (b0+ti < numElements){
for(int i=0;i<Ntimes;i++){
a[ti]=a[ti]*a[ti]*10-2*a[ti]+1;
}
C[b0+ti] = a[ti]*a[ti];
}
}
int main(void){
int numElements = 500000;
size_t size = numElements * sizeof(float);
// Allocate the host input
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
// Allocate the host output
float *h_C = (float *)malloc(size);
float *h_D = (float *)malloc(size);
// Initialize the host input
for (int i = 0; i < numElements; i++){
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = h_A[i];
}
// Allocate the device input
float *d_A = NULL; cudaMalloc((void **)&d_A, size);
float *d_B = NULL; cudaMalloc((void **)&d_B, size);
float *d_C = NULL; cudaMalloc((void **)&d_C, size);
float *d_D = NULL; cudaMalloc((void **)&d_D, size);
//Copy to Device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
testgl<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
testsh<<<blocksPerGrid, threadsPerBlock>>>(d_B, d_D, numElements);
// Copy the device resultto the host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost);
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_D);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
free(h_D);
// Reset the device and exit
cudaDeviceReset();
return 0;
}
如果 Ntimes 设置为 1,testgl 花费 49us,testsh 花费 97us。如果 Ntimes 设置为 100,testgl 花费 9.7ms,testsh 花费 8.9ms。
我不知道为什么它长了 100 多倍。
所以看起来共享内存只有在我们想在设备中做很多事情时才有帮助,对吗?
这里使用的卡是GTX680。
提前致谢。