一个问题涉及对存储在计算能力 1.3 GPU 的全局内存中的无符号字符数组的跨步访问。为了绕过全局内存的合并要求,线程顺序访问全局内存并将数组复制到共享内存,仅使用 2 个内存事务,例如:
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
__global__ void kernel ( unsigned char *d_text, unsigned char *d_out ) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ unsigned char s_array[];
uint4 *uint4_text = ( uint4 * ) d_text;
uint4 var;
//memory transaction
var = uint4_text[0];
uchar4 c0 = *reinterpret_cast<uchar4 *>(&var.x);
uchar4 c4 = *reinterpret_cast<uchar4 *>(&var.y);
uchar4 c8 = *reinterpret_cast<uchar4 *>(&var.z);
uchar4 c12 = *reinterpret_cast<uchar4 *>(&var.w);
s_array[threadIdx.x*16 + 0] = c0.x;
s_array[threadIdx.x*16 + 1] = c0.y;
s_array[threadIdx.x*16 + 2] = c0.z;
s_array[threadIdx.x*16 + 3] = c0.w;
s_array[threadIdx.x*16 + 4] = c4.x;
s_array[threadIdx.x*16 + 5] = c4.y;
s_array[threadIdx.x*16 + 6] = c4.z;
s_array[threadIdx.x*16 + 7] = c4.w;
s_array[threadIdx.x*16 + 8] = c8.x;
s_array[threadIdx.x*16 + 9] = c8.y;
s_array[threadIdx.x*16 + 10] = c8.z;
s_array[threadIdx.x*16 + 11] = c8.w;
s_array[threadIdx.x*16 + 12] = c12.x;
s_array[threadIdx.x*16 + 13] = c12.y;
s_array[threadIdx.x*16 + 14] = c12.z;
s_array[threadIdx.x*16 + 15] = c12.w;
d_out[idx] = s_array[threadIdx.x*16];
}
int main ( void ) {
unsigned char *d_text, *d_out;
unsigned char *h_out = ( unsigned char * ) malloc ( 32 * sizeof ( unsigned char ) );
unsigned char *h_text = ( unsigned char * ) malloc ( 32 * sizeof ( unsigned char ) );
int i;
for ( i = 0; i < 32; i++ )
h_text[i] = 65 + i;
cudaMalloc ( ( void** ) &d_text, 32 * sizeof ( unsigned char ) );
cudaMalloc ( ( void** ) &d_out, 32 * sizeof ( unsigned char ) );
cudaMemcpy ( d_text, h_text, 32 * sizeof ( unsigned char ), cudaMemcpyHostToDevice );
kernel<<<1,32,16128>>>(d_text, d_out );
cudaMemcpy ( h_out, d_out, 32 * sizeof ( unsigned char ), cudaMemcpyDeviceToHost );
for ( i = 0; i < 32; i++ )
printf("%c\n", h_out[i]);
return 0;
}
问题是在将数据复制到共享内存时会发生银行冲突(由 nvprof 报告的上述示例的 384 冲突)导致线程的序列化访问。
共享内存被划分为 16 个(或在较新的设备架构上为 32 个)32 位存储库,以便同时为同一个半扭曲的 16 个线程提供服务。数据在 bank 之间交错,第 i 个 32 位字始终存储在 i % 16 - 1 共享内存 bank 中。
由于每个线程通过一个内存事务读取 16 个字节,因此字符将以跨步方式存储到共享内存中。这会导致线程 0、4、8、12 之间发生冲突;1、5、9、13;2、6、10、14;3、7、11、15 个相同的半经线。消除银行冲突的一种简单方法是使用 if/else 分支以类似于以下的循环方式将数据存储到共享内存,但会导致一些严重的线程分歧:
int tid16 = threadIdx.x % 16;
if ( tid16 < 4 ) {
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
} else if ( tid16 < 8 ) {
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
} else if ( tid16 < 12 ) {
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
} else {
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
}
任何人都可以提出更好的解决方案吗?我已经研究过 SDK 的缩减示例,但我不确定它是否适用于我的问题。