通常有两种技术可以提高计算能力 1.3 GPU 上 CUDA 内核上全局内存的内存吞吐量;内存访问合并和访问至少 4 个字节的字。使用第一种技术,通过相同半扭曲的线程访问相同的内存段被合并为更少的事务,同时访问至少 4 个字节的字,这个内存段有效地从 32 个字节增加到 128 个。
更新:基于 talonmies answer 的解决方案。当全局内存中存储有无符号字符时,要访问 16 字节而不是 1 字节字,通常通过将内存数组转换为 uint4 来使用 uint4 向量。要从 uint4 向量中获取值,可以将其重铸为 uchar4,如下所示:
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
__global__ void kernel ( unsigned char *d_text, unsigned char *d_out ) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ unsigned char s_array[];
uint4 *uint4_text = reinterpret_cast<uint4 *>(d_text);
uint4 uint4_var;
//memory transaction
uint4_var = uint4_text[0];
//recast data to uchar4
uchar4 c0 = *reinterpret_cast<uchar4 *>(&uint4_var.x);
uchar4 c4 = *reinterpret_cast<uchar4 *>(&uint4_var.y);
uchar4 c8 = *reinterpret_cast<uchar4 *>(&uint4_var.z);
uchar4 c12 = *reinterpret_cast<uchar4 *>(&uint4_var.w);
d_out[idx] = c0.y;
}
int main ( void ) {
unsigned char *d_text, *d_out;
unsigned char *h_out = ( unsigned char * ) malloc ( 16 * sizeof ( unsigned char ) );
unsigned char *h_text = ( unsigned char * ) malloc ( 16 * sizeof ( unsigned char ) );
int i;
for ( i = 0; i < 16; i++ )
h_text[i] = 65 + i;
cudaMalloc ( ( void** ) &d_text, 16 * sizeof ( unsigned char ) );
cudaMalloc ( ( void** ) &d_out, 16 * sizeof ( unsigned char ) );
cudaMemcpy ( d_text, h_text, 16 * sizeof ( unsigned char ), cudaMemcpyHostToDevice );
kernel<<<1,16>>>(d_text, d_out );
cudaMemcpy ( h_out, d_out, 16 * sizeof ( unsigned char ), cudaMemcpyDeviceToHost );
for ( i = 0; i < 16; i++ )
printf("%c\n", h_out[i]);
return 0;
}