我终于找到了自己问题的答案。char4 的定义是
char4 *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char4)/4 );
不需要纹理。内核确实使用 char4 将速度提高了三倍,但如果我进行循环展开,则减少到两倍。为了完整起见,我的内核是
__global__ void kernel(unsigned int jobs_todo, char* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char ch;
if(id < jobs_todo) {
for(i = 0; i < 1000; i += 1){
ch = database[jobs_todo*i + id];
if(ch == 'A') A++;
}
results[id] = A;
}
}
和 char4 它是
__global__ void kernel4(unsigned int jobs_todo, char4* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char4 ch4;
if(id < jobs_todo) {
for(i = 0; i < 1000/4; i += 1){
ch4 = database[jobs_todo*i + id];
if(ch4.x == 'A') A++;
if(ch4.y == 'A') A++;
if(ch4.z == 'A') A++;
if(ch4.w == 'A') A++;
}
results[id] = A;
}
}
我也试过 int4 但它比 char4 时间快 0.0002 秒。