我有一个使用几个 CUDA 内核的程序,这个程序需要 50-100 毫秒才能运行,而其他程序需要 0-5 毫秒。我希望这与所有分支有关,但我不确定如何减少它。我正在为计算能力 2.1 设备进行编译。如果有人能指出我正确的方向,那就太好了。
// chosen using occupancy spreadsheet
#define SCORE_THREADS_PER_BLOCK 448
__device__ double ScoringMatrixVal(double *scoring_matrix, size_t pitch, unsigned int row, unsigned int column) {
return *((double*)((char*) scoring_matrix + row * pitch) + column);
}
__global__ void ScoreBindingSites(char *input_sequence, unsigned long is_length, unsigned int *rvd_sequence, unsigned int rs_len, double cutoff, unsigned int rvd_num, double *scoring_matrix, size_t sm_pitch, unsigned char *results) {
int block_seq_index = SCORE_THREADS_PER_BLOCK * (blockIdx.y * gridDim.x + blockIdx.x);
int thread_id = (blockDim.x * threadIdx.y) + threadIdx.x;
int seq_index = block_seq_index + thread_id;
if (seq_index < 1 || seq_index >= is_length || seq_index + rs_len >= is_length - 1) return;
if (input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't') {
double thread_result = 0;
for (int i = 0; i < rs_len; i++) {
int rvd_index = i;
int sm_col = 4;
char base = input_sequence[seq_index + i];
if (base == 'A' || base == 'a')
sm_col = 0;
if (base == 'C' || base == 'c')
sm_col = 1;
if (base == 'G' || base == 'g')
sm_col = 2;
if (base == 'T' || base == 't')
sm_col = 3;
thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);
}
results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num);
}
if (input_sequence[seq_index + rs_len] == 'A' || input_sequence[seq_index + rs_len] == 'a') {
double thread_result = 0;
for (int i = 0; i < rs_len; i++) {
int rvd_index = rs_len - i - 1;
int sm_col = 4;
char base = input_sequence[seq_index + i];
if (base == 'A' || base == 'a')
sm_col = 3;
if (base == 'C' || base == 'c')
sm_col = 2;
if (base == 'G' || base == 'g')
sm_col = 1;
if (base == 'T' || base == 't')
sm_col = 0;
thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);
}
results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num + 1);
}
}
ScoreBindingSites 启动时每个块有 (32, 14) 个线程,并且有足够的块来覆盖输入序列。如果有帮助,可以在此处找到完整的源代码。