0

我有一个使用几个 CUDA 内核的程序,这个程序需要 50-100 毫秒才能运行,而其他程序需要 0-5 毫秒。我希望这与所有分支有关,但我不确定如何减少它。我正在为计算能力 2.1 设备进行编译。如果有人能指出我正确的方向,那就太好了。

// chosen using occupancy spreadsheet
#define SCORE_THREADS_PER_BLOCK 448

__device__ double ScoringMatrixVal(double *scoring_matrix, size_t pitch, unsigned int row, unsigned int column) {
  return *((double*)((char*) scoring_matrix + row * pitch) + column);
}

__global__ void ScoreBindingSites(char *input_sequence, unsigned long is_length, unsigned int *rvd_sequence, unsigned int rs_len, double cutoff, unsigned int rvd_num, double *scoring_matrix, size_t sm_pitch, unsigned char *results) {

  int block_seq_index = SCORE_THREADS_PER_BLOCK * (blockIdx.y * gridDim.x + blockIdx.x);
  int thread_id = (blockDim.x * threadIdx.y) + threadIdx.x;
  int seq_index = block_seq_index + thread_id;

  if (seq_index < 1 || seq_index >= is_length || seq_index + rs_len >= is_length - 1) return;

  if (input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't') {

    double thread_result = 0;

    for (int i = 0; i < rs_len; i++) {

      int rvd_index = i;

      int sm_col = 4;

      char base = input_sequence[seq_index + i];

      if (base == 'A' || base == 'a')    
        sm_col = 0;
      if (base == 'C' || base == 'c')
        sm_col = 1;
      if (base == 'G' || base == 'g')
        sm_col = 2;
      if (base == 'T' || base == 't')
        sm_col = 3;

      thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);

    }

    results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num);

  } 

  if (input_sequence[seq_index + rs_len] == 'A' || input_sequence[seq_index + rs_len] == 'a') {

    double thread_result = 0;

    for (int i = 0; i < rs_len; i++) {

      int rvd_index = rs_len - i - 1;

      int sm_col = 4;

      char base = input_sequence[seq_index + i];

      if (base == 'A' || base == 'a')    
        sm_col = 3;
      if (base == 'C' || base == 'c')
        sm_col = 2;
      if (base == 'G' || base == 'g')
        sm_col = 1;
      if (base == 'T' || base == 't')
        sm_col = 0;

      thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);

    }

    results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num + 1);

  }

}

ScoreBindingSites 启动时每个块有 (32, 14) 个线程,并且有足够的块来覆盖输入序列。如果有帮助,可以在此处找到完整的源代码。

4

2 回答 2

1

您可以做一些事情来改进此代码:

  • 如上所述,合并两个循环 for'T''A'。这可能是分支分歧的最大来源,因为if循环内的小级联语句很可能会被编译为谓词指令(请参阅NVidia CUDA C 编程指南的第 5.4.2 节)。

  • 字节大小的全局内存访问是一个糟糕的主意。相反,我建议在主循环的每次迭代中声明input_sequence,resultsbaseas and ,为 ,和的char4每个值做你的事情。base.xbase.ybase.zbase.w

  • 您可能还想仔细看看ScoringMatrixVal正在做什么。它只是从内存中读取值吗?如果是这样,您可以用恒定内存替换它吗?还是质感?

更新

根据要求,这就是我对第二点的意思。不过,我还没有测试过代码,所以请随时保留您发现的任何错误或拼写错误。请注意,为简单起见,我假设这rs_len是四的倍数。

// chosen using occupancy spreadsheet
#define SCORE_THREADS_PER_BLOCK 448

__device__ double ScoringMatrixVal(double *scoring_matrix, size_t pitch, unsigned int row, unsigned int column) {
  return scoring_matrix[ row*pitch/sizeof(double) + column ];
}

__global__ void ScoreBindingSites(char4 *input_sequence, unsigned long is_length, unsigned int *rvd_sequence, unsigned int rs_len, double cutoff, unsigned int rvd_num, double *scoring_matrix, size_t sm_pitch, unsigned char *results) {

  int block_seq_index = SCORE_THREADS_PER_BLOCK * (blockIdx.y * gridDim.x + blockIdx.x);
  int thread_id = (blockDim.x * threadIdx.y) + threadIdx.x;
  int seq_index = block_seq_index + thread_id;

  if (seq_index < 1 || seq_index >= is_length || seq_index + rs_len >= is_length - 1) return;

  if (input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't') {

    double4 thread_result = make_double4( 0 );

    for (int i = 0; i < rs_len/4; i++) {

      int rvd_index = 4*i;

      int4 sm_col = make_int4( 4 );

      char4 base = input_sequence[seq_index + i];

      if (base.x == 'A' || base.x == 'a')    
        sm_col.x = 0;
      else if (base.x == 'C' || base.x == 'c')
        sm_col.x = 1;
      else if (base.x == 'G' || base.x == 'g')
        sm_col.x = 2;
      else if (base.x == 'T' || base.x == 't')
        sm_col.x = 3;
      thread_result.x += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 0], sm_col.x);

      if (base.y == 'A' || base.y == 'a')    
        sm_col.y = 0;
      else if (base.y == 'C' || base.y == 'c')
        sm_col.y = 1;
      else if (base.y == 'G' || base.y == 'g')
        sm_col.y = 2;
      else if (base.y == 'T' || base.y == 't')
        sm_col.y = 3;
      thread_result.y += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 1], sm_col.y);

      if (base.z == 'A' || base.z == 'a')    
        sm_col.z = 0;
      else if (base.z == 'C' || base.z == 'c')
        sm_col.z = 1;
      else if (base.z == 'G' || base.z == 'g')
        sm_col.z = 2;
      else if (base.z == 'T' || base.z == 't')
        sm_col.z = 3;
      thread_result.z += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 2], sm_col.z);

      if (base.w == 'A' || base.w == 'a')    
        sm_col.w = 0;
      else if (base.w == 'C' || base.w == 'c')
        sm_col.w = 1;
      else if (base.w == 'G' || base.w == 'g')
        sm_col.w = 2;
      else if (base.w == 'T' || base.w == 't')
        sm_col.w = 3;
      thread_result.w += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 3], sm_col.w);

    }

    double acc_thread_result = thread_result.x + thread_result.y + thead_result.z + thread_result.w;

    results[seq_index] |= (acc_thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num);

  }

  if (input_sequence[seq_index + rs_len] == 'A' || input_sequence[seq_index + rs_len] == 'a') {

    ...

  }

}

几点注意事项:

  • 我已经重写了(希望是正确的)您的函数ScoringMatrixVal以使用常规数组访问,因为指针算术的整个混乱可能会使编译器关闭。
  • 我已将您的if- 语句转换为级联 -if-elseif语句,因为它们似乎相互排斥。我猜编译器将使用谓词指令并将这四个if-elseif块交错。
  • 您可以考虑将所有这些char[256]都替换为除, , ,等处4的字符代码之外的所有内容...AaCc
  • 如果您将if-elseif-statements 转换为表查找,您可以使用两个不同的表 forinput_sequence[seq_index - 1] == 'T'input_sequence[seq_index + rs_len] == 'A',从而将其全部保存在一个循环中。

我希望我没有把代码弄乱太多,这会有所帮助!

于 2012-07-09T09:49:46.403 回答
0

据我在您的内核中了解,每个线程最多读取 32 个字符并检查每个字符并输出一些数据。

您可以通过使用不同的块方法和不同的索引(如果在您的情况下可能的话)隐式模拟它来完全删除循环。

每个块有 32 个线程,每个线程计算一次循环迭代的结果。

我不知道它是否更快但值得测试。

应该明确考虑佩德罗使用表格查找来替换您的条件的答案。

小改动:

是否input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't'优化为仅读取一次内存?

通过仅删除一次使用的变量来保存寄存器threadId

于 2012-07-10T13:19:36.990 回答