cuda - CUDA 分支优化

Question

我有一个使用几个 CUDA 内核的程序，这个程序需要 50-100 毫秒才能运行，而其他程序需要 0-5 毫秒。我希望这与所有分支有关，但我不确定如何减少它。我正在为计算能力 2.1 设备进行编译。如果有人能指出我正确的方向，那就太好了。

// chosen using occupancy spreadsheet
#define SCORE_THREADS_PER_BLOCK 448

__device__ double ScoringMatrixVal(double *scoring_matrix, size_t pitch, unsigned int row, unsigned int column) {
  return *((double*)((char*) scoring_matrix + row * pitch) + column);
}

__global__ void ScoreBindingSites(char *input_sequence, unsigned long is_length, unsigned int *rvd_sequence, unsigned int rs_len, double cutoff, unsigned int rvd_num, double *scoring_matrix, size_t sm_pitch, unsigned char *results) {

  int block_seq_index = SCORE_THREADS_PER_BLOCK * (blockIdx.y * gridDim.x + blockIdx.x);
  int thread_id = (blockDim.x * threadIdx.y) + threadIdx.x;
  int seq_index = block_seq_index + thread_id;

  if (seq_index < 1 || seq_index >= is_length || seq_index + rs_len >= is_length - 1) return;

  if (input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't') {

    double thread_result = 0;

    for (int i = 0; i < rs_len; i++) {

      int rvd_index = i;

      int sm_col = 4;

      char base = input_sequence[seq_index + i];

      if (base == 'A' || base == 'a')    
        sm_col = 0;
      if (base == 'C' || base == 'c')
        sm_col = 1;
      if (base == 'G' || base == 'g')
        sm_col = 2;
      if (base == 'T' || base == 't')
        sm_col = 3;

      thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);

    }

    results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num);

  } 

  if (input_sequence[seq_index + rs_len] == 'A' || input_sequence[seq_index + rs_len] == 'a') {

    double thread_result = 0;

    for (int i = 0; i < rs_len; i++) {

      int rvd_index = rs_len - i - 1;

      int sm_col = 4;

      char base = input_sequence[seq_index + i];

      if (base == 'A' || base == 'a')    
        sm_col = 3;
      if (base == 'C' || base == 'c')
        sm_col = 2;
      if (base == 'G' || base == 'g')
        sm_col = 1;
      if (base == 'T' || base == 't')
        sm_col = 0;

      thread_result += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index], sm_col);

    }

    results[seq_index] |= (thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num + 1);

  }

}

ScoreBindingSites 启动时每个块有 (32, 14) 个线程，并且有足够的块来覆盖输入序列。如果有帮助，可以在此处找到完整的源代码。

score 1 · Accepted Answer

您可以做一些事情来改进此代码：

如上所述，合并两个循环 for'T'和'A'。这可能是分支分歧的最大来源，因为if循环内的小级联语句很可能会被编译为谓词指令（请参阅NVidia CUDA C 编程指南的第 5.4.2 节）。
字节大小的全局内存访问是一个糟糕的主意。相反，我建议在主循环的每次迭代中声明input_sequence,results和baseas and ，为 ,和的char4每个值做你的事情。base.xbase.ybase.zbase.w
您可能还想仔细看看ScoringMatrixVal正在做什么。它只是从内存中读取值吗？如果是这样，您可以用恒定内存替换它吗？还是质感？

更新

根据要求，这就是我对第二点的意思。不过，我还没有测试过代码，所以请随时保留您发现的任何错误或拼写错误。请注意，为简单起见，我假设这rs_len是四的倍数。

// chosen using occupancy spreadsheet
#define SCORE_THREADS_PER_BLOCK 448

__device__ double ScoringMatrixVal(double *scoring_matrix, size_t pitch, unsigned int row, unsigned int column) {
  return scoring_matrix[ row*pitch/sizeof(double) + column ];
}

__global__ void ScoreBindingSites(char4 *input_sequence, unsigned long is_length, unsigned int *rvd_sequence, unsigned int rs_len, double cutoff, unsigned int rvd_num, double *scoring_matrix, size_t sm_pitch, unsigned char *results) {

  int block_seq_index = SCORE_THREADS_PER_BLOCK * (blockIdx.y * gridDim.x + blockIdx.x);
  int thread_id = (blockDim.x * threadIdx.y) + threadIdx.x;
  int seq_index = block_seq_index + thread_id;

  if (seq_index < 1 || seq_index >= is_length || seq_index + rs_len >= is_length - 1) return;

  if (input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't') {

    double4 thread_result = make_double4( 0 );

    for (int i = 0; i < rs_len/4; i++) {

      int rvd_index = 4*i;

      int4 sm_col = make_int4( 4 );

      char4 base = input_sequence[seq_index + i];

      if (base.x == 'A' || base.x == 'a')    
        sm_col.x = 0;
      else if (base.x == 'C' || base.x == 'c')
        sm_col.x = 1;
      else if (base.x == 'G' || base.x == 'g')
        sm_col.x = 2;
      else if (base.x == 'T' || base.x == 't')
        sm_col.x = 3;
      thread_result.x += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 0], sm_col.x);

      if (base.y == 'A' || base.y == 'a')    
        sm_col.y = 0;
      else if (base.y == 'C' || base.y == 'c')
        sm_col.y = 1;
      else if (base.y == 'G' || base.y == 'g')
        sm_col.y = 2;
      else if (base.y == 'T' || base.y == 't')
        sm_col.y = 3;
      thread_result.y += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 1], sm_col.y);

      if (base.z == 'A' || base.z == 'a')    
        sm_col.z = 0;
      else if (base.z == 'C' || base.z == 'c')
        sm_col.z = 1;
      else if (base.z == 'G' || base.z == 'g')
        sm_col.z = 2;
      else if (base.z == 'T' || base.z == 't')
        sm_col.z = 3;
      thread_result.z += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 2], sm_col.z);

      if (base.w == 'A' || base.w == 'a')    
        sm_col.w = 0;
      else if (base.w == 'C' || base.w == 'c')
        sm_col.w = 1;
      else if (base.w == 'G' || base.w == 'g')
        sm_col.w = 2;
      else if (base.w == 'T' || base.w == 't')
        sm_col.w = 3;
      thread_result.w += ScoringMatrixVal(scoring_matrix, sm_pitch, rvd_sequence[rvd_index + 3], sm_col.w);

    }

    double acc_thread_result = thread_result.x + thread_result.y + thead_result.z + thread_result.w;

    results[seq_index] |= (acc_thread_result < cutoff ? 1UL : 0UL) << (2 * rvd_num);

  }

  if (input_sequence[seq_index + rs_len] == 'A' || input_sequence[seq_index + rs_len] == 'a') {

    ...

  }

}

几点注意事项：

我已经重写了（希望是正确的）您的函数ScoringMatrixVal以使用常规数组访问，因为指针算术的整个混乱可能会使编译器关闭。
我已将您的if- 语句转换为级联 -if-elseif语句，因为它们似乎相互排斥。我猜编译器将使用谓词指令并将这四个if-elseif块交错。
您可以考虑将所有这些char[256]都替换为除, , ,等处4的字符代码之外的所有内容...AaCc
如果您将if-elseif-statements 转换为表查找，您可以使用两个不同的表 forinput_sequence[seq_index - 1] == 'T'和input_sequence[seq_index + rs_len] == 'A'，从而将其全部保存在一个循环中。

我希望我没有把代码弄乱太多，这会有所帮助！

score 0 · Accepted Answer

据我在您的内核中了解，每个线程最多读取 32 个字符并检查每个字符并输出一些数据。

您可以通过使用不同的块方法和不同的索引（如果在您的情况下可能的话）隐式模拟它来完全删除循环。

每个块有 32 个线程，每个线程计算一次循环迭代的结果。

我不知道它是否更快但值得测试。

应该明确考虑佩德罗使用表格查找来替换您的条件的答案。

小改动：

是否input_sequence[seq_index - 1] == 'T' || input_sequence[seq_index - 1] == 't'优化为仅读取一次内存？

通过仅删除一次使用的变量来保存寄存器threadId。

cuda - CUDA 分支优化

2 回答 2

Related

Reference