0

I written two separate code for same program in CPU (C++) and CUDA. i don't know why speedup of CUDA code is less than CPU code.
I have three matrices H, E, F and operations are performed on these. the speedup time in CPU code is 0.004s and in CUDA code is: 0.006s where dimensions of matrices is 32*32. in kernel code i defined three shared memory variables matrix_H, matrix_E, matrix_Fand copied dev_H, dev_E, dev_F values from global memory to shared memory for speedup access time of memory and finally copied calculated shared memory variables to global memory.
it's because of a lot of parameters in kernel call or elsewhere?

__global__ void kernel_ScoreMatrix(char *dev_seqA, char *dev_seqB, 
   int *dev_H, int *dev_E, int *dev_F, int *dev_i_side, int *dev_j_side,
   int *dev_lenA, int *dev_idx_array, int *dev_array_length)
{
   __shared__ int matrix_H[1024];
   __shared__ int matrix_E[1024];
   __shared__ int matrix_F[1024];

   int x= threadIdx.x;
   int y= threadIdx.y;

   //calculate current_cell that execute with threads
   int current_cell = *(dev_lenA)*(y) + x;

   matrix_H[current_cell]=dev_H[current_cell];
   matrix_E[current_cell]=dev_E[current_cell];
   matrix_F[current_cell]=dev_F[current_cell];

   int index=0;

   int scoreMatrix[4];

   //for determine cells  that must compute in this time
   for (int i=0; i<*(dev_array_length); i++)
    if (current_cell== dev_idx_array[i]){
            scoreMatrix[0] = H_Matrix(current_cell, x, y, matrix_H, dev_seqA, dev_seqB, dev_lenA); 
            scoreMatrix[1] = E_Matrix(current_cell, matrix_E, matrix_H, dev_lenA);
            scoreMatrix[2] = F_Matrix(current_cell, matrix_F, matrix_H, dev_lenA);
            scoreMatrix[3] = 0;
            dev_H[current_cell] = findMax(scoreMatrix,4, index);
}

in main function:

dim3 threadsPerBlock(32, 32);
kernel_ScoreMatrix<<<1,threadsPerBlock>>>(dev_seqA, dev_seqB, dev_H, dev_E, dev_F, 
        dev_i_side, dev_j_side, dev_lenA, dev_idx_array, dev_array_length);
4

1 回答 1

4

根据定义,线程块在单个 SM 上执行。因此,无论线程块包含多少线程,可用于执行该特定线程块的唯一执行资源是该(单个)SM 中的资源。由于几乎所有 NVIDIA GPU 都包含不止一个 SM,为了让 GPU 保持忙碌(这是获得最高性能所必需的),有必要启动具有超过 1 个线程块的网格。一个合理的经验法则是线程块的数量至少是 SM 的 2-4 倍,而且拥有比这更多的线程块通常没什么坏处。

但是如果你启动一个只有 1 个线程块的内核,你就被限制为 1 个 SM。因此,您可以获得1/(number of SMs in your GPU)机器的大致可用性能。该线程块中的线程数不影响此因素。

于 2013-05-14T22:09:40.057 回答