I written two separate code for same program in CPU (C++) and CUDA. i don't know why speedup of CUDA code is less than CPU code.
I have three matrices H
, E
, F
and operations are performed on these. the speedup time in CPU code is 0.004s and in CUDA code is: 0.006s where dimensions of matrices is 32*32. in kernel code i defined three shared memory variables matrix_H
, matrix_E
, matrix_F
and copied dev_H
, dev_E
, dev_F
values from global memory to shared memory for speedup access time of memory and finally copied calculated shared memory variables to global memory.
it's because of a lot of parameters in kernel call or elsewhere?
__global__ void kernel_ScoreMatrix(char *dev_seqA, char *dev_seqB,
int *dev_H, int *dev_E, int *dev_F, int *dev_i_side, int *dev_j_side,
int *dev_lenA, int *dev_idx_array, int *dev_array_length)
{
__shared__ int matrix_H[1024];
__shared__ int matrix_E[1024];
__shared__ int matrix_F[1024];
int x= threadIdx.x;
int y= threadIdx.y;
//calculate current_cell that execute with threads
int current_cell = *(dev_lenA)*(y) + x;
matrix_H[current_cell]=dev_H[current_cell];
matrix_E[current_cell]=dev_E[current_cell];
matrix_F[current_cell]=dev_F[current_cell];
int index=0;
int scoreMatrix[4];
//for determine cells that must compute in this time
for (int i=0; i<*(dev_array_length); i++)
if (current_cell== dev_idx_array[i]){
scoreMatrix[0] = H_Matrix(current_cell, x, y, matrix_H, dev_seqA, dev_seqB, dev_lenA);
scoreMatrix[1] = E_Matrix(current_cell, matrix_E, matrix_H, dev_lenA);
scoreMatrix[2] = F_Matrix(current_cell, matrix_F, matrix_H, dev_lenA);
scoreMatrix[3] = 0;
dev_H[current_cell] = findMax(scoreMatrix,4, index);
}
in main function:
dim3 threadsPerBlock(32, 32);
kernel_ScoreMatrix<<<1,threadsPerBlock>>>(dev_seqA, dev_seqB, dev_H, dev_E, dev_F,
dev_i_side, dev_j_side, dev_lenA, dev_idx_array, dev_array_length);