I am trying to implement a parallel merging algorithm in CUDA. The algorithm is designed to be executed in one thread block. Its basic idea is to compute the global rank of each element in two input sequences. Since these two input sequences are sorted, a element's global rank is equal to its index in its original sequence plus its rank in the other sequence which is computed by binary search. I think the best strategy for implementing such algorithm is to load two sequences into shared memory to reduce the global memory read. However, when I compared two version of implementation, one using shared memory and one without using shared memory, I can't see the performance enhancement. I wonder if I am doing something wrong.
hardware: GeForce GTX 285, Linux x86_64. time to merge two sequences of 1024 elements for both implementations is about 0.068672 ms.
__global__ void localMerge(int * A, int numA,int * B,int numB,int * C){
extern __shared__ int temp[]; // shared memory for A and B;
int tx=threadIdx.x;
int size=blockDim.x;
int *tempA=temp;
int *tempB=temp+numA;
int i,j,k,mid;
//read sequences into shared memory
for(i=tx;i<numA;i+=size){
tempA[i]=A[i];
}
for(i=tx;i<numB;i+=size){
tempB[i]=B[i];
}
__syncthreads();
//compute global rank for elements in sequence A
for(i=tx;i<numA;i+=size){
j=0;
k=numB-1;
if(tempA[i]<=tempB[0]){
C[i]=tempA[i];
}
else if(tempA[i]>tempB[numB-1]){
C[i+numB]=tempA[i];
}
else{
while(j<k-1){
mid=(j+k)/2;
if(tempB[mid]<tempA[i]){
j=mid;
}
else{
k=mid;
}
}
//printf("i=%d,j=%d,C=%d\n",i,j,tempA[i]);
C[i+j+1]=tempA[i];
}
}
//compute global rank for elements in sequence B
for(i=tx;i<numB;i+=size){
j=0;
k=numA-1;
if(tempB[i]<tempA[0]){
C[i]=tempB[i];
}
else if(tempB[i]>=tempA[numA-1]){
C[i+numA]=tempB[i];
}
else{
while(j<k-1){
mid=(j+k)/2;
if(tempA[mid]<=tempB[i]){
j=mid;
}
else{
k=mid;
}
}
//printf("i=%d,j=%d,C=%d\n",i,j,tempB[i]);
C[i+j+1]=tempB[i];
}
}
}