我写了一个简单的代码来理解动态并行。从打印的值中,我看到子内核已正确执行,但是当我回到父内核时,我看到使用了错误的值来代替在子内核中正确更新的临时数组。当我尝试更新“d_cin 数组”时,它给了我错误的值。这些是正在使用的编译标志:
nvcc -m64 -dc -gencode arch=compute_35,code=sm_35 -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35 -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt
有人能帮我吗 ?这是代码。
#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
int index=threadIdx.x+blockIdx.x*blockDim.x;
d_temp[parentIndex*3+index]=I[parentIndex];
}
__global__ void kernel(double *d_I,double *d_temp,double *d_cin){
int index=threadIdx.x+blockIdx.x*blockDim.x;
int i;
double res=0.0;
if(index<30){
cudaStream_t s;
cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
dim3 dimBlock(3,1,1);
dim3 dimGrid(1,1,1);
innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
__syncthreads();
if(index==0){
for(i=0;i<90;i++)
cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
}
for (i=0;i<3;i++){
res=res+d_temp[index*3+i];
}
__syncthreads();
d_cin[index]=res;
cudaStreamDestroy(s);
}
}
int main(int argc,char **argv){
double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
double *d_I;
double *d_temp;
double *d_cin;
double cout[30];
cudaMalloc(&d_I,30*sizeof(double));
cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);
cudaMalloc(&d_temp,3*30*sizeof(double));
cudaMalloc(&d_cin,30*sizeof(double));
dim3 dimBlock(8,1,1);
dim3 dimGrid(4,1,1);
/*LAUNCH THE KERNEL*/
printf("Before the kernel\n");
cudaPrintfInit();
kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
//cudaThreadSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
printf("After the kernel\n");
cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
int i;
for(i=0;i<30;i++)
printf("%f\n",cout[i]);
}