终于启动并运行了动态并行,我现在正尝试用它来实现我的模型。我花了一段时间才发现需要使用 cudaDeviceSynchronize() 使父内核等待子内核完成导致了一些奇怪的输出。
我定义为 arrAdd 的设备函数似乎有问题。这是 k2 父内核中每个子内核之前和之后的输出表。
Initially : k1 = { -1 0 0 0 0 }
Post arrInit : temp = { .25 .25 .25 .25 .25}
Post arrMult : temp = {-.25 0 0 0 0 }
post arrAdd : temp = { -8 0 0 0 0 }
Expected : temp = {-.50 0 0 0 0 }
__global__ void k2(double* concs, int* maxlength, double* k1s, double* k2s, double * temp, double* tempsum)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double a21 = .25;
arrInit<<< 1, *maxlength >>>(temp, a21); //temp = a21
cudaDeviceSynchronize();
arrMult<<< 1, *maxlength >>>(k1s, temp, temp); //temp = a21*k1
cudaDeviceSynchronize();
arrAdd<<< 1, *maxlength >>>(temp, temp, temp); //temp = 2*a21*k1
cudaDeviceSynchronize();
}
__global__ void arrAdd(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]+b[idx];
}
__global__ void arrMult(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]*b[idx];
}
__global__ void arrInit(double* a, double b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b;
}