从 CUDA 内核调用 CUB 类后是否需要调用 cudaDeviceSynchronize?当使用设备中的 DeviceReduce::Sum() 时,存在隐式内存副本会阻止设备继续运行,但在使用 GPU 上调用的以下代码后遇到了一些不稳定性:
__device__ void calcMonomerFlux(double* fluxes, double* lengths, double* dt) //temp2 temp1
{
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
arrInitToLengths<<< numBlocks, numThreads >>>(lengths);
cudaDeviceSynchronize();
arrMult<<< numBlocks, numThreads >>>(fluxes, lengths, lengths);
cudaDeviceSynchronize();
double sum = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, lengths, lengths, maxlength);
//cudaDeviceSynchronize();
cudaMalloc(&d_temp_storage, temp_storage_bytes);
//cudaDeviceSynchronize();
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, lengths, lengths, maxlength);
//cudaDeviceSynchronize();
cudaFree(d_temp_storage);
}