我有几次执行 CUDA 内核的问题。我的代码中的环境有问题。第一次代码正常工作,第二次在第三次调用之前清理环境期间出现随机崩溃。我认为由于某种原因我有内存损坏。有时会在 CUDA 驱动程序中发生崩溃,有时会发生简单的 printf 崩溃或便宜的 kernel32.dll。我想我的代码中的内存管理有问题。
在再次执行内核之前应该做什么?
此代码在我执行一次时有效。我正在使用 CURAND 来初始化随机生成器。这是我的代码:
#define GRID_BLOCK 64
#define GRID_THREAD 8
#define CITIES 100
#define CIPOW2 101
int lenghtPaths = GRID_BLOCK*GRID_THREAD;
int cities = CITIES;
//prepare CURAND
curandState *devStates;
CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
/* Setup prng states */
setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
//copy distance grid to constant memory
cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
for (int k = 0; k < 5; k++){
int* pathsForThreads;
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
//Copy lenght of each path to CPU
CUDA_CALL(cudaMemcpy(h_results, d_results, GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
//Copy paths to CPU
CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
//check the shortest path
shortestPath = FindTheShortestPath(h_results);
fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
Path[i] = pathsForThreads[shortestPath*CITIES +i];
free(pathsForThreads);
free(h_results);
}
CUDA_CALL(cudaFree(dev_pathsForThreads));
CUDA_CALL(cudaFree(d_results));
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaDeviceReset());