cuda - 多次执行 CUDA 内核

Question

我有几次执行 CUDA 内核的问题。我的代码中的环境有问题。第一次代码正常工作，第二次在第三次调用之前清理环境期间出现随机崩溃。我认为由于某种原因我有内存损坏。有时会在 CUDA 驱动程序中发生崩溃，有时会发生简单的 printf 崩溃或便宜的 kernel32.dll。我想我的代码中的内存管理有问题。

在再次执行内核之前应该做什么？

此代码在我执行一次时有效。我正在使用 CURAND 来初始化随机生成器。这是我的代码：

    #define GRID_BLOCK 64
    #define GRID_THREAD 8
    #define CITIES 100
    #define CIPOW2 101
    int lenghtPaths = GRID_BLOCK*GRID_THREAD;
    int cities = CITIES; 
    //prepare CURAND 
    curandState *devStates;
    CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
    /* Setup prng states */
    setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
    CUDA_CALL(cudaDeviceSynchronize());
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) 
        fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
    //copy distance grid to constant memory 
    cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
    CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
    CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
    for (int k = 0; k < 5; k++){
        int* pathsForThreads;
        pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
        pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
        CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
        GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
        CUDA_CALL(cudaDeviceSynchronize());
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
            fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
        h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
        //Copy lenght of each path to CPU 
        CUDA_CALL(cudaMemcpy(h_results, d_results,  GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
        //Copy paths to CPU 
        CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
        //check the shortest path                       
        shortestPath = FindTheShortestPath(h_results);
        fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
        for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
            Path[i] = pathsForThreads[shortestPath*CITIES +i]; 
        free(pathsForThreads);
        free(h_results);
    }
    CUDA_CALL(cudaFree(dev_pathsForThreads));
    CUDA_CALL(cudaFree(d_results));
    CUDA_CALL(cudaFree(devStates));
    CUDA_CALL(cudaDeviceReset());

score 1 · Accepted Answer

这是一个坏主意：

    pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
    pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);

如果调用PreaparePaths分配了一些其他值而pathsForThreads不是malloc操作分配给它的值，那么稍后当您执行此操作时：

    free(pathsForThreads);

你会得到不可预知的结果。

您不应将随后要传递给free其他值的指针重新分配。免费的手册页显示：

  free() frees the memory space pointed to by ptr, which must  have  been
   returned by a previous call to malloc(), calloc() or realloc().

因此，如果您打算将指针传递给其他对象，则不允许将其重新分配给其他对象free

cuda - 多次执行 CUDA 内核

1 回答 1

Related

Reference