cuda - 在内核中从 for 循环更改为多线程

Question

我目前正在研究网格的插值并且在多线程方面遇到了一些问题。该代码假设读取由 2x2 矩阵表示的地图，然后对其进行插值以将点数增加 100 倍。在内核中使用 for 循环时，效果很好。

当我尝试使用线程更改 for 循环时，它产生了一些奇怪的结果。它用 -1 填充结果矩阵而不是数字。#QNAN

这是我在内核中使用 for 循环的工作代码

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"

using namespace std;

float Z[41][41];

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<float, 2, cudaReadModeElementType> tex;


__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);


    for (float i=0; i<n*k; i++)
    {
        for (float j=0; j<m*k; j++) 
        {
        f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
        }
    }

}

int main (void)
{
    // Start timer
    clock_t tStart = clock();

    // Size of map
    int n=41;
    int m=41;

    int g = 0;

    float numberOfInterpolationsPerSquare = 100;
    float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);

    size_t pitch, tex_ofs;
    float *f;
    float *r;
    float *map_d = 0;

    // Build read-Streams
    ifstream map;   

    //Create and open a txt file for MATLAB
    ofstream file;

    // Open data
    map.open("Map.txt", ios_base::in); 
    file.open("Bilinear.txt");

    // Store the map in a 2D array
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<m; j++)
        {
            map >> Z[i][j];
        }
    }

    // Allocate memory on host and device
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
    CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
    r = (float*)malloc(numberOfElements*sizeof(float));

    // Copy map from host to device
    CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));

    // Set texture mode to bilinear interpolation
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;

    // Bind the map to texture
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));

    // Checking for offset
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
   }

    // Launch Kernel
    kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
    CHECK_LAUNCH_ERROR();    
    CUDA_SAFE_CALL (cudaDeviceSynchronize());

    // Copy result from device to host
    cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Write results to file
    for(int h=0;h<numberOfElements;h++)
    {
        if(g==sqrt(numberOfElements))
        {
            file << endl;
            g=0;
        }
        file << r[h] << " ";
        g++;
    }

    // Free memory
    CUDA_SAFE_CALL (cudaUnbindTexture (tex));
    CUDA_SAFE_CALL (cudaFree (map_d));
    CUDA_SAFE_CALL (cudaFree (f));
    free( r );

    // Print out execution time
    printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}

这是带有多线程的内核，它不起作用

__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);

    int i= blockIdx.x * blockDim.x + threadIdx.x;
    int j= blockIdx.y * blockDim.y + threadIdx.y;


    if(i>=n*k || j>=m*k)
        return;

    f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);

}

有谁知道为什么多线程版本不起作用？

问候

桑德雷

score 1 · Accepted Answer

在第二个内核中，i并且j是int而不是float. soj/k和i/kintex2D将导致整数除法。将 k 声明为浮点数以避免整数除法。

最初，内核使用以下配置启动：

//Find number of blocks 
int nthreads = 1024;
int blocksize = 512; 
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads); 

// Launch Kernel 
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);

上面代码的问题是它会启动一维块的一维网格，但在内核内部，使用了二维索引。内核需要 2D 网格/块配置才能正常工作。从内核代码的外观来看，以下网格/块配置应该可以工作：

float k = sqrt(numberOfInterpolationsPerSquare);

const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);

const dim3 dimBlock(16,16);

dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

cuda - 在内核中从 for 循环更改为多线程

1 回答 1

Related

Reference