c++ - CUDA C++：数字 64 有什么特别之处吗？

Question

我在使用 C++ 和 CUDA 编程以在我的 GPU 上运行的数值积分器时遇到了一些病态行为。我的积分器目前使用固定的步长，当我将要积分的点数（numpoints）设置为 65（使步长变为 1/65 和我存储的二维数组的宽度）计算的数据为 65），我的积分器不起作用，似乎某处的函数返回零。大于 64 双倍宽的二维数组有问题吗？

我已经尝试实现 Talonmies 在什么是使用 CUDA 运行时 API 检查错误的规范方法中编写的宏？显然，我的内核出现了问题，并且将内核计算的数据复制回主机。“GPU 断言：无效参数。” 我不确定如何解释这些错误或从这里开始。

我的怀疑是它与 2D 阵列的宽度大于 64 或间距以及我如何将东西存储在设备上的 2D 阵列中有关。以下代码是否正确填充了二维数组的列？

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>                      //display 2 decimal places
using namespace std;

__global__ void rkf5(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int*, size_t, double*, double*, double*);
__global__ void calcK(double*, double*, double*);
__global__ void k1(double*, double*, double*);
__global__ void k2(double*, double*, double*);
__global__ void k3(double*, double*, double*);
__global__ void k4(double*, double*, double*);
__global__ void k5(double*, double*, double*);
__global__ void k6(double*, double*, double*);
__global__ void arrAdd(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrInit(double*, double);
__device__ void setup(double , double*, double*, double*, double*, int*);
__device__ double flux(int, double*) ;
__global__ void storeConcs(double*, size_t, double*, int);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*);

//Error checking that I don't understand yet.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//Main program.
int main(int argc, char** argv)
{

    //std::cout << std::fixed;          //display 2 decimal places
    //std::cout << std::setprecision(16);   //display 2 decimal places
    const int maxlength = 1;            //Number of discrete concentrations we are tracking.
    double concs[maxlength];            //Meant to store the current concentrations 
    double temp[maxlength];             //Used as a bin to store products of Butcher's tableau and k values.
    double tempsum[maxlength];          //Used as a bin to store cumulative sum of tableau and k values
    double k1s[maxlength];
    double k2s[maxlength];
    double k3s[maxlength];
    double k4s[maxlength];
    double k5s[maxlength];
    double k6s[maxlength];
    const int numpoints = 64;       
    double to = 0;
    double tf = .5;
    //double dt = static_cast<double>(.5)/static_cast<double>(64);
    double dt = (tf-to)/static_cast<double>(numpoints);
    double mo = 1;
    double concStorage[maxlength][numpoints];   //Stores concs vs. time                     

    //Initialize all the arrays on the host to ensure arrays of 0's are sent to the device.
    //Also, here is where we can seed the system.
    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";
    for (int i=0; i<maxlength; i++)
    {
        for (int j=0; j<numpoints; j++)
            concStorage[i][j]=0;
        concs[i]=0;
        temp[i]=0;
        tempsum[i]=0;
        k1s[i]=0;
        k2s[i]=0;
        k3s[i]=0;
        k4s[i]=0;
        k5s[i]=0;
        k6s[i]=0;
        std::cout<<concs[i];
        std::cout<<" ";
    }
    concs[0]=mo;
    std::cout<<"\n";

    //Define all the pointers to device array memory addresses. These contain the on-GPU
    //addresses of all the data we're generating/using.
    double *d_concs;
    double *d_temp;
    double *d_tempsum;
    double *d_k1s;
    double *d_k2s;
    double *d_k3s;
    double *d_k4s;
    double *d_k5s;
    double *d_k6s;
    double *d_dt;
    int *d_maxlength;
    int *d_numpoints;
    double *d_to;
    double *d_tf;
    double *d_concStorage;

    //Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU.
    size_t size_concs = sizeof(concs);
    size_t size_temp = sizeof(temp);
    size_t size_tempsum = sizeof(tempsum);
    size_t size_ks = sizeof(k1s);
    size_t size_maxlength = sizeof(maxlength);
    size_t size_numpoints = sizeof(numpoints);
    size_t size_dt = sizeof(dt);
    size_t size_to = sizeof(to);
    size_t size_tf = sizeof(tf);
    size_t h_pitch = numpoints*sizeof(double);
    size_t d_pitch;

    //Calculate the "pitch" of the 2D array.  The pitch is basically the length of a 2D array's row.  IT's larger 
    //than the actual row full of data due to hadware issues.  We thusly will use the pitch instead of the data 
    //size to traverse the array.
    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints)); 

    //Allocate memory on the GPU for all the arrrays we're going to use in the integrator.
    cudaMalloc((void**)&d_concs, size_concs);
    cudaMalloc((void**)&d_temp, size_temp);
    cudaMalloc((void**)&d_tempsum, size_tempsum);
    cudaMalloc((void**)&d_k1s, size_ks);
    cudaMalloc((void**)&d_k2s, size_ks);
    cudaMalloc((void**)&d_k3s, size_ks);
    cudaMalloc((void**)&d_k4s, size_ks);
    cudaMalloc((void**)&d_k5s, size_ks);
    cudaMalloc((void**)&d_k6s, size_ks);
    cudaMalloc((void**)&d_maxlength, size_maxlength);
    cudaMalloc((void**)&d_numpoints, size_numpoints);
    cudaMalloc((void**)&d_dt, size_dt);
    cudaMalloc((void**)&d_to, size_to);
    cudaMalloc((void**)&d_tf, size_tf);

    //Copy all initial values of arrays to GPU.
    cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice);
    cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice);
    cudaMemcpy(d_temp, &temp, size_temp, cudaMemcpyHostToDevice);
    cudaMemcpy(d_tempsum, &tempsum, size_tempsum, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k1s, &k1s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k2s, &k2s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k3s, &k3s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k4s, &k4s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k5s, &k5s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_k6s, &k6s, size_ks, cudaMemcpyHostToDevice);
    cudaMemcpy(d_maxlength, &maxlength, size_maxlength, cudaMemcpyHostToDevice);
    cudaMemcpy(d_numpoints, &numpoints, size_numpoints, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice);
    cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice);
    cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice);

    //Run the integrator.
    rkf5<<<1,1>>>(d_concs, d_concStorage, d_temp, d_tempsum, d_k1s, d_k2s, d_k3s, d_k4s, d_k5s, d_k6s, d_maxlength, d_numpoints, d_pitch, d_dt, d_to, d_tf);
    //gpuErrchk( cudaPeekAtLastError() );
    //gpuErrchk( cudaDeviceSynchronize() );
    cudaDeviceSynchronize();

    //Copy concentrations from GPU to Host.  Almost defunct now that transferring the 2D array works.
    cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost);
    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyDeviceToHost) );   

    //Print concentrations after the integrator kernel runs.  Used to test that data was transferring to and from GPU correctly.
    std::cout << "\n";
    for (int i=0; i<maxlength; i++)
    {
        std::cout<<concs[i];
        std::cout<<" ";
    }

    //Print out the concStorage array after the kernel runs.  Used to test that the 2D array transferred correctly from host to GPU and back.
    std::cout << "\n";
    for (int i=0; i<maxlength; i++)
    {
        for(int j=0; j<numpoints; j++)
        {
            std::cout<<concStorage[i][j];
            std::cout<<"   ";
        }
        std::cout << "\n";
    }
    std::cout << "\n";

    cudaDeviceReset();  //Clean up all memory.

    return 0;
}
//Main kernel.  This is mean to be run as a master thread that calls all the other functions and thusly "runs" the integrator.
__global__ void rkf5(double* concs, double* concStorage, double* temp, double* tempsum, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s, int* maxlength, int* numpoints, size_t pitch, double* dt, double* to, double* tf)
{
    /*
    axy variables represent the coefficients in the Butcher's tableau where x represents the order of the step and the y value corresponds to the ky value 
    the coefficient gets multiplied by.  Have to cast them all as doubles, or the ratios evaluate as integers.
    e.g. a21 -> a21 * k1
    e.g. a31 -> a31 * k1 + a32 * k2
    */
    double a21 = static_cast<double>(.25);

    double a31 = static_cast<double>(3)/static_cast<double>(32);
    double a32 = static_cast<double>(9)/static_cast<double>(32);

    double a41 = static_cast<double>(1932)/static_cast<double>(2197);
    double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
    double a43 = static_cast<double>(7296)/static_cast<double>(2197);

    double a51 = static_cast<double>(439)/static_cast<double>(216);
    double a52 = static_cast<double>(-8);
    double a53 = static_cast<double>(3680)/static_cast<double>(513);
    double a54 = static_cast<double>(-845)/static_cast<double>(4104);

    double a61 = static_cast<double>(-8)/static_cast<double>(27);
    double a62 = static_cast<double>(2);
    double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
    double a64 = static_cast<double>(1859)/static_cast<double>(4104);
    double a65 = static_cast<double>(-11)/static_cast<double>(40);

    //for loop that integrates over the specified number of points. Actually, might have to make it a do-while loop for adaptive step sizes 
    for(int k = 0; k < *numpoints; k++)
    {

        calcK<<< 1, *maxlength >>>(concs, k1s, dt);         //k1 = dt * flux (concs)
        cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished

        setup(a21, temp, tempsum, k1s, concs, maxlength);   //tempsum = a21*k1
        arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum);  //tempsum = concs + a21*k1
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(tempsum, k2s, dt);       //k2 = dt * flux (concs + a21*k1)
        cudaDeviceSynchronize();

        setup(a31, temp, tempsum, k1s, concs, maxlength);   //tempsum = a31*k1
        setup(a32, temp, tempsum, k2s, concs, maxlength);   //tempsum = a31*k1 + a32*k2
        arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum);  //tempsum = concs + a31*k1 + a32*k2
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(tempsum, k3s, dt);           //k3 = dt * flux (concs + a31*k1 + a32*k2)
        cudaDeviceSynchronize();

        setup(a41, temp, tempsum, k1s, concs, maxlength);   //tempsum = a41*k1
        setup(a42, temp, tempsum, k2s, concs, maxlength);   //tempsum = a41*k1 + a42*k2
        setup(a43, temp, tempsum, k3s, concs, maxlength);   //tempsum = a41*k1 + a42*k2 + a43*k3
        arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum);  //tempsum = concs + a41*k1 + a42*k2 + a43*k3
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(tempsum, k4s, dt);           //k4 = dt * flux (concs + a41*k1 + a42*k2 + a43*k3)
        cudaDeviceSynchronize();

        setup(a51, temp, tempsum, k1s, concs, maxlength);   //tempsum = a51*k1
        setup(a52, temp, tempsum, k2s, concs, maxlength);   //tempsum = a51*k1 + a52*k2
        setup(a53, temp, tempsum, k3s, concs, maxlength);   //tempsum = a51*k1 + a52*k2 + a53*k3
        setup(a54, temp, tempsum, k4s, concs, maxlength);   //tempsum = a51*k1 + a52*k2 + a53*k3 + a54*k4
        arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum);  //tempsum = concs + a51*k1 + a52*k2 + a53*k3 + a54*k4
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(tempsum, k5s, dt);           //k5 = dt * flux (concs + a51*k1 + a52*k2 + a53*k3 + a54*k4)
        cudaDeviceSynchronize();

        setup(a61, temp, tempsum, k1s, concs, maxlength);   //tempsum = a61*k1
        setup(a62, temp, tempsum, k2s, concs, maxlength);   //tempsum = a61*k1 + a62*k2
        setup(a63, temp, tempsum, k3s, concs, maxlength);   //tempsum = a61*k1 + a62*k2 + a63*k3
        setup(a64, temp, tempsum, k4s, concs, maxlength);   //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4
        setup(a65, temp, tempsum, k4s, concs, maxlength);   //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5
        arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum);  //tempsum = concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5
        cudaDeviceSynchronize();

        calcK<<< 1, *maxlength >>>(tempsum, k6s, dt);           //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
        cudaDeviceSynchronize();

        //At this point, temp and tempsum are maxlength dimension arrays that are able to be used for other things.

        /*
        //All this is is a way of printing all my k values in a 2D array.  No bearing on actual program.
        for (int i = 0; i < *maxlength; i++)
        {
            switch (j)
            {
            case 0: concs[i]=k1s[i];
                break;
            case 1: concs[i]=k2s[i];
                break;
            case 2: concs[i]=k3s[i];
                break;
            case 3: concs[i]=k4s[i];
                break;
            case 4: concs[i]=k5s[i];
                break;
            }
        }
        */
        //calcStepSize
        takeFifthOrderStep<<< 1, *maxlength >>>(concs, k1s, k2s, k3s, k4s, k5s, k6s);
        cudaDeviceSynchronize();
        storeConcs<<< 1, *maxlength >>>(concStorage, pitch, k1s, k);
        cudaDeviceSynchronize();

    }
}
//calcStepSize will take in an error tolerance, the current concentrations and the k values and calculate the resulting step size according to the following equation
//e[n+1]=y4[n+1] - y5[n+1]
//__global__ void calcStepSize(double *y5, double* y4)


//takeFourthOrderStep is going to overwrite the old temp array with the new array of concentrations that result from a 4th order step.  This kernel is meant to be launched 
// with as many threads as there are discrete concentrations to be tracked.
__global__ void takeFourthOrderStep(double* concs, double* k1s, double* k2s,double* k3s, double* k4s, double* k5s)
{
    double b41 = static_cast<double>(25)/static_cast<double>(216);
    double b42 = static_cast<double>(0);
    double b43 = static_cast<double>(1408)/static_cast<double>(2565);
    double b44 = static_cast<double>(2197)/static_cast<double>(4104);
    double b45 = static_cast<double>(-1)/static_cast<double>(5);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    concs[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
//takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations.  As of now, this will be the 5th order step.  Another function can be d
//defined that will take a fourth order step if that is interesting for any reason.  This kernel is meant to be launched with as many threads as there are discrete concentrations
//to be tracked.
//Store b values in register? Constants?
__global__ void takeFifthOrderStep(double* concs, double* k1s, double* k2s,double* k3s, double* k4s, double* k5s, double* k6s)
{
    double b51 = static_cast<double>(16)/static_cast<double>(135);
    double b52 = static_cast<double>(0);
    double b53 = static_cast<double>(6656)/static_cast<double>(12825);
    double b54 = static_cast<double>(28561)/static_cast<double>(56430);
    double b55 = static_cast<double>(-9)/static_cast<double>(50);
    double b56 = static_cast<double>(2)/static_cast<double>(55);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    concs[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
    int tIdx = threadIdx.x;
    //cS is basically the memory address of the first element of the flattened (1D) 2D array.
    double* row = (double*)((char*)cS + tIdx * pitch);
    row[cId] = concs[tIdx];
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcK(double* concs, double* ks, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    ks[idx]=(*dt)*flux(idx, concs);
}
//Adds two arrays (a and b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]+b[idx];
}
//Multiplies two arrays (a and b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]*b[idx];
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b;
}
//Placeholder function for the flux calculation.  It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs) 
{
    return -concs[r];
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum.  Used to
//add all the a*k terms.
__device__ void setup(double tableauValue, double *temp, double *tempsum, double *ks, double *concs, int *maxlength) 
{
    //Sets tempsum to tabVal * k
    arrInit<<< 1, *maxlength >>>(temp, tableauValue);       //Set [temp] to tableau value
    cudaDeviceSynchronize();
    arrMult<<< 1, *maxlength >>>(ks, temp, temp);           //Multiply tableau value by appropriate [k]
    cudaDeviceSynchronize();
    arrAdd<<< 1, *maxlength >>>(tempsum, temp, tempsum);    //Move tabVal*k to [tempsum]
    cudaDeviceSynchronize();
}

/*
__device__ double knowles_flux(int r, double *conc, double *params)
{

    const double nc = params[0];
    const double ka = params[1];
    //const float kb = params[2];
    //const float kp = params[3];
    const double km = params[4];
    const double kn = params[5];
    //const float n2 = params[6];
    //const float kn2 = params[7];
    const int maxlength = params[8];


    const int r = blockIdx.x*blockDim.x + threadIdx.x;

    double frag_term = 0;
    double flux = 0;
    if (r == (maxlength-1))
        {
        flux = -km*(r)*conc[r]+2*ka*conc[r-1]*conc[0];
        }
    else if (r > (nc-1))
        {
        for (int s = r+1; s < maxlength; s++)
            {
            frag_term += conc[s];
            }
        //double frag_term = thrust::reduce(conc, conc);
        flux = -km*(r)*conc[r] + 2*km*frag_term - 2*ka*conc[r]*conc[0] + 2*ka*conc[r-1]*conc[0];
        }
    else if (r == (nc-1))
        {
        for (int s = r+1; s < maxlength; s++)
            {
            frag_term += conc[s];
            }
        //double frag_term = thrust::reduce(conc, conc);
        flux = kn*pow(conc[0],nc) + 2*km*frag_term - 2*ka*conc[r]*conc[0];
        }
    else if (r < (nc-1))
        {
        flux[r] = 0;
        }
}
*/

/*
Encountered Errors :
1. nvlink - undefined reference : there's a mismatch between function protoypes and function declaration, possibly the number of arguments. 
"nvlink : error : Undefined reference to '_Z2k1PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'" - fixed by making sure func def had same  parameters as proto

2.1>nvlink : error : Undefined reference to '_Z2k1PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_Z2k2PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_Z2k3PdPiS_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_Z2k4PdPiS_S_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_Z2k5PdPiS_S_S_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'

This is caused by there being references defined in the prototype that don't exist in the actual function definition.
*/

score 4 · Accepted Answer

首先，您还没有实施正确的 cuda 错误检查。请重新阅读，学习，做任何你必须做的事情来理解 talonmies 的帖子。您需要在每次cuda API 调用之后放置错误检查宏，这意味着在each cudaMalloc等之后，cudaMemcpy您应该放置以下行：

gpuErrchk( cudaPeekAtLastError() );

或者，您可以简单地将每个调用包装在错误检查宏中：

gpuErrchk(cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost));

如果您不这样做，是的，错误消息会令人困惑，因为它们通常会应用于引发错误的其他（先前）代码行。

一旦你进行了可靠的错误检查，你会发现无效参数是由这行代码引起的：

cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice);

要了解原因，我们需要将其与创建的相应 malloc 操作进行比较d_concStorage，即这一行：

gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints));

请注意，在您的原始变量定义中，numpoints是一个宽度参数。但是，在cudaMallocPitch上面，您将numpoints作为最后一个参数传递，即height参数。请参阅 cuda API文档。

为了使这些调用彼此和谐，我们需要对应的高度和宽度参数。我认为正确的做法是修复cudaMallocPitch线路，使其与cudaMemcpy2D操作相匹配，如下所示：

gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, numpoints * sizeof(double), maxlength));

当我进行更改时，我可以设置numpoints为 65 并编译和运行您的代码，而不会出现任何 API 错误。

c++ - CUDA C++：数字 64 有什么特别之处吗？

1 回答 1

Related

Reference