2

更清楚地说,我想要的是传递指针和它们指向设备的所有数据。为了测试如何实现这个目标,我编写了一个简单的类:

class vecarray{
    public:
        int * vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to
        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being              
                                       //pointed to
}

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

然后我在下面的代码中使用这个类:

#define N 2
__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back 
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    dev_v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int * dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;


    cudaMalloc((void**)&dev_v, sizeof(vecarray));      

    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object 

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
    cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
    }
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("the result is %d\n", result);
}

该代码通过了 nvcc 编译器,但在运行时因分段错误而失败。我检查了问题在于 for 循环中的两个 cudaMalloc 和 cudaMemcpy 操作。所以我的问题是我应该如何将这个对象传递给 CUDA?提前致谢。

4

1 回答 1

4

您的代码中有几个错误。正如我在评论中提到的,关键错误之一是如何为类中指针引用的数据区域分配内存。那里的关键错误是您传递了一个指向已经存在于设备内存中的 cudaMalloc 的指针。我们可以通过创建一组额外的指针来解决这个问题,我们将使用这些指针为类中指向的数组分配所需的设备存储空间。此外还有一些其他错误,例如您没有为dev_result. 以下代码修复了我能找到的所有错误,我相信会给出正确的结果。我还添加了一个 cuda 错误检查的参考形式,您可能会发现在您的项目中使用它很有用:

#include <stdio.h>

#define N 2
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

using namespace std;

class vecarray{
    public:
        int *vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to

        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being
                                       //pointed to
};

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

__device__ __host__ int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int *dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;
    int *vptr[N];

    cudaMalloc((void**)&dev_v, sizeof(vecarray));
    cudaCheckErrors("cudaMalloc1 fail");
    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
    cudaCheckErrors("cudaMemcpy1 fail");

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(vptr[i]), v->dim[i]*sizeof(int));
        cudaCheckErrors("cudaMalloc2 fail");
        cudaMemcpy(&(dev_v->vecptr[i]), &vptr[i], sizeof(int*), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy2 fail");
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
        cudaMemcpy(vptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy3 fail");
    }
    cudaMalloc((void **)&dev_result, sizeof(int));
    cudaCheckErrors("cudaMalloc3 fail");
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy4 fail");
    printf("the result is %d\n", result);
    return 0;
}
于 2013-02-09T21:49:26.850 回答