更清楚地说,我想要的是传递指针和它们指向设备的所有数据。为了测试如何实现这个目标,我编写了一个简单的类:
class vecarray{
public:
int * vecptr[N]; //array of pointers pointing to array
int dim[N]; //store length of each array pointed to
__device__ __host__ vecarray(); //constructor
__device__ __host__ int sum(); //sum up all the elements in the array being
//pointed to
}
vecarray::vecarray(){
for(int i = 0; i<N; i++)
{
vecptr[i] = NULL;
dim[i] = 0;
}
}
int vecarray::sum(){
int i=0, j=0, s=0;
for (i=0; i<N; i++)
for(j=0; j < dim[i]; j++)
s += vecptr[i][j];
return s;
}
然后我在下面的代码中使用这个类:
#define N 2
__global__ void addvecarray( vecarray * v, int *s){
*s = v->sum();
}
int main(){ //copy *V to device, do sum() and pass back
vecarray *v, *dev_v; //the result by dev_v
v = new vecarray;
dev_v = new vecarray;
int a[3] = {1,2,3}; //initialize v manually
int b[4] = {4,5,6,7};
int result = 0;
int * dev_result;
v->vecptr[0] = a;
v->vecptr[1] = b;
v->dim[0] = 3; v->dim[1] = 4;
cudaMalloc((void**)&dev_v, sizeof(vecarray));
cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
for(int i = 0; i < N; i++){
cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
}
for(int i = 0; i<N; i++ ){ //copy arrays
cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
}
addvecarray<<<1,1>>>(dev_v, dev_result);
cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("the result is %d\n", result);
}
该代码通过了 nvcc 编译器,但在运行时因分段错误而失败。我检查了问题在于 for 循环中的两个 cudaMalloc 和 cudaMemcpy 操作。所以我的问题是我应该如何将这个对象传递给 CUDA?提前致谢。