我正在使用 CUDA 优化一些代码。我不确定是否应该在 _ _ global _ _ 函数 (fun1) 中使用 cudaMalloc(不是 x 已经在 GPU 的内存上分配了吗?):
__global__ void fun2(double *y)
{
int i=blockIdx.x;
y[i]=...;
}
__global__ void fun1(double *x)
{
//should I cudaMalloc() y for fun2 or just use the x which was already allocated in main?
fun2<<<N,1>(x);
...
}
int main(){
double *x;
...
cudaMalloc((void**)&x, N*sizeof(double));
fun1<<<N,1>>>(x);
...
}