CUDA 编程指南指出“malloc()
可以使用运行时复制通过分配的内存(即,通过从设备内存调用任何复制内存函数)”,但不知何故我无法重现此功能。代码:
#include <cstdio>
__device__ int* p;
__global__ void allocate_p() {
p = (int*) malloc(10);
printf("p = %p (seen by GPU)\n", p);
}
int main() {
cudaError_t err;
int* localp = (int*) malloc(10);
allocate_p<<<1,1>>>();
cudaDeviceSynchronize();
//Getting pointer to device-allocated memory
int* tmpp = NULL;
cudaMemcpyFromSymbol(&tmpp, p, 4);
printf("p = %p (seen by CPU)\n", tmpp);
//cudaMalloc((void**)&tmpp, 40);
err = cudaMemcpy(tmpp, localp, 40, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
printf(" err:%i %s", (int)err, cudaGetErrorString(err));
delete localp;
return 0;
}
崩溃并输出:
p = 0x601f920 (seen by GPU)
p = 0x601f920 (seen by CPU)
err:11 invalid argument
我收集到,主机在设备上看到了适当的地址,但不知何故不喜欢它来自malloc()
.
如果我早先分配cudaMalloc((void**)&np, 40);
然后将指针np
作为参数传递给 kernel allocate_p
,它将被分配给p
(而不是malloc()
),那么代码运行良好。
我在做什么错/我们如何malloc()
在主机端功能中使用分配的设备内存?