这是我对自定义分配器和池的粗略概述,它将隐藏在主机和设备上使用类的一些机制。
我不认为它是卓越编程的典范。这只是我认为可能涉及的步骤的粗略概述。我敢肯定有很多错误。我没有包括它,但我认为你会想要一个公共方法也能得到它size
。
#include <iostream>
#include <assert.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float mytype;
__device__ unsigned int pool_allocated = 0;
__device__ unsigned int pool_size = 0;
__device__ mytype *pool = 0;
__device__ unsigned int pool_reserve(size_t size){
assert((pool_allocated+size) < pool_size);
unsigned int offset = atomicAdd(&pool_allocated, size);
assert (offset < pool_size);
return offset;
}
__host__ void init_pool(size_t psize){
mytype *temp;
unsigned int my_size = psize;
cudaMalloc((void **)&temp, psize*sizeof(mytype));
cudaCheckErrors("init pool cudaMalloc fail");
cudaMemcpyToSymbol(pool, &temp, sizeof(mytype *));
cudaCheckErrors("init pool cudaMemcpyToSymbol 1 fail");
cudaMemcpyToSymbol(pool_size, &my_size, sizeof(unsigned int));
cudaCheckErrors("init pool cudaMemcpyToSymbol 2 fail");
}
class A{
public:
mytype *data;
__host__ __device__ void pool_allocate_and_copy() {
assert(d_data == 0);
assert(size != 0);
#ifdef __CUDA_ARCH__
unsigned int offset = pool_reserve(size);
d_data = pool + offset;
memcpy(d_data, data, size*sizeof(mytype));
#else
cudaMalloc((void **)&d_data, size*sizeof(mytype));
cudaCheckErrors("pool_allocate_and_copy cudaMalloc fail");
cudaMemcpy(d_data, data, size*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("pool_allocate_and_copy cudaMemcpy fail");
#endif /* __CUDA_ARCH__ */
}
__host__ __device__ void update(){
#ifdef __CUDA_ARCH__
assert(data != 0);
data = d_data;
assert(data != 0);
#else
if (h_data == 0) h_data = (mytype *)malloc(size*sizeof(mytype));
data = h_data;
assert(data != 0);
cudaMemcpy(data, d_data, size*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaCheckErrors("update cudaMempcy fail");
#endif
}
__host__ __device__ void allocate(size_t asize) {
assert(data == 0);
data = (mytype *)malloc(asize*sizeof(mytype));
assert(data != 0);
#ifndef __CUDA_ARCH__
h_data = data;
#endif
size = asize;
}
__host__ __device__ void copyobj(A *obj){
assert(obj != 0);
#ifdef __CUDA_ARCH__
memcpy(this, obj, sizeof(A));
#else
cudaMemcpy(this, obj, sizeof(A), cudaMemcpyDefault);
cudaCheckErrors("copy cudaMempcy fail");
#endif
this->update();
}
__host__ __device__ A();
private:
unsigned int size;
mytype *d_data;
mytype *h_data;
};
__host__ __device__ A::A(){
data = 0;
d_data = 0;
h_data = 0;
size = 0;
}
__global__ void mykernel(A obj, A *res){
A mylocal;
mylocal.copyobj(&obj);
A mylocal2;
mylocal2.allocate(24);
mylocal2.data[0]=45;
mylocal2.pool_allocate_and_copy();
res->copyobj(&mylocal2);
printf("kernel data %f\n", mylocal.data[0]);
}
int main(){
A my_obj;
A *d_result, h_result;
my_obj.allocate(32);
my_obj.data[0] = 12;
init_pool(1048576);
my_obj.pool_allocate_and_copy();
cudaMalloc((void **)&d_result, sizeof(A));
cudaCheckErrors("main cudaMalloc fail");
mykernel<<<1,1>>>(my_obj, d_result);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
h_result.copyobj(d_result);
printf("host data %f\n", h_result.data[0]);
return 0;
}