我有以下问题(请记住,我对使用 CUDA 编程相当陌生),
我有一个名为 vec3f 的类,它类似于 float3 数据类型,但具有重载运算符和其他向量函数。这些函数以 __ device __ __ host __ 为前缀(我添加了空格,因为它使这些单词变粗了)。然后,在我的内核中,我对 block_x 和 block_y 索引做了一个嵌套的 for 循环,并执行类似的操作,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
在上面的代码中,oldParticles 是一个指向被传递给内核的名为particles 的类的指针。OldParticles 实际上是推力::device_vector 的底层指针(我不确定这是否与它有关)。一切都可以编译,但是当我运行时出现错误
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
感谢您的回复。我认为这个错误与我没有为传递给内核的参数分配空间有关。在我的主机代码中执行以下操作修复了此错误,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
然后内核调用是,
force_kernel<<< grid,block,sharedMemSize >>>(particle_ptrs[0],particle_ptrs[1],time_step);
我现在遇到的问题似乎是我无法将数据从设备复制回主机。我认为这与我不熟悉推力有关。
我做了一系列的副本如下,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
所以我想我的问题是,我可以修改内核中的推力设备向量(在本例中为particle_pters [0])将修改保存到内核中的另一个推力设备向量(在本例中为particle_pters [1])然后一次我退出内核,将其复制到宿主向量?
我仍然无法让它工作。我做了一个较短的例子,我遇到了同样的问题,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
};
particle(const vec3f& pos, const vec3f& vel) :
_position(pos), _velocity(vel), _density(0.0) {
};
vec3f _velocity;
vec3f _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
}
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
//copy host particles to old device particles...
old_parts = h_parts;
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
因此,对于所有 64 个粒子,主向量的初始位置为 (0.5,0.5,0.5)。然后内核尝试将其乘以 10 以给出 (5,5,5) 作为所有粒子的位置。但是当我“计算”数据时我没有看到这一点。它仍然只是 (0.5,0.5,0.5)。我如何分配内存有问题吗?线路有没有问题:
//copy new device particles back to host particles
h_parts = new_parts;
可能是什么问题?谢谢你。