cuda - CUDA 和 libc++abi.dylib 中的对象共享内存错误

Question

我有以下问题（请记住，我对使用 CUDA 编程相当陌生），

我有一个名为 vec3f 的类，它类似于 float3 数据类型，但具有重载运算符和其他向量函数。这些函数以 __ device __ __ host __ 为前缀（我添加了空格，因为它使这些单词变粗了）。然后，在我的内核中，我对 block_x 和 block_y 索引做了一个嵌套的 for 循环，并执行类似的操作，

//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();

在上面的代码中，oldParticles 是一个指向被传递给内核的名为particles 的类的指针。OldParticles 实际上是推力::device_vector 的底层指针（我不确定这是否与它有关）。一切都可以编译，但是当我运行时出现错误

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

感谢您的回复。我认为这个错误与我没有为传递给内核的参数分配空间有关。在我的主机代码中执行以下操作修复了此错误，

particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );

然后内核调用是，

force_kernel<<< grid,block,sharedMemSize  >>>(particle_ptrs[0],particle_ptrs[1],time_step);

我现在遇到的问题似乎是我无法将数据从设备复制回主机。我认为这与我不熟悉推力有关。

我做了一系列的副本如下，

//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above 
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;

所以我想我的问题是，我可以修改内核中的推力设备向量（在本例中为particle_pters [0]）将修改保存到内核中的另一个推力设备向量（在本例中为particle_pters [1]）然后一次我退出内核，将其复制到宿主向量？

我仍然无法让它工作。我做了一个较短的例子，我遇到了同样的问题，

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() : 
    _velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
  };
  particle(const vec3f& pos, const vec3f& vel) :
    _position(pos), _velocity(vel), _density(0.0) {
  };

  vec3f _velocity;
  vec3f _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);

//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];

int main() {
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
  }

  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
  cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
  //copy host particles to old device particles...
  old_parts = h_parts;
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
  kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position << endl;
  }  
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();
  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;

  new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
  new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
  new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}

因此，对于所有 64 个粒子，主向量的初始位置为 (0.5,0.5,0.5)。然后内核尝试将其乘以 10 以给出 (5,5,5) 作为所有粒子的位置。但是当我“计算”数据时我没有看到这一点。它仍然只是 (0.5,0.5,0.5)。我如何分配内存有问题吗？线路有没有问题：

  //copy new device particles back to host particles
  h_parts = new_parts;

可能是什么问题？谢谢你。

score 1 · Accepted Answer

您发布的代码存在各种问题。

您在内核调用中颠倒了您的block和变量。先来。gridgrid
您应该对内核和运行时 API 调用进行cuda 错误检查。
您使用从空设备向量原始转换的指针分配存储的方法cudaMalloc是不明智的。矢量容器不知道您是“在后台”执行此操作的。相反，您可以在实例化设备向量时直接为其分配存储空间，例如：
```
thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);
```
你说你期待 5,5,5，但你的内核乘以 10，然后乘以dt0.01，所以我相信正确的输出是 0.05、0.05、0.05
您的网格计算 (int(sqrt...))，对于任意max_particles一个都不能保证产生足够的块（如果将浮点数转换为 int 会截断或向下舍入）或会产生额外的块（如果向上舍入）。四舍五入的情况很糟糕。我们应该通过使用ceil函数或其他网格计算方法来处理它。向上取整的情况（这ceil将是什么）是可以的，但是我们需要处理网格可能会启动额外的块/线程的事实。我们通过内核中的线程检查来做到这一点。网格计算也存在其他问题。我们要取的平方根max_particles，然后除以特定方向的块尺寸，得到该方向的网格尺寸。

这是我根据这些更改修改的一些代码，它似乎产生了正确的输出（0.05、0.05、0.05）。请注意，我必须进行一些其他更改，因为我手边没有您的“vec3f.h”头文件，所以我float3改用了。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>

const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() :
    _velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
 {
  };
  particle(const float3& pos, const float3& vel) :
    _position(pos), _velocity(vel), _density(0.0)
 {
  };

  float3 _velocity;
  float3 _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);


int main() {
  //global thrust vectors
  thrust::host_vector<particle> h_parts;
  particle* particle_ptrs[2];
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
  }

  //copy host particles to old device particles...
  thrust::device_vector<particle> old_parts = h_parts;
  thrust::device_vector<particle> new_parts(max_particles);
  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
  cout << "grid x: " << grid.x << "  grid y: "  << grid.y << endl;
  kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  cudaDeviceSynchronize();
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
  }
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();

  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
  if (arr_pos < max_particles) {

    new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
    new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
    new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
    new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
    new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
    new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
    new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
  }
}

cuda - CUDA 和 libc++abi.dylib 中的对象共享内存错误

1 回答 1

Related

Reference