c - 如何在内核中动态分配数组？

Question

我需要在内核函数中动态分配一些数组。我怎么能这样做？

我的代码是这样的：

__global__ func(float *grid_d,int n, int nn){  
    int i,j;  
    float x[n],y[nn];  
    //Do some really cool and heavy computations here that takes hours.  
}

但这行不通。如果这是在主机代码中，我可以使用 malloc。cudaMalloc 需要主机上的指针和设备上的其他指针。在内核函数内部，我没有主机指针。

所以我该怎么做？

如果需要太长时间（几秒钟）来分配所有数组（我需要大约 4 个大小为 n 和 5 个大小为 nn），这不会是一个问题。因为内核可能至少会运行 20 分钟。

score 29 · Accepted Answer

动态内存分配仅在计算能力 2.x 和更新的硬件上受支持。您可以在内核中使用 C++ new 关键字或 malloc，因此您的示例可以变为：

__global__ func(float *grid_d,int n, int nn){  
    int i,j;  
    float *x = new float[n], *y = new float[nn];   
}

这会在具有上下文生命周期的本地内存运行时堆上分配内存，因此如果您不打算再次使用内存，请确保在内核完成运行后释放内存。您还应该注意，运行时堆内存不能直接从主机 API 访问，因此您不能将在内核中分配的指针作为参数传递cudaMemcpy给例如。

score 14 · Accepted Answer

@talonmies 回答了您关于如何在内核中动态分配内存的问题。这旨在作为补充答案，解决__device__ malloc()您可能要考虑的性能和替代方案。

在内核中动态分配内存可能很诱人，因为它允许 GPU 代码看起来更像 CPU 代码。但它会严重影响性能。我写了一个独立的测试并将其包含在下面。该测试启动了大约 260 万个线程。每个线程使用从线程索引派生的一些值填充 16 个整数的全局内存，然后对这些值求和并返回总和。

该测试实现了两种方法。第一种方法使用__device__ malloc()，第二种方法使用在内核运行之前分配的内存。

在我的 2.0 设备上，内核在使用时运行时间为 1500__device__ malloc()毫秒，在使用预分配内存时运行时间为 27 毫秒。换句话说，当内存在内核中动态分配时，测试的运行时间要长 56 倍。时间包括外部循环cudaMalloc()/ cudaFree()，它不是内核的一部分。如果使用相同数量的线程多次启动同一个内核（通常是这种情况），则cudaMalloc()/的成本将cudaFree()在所有内核启动中分摊。这使差异更大，达到 60 倍左右。

推测，我认为性能下降部分是由隐式序列化引起的。GPU 可能必须序列化所有同时调用__device__ malloc()，以便为每个调用者提供单独的内存块。

不使用的版本会__device__ malloc()在运行内核之前分配所有的 GPU 内存。指向内存的指针被传递给内核。每个线程计算一个索引到先前分配的内存中，而不是使用__device__ malloc().

预先分配内存的潜在问题是，如果只有一些线程需要分配内存，并且不知道这些线程是哪些线程，则有必要为所有线程分配内存。如果没有足够的内存，那么减少每个内核调用的线程数可能比使用__device__ malloc(). 其他解决方法可能最终会重新实现 __device__ malloc()在后台执行的操作，并且会看到类似的性能损失。

测试性能__device__ malloc()：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

const int N_ITEMS(16);

#define USE_DYNAMIC_MALLOC

__global__ void test_malloc(int* totals)
{
  int tx(blockIdx.x * blockDim.x + threadIdx.x);

  int* s(new int[N_ITEMS]);

  for (int i(0); i < N_ITEMS; ++i) {
    s[i] = tx * i;
  }

  int total(0);
  for (int i(0); i < N_ITEMS; ++i) {
    total += s[i];
  }

  totals[tx] = total;

  delete[] s;
}

__global__ void test_malloc_2(int* items, int* totals)
{
  int tx(blockIdx.x * blockDim.x + threadIdx.x);

  int* s(items + tx * N_ITEMS);

  for (int i(0); i < N_ITEMS; ++i) {
    s[i] = tx * i;
  }

  int total(0);
  for (int i(0); i < N_ITEMS; ++i) {
    total += s[i];
  }

  totals[tx] = total;
}

int main()
{
  cudaError_t cuda_status;

  cudaSetDevice(0);

  int blocks_per_launch(1024 * 10);
  int threads_per_block(256);

  int threads_per_launch(blocks_per_launch * threads_per_block);

  int* totals_d;
  cudaMalloc((void**)&totals_d, threads_per_launch * sizeof(int));

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaDeviceSynchronize();
  cudaEventRecord(start, 0);

#ifdef USE_DYNAMIC_MALLOC
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, threads_per_launch * N_ITEMS * sizeof(int));

  test_malloc<<<blocks_per_launch, threads_per_block>>>(totals_d);
#else
  int* items_d;
  cudaMalloc((void**)&items_d, threads_per_launch * sizeof(int) * N_ITEMS);

  test_malloc_2<<<blocks_per_launch, threads_per_block>>>(items_d, totals_d);

  cudaFree(items_d);
#endif

  cuda_status = cudaDeviceSynchronize();
  if (cuda_status != cudaSuccess) {
    printf("Error: %d\n", cuda_status);
    exit(1);
  }

  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  float elapsedTime;
  cudaEventElapsedTime(&elapsedTime, start, stop);

  printf("Elapsed: %f\n", elapsedTime);

  int* totals_h(new int[threads_per_launch]);
  cuda_status = cudaMemcpy(totals_h, totals_d, threads_per_launch * sizeof(int), cudaMemcpyDeviceToHost);
  if (cuda_status != cudaSuccess) {
    printf("Error: %d\n", cuda_status);
    exit(1);
  }

  for (int i(0); i < 10; ++i) {
    printf("%d ", totals_h[i]);
  }
  printf("\n");

  cudaFree(totals_d);
  delete[] totals_h;

  return cuda_status;
}

输出：

C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 27.311169
0 120 240 360 480 600 720 840 960 1080

C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 1516.711914
0 120 240 360 480 600 720 840 960 1080

score 2 · Accepted Answer

如果在调用内核之前 n 和 nn 的值是已知的，那么为什么不 cudaMalloc 主机端的内存并将设备内存指针传递给内核呢？

score 2 · Accepted Answer

根据@rogerdahl 帖子中的概念进行实验。假设：

以 64B 块分配的 4MB 内存。
该块中有 1 个 GPU 块和 32 个扭曲线程
在 P100 上运行

GPU 本地的 malloc+free 调用似乎比cudaMalloc+cudaFree调用快得多。程序的输出：

Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
         timer for cuda malloc timer took 1.169631s
Starting timer for device malloc timer
Stopping timer for device malloc timer
         timer for device malloc timer took 0.029794s

我省略了timer.hand的代码timer.cpp，但这里是测试本身的代码：

#include "cuda_runtime.h"
#include <stdio.h>
#include <thrust/system/cuda/error.h>

#include "timer.h"

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 32;
const int ITERATIONS = 1 << 12;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);

const int ARRAY_SIZE = 64;


void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

__global__ void mallocai() {
    for (int i = 0; i < ITERATIONS_PER_BLOCKTHREAD; ++i) {
        int * foo;
        foo = (int *) malloc(sizeof(int) * ARRAY_SIZE);
        free(foo);
    }
}

int main() {

    Timer cuda_malloc_timer("cuda malloc timer");

    for (int i = 0; i < ITERATIONS; ++ i) {
        if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
        int * foo;
        cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
        cudaFree(foo);
    }
    cuda_malloc_timer.stop_and_report();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());

    Timer device_malloc_timer("device malloc timer");
    device_malloc_timer.start();
    mallocai<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    device_malloc_timer.stop_and_report();
}

如果您发现错误，请在评论中 lmk，我会尽力修复它们。

我用更大的东西再次运行它们：

const int BLOCK_COUNT = 56;
const int THREADS_PER_BLOCK = 1024;
const int ITERATIONS = 1 << 18;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);

const int ARRAY_SIZE = 1024;

cudaMalloc 仍然慢了很多：

Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
         timer for cuda malloc timer took 74.878016s
Starting timer for device malloc timer
Stopping timer for device malloc timer
         timer for device malloc timer took 0.167331s

score 0 · Accepted Answer

也许你应该测试

cudaMalloc(&foo,sizeof(int) * ARRAY_SIZE * ITERATIONS);
cudaFree(foo);

反而

for (int i = 0; i < ITERATIONS; ++ i) {
    if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
    int * foo;
    cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
    cudaFree(foo);
}

c - 如何在内核中动态分配数组？

5 回答 5

Related

Reference