matrix - 矩阵的零碎处理 - CUDA

Question

好的，假设我有一个 ( N x N ) 矩阵要处理。这个矩阵对于我的计算机来说非常大，如果我尝试一次将它全部发送到设备，我会收到“内存不足错误”。

那么有没有办法将矩阵的各个部分发送到设备？我可以看到的一种方法是在主机上复制矩阵的部分，然后将这些可管理的复制部分从主机发送到设备，然后最后将它们放回原处。

这是我尝试过的方法，但 for 循环中的 cudaMemcpy 返回错误代码 11，“无效参数”。

int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A  = (float*)malloc(h_size_m*h_size_m);

int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);

InitializeMatrices(h_N);

int i;
int iterations = (h_N*h_N)/(d_N*d_N);

for( i = 0; i < iterations; i++ ) 
{
    float* h_array_ref = h_A+(i*d_N*d_N);
    cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
    cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}

我试图用上面的代码完成的是：我没有将整个矩阵发送到设备，而是发送一个指向该矩阵中某个位置的指针，并在设备上保留足够的空间来完成这项工作，然后使用循环的下一次迭代将指针在矩阵中向前移动，等等。

score 4 · Accepted Answer

您不仅可以做到这一点（假设您的问题很容易以这种方式分解为子数组），而且对性能来说也是一件非常有用的事情；一旦您获得了您描述的基本方法，您就可以开始使用异步内存副本和双缓冲来重叠一些内存传输时间与计算已经在卡上的时间所花费的时间。

但是第一个让简单的事情起作用。下面是一个 1d 示例（将向量乘以标量并添加另一个标量），但使用线性化的 2d 数组将是相同的；关键部分是

CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);

int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {

    int size=batchsize;
    if ((nstart + batchsize) > n) size = n - nstart;

    CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

    blocksize = (size+nblocks-1)/nblocks;
    cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

    CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

    nbatches++;
}
gputime = tock(&gputimer);

CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );

您在开始时分配缓冲区，然后循环直到完成，每次都进行复制，启动内核，然后再复制回来。你最后自由了。

完整的代码是

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>

#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}

__global__ void cuda_saxpb(const float *xd, const float a, const float b,
                           float *yd, const int n) {

    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i<n) {
        yd[i] = a*xd[i]+b;
    }
    return;
}

void cpu_saxpb(const float *x, float a, float b, float *y, int n) {

    int i;
    for (i=0;i<n;i++) {
        y[i] = a*x[i]+b;
    }
    return;
}

int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);

int main(int argc, char **argv) {
    int n=1000;
    int nblocks=10;
    int batchsize=100;
    float a = 5.;
    float b = -1.;
    int err;
    float *x, *y, *ycuda;
    float *xd, *yd;
    double abserr;
    int blocksize;
    int i;
    struct timeval cputimer;
    struct timeval gputimer;
    double cputime, gputime;

    err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
    if (batchsize > n) {
        fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
        batchsize = n;
    }
    if (err) return 0;

    x = (float *)malloc(n*sizeof(float));
    if (!x) return 1;

    y = (float *)malloc(n*sizeof(float));
    if (!y) {free(x); return 1;}

    ycuda = (float *)malloc(n*sizeof(float));
    if (!ycuda) {free(y); free(x); return 1;}

    /* run CPU code */

    tick(&cputimer);
    cpu_saxpb(x, a, b, y, n);
    cputime = tock(&cputimer);

    /* run GPU code */

    /* only have to allocate once */
    CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
    CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
    tick(&gputimer);

    int nbatches = 0;
    for (int nstart=0; nstart < n; nstart+=batchsize) {

        int size=batchsize;
        if ((nstart + batchsize) > n) size = n - nstart;

        CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );

        blocksize = (size+nblocks-1)/nblocks;
        cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);

        CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );

        nbatches++;
    }
    gputime = tock(&gputimer);

    CHK_CUDA( cudaFree(xd) );
    CHK_CUDA( cudaFree(yd) );

    abserr = 0.;
    for (i=0;i<n;i++) {
        abserr += fabs(ycuda[i] - y[i]);
    }

    printf("Y = a*X + b, problemsize = %d\n", n);
    printf("CPU time = %lg millisec.\n", cputime*1000.);
    printf("GPU time = %lg millisec (done with %d batches of %d).\n",
                  gputime*1000., nbatches, batchsize);
    printf("CUDA and CPU results differ by %lf\n", abserr);

    free(x);
    free(y);
    free(ycuda);
    return 0;
}


int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {

  const struct option long_options[] = {
    {"nvals"     , required_argument, 0, 'n'},
    {"nblocks"   , required_argument, 0, 'B'},
    {"batchsize" , required_argument, 0, 's'},
    {"a", required_argument, 0, 'a'},
    {"b", required_argument, 0, 'b'},
    {"help",      no_argument, 0, 'h'},
    {0, 0, 0, 0}};

  char c;
  int option_index;
  int tempint;

  while (1) {
    c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
    if (c == -1) break;

    switch(c) {
      case 'n': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 500000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *n);
          } else {
            *n = tempint;
          }
          break;

      case 's': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 50000) {
            fprintf(stderr,"%s: Cannot use number of points %s;\n  Using %d\n", argv[0], optarg, *s);
          } else {
            *s = tempint;
          }
          break;

      case 'B': tempint = atoi(optarg);
          if (tempint < 1 || tempint > 1000 || tempint > *n) {
            fprintf(stderr,"%s: Cannot use number of blocks %s;\n  Using %d\n", argv[0], optarg, *nb);
          } else {
            *nb = tempint;
          }
          break;

      case 'a': *a = atof(optarg);
          break;

      case 'b': *b = atof(optarg);
          break;

      case 'h':
          puts("Calculates y[i] = a*x[i] + b on the GPU.");
          puts("Options: ");
          puts("    --nvals=N      (-n N): Set the number of values in y,x.");
          puts("    --batchsize=N  (-s N): Set the number of values to transfer at a time.");
          puts("    --nblocks=N    (-B N): Set the number of blocks used.");
          puts("    --a=X          (-a X): Set the parameter a.");
          puts("    --b=X          (-b X): Set the parameter b.");
          puts("    --niters=N     (-I X): Set number of iterations to calculate.");
          puts("");
          return +1;
        }
    }

    return 0;
}

void tick(struct timeval *timer) {
    gettimeofday(timer, NULL);
}

double tock(struct timeval *timer) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}

运行这个得到：

$  ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000

$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000

在这种情况下，GPU 时间会增加（我们正在做更多的内存副本），但答案保持不变。

已编辑：此代码的原始版本有一个选项可以运行内核的多次迭代以用于计时目的，但这在这种情况下会造成不必要的混淆，因此已将其删除。

matrix - 矩阵的零碎处理 - CUDA

1 回答 1

Related

Reference