0

我正在编写一个具有数组依赖性的 OpenACC 代码。内循环的每次迭代都可以更新数组的相同位置。这是一些代码:

    long unsigned int digits[d + 11];
    for (long unsigned int digit = 0; digit < d + 11; ++digit)
            digits[digit] = 0;

    for (long unsigned int i = 1; i <= n; ++i) {
            long unsigned int remainder = 1;
            for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                    long unsigned int div = remainder / i;
                    long unsigned int mod = remainder % i;
                    digits[digit] += div; // here
                    remainder = mod * 10;
            }
    }

OpenMP 版本的写法如下:

    #pragma omp parallel private(i)
    {
            long unsigned int digit_local[d+11];
            for(i=0;i<d+11;i++)
                    digit_local[i] = 0;

            #pragma omp for
            for (i = 1; i <= n; ++i) {
                    long unsigned int remainder = 1;
                    for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                            long unsigned int div = remainder / i;
                            long unsigned int mod = remainder % i;
                            digit_local[digit] += div;
                            remainder = mod * 10;
                    }
            }

            #pragma omp critical
            for(long unsigned int digit = 0; digit < d+11; ++digit)
                    digits[digit] += digit_local[digit];

    }

在 OpenACC 中,关键字 private 与数组一起使用,但我不知道如何将私有数组与全局数组连接起来。

谢谢。

4

1 回答 1

0

您将使用 OpenACC“原子更新”指令。

            #pragma acc atomic update
            digits[digit] += div; // here

或者,您可以执行与您的 OpenMP 版本类似的操作。

    long unsigned int digit_local[d+11][n];
    #pragma acc data create(digit_local) copyout(digits)
    {

    #pragma acc parallel loop gang vector
    for (i = 1; i <= n; ++i) {
          for(j=0;j<d+11;j++) digit_local[j][i] = 0;
            long unsigned int remainder = 1;
            for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                    long unsigned int div = remainder / i;
                    long unsigned int mod = remainder % i;
                    digit_local[digit][i] += div;
                    remainder = mod * 10;
            }
    }

    #pragma acc parallel loop gang
    for(long unsigned int digit = 0; digit < d+11; ++digit) {
          long unsigned int dsum = 0;
          #pragma acc loop vector reduction(+:dsum)
          for (i = 1; i <= n; ++i) {
            dsum += digit_local[digit][i];
          }
          digits[digit] = dsum;
    }
    }

不过,我不确定这也会看到任何加速。

希望这会有所帮助,垫

于 2016-09-21T15:28:47.600 回答