0

我想通过 OpenACC 优化此代码,但输出计算量为零。我很感激有机会以这种方式帮助我,并使用您的指导来取得成功并解决我的问题。

国王问候,穆罕默德

  #include <stdio.h>
  #include <math.h>
  #include <stdlib.h>
  #include <assert.h>
  #include <openacc.h>
  #include<time.h>
  #include <string.h>
  #include <malloc.h>

  #define NX 4
  #define NY 4
  #define NZ 4
  int main(void)
  {
  int i, j,p, k,m;
  static double A[NX][NY][NZ]={0.} ,B[NX][NY][NZ]={0.},C[NX][NY][NZ]={0.},D[NX][NY][NZ]={0.};
  FILE *file;
  file = fopen("B-and-A.csv", "w");
#pragma acc data copyin(C,D),copy(A,B)
 {
for (p = 0; p <=5; p++) {
    #pragma acc kernels
    for ( i = 1; i < NX - 1; i++ ) {
            for ( j = 0; j < NY - 1; j++ ) {
                for ( k = 0; k < NZ - 1; k++ ) {
                 A[i][j][k] = A[i][j][k] + 1.*( B[i][j+1][k] + D[i][j][k] );
                        }
                    }
                }
#pragma acc kernels
 for ( i = 1; i < NX - 1; i++ ) {
        for ( j = 0; j < NY - 1; j++ ) {
            for ( k = 0; k < NZ - 1; k++ ) {
             B[i][j][k] = B[i][j][k]+ 1.*( A[i][j+1][k] + D[i][j][k] );
            }
        }
    }

    for (m = 0; m < NZ - 1; m++) {
        A[0][m][m] = -25. ;
        A[2][m][m] = 52. ;
        B[0][m][m] = 15. ;
        B[2][m][m] = -55. ;
                                }
#pragma acc update self(B)
fprintf(file,"%e\n",B[2][2][2]);
printf("%e\n",B[2][2][2]);
}
}
  fclose(file);
}
4

1 回答 1

0

当您执行“更新自身(B)”时,您将 B 的设备值复制到 B 的主机副本。在此之前您对主机副本所做的任何更改都将丢失。

对于此代码,您需要在主机上执行循环之前更新 B,然后在主机循环之后更新设备副本。或者,您可以将循环卸载到 GPU,以便所有计算都在设备上完成。

选项1:

// Update the host copies before executing on the host
#pragma acc update self(A,B)
                        // This loop is executed on the host
                        for (m = 0; m < NZ - 1; m++) {
                                A[0][m][m] = -25. ;
                                A[2][m][m] = 52. ;
                                B[0][m][m] = 15. ;
                                B[2][m][m] = -55. ;
                        }
// To keep the device and host copies coherent, update the device copies
#pragma acc update device(A,B)

选项#2:

// Or offload the loop to the device
#pragma acc kernels
                        for (m = 0; m < NZ - 1; m++) {
                                A[0][m][m] = -25. ;
                                A[2][m][m] = 52. ;
                                B[0][m][m] = 15. ;
                                B[2][m][m] = -55. ;
                        }
#pragma acc update self(B)
于 2018-03-21T15:53:49.007 回答