1

我有以下简单的代码,由 4 个文件组成:

//Data.h:
#ifndef DATA_H
#define DATA_H

constexpr int N=10000000;
namespace data{
  float particles[N];
  float outputArray[N];
}
#endif

//Process.h:
#ifndef PROCESS_H
#define PROCESS_H

template <typename ProcessImpl>
class Process{
public:
  using Base_t = ProcessImpl;
  Process(ProcessImpl arg):fProcessImpl(arg){}
  void Get1(int N, float * outputArray) const;
  void Get2(int N) const;
private:
  ProcessImpl fProcessImpl;
};
template <class ProcessImpl>
void Process<ProcessImpl>::Get1(int N, float * outputArray) const
{         
#pragma acc parallel loop gang vector present(outputArray)
  for(int ind=0; ind < N; ++ind){outputCSArray[ind]=fProcessImpl.Get1(ind);}
}   
template <class ProcessImpl>
void Process<ProcessImpl>::Get2(int N) const
{
#pragma acc parallel loop gang vector
  for (int ind = 0u; ind < N; ++ind){fProcessImpl.Get2(ind);}
}
#endif

//ProcessImpl.h:
#ifndef PROCESSIMPL_H
#define PROCESSIMPL_H

#include "Data.h"
using namespace data;
class ProcessImpl
{
public:
  inline float Get1(int ind, float * outputArray) const;
  inline void Get2(int ind) const;
};
float ProcessImpl::Get1(int ind, float * outputArray) const
{
  outputArray[ind]=particles[ind];
  return particles[ind+1];
}
void ProcessImpl::Get2(int ind) const
{
  particles[ind]=2*particles[ind];
}
#endif

//main.cpp:
#include <iostream>
#include "Data.h"
#include "Process.h"
#include "ProcessImpl.h"

#include <accelmath.h>
#include <openacc.h>

using namespace data;
using Process_t = Process<ProcessImpl>;
Process_t process = Process_t(typename Process_t::Base_t());

int main(int argc, char **argv)
{
#pragma acc data create(particles,outputArray)
  {
  #pragma acc parallel loop gang vector present(particles)
    for(int i=0; i<N; ++i) particles[i]=static_cast<float>(i);
  #pragma acc update host(particles)
    for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
    std::cout<<std::endl;

    process.Get2(N);

  #pragma acc update host(particles)
    for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
    std::cout<<std::endl;  
  }
  return 0;
}

它在带有 PGI 19.4 编译器的 CPU 上正常工作。但我的任务是在 GPU 上启动代码。我使用 PGI 19.4 + OpenAcc。使用简单的CMakeLists.txt文件和编译行(GPU Nvidia GeForce 650 Ti,计算能力 3.0):

cmake . -DCMAKE_C_COMPILER=pgcc -DCMAKE_CXX_COMPILER=pgc++
-DCMAKE_C_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
-DCMAKE_CXX_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"

编译失败:

> Scanning dependencies of target Test
[ 50%] Building CXX object CMakeFiles/Test.dir/main.cpp.o
main:
     16, Generating create(_ZN4data11outputArrayE[:])
         Generating present(_ZN4data9particlesE[:])
         Generating create(_ZN4data9particlesE[:])
         Generating Tesla code
         18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     18, Generating update self(_ZN4data9particlesE[:])
     23, Generating update self(_ZN4data9particlesE[:])
         PGCC-W-0155-External and Static variables are not supported in acc routine - 
         _ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)
         ProcessImpl::Get2(int) const:
      4, include "ProcessImpl.h"
         18, Generating implicit acc routine seq
             Process<ProcessImpl>::Get2(int) const:
      3, include "Process.h"
         25, Generating Tesla code
             27, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         25, Generating implicit copyin(this[:])
PGCC/x86-64 Linux 19.4-0: compilation completed with warnings
[100%] Linking CXX executable Test
nvlink error   : Undefined reference to '_ZNK11ProcessImpl4Get2Ei' in 
'CMakeFiles/Test.dir/main.cpp.o'
pgacclnk: child process exit status 2: /opt/pgi/linux86-64-llvm/19.4/bin/pgnvd
CMakeFiles/Test.dir/build.make:83: recipe for target 'Test' failed
make[2]: *** [Test] Error 2
CMakeFiles/Makefile2:72: recipe for target 'CMakeFiles/Test.dir/all' failed
make[1]: *** [CMakeFiles/Test.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2

使用pggdecode,发现“ _ZNK11ProcessImpl4Get2Ei ”是ProcessImpl::Get2(int) const的错误名称。我从ProcessImpl.h中删除了inline关键字,并尝试在main()中将copyin(process)添加到#pragma acc data create(particles,outputArray),但这没有帮助。使用 gcc 5.3.1 在 Fedora 23 上工作。

在完整的代码中,我避免了在单个 .cpp 文件中包含多个数组粒子outputArray定义的问题,因为 OpenAcc 不允许使用extern关键字。可能不好(如果您知道如何做得更好,请提出建议),但它确实有效。

问题是:

如何正确地将 GPU 版本的数组粒子outputArray传递给Process.h中的 Get1( )Get2 () ,并使ProcessImpl.h中的Get1()Get2()与 GPU 上分配的数组一起工作?以及如何编译这段代码?

OpenAcc 如何允许直接访问在 OpenAcc 计算区域的代码中复制到 GPU 的全局分配数组,而无需将指向它们的指针作为调用函数的参数传递?

谢谢你。

4

1 回答 1

1

未定义的引用是由于错误导致未创建 Get2 的设备版本:

PGCC-W-0155-acc 例程 _ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19) 中不支持外部和静态变量

问题是直接在设备例程中访问的全局变量需要在链接时定义设备版本,以便链接器可以在两者之间建立关联。一种选择是将“particles”作为参数传递,但更简单的选择是将“particles”放入“declare create”指令中。

“declare”指令创建一个与定义它的范围单元具有相同范围的数据区域。因此,将其用于具有全局范围的变量,也将其置于设备上的全局范围内。

% cat Data.h
//Data.h:
#ifndef DATA_H
#define DATA_H

constexpr int N=10000000;
namespace data{
  float particles[N];
  float outputArray[N];
#pragma acc declare create(particles[:N])
}
#endif

% pgc++ -I. main.cpp -ta=tesla -Minfo=accel
main:
     17, Generating create(_ZN4data11outputArrayE[:]) [if not already present]
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     19, Generating update self(_ZN4data9particlesE[:])
     24, Generating update self(_ZN4data9particlesE[:])
ProcessImpl::Get2(int) const:
      5, include "ProcessImpl.h"
          19, Generating implicit acc routine seq
              Generating acc routine seq
              Generating Tesla code
Process<ProcessImpl>::Get2(int) const:
      4, include "Process.h"
          23, Generating Tesla code
              25, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
          23, Generating implicit copyin(this[:]) [if not already present]
% a.out
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198
于 2020-03-10T20:52:01.003 回答