我想运行一个使用 OpenCL 的旧 N-body。
我有 2 张 NVIDIA A6000 卡NVLink
,这是一个从硬件(可能还有软件?)角度绑定的组件,这 2 张 GPU 卡。
但在执行时,我得到以下结果:
下面是使用的内核代码(我已经放了我估计对 NVIDIA 卡有用的编译指示):
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel
void
nbody_sim(
__global double4* pos ,
__global double4* vel,
int numBodies,
double deltaTime,
double epsSqr,
__local double4* localPos,
__global double4* newPosition,
__global double4* newVelocity)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
// Gravitational constant
double G_constant = 227.17085e-74;
// Number of tiles we need to iterate
unsigned int numTiles = numBodies / localSize;
// position of this work-item
double4 myPos = pos[gid];
double4 acc = (double4) (0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < numTiles; ++i)
{
// load one tile into local memory
int idx = i * localSize + tid;
localPos[tid] = pos[idx];
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// Calculate acceleration effect due to each body
// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
for(int j = 0; j < localSize; ++j)
{
// Calculate acceleration caused by particle j on particle i
double4 r = localPos[j] - myPos;
double distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
double invDist = 1.0f / sqrt(distSqr + epsSqr);
double invDistCube = invDist * invDist * invDist;
double s = G_constant * localPos[j].w * invDistCube;
// accumulate effect of all particles
acc += s * r;
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
double4 oldVel = vel[gid];
// updated position and velocity
double4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
newPos.w = myPos.w;
double4 newVel = oldVel + acc * deltaTime;
// write to global memory
newPosition[gid] = newPos;
newVelocity[gid] = newVel;
}
设置内核代码的部分代码如下:
int NBody::setupCL()
{
cl_int status = CL_SUCCESS;
cl_event writeEvt1, writeEvt2;
// The block is to move the declaration of prop closer to its use
cl_command_queue_properties prop = 0;
commandQueue = clCreateCommandQueue(
context,
devices[current_device],
prop,
&status);
CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed.");
...
// create a CL program using the kernel source
const char *kernelName = "NBody_Kernels.cl";
FILE *fp = fopen(kernelName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
char *source = (char*)malloc(10000);
int sourceSize = fread( source, 1, 10000, fp);
fclose(fp);
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&sourceSize, &status);
// Build the program
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
// get a kernel object handle for a kernel with the given name
kernel = clCreateKernel(
program,
"nbody_sim",
&status);
CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");
status = waitForEventAndRelease(&writeEvt1);
CHECK_ERROR(status, NBODY_SUCCESS, "WaitForEventAndRelease(writeEvt1) Failed");
status = waitForEventAndRelease(&writeEvt2);
CHECK_ERROR(status, NBODY_SUCCESS, "WaitForEventAndRelease(writeEvt2) Failed");
return NBODY_SUCCESS;
}
因此,错误发生在创建内核代码时。有没有办法将其视为the 2 GPU
独特的 GPU NVLINK component
?我的意思是从软件的角度来看?
如何解决创建内核代码的错误?
更新 1
I) 我自愿通过修改下面的这个循环将 GPU 设备的数量限制为只有一个 GPU(实际上,它仍然只有一个迭代):
// Print device index and device names
//for(cl_uint i = 0; i < deviceCount; ++i)
for(cl_uint i = 0; i < 1; ++i)
{
char deviceName[1024];
status = clGetDeviceInfo(deviceIds[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed");
std::cout << "Device " << i << " : " << deviceName <<" Device ID is "<<deviceIds[i]<< std::endl;
}
// Set id = 0 for currentDevice with deviceType
*currentDevice = 0;
free(deviceIds);
return NBODY_SUCCESS;
}
并在经典电话之后做:
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
但错误仍然存在,在消息下方:
II)如果我不修改此循环并应用建议的解决方案,即设置devices[current_device]
而不是devices
我得到如下编译错误:
In file included from NBody.hpp:8,
from NBody.cpp:1:
/opt/AMDAPPSDK-3.0/include/CL/cl.h:863:16: note: initializing argument 3 of ‘cl_int clBuildProgram(cl_program, cl_uint, _cl_device_id* const*, const char*, void (*)(cl_program, void*), void*)’
const cl_device_id * /* device_list */,
我怎样才能绕过这个编译问题?
更新 2
我status
在这部分代码中打印了变量的值:
我得到了status = -44
. 从CL/cl.h
,它将对应一个CL_INVALID_PROGRAM
错误:
然后,当我执行应用程序时,我得到:
我想知道我是否没有错过在内核代码中添加特殊的编译指示,因为我在 NVIDIA 卡上使用 OpenCL,不是吗?
顺便问一下,变量的类型是devices
什么?我无法正确打印它。
更新 3
我添加了以下几行,但仍在-44 error
执行中。我没有放置所有相关代码,而是提供以下链接来下载源文件:http: //31.207.36.11/NBody.cpp和用于编译的 Makefile:http: //31.207.36.11/Makefile。也许有人会发现一些错误,但我主要想知道我为什么会得到这个error -44
。
更新 4
我正在接手这个项目。
这是 clinfo 命令的结果:
$ clinfo
Number of platforms: 1
Platform Profile: FULL_PROFILE
Platform Version: OpenCL 3.0 CUDA 11.4.94
Platform Name: NVIDIA CUDA
Platform Vendor: NVIDIA Corporation
Platform Extensions: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_nv_kernel_attribute cl_khr_device_uuid cl_khr_pci_bus_info
Platform Name: NVIDIA CUDA
Number of devices: 2
Device Type: CL_DEVICE_TYPE_GPU
Vendor ID: 10deh
Max compute units: 84
Max work items dimensions: 3
Max work items[0]: 1024
Max work items[1]: 1024
Max work items[2]: 64
Max work group size: 1024
Preferred vector width char: 1
Preferred vector width short: 1
Preferred vector width int: 1
Preferred vector width long: 1
Preferred vector width float: 1
Preferred vector width double: 1
Native vector width char: 1
Native vector width short: 1
Native vector width int: 1
Native vector width long: 1
Native vector width float: 1
Native vector width double: 1
Max clock frequency: 1800Mhz
Address bits: 64
Max memory allocation: 12762480640
Image support: Yes
Max number of images read arguments: 256
Max number of images write arguments: 32
Max image 2D width: 32768
Max image 2D height: 32768
Max image 3D width: 16384
Max image 3D height: 16384
Max image 3D depth: 16384
Max samplers within kernel: 32
Max size of kernel argument: 4352
Alignment (bits) of base address: 4096
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: Yes
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: Yes
Cache type: Read/Write
Cache line size: 128
Cache size: 2408448
Global memory size: 51049922560
Constant buffer size: 65536
Max number of constant args: 9
Local memory type: Scratchpad
Local memory size: 49152
Max pipe arguments: 0
Max pipe active reservations: 0
Max pipe packet size: 0
Max global variable size: 0
Max global variable preferred total size: 0
Max read/write image args: 0
Max on device events: 0
Queue on device max size: 0
Max on device queues: 0
Queue on device preferred size: 0
SVM capabilities:
Coarse grain buffer: Yes
Fine grain buffer: No
Fine grain system: No
Atomics: No
Preferred platform atomic alignment: 0
Preferred global atomic alignment: 0
Preferred local atomic alignment: 0
Kernel Preferred work group size multiple: 32
Error correction support: 0
Unified memory for Host and Device: 0
Profiling timer resolution: 1000
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: No
Queue on Host properties:
Out-of-Order: Yes
Profiling : Yes
Queue on Device properties:
Out-of-Order: No
Profiling : No
Platform ID: 0x1e97440
Name: NVIDIA RTX A6000
Vendor: NVIDIA Corporation
Device OpenCL C version: OpenCL C 1.2
Driver version: 470.57.02
Profile: FULL_PROFILE
Version: OpenCL 3.0 CUDA
Extensions: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_nv_kernel_attribute cl_khr_device_uuid cl_khr_pci_bus_info
Device Type: CL_DEVICE_TYPE_GPU
Vendor ID: 10deh
Max compute units: 84
Max work items dimensions: 3
Max work items[0]: 1024
Max work items[1]: 1024
Max work items[2]: 64
Max work group size: 1024
Preferred vector width char: 1
Preferred vector width short: 1
Preferred vector width int: 1
Preferred vector width long: 1
Preferred vector width float: 1
Preferred vector width double: 1
Native vector width char: 1
Native vector width short: 1
Native vector width int: 1
Native vector width long: 1
Native vector width float: 1
Native vector width double: 1
Max clock frequency: 1800Mhz
Address bits: 64
Max memory allocation: 12762578944
Image support: Yes
Max number of images read arguments: 256
Max number of images write arguments: 32
Max image 2D width: 32768
Max image 2D height: 32768
Max image 3D width: 16384
Max image 3D height: 16384
Max image 3D depth: 16384
Max samplers within kernel: 32
Max size of kernel argument: 4352
Alignment (bits) of base address: 4096
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: Yes
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: Yes
Cache type: Read/Write
Cache line size: 128
Cache size: 2408448
Global memory size: 51050315776
Constant buffer size: 65536
Max number of constant args: 9
Local memory type: Scratchpad
Local memory size: 49152
Max pipe arguments: 0
Max pipe active reservations: 0
Max pipe packet size: 0
Max global variable size: 0
Max global variable preferred total size: 0
Max read/write image args: 0
Max on device events: 0
Queue on device max size: 0
Max on device queues: 0
Queue on device preferred size: 0
SVM capabilities:
Coarse grain buffer: Yes
Fine grain buffer: No
Fine grain system: No
Atomics: No
Preferred platform atomic alignment: 0
Preferred global atomic alignment: 0
Preferred local atomic alignment: 0
Kernel Preferred work group size multiple: 32
Error correction support: 0
Unified memory for Host and Device: 0
Profiling timer resolution: 1000
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: No
Queue on Host properties:
Out-of-Order: Yes
Profiling : Yes
Queue on Device properties:
Out-of-Order: No
Profiling : No
Platform ID: 0x1e97440
Name: NVIDIA RTX A6000
Vendor: NVIDIA Corporation
Device OpenCL C version: OpenCL C 1.2
Driver version: 470.57.02
Profile: FULL_PROFILE
Version: OpenCL 3.0 CUDA
Extensions: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_nv_kernel_attribute cl_khr_device_uuid cl_khr_pci_bus_info
所以我有一个带有 2 个 GPU 卡 A6000 的平台。
鉴于我想运行我的代码的原始版本(即使用 a single GPU card
),我必须在源代码中只选择一个 ID NBody.cpp
(我将在第二次看到如何使用 2 个 GPU 卡进行管理,但这是为了后)。所以,我刚刚在这个来源中进行了修改。
代替:
// Print device index and device names
for(cl_uint i = 0; i < deviceCount; ++i)
{
char deviceName[1024];
status = clGetDeviceInfo(deviceIds[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed");
std::cout << "Device " << i << " : " << deviceName <<" Device ID is "<<deviceIds[i]<< std::endl;
}
我做了:
// Print device index and device names
//for(cl_uint i = 0; i < deviceCount; ++i)
for(cl_uint i = 0; i < 1; ++i)
{
char deviceName[1024];
status = clGetDeviceInfo(deviceIds[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed");
std::cout << "Device " << i << " : " << deviceName <<" Device ID is "<<deviceIds[i]<< std::endl;
}
如你所见,我已经强行考虑到了deviceIds[0]
,也就是单GPU卡。
一个关键点也是构建程序的一部分。
// create a CL program using the kernel source
const char *kernelName = "NBody_Kernels.cl";
FILE *fp = fopen(kernelName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
char *source = (char*)malloc(10000);
int sourceSize = fread( source, 1, 10000, fp);
fclose(fp);
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&sourceSize, &status);
// Build the program
//status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
status = clBuildProgram(program, 1, &devices[current_device], NULL, NULL, NULL);
printf("status1 = %d\n", status);
//printf("devices = %d\n", devices[current_device]);
// get a kernel object handle for a kernel with the given name
kernel = clCreateKernel(
program,
"nbody_sim",
&status);
printf("status2 = %d\n", status);
CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");
status1
在执行时,我得到了and的以下值status2
:
Selected Platform Vendor : NVIDIA Corporation
deviceCount = 2/nDevice 0 : NVIDIA RTX A6000 Device ID is 0x55c38207cdb0
status1 = -44
devices = -2113661720
status2 = -44
clCreateKernel failed.
clSetKernelArg failed. (updatedPos)
clEnqueueNDRangeKernel failed.
clEnqueueNDRangeKernel failed.
clEnqueueNDRangeKernel failed.
clEnqueueNDRangeKernel failed.
第一个错误是内核创建失败。这是我的NBody_Kernels.cl
来源:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel
void
nbody_sim(
__global double4* pos ,
__global double4* vel,
int numBodies,
double deltaTime,
double epsSqr,
__local double4* localPos,
__global double4* newPosition,
__global double4* newVelocity)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
// Gravitational constant
double G_constant = 227.17085e-74;
// Number of tiles we need to iterate
unsigned int numTiles = numBodies / localSize;
// position of this work-item
double4 myPos = pos[gid];
double4 acc = (double4) (0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < numTiles; ++i)
{
// load one tile into local memory
int idx = i * localSize + tid;
localPos[tid] = pos[idx];
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// Calculate acceleration effect due to each body
// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
for(int j = 0; j < localSize; ++j)
{
// Calculate acceleration caused by particle j on particle i
double4 r = localPos[j] - myPos;
double distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
double invDist = 1.0f / sqrt(distSqr + epsSqr);
double invDistCube = invDist * invDist * invDist;
double s = G_constant * localPos[j].w * invDistCube;
// accumulate effect of all particles
acc += s * r;
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
double4 oldVel = vel[gid];
// updated position and velocity
double4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
newPos.w = myPos.w;
double4 newVel = oldVel + acc * deltaTime;
// write to global memory
newPosition[gid] = newPos;
newVelocity[gid] = newVel;
}
修改后的源代码可以在这里找到:
我不知道如何解决创建此内核代码以及以下值status1 = -44
和status2 = -44
.
更新 5
我在代码中添加clGetProgramBuildInfo
了以下代码片段,以便能够查看clCreateKernl failed
错误的问题:
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&sourceSize, &status);
if (clBuildProgram(program, 1, devices, NULL, NULL, NULL) != CL_SUCCESS)
{
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[current_device], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
cout << "size log =" << log_size << endl;
// Get the log
clGetProgramBuildInfo(program, devices[current_device], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
// get a kernel object handle for a kernel with the given name
kernel = clCreateKernel(
program,
"nbody_sim",
&status);
CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");
不幸的是,这个函数clGetProgramBuildInfo
只给出输出:
Selected Platform Vendor : NVIDIA Corporation
Device 0 : NVIDIA RTX A6000 Device ID is 0x562857930980
size log =16
log =
clCreateKernel failed.
如何打印“ value
”的内容?
更新 6
如果我做一个printf
:
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&sourceSize, &status);
printf("status clCreateProgramWithSourceContext = %d\n", status);
我得到一个status=-6
对应于CL_OUT_OF_HOST_MEMORY
哪些轨道可以解决这个问题?
部分解决方案
通过使用 Intel 编译器 (icc
和icpc
) 进行编译,可以很好地执行编译并且代码运行良好。我不明白为什么它不适用于GNU gcc/g++-8
编译器。如果有人有想法...