我在尝试使用 nvidia(在 Visual Studio 2010 下)编译 opencl 矢量加法内核时遇到了这种奇怪的行为。我得到以下输出:
C[0]=1=0+1
C[1]=1.#INF=1+2
C[2]=-5.87747e-039=2+3
C[3]=-1.76324e-038=3+4
C[4]=-2.93874e-038=4+5
C[5]=-4.11423e-038=5+6
C[6]=-5.87747e-038=6+7
C[7]=-8.22846e-038=7+8
C[8]=-1.05794e-037=8+9
C[9]=-1.29304e-037=9+10
如果我替换内核中的操作
C[i] = A[i] + B[i];
通过将 C 分配给 A 或 B,无需算术运算,如下所示:
C[i] = A[i];
返回的输出将是正确的。我想这意味着它可以正确读取和写入缓冲区。
此外,当我通过为 C 分配常量值来替换操作时,它返回了奇怪的值,例如:
C[i] = 999;
这是我的主机程序:
#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
int main(int argc, char *argv[]) {
try {
std::cout<<"C"<<"\n";
std::ifstream ifs("vector_add.cl");
std::string kernelSource( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
cl_uint N = 10;
std::vector<cl::Platform> platform1;
cl::Platform::get(&platform1);
std::vector<cl::Device> device1;
//platform1.front().getDevices(CL_DEVICE_TYPE_ALL, &device1);
platform1.front().getDevices(CL_DEVICE_TYPE_GPU, &device1);
cl::Context context1 = cl::Context(device1);
cl::Program::Sources src_code1;
src_code1.push_back(std::make_pair(kernelSource.c_str(),kernelSource.size()));
cl::Program program1 = cl::Program(context1, src_code1);
program1.build(device1);
cl::Kernel kernel1(program1, "vector_add");
// Create a command queue and use the first device
cl::Device& device1_addr = device1.front();
cl::CommandQueue queue1(context1, device1_addr);
float A[10], B[10], C[10];
for (int i=0;i<10;i++) {
A[i]=float(i);
B[i]=1.0+i;
}
cl::Buffer A_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N);
cl::Buffer B_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N);
cl::Buffer C_buff(context1,CL_MEM_WRITE_ONLY,sizeof(cl_float)*N);
queue1.enqueueWriteBuffer(A_buff, CL_TRUE, 0, sizeof(cl_float)*N, A);
queue1.enqueueWriteBuffer(B_buff, CL_TRUE, 0, sizeof(cl_float)*N, B);
kernel1.setArg(0,A_buff);
kernel1.setArg(1,B_buff);
kernel1.setArg(2,C_buff);
queue1.enqueueNDRangeKernel(kernel1, 0, cl::NDRange(10), cl::NDRange(1));
queue1.enqueueReadBuffer(C_buff, CL_TRUE, 0, sizeof(cl_float)*N, C);
for (int i=0;i<10;i++) {
std::cout<<"C["<<i<<"]="<<C[i]<<"="<<A[i]<<"+"<<B[i]<<"\n";
}
} catch(cl::Error& err) {
std::cerr << "OpenCL error: " << err.what() << "(" << err.err() <<
")" << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
而这是我的内核:
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
//C[i] = B[i];
//C[i] = 999;
}