1

所以,在花了几个小时阅读和理解之后,我终于制作了我的第一个 OpenCL 程序,它实际上做了一些事情,它添加了两个向量并输出到一个文件。

#include <iostream>
#include <vector>
#include <cstdlib>
#include <string>
#include <fstream>

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>

int main(int argc, char *argv[])
{
    try
    {
// get platforms, devices and display their info.
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        std::vector<cl::Platform>::iterator i=platforms.begin();

        std::cout<<"OpenCL \tPlatform : "<<i->getInfo<CL_PLATFORM_NAME>()<<std::endl;
        std::cout<<"\tVendor: "<<i->getInfo<CL_PLATFORM_VENDOR>()<<std::endl;
        std::cout<<"\tVersion : "<<i->getInfo<CL_PLATFORM_VERSION>()<<std::endl;
        std::cout<<"\tExtensions : "<<i->getInfo<CL_PLATFORM_EXTENSIONS>()<<std::endl;

// get devices
        std::vector<cl::Device> devices;
        i->getDevices(CL_DEVICE_TYPE_ALL,&devices);
        int o=99;
        std::cout<<"\n\n";
// iterate over available devices
        for(std::vector<cl::Device>::iterator j=devices.begin(); j!=devices.end(); j++)
        {
            std::cout<<"\tOpenCL\tDevice : " << j->getInfo<CL_DEVICE_NAME>()<<std::endl;
            std::cout<<"\t\t Type : " << j->getInfo<CL_DEVICE_TYPE>()<<std::endl;
            std::cout<<"\t\t Vendor : " << j->getInfo<CL_DEVICE_VENDOR>()<<std::endl;
            std::cout<<"\t\t Driver : " << j->getInfo<CL_DRIVER_VERSION>()<<std::endl;
            std::cout<<"\t\t Global Mem : " << j->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/(1024*1024)<<" MBytes"<<std::endl;
            std::cout<<"\t\t Local Mem : " << j->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/1024<<" KBbytes"<<std::endl;
            std::cout<<"\t\t Compute Unit : " << j->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;
            std::cout<<"\t\t Clock Rate : " << j->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()<<" MHz"<<std::endl;
        }
        std::cout<<"\n\n\n";





 //MAIN CODE BEGINS HERE


 //get Kernel
        std::ifstream ifs("vector_add_kernel.cl");
        std::string kernelSource((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
        std::cout<<kernelSource;

   //Create context, select device and command queue.
        cl::Context context(devices);
        cl::Device &device=devices.front();
        cl::CommandQueue cmdqueue(context,device);

  // Generate Source vector and push the kernel source in it.
        cl::Program::Sources sourceCode;
        sourceCode.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size()));

  //Generate program using sourceCode
        cl::Program program=cl::Program(context, sourceCode);

  //Build program..
        try
        {
            program.build(devices);
        }
        catch(cl::Error &err)
        {
            std::cerr<<"Building failed, "<<err.what()<<"("<<err.err()<<")"
                     <<"\nRetrieving build log"
                     <<"\n Build Log Follows \n"
                     <<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices.front());
        }


  //Declare and initialize vectors
        std::vector<cl_float>B(993448,1.3);
        std::vector<cl_float>C(993448,1.3);
        std::vector<cl_float>A(993448,1.3);
        cl_int N=A.size();

  //Declare and intialize proper work group size and global size. Global size raised to the nearest multiple of workGroupSize.
        int workGroupSize=128;
        int GlobalSize;
        if(N%workGroupSize) GlobalSize=N - N%workGroupSize + workGroupSize;
        else GlobalSize=N;
  //Declare buffers.
        cl::Buffer vecA(context, CL_MEM_READ_WRITE, sizeof(cl_float)*N);
        cl::Buffer vecB(context, CL_MEM_READ_ONLY , (B.size())*sizeof(cl_float));
        cl::Buffer vecC(context, CL_MEM_READ_ONLY , (C.size())*sizeof(cl_float));
  //Write vectors into buffers
        cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (B.size())*sizeof(cl_float), &B[0] );
        cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (C.size())*sizeof(cl_float), &C[0] );

 //Executing kernel
        cl::Kernel kernel(program, "vector_add");
        cl::KernelFunctor kernel_func=kernel.bind(cmdqueue, cl::NDRange(GlobalSize), cl::NDRange(workGroupSize));

        kernel_func(vecA, vecB, vecC, N);

 //Reading back values into vector A
        cmdqueue.enqueueReadBuffer(vecA,true,0,N*sizeof(cl_float), &A[0]);
        cmdqueue.finish();


  //Saving into file.
        std::ofstream output("vectorAdd.txt");
        for(int i=0;i<N;i++) output<<A[i]<<"\n";

    }
    catch(cl::Error& err)
    {
        std::cerr << "OpenCL error: " << err.what() << "(" << err.err() <<
                  ")" << std::endl;

        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}

问题是,对于较小的 N 值,我得到的正确结果是 2.6 但对于较大的值,例如上面代码中的值 (993448),我得到的垃圾输出在 1 和 2.4 之间变化。

这是内核代码:

__kernel void vector_add(__global float *A, __global float *B, __global float *C, int N) {

// Get the index of the current element
int i = get_global_id(0);


//Do the operation
if(i<N)    A[i] = C[i] + B[i];
}

更新:好的,看来代码现在可以工作了。我在上面的代码中修复了一些小错误 1)初始化 GlobalSize 的部分已经修复。2)enqueueWriteBuffer 中的愚蠢错误(给定的参数错误)它现在正在输出大 N 值的正确结果。

4

1 回答 1

-1

尝试将数据类型从浮点数更改为双精度数等。

于 2013-04-24T10:27:27.150 回答