为了加快程序中的一些数学例程,我希望尝试将一些数组划分转移到 GPU 上。我正在开发的环境是一个使用monodevelop和使用OpenTk库的linux(Arch),它只是opencl C库的P/Invoke包装器。
具体来说,我能够完成所有启动、构建内核而不会出现错误或日志中报告的任何内容、设置内存缓冲区并设置内核参数。当我尝试使用“clEnqueueTask”方法对任务进行排队时,就会出现问题,此时程序出现段错误。
在这一点上,我不确定如何调试程序,因为没有抛出异常(它一直在抛出更普通的错误,比如缺少 dll),所以我很茫然,我正在寻找一些建议。
非常感谢!
这是代码...
string vecDivision = @"__kernel void floatDivision(__global float * v1, __global float * v2){
// Vector element index
int i = get_global_id(0);
v1[i] = v1[i] / v2[0];
}";
try{
fixed(float* srcA = &data[0]){
ErrorCode err = ErrorCode.Success;
IntPtr deviceId = IntPtr.Zero;
int numDevices = -1;
// get the platform to create the context from
IntPtr[] platforms = new IntPtr[3];
uint[] numPlatforms = new uint[3];
err = (ErrorCode)CL.GetPlatformIDs(3, platforms, numPlatforms);
// create the context
IntPtr[] properties = new IntPtr[]{new IntPtr((int)ContextProperties.ContextPlatform), platforms[0], IntPtr.Zero};
IntPtr context = CL.CreateContextFromType(properties, DeviceTypeFlags.DeviceTypeGpu, IntPtr.Zero, IntPtr.Zero, new ErrorCode[0]);
if(context == IntPtr.Zero){
throw new Exception("Count not create CL Context");
}
// get the device id
err = (ErrorCode)CL.GetDeviceIDs(platforms[0],DeviceTypeFlags.DeviceTypeDefault, 1, ref deviceId, ref numDevices);
if(err != ErrorCode.Success){
throw new Exception("Count not get the CL device");
}
// create the command queue
IntPtr queue = CL.CreateCommandQueue(context, deviceId, 0, out err);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL command queue");
}
// create the memory buffers
IntPtr ptrA = CL.CreateBuffer(context, MemFlags.MemReadWrite | MemFlags.MemUseHostPtr, new IntPtr(sizeof(float)*length), (IntPtr)srcA , out err);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL buffer");
}
IntPtr ptrB = CL.CreateBuffer(context, MemFlags.MemReadOnly | MemFlags.MemUseHostPtr, new IntPtr(sizeof(float)) , (IntPtr)divisor, out err);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL buffer");
}
// create the program using the source
IntPtr lengths = IntPtr.Zero;
IntPtr program = CL.CreateProgramWithSource(context, 1,new string[]{vecDivision}, ref lengths, out err);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL program from source");
}
// build the program from source
err = (ErrorCode)CL.BuildProgram(program, 1, new IntPtr[]{deviceId}, "", IntPtr.Zero, IntPtr.Zero);
if(err != ErrorCode.Success){
throw new Exception("Count not build the CL program");
}
// get the build log
char[] log = new char[2000];
IntPtr returnSize = IntPtr.Zero;
fixed(char* logp = &log[0]){
err = (ErrorCode)CL.GetProgramBuildInfo(program, deviceId, ProgramBuildInfo.ProgramBuildLog, new IntPtr(log.Length), new IntPtr(logp), out returnSize);
}
MsgHandling.LogMessage(new string(log).Substring(0,(int)returnSize), string.Empty);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL kernel");
}
// create the kernel object
IntPtr kernel = CL.CreateKernel(program, "floatDivision", out err);
if(err != ErrorCode.Success){
throw new Exception("Count not create CL kernel");
}
// set the arguments for the program
err = (ErrorCode)CL.SetKernelArg(kernel, 0, new IntPtr(sizeof(float*)), new IntPtr(&ptrA));
if(err != ErrorCode.Success){
throw new Exception("Count not set CL first argument");
}
err = (ErrorCode)CL.SetKernelArg(kernel, 1, new IntPtr(sizeof(float*)), new IntPtr(&ptrB));
if(err != ErrorCode.Success){
throw new Exception("Count not set CL second argument");
}
// queue up the worker
IntPtr globalWorkOffset = IntPtr.Zero;
IntPtr globalWorkSize = new IntPtr(length);
IntPtr localWorkSize = new IntPtr(64);
IntPtr eventWaitList = IntPtr.Zero;
IntPtr eventItem = IntPtr.Zero;
//err = (ErrorCode)CL.EnqueueNDRangeKernel(queue, kernel, 1, null, new IntPtr[]{globalWorkSize}, new IntPtr[]{localWorkSize}, 0, null, new IntPtr[]{eventItem});
// I DIE HERE!
err = (ErrorCode)CL.EnqueueTask(queue, kernel, 0,null, new IntPtr[]{eventItem});
// I DIE HERE!
if(err != ErrorCode.Success){
throw new Exception("Count not queue CL job");
}
// wait for it to finish
err = (ErrorCode)CL.Finish(queue);
if(err != ErrorCode.Success){
throw new Exception("Count not wait for CL job to finish");
}
}
更新:
我想到了。问题是我如何创建缓冲区。显然,如果您使用 MemUseHostPtr 选项,则必须使用另一种方法来排队读取缓冲区,因此我使用了将内存直接复制到 GPU 内存的选项,从长远来看,这可能是一个更好的选择。