对于我尝试使用 ALEA 库将结构数组传递给 NVIDIA 内核的代码,我收到“Fody/Alea.CUDA:clrobj(cGPU) 没有 llvm”构建错误。这是我的代码的简化版本。我删除了输出收集功能以保持代码简单。我现在只需要能够将结构数组发送到 GPU。
using Alea.CUDA;
using Alea.CUDA.Utilities;
using Alea.CUDA.IL;
namespace GPUProgramming
{
public class cGPU
{
public int Slice;
public float FloatValue;
}
[AOTCompile(AOTOnly = true)]
public class TestModule : ILGPUModule
{
public TestModule(GPUModuleTarget target) : base(target)
{
}
const int blockSize = 64;
[Kernel]
public void Kernel2(deviceptr<cGPU> Data, int n)
{
var start = blockIdx.x * blockDim.x + threadIdx.x;
int ind = threadIdx.x;
var sharedSlice = __shared__.Array<int>(64);
var sharedFloatValue = __shared__.Array<float>(64);
if (ind < n && start < n)
{
sharedSlice[ind] = Data[start].Slice;
sharedFloatValue[ind] = Data[start].FloatValue;
Intrinsic.__syncthreads();
}
}
public void Test2(deviceptr<cGPU> Data, int n, int NumOfBlocks)
{
var GridDim = new dim3(NumOfBlocks, 1);
var BlockDim = new dim3(64, 1);
try
{
var lp = new LaunchParam(GridDim, BlockDim);
GPULaunch(Kernel2, lp, Data, n);
}
catch (CUDAInterop.CUDAException x)
{
var code = x.Data0;
Console.WriteLine("ErrorCode = {0}", code);
}
}
public void Test2(cGPU[] Data)
{
int NumOfBlocks = Common.divup(Data.Length, blockSize);
using (var d_Slice = GPUWorker.Malloc(Data))
{
try
{
Test_Kernel2(d_Slice.Ptr, Data.Length, NumOfBlocks);
}
catch (CUDAInterop.CUDAException x)
{
var code = x.Data0;
Console.WriteLine("ErrorCode = {0}", x.Data0);
}
}
}
}
}