我正在尝试为实时图像处理软件开发一个支持 ML 的插件,该插件cudaArray_t
在 GPU 上提供图像数据,但由于该软件将我锁定在较旧的 CUDA 版本中,我想使用 DirectML(该软件无论如何都是Windows)。
出于延迟原因,我不想做任何不必要的 GPU-CPU-GPU 往返。为此,我认为我需要将 CUDA 数据映射到 D3D12 资源,然后可用于创建输入和输出张量以绑定到模型。我找到了一个示例,该示例使用 CUDA外部资源互操作性API 将 a 映射cudaArray_t
到我试图作为代码基础的ID3D12Resource
此处。由于我不需要渲染任何东西,我认为我可以简单地创建堆和资源,然后将传入cudaArray_t
的内容复制到互操作cudaArray_t
中,如下所示,而无需创建任何类型的命令队列。请注意,缺少的代码与上面链接的 github 存储库中的代码相同,因此为了简洁起见,我将其省略了。
这种方法不起作用,但我不确定如何调试它,因为我通常是 Direct3D 编程和 GPU 编程的新手。我正在使用官方 Direct3D 12 文档作为参考,但它有点压倒性,因此非常感谢这里应该修复的一些方向 :) 我在想我需要使用信号量进行某种同步,但我不确定在不创建某种命令队列的情况下是否可行。
bool initD3d12() {
// setup the d3d12 device
UINT dxgiFactoryFlags = 0;
winrt::com_ptr<IDXGIFactory4> factory;
winrt::check_hresult(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(factory.put())));
winrt::com_ptr<IDXGIAdapter1> hardwareAdapter;
GetHardwareAdapter(factory.get(), hardwareAdapter.put());
winrt::check_hresult(D3D12CreateDevice(hardwareAdapter.get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(m_d3d12Device.put())));
DXGI_ADAPTER_DESC1 desc;
hardwareAdapter->GetDesc1(&desc);
m_dx12deviceluid = desc.AdapterLuid;
return true;
}
void initCuda() {
// setup the cuda device
int num_cuda_devices = 0;
checkCudaErrors(cudaGetDeviceCount(&num_cuda_devices));
if (!num_cuda_devices) {
throw std::exception("No CUDA Devices found");
}
for (int devId = 0; devId < num_cuda_devices; devId++) {
cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devId));
if ((memcmp(&m_dx12deviceluid.LowPart, devProp.luid,
sizeof(m_dx12deviceluid.LowPart)) == 0) &&
(memcmp(&m_dx12deviceluid.HighPart,
devProp.luid + sizeof(m_dx12deviceluid.LowPart),
sizeof(m_dx12deviceluid.HighPart)) == 0)) {
checkCudaErrors(cudaSetDevice(devId));
m_cudaDeviceID = devId;
m_nodeMask = devProp.luidDeviceNodeMask;
checkCudaErrors(cudaStreamCreate(&m_streamToRun));
printf("CUDA Device Used [%d] %s\n", devId, devProp.name);
break;
}
}
}
void copyArrayToResource(cudaArray_t cudaArray) {
// then we want to copy cudaArray to the D3D texture, via its mapped form : cudaArray
cudaMemcpy2DArrayToArray(
m_cudaArray, // dst array
0, 0, // offset
cudaArray, 0, 0, // src
m_width * 4 * sizeof(float), m_height, // extent
cudaMemcpyDeviceToDevice); // kind
}
void createResource(size_t width, size_t height, ID3D12Resource** d3d12Resource) {
// Create a d3d12 resource in the desired size and map it to a cudaArray
m_width = width;
m_height = height;
// Create D3D12 2DTexture
// Assume 32-Bit float RGBA image
const auto channels = 4;
const auto textureSurface = width * height;
const auto texturePixels = textureSurface * channels;
const auto textureSizeBytes = sizeof(float)* texturePixels;
const auto texFormat = channels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
const auto texDesc = CD3DX12_RESOURCE_DESC::Tex2D(texFormat, width, height, 1, 1, 1, 0, D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS);
D3D12_HEAP_PROPERTIES heapProperties = {
D3D12_HEAP_TYPE_DEFAULT,
D3D12_CPU_PAGE_PROPERTY_UNKNOWN,
D3D12_MEMORY_POOL_UNKNOWN,
0,
0};
winrt::check_hresult(m_d3d12Device->CreateCommittedResource(
&heapProperties,
D3D12_HEAP_FLAG_SHARED,
&texDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(d3d12Resource)));
// Create CUDA external resource
HANDLE sharedHandle;
WindowsSecurityAttributes windowsSecurityAttributes{};
LPCWSTR name = NULL;
winrt::check_hresult(m_d3d12Device->CreateSharedHandle(
*d3d12Resource, &windowsSecurityAttributes, GENERIC_ALL, 0,
&sharedHandle));
D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
d3d12ResourceAllocationInfo = m_d3d12Device->GetResourceAllocationInfo(
m_nodeMask, 1, &texDesc);
size_t actualSize = d3d12ResourceAllocationInfo.SizeInBytes;
size_t alignment = d3d12ResourceAllocationInfo.Alignment;
cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
externalMemoryHandleDesc.size = actualSize;
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
checkCudaErrors(
cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(width, height, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
cuExtmemMipDesc.flags = cudaArrayDefault;
cudaMipmappedArray_t cuMipArray{};
checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
checkCudaErrors(cudaGetMipmappedArrayLevel(&m_cudaArray, cuMipArray, 0));
}
最后,如果映射到 aID3D12Resource
可行,我假设可以使用ITensorStaticsNative 接口创建一个张量以绑定到LearningModel的输出或输入。