c++ - 如何将 cudaArray 提供给 Windows-Machine-Learning 推理引擎？

Question

我正在尝试为实时图像处理软件开发一个支持 ML 的插件，该插件cudaArray_t在 GPU 上提供图像数据，但由于该软件将我锁定在较旧的 CUDA 版本中，我想使用 DirectML（该软件无论如何都是Windows）。

出于延迟原因，我不想做任何不必要的 GPU-CPU-GPU 往返。为此，我认为我需要将 CUDA 数据映射到 D3D12 资源，然后可用于创建输入和输出张量以绑定到模型。我找到了一个示例，该示例使用 CUDA外部资源互操作性API 将 a 映射cudaArray_t到我试图作为代码基础的ID3D12Resource 此处。由于我不需要渲染任何东西，我认为我可以简单地创建堆和资源，然后将传入cudaArray_t的内容复制到互操作cudaArray_t中，如下所示，而无需创建任何类型的命令队列。请注意，缺少的代码与上面链接的 github 存储库中的代码相同，因此为了简洁起见，我将其省略了。

这种方法不起作用，但我不确定如何调试它，因为我通常是 Direct3D 编程和 GPU 编程的新手。我正在使用官方 Direct3D 12 文档作为参考，但它有点压倒性，因此非常感谢这里应该修复的一些方向 :) 我在想我需要使用信号量进行某种同步，但我不确定在不创建某种命令队列的情况下是否可行。

bool initD3d12() {
  // setup the d3d12 device
  UINT dxgiFactoryFlags = 0;
  winrt::com_ptr<IDXGIFactory4> factory;
  winrt::check_hresult(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(factory.put())));
  winrt::com_ptr<IDXGIAdapter1> hardwareAdapter;
  GetHardwareAdapter(factory.get(), hardwareAdapter.put());
  
  winrt::check_hresult(D3D12CreateDevice(hardwareAdapter.get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(m_d3d12Device.put())));
  
  DXGI_ADAPTER_DESC1 desc;
  hardwareAdapter->GetDesc1(&desc);
  m_dx12deviceluid = desc.AdapterLuid;
  
  return true;
}
  
void initCuda() {
  // setup the cuda device
  int num_cuda_devices = 0;
  checkCudaErrors(cudaGetDeviceCount(&num_cuda_devices));
  
  if (!num_cuda_devices) {
      throw std::exception("No CUDA Devices found");
  }
  for (int devId = 0; devId < num_cuda_devices; devId++) {
      cudaDeviceProp devProp;
      checkCudaErrors(cudaGetDeviceProperties(&devProp, devId));
  
      if ((memcmp(&m_dx12deviceluid.LowPart, devProp.luid,
          sizeof(m_dx12deviceluid.LowPart)) == 0) &&
          (memcmp(&m_dx12deviceluid.HighPart,
              devProp.luid + sizeof(m_dx12deviceluid.LowPart),
              sizeof(m_dx12deviceluid.HighPart)) == 0)) {
          checkCudaErrors(cudaSetDevice(devId));
          m_cudaDeviceID = devId;
          m_nodeMask = devProp.luidDeviceNodeMask;
          checkCudaErrors(cudaStreamCreate(&m_streamToRun));
          printf("CUDA Device Used [%d] %s\n", devId, devProp.name);
          break;
      }
  }
}
  
void copyArrayToResource(cudaArray_t cudaArray) {
  // then we want to copy cudaArray to the D3D texture, via its mapped form : cudaArray
  cudaMemcpy2DArrayToArray(
      m_cudaArray, // dst array
      0, 0,    // offset
      cudaArray, 0, 0,       // src
      m_width * 4 * sizeof(float), m_height, // extent
      cudaMemcpyDeviceToDevice); // kind
}
  
void createResource(size_t width, size_t height, ID3D12Resource** d3d12Resource) {
  // Create a d3d12 resource in the desired size and map it to a cudaArray
  m_width = width;
  m_height = height;
  // Create D3D12 2DTexture
  // Assume 32-Bit float RGBA image
  const auto channels = 4;
  const auto textureSurface = width * height;
  const auto texturePixels = textureSurface * channels;
  const auto textureSizeBytes = sizeof(float)* texturePixels;
  
  const auto texFormat = channels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
  const auto texDesc = CD3DX12_RESOURCE_DESC::Tex2D(texFormat, width, height, 1, 1, 1, 0, D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS);
  D3D12_HEAP_PROPERTIES heapProperties = {
      D3D12_HEAP_TYPE_DEFAULT,
      D3D12_CPU_PAGE_PROPERTY_UNKNOWN,
      D3D12_MEMORY_POOL_UNKNOWN,
      0,
      0};
  
  winrt::check_hresult(m_d3d12Device->CreateCommittedResource(
      &heapProperties,
      D3D12_HEAP_FLAG_SHARED,
      &texDesc,
      D3D12_RESOURCE_STATE_COMMON,
      nullptr,
      IID_PPV_ARGS(d3d12Resource)));
  
  
  // Create CUDA external resource
  HANDLE sharedHandle;
  WindowsSecurityAttributes windowsSecurityAttributes{};
  LPCWSTR name = NULL;
  winrt::check_hresult(m_d3d12Device->CreateSharedHandle(
      *d3d12Resource, &windowsSecurityAttributes, GENERIC_ALL, 0,
      &sharedHandle));
  
  D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
  d3d12ResourceAllocationInfo = m_d3d12Device->GetResourceAllocationInfo(
      m_nodeMask, 1, &texDesc);
  size_t actualSize = d3d12ResourceAllocationInfo.SizeInBytes;
  size_t alignment = d3d12ResourceAllocationInfo.Alignment;
  
  cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
  memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
  
  externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
  externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
  externalMemoryHandleDesc.size = actualSize;
  externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
  
  checkCudaErrors(
      cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));
  
  cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
  cuExtmemMipDesc.extent = make_cudaExtent(width, height, 0);
  cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
  cuExtmemMipDesc.numLevels = 1;
  cuExtmemMipDesc.flags = cudaArrayDefault;
  
  cudaMipmappedArray_t cuMipArray{};
  checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
  
  checkCudaErrors(cudaGetMipmappedArrayLevel(&m_cudaArray, cuMipArray, 0));
}

最后，如果映射到 aID3D12Resource可行，我假设可以使用ITensorStaticsNative 接口创建一个张量以绑定到LearningModel的输出或输入。

c++ - 如何将 cudaArray 提供给 Windows-Machine-Learning 推理引擎？

0 回答 0

Related

Reference