c++ - 如何将CUDA代码分成多个文件

Question

我正在尝试将一个 CUDA 程序分成两个单独的 .cu 文件，以便更接近于用 C++ 编写一个真正的应用程序。我有一个简单的小程序：

在主机和设备上分配内存。
将主机数组初始化为一系列数字。将主机数组复制到设备数组使用设备内核查找数组中所有元素的平方将设备数组复制回主机数组打印结果

如果我将它们全部放在一个 .cu 文件中并运行它，这将非常有用。当我将它分成两个单独的文件时，我开始出现链接错误。就像我最近的所有问题一样，我知道这是一件小事，但它是什么？

内核支持.cu

#ifndef _KERNEL_SUPPORT_
#define _KERNEL_SUPPORT_

#include <iostream>
#include <MyKernel.cu>

int main( int argc, char** argv) 
{
    int* hostArray;
    int* deviceArray;
    const int arrayLength = 16;
    const unsigned int memSize = sizeof(int) * arrayLength;

    hostArray = (int*)malloc(memSize);
    cudaMalloc((void**) &deviceArray, memSize);

    std::cout << "Before device\n";
    for(int i=0;i<arrayLength;i++)
    {
        hostArray[i] = i+1;
        std::cout << hostArray[i] << "\n";
    }
    std::cout << "\n";

    cudaMemcpy(deviceArray, hostArray, memSize, cudaMemcpyHostToDevice);
    TestDevice <<< 4, 4 >>> (deviceArray);
    cudaMemcpy(hostArray, deviceArray, memSize, cudaMemcpyDeviceToHost);

    std::cout << "After device\n";
    for(int i=0;i<arrayLength;i++)
    {
        std::cout << hostArray[i] << "\n";
    }

    cudaFree(deviceArray);
    free(hostArray);

    std::cout << "Done\n";
}

#endif

我的内核.cu

#ifndef _MY_KERNEL_
#define _MY_KERNEL_

__global__ void TestDevice(int *deviceArray)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    deviceArray[idx] = deviceArray[idx]*deviceArray[idx];
}


#endif

构建日志：

1>------ Build started: Project: CUDASandbox, Configuration: Debug x64 ------
1>Compiling with CUDA Build Rule...
1>"C:\CUDA\bin64\nvcc.exe"    -arch sm_10 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin"    -Xcompiler "/EHsc /W3 /nologo /O2 /Zi   /MT  "  -maxrregcount=32  --compile -o "x64\Debug\KernelSupport.cu.obj" "d:\Stuff\Programming\Visual Studio 2008\Projects\CUDASandbox\CUDASandbox\KernelSupport.cu" 
1>KernelSupport.cu
1>tmpxft_000016f4_00000000-3_KernelSupport.cudafe1.gpu
1>tmpxft_000016f4_00000000-8_KernelSupport.cudafe2.gpu
1>tmpxft_000016f4_00000000-3_KernelSupport.cudafe1.cpp
1>tmpxft_000016f4_00000000-12_KernelSupport.ii
1>Linking...
1>KernelSupport.cu.obj : error LNK2005: __device_stub__Z10TestDevicePi already defined in MyKernel.cu.obj
1>KernelSupport.cu.obj : error LNK2005: "void __cdecl TestDevice__entry(int *)" (?TestDevice__entry@@YAXPEAH@Z) already defined in MyKernel.cu.obj
1>D:\Stuff\Programming\Visual Studio 2008\Projects\CUDASandbox\x64\Debug\CUDASandbox.exe : fatal error LNK1169: one or more multiply defined symbols found
1>Build log was saved at "file://d:\Stuff\Programming\Visual Studio 2008\Projects\CUDASandbox\CUDASandbox\x64\Debug\BuildLog.htm"
1>CUDASandbox - 3 error(s), 0 warning(s)
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

我在 Windows 7 64 位上运行 Visual Studio 2008。

编辑：

我想我需要稍微详细说明一下。我在这里寻找的最终结果是拥有一个普通的 C++ 应用程序，其中包含 Main.cpp 之类的int main()事件，并从那里运行。在我的 .cpp 代码中，我希望能够引用 CUDA 位。所以我的想法（如果这里有更标准的约定，请纠正我）是我会将 CUDA 内核代码放入他们的 .cu 文件中，然后有一个支持的 .cu 文件来处理与设备的对话和调用内核函数等等。

score 13 · Accepted Answer

当您尝试链接时，您正在包括mykernel.cuin ，编译器会看到 mykernel.cu 两次。kernelsupport.cu您必须创建一个定义 TestDevice 的标头并包含它。

重新评论：

像这样的东西应该工作

// MyKernel.h
#ifndef mykernel_h
#define mykernel_h
__global__ void TestDevice(int* devicearray);
#endif

然后将包含文件更改为

//KernelSupport.cu
#ifndef _KERNEL_SUPPORT_
#define _KERNEL_SUPPORT_

#include <iostream>
#include <MyKernel.h>
// ...

重新编辑

只要您在 c++ 代码中使用的标头没有任何 cuda 特定的东西（__kernel__,__global__等），您应该可以很好地链接 c++ 和 cuda 代码。

score 4 · Accepted Answer

如果您查看 CUDA SDK 代码示例，它们有 extern C 定义从 .cu 文件编译的引用函数。这样，.cu 文件由 nvcc 编译，只链接到主程序，而 .cpp 文件正常编译。

例如，在 marchingCubes_kernel.cu 中有函数体：

extern "C" void
launch_classifyVoxel( dim3 grid, dim3 threads, uint* voxelVerts, uint *voxelOccupied, uchar *volume,
                      uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, uint numVoxels,
                      float3 voxelSize, float isoValue)
{
    // calculate number of vertices need per voxel
    classifyVoxel<<<grid, threads>>>(voxelVerts, voxelOccupied, volume, 
                                     gridSize, gridSizeShift, gridSizeMask, 
                                     numVoxels, voxelSize, isoValue);
    cutilCheckMsg("classifyVoxel failed");
}

而在 marchingCubes.cpp （main() 所在的位置）只有一个定义：

extern "C" void
launch_classifyVoxel( dim3 grid, dim3 threads, uint* voxelVerts, uint *voxelOccupied, uchar *volume,
                      uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, uint numVoxels,
                      float3 voxelSize, float isoValue);

您也可以将它们放入 .h 文件中。

score 3 · Accepted Answer

获得分离实际上非常简单，请查看此答案以了解如何设置。然后，您只需将主机代码放在 .cpp 文件中，将设备代码放在 .cu 文件中，构建规则会告诉 Visual Studio 如何将它们链接到最终的可执行文件中。

代码中的直接问题是定义__global__ TestDevice函数两次，一次是在#includeMyKernel.cu 时，一次是在独立编译 MyKernel.cu 时。

您还需要将包装器放入 .cu 文件中 - 目前您正在TestDevice<<<>>>从 main 函数调用，但是当您将其移动到 .cpp 文件中时，它将使用不理解<<<>>>语法的 cl.exe 进行编译. 因此，您只需调用TestDeviceWrapper(griddim, blockdim, params).cpp 文件并在 .cu 文件中提供此功能。

如果你想要一个例子，SDK 中的 SobolQRNG 示例实现了很好的分离，尽管它仍然使用 cutil，我总是建议避免使用 cutil。

score -3 · Accepted Answer

简单的解决方案是关闭 MyKernel.cu 文件的构建。

属性 -> 常规 -> 从构建中排除

更好的解决方案 imo 是将内核拆分为一个 cu 和一个 cuh 文件，并将其包括在内，例如：

//kernel.cu
#include "kernel.cuh"
#include <cuda_runtime.h>

__global__ void increment_by_one_kernel(int* vals) {
  vals[threadIdx.x] += 1;
}

void increment_by_one(int* a) {
  int* a_d;

  cudaMalloc(&a_d, 1);
  cudaMemcpy(a_d, a, 1, cudaMemcpyHostToDevice);
  increment_by_one_kernel<<<1, 1>>>(a_d);
  cudaMemcpy(a, a_d, 1, cudaMemcpyDeviceToHost);

  cudaFree(a_d);
}

//kernel.cuh
#pragma once

void increment_by_one(int* a);

//main.cpp
#include "kernel.cuh"

int main() {
  int a[] = {1};

  increment_by_one(a);

  return 0;
}

c++ - 如何将CUDA代码分成多个文件

4 回答 4

Related

Reference