cuda - CUDA 函数指针

Question

我试图在CUDA中做这样的事情（实际上我需要编写一些集成函数）

#include <iostream>
using namespace std;

float f1(float x) {
    return x * x;
}

float f2(float x) {
    return x;
}

void tabulate(float p_f(float)) {
    for (int i = 0; i != 10; ++i) {
        std::cout << p_f(i) << ' ';
    }
    std::cout << std::endl;
}

int main() {
    tabulate(f1);
    tabulate(f2);
    return 0;
}

输出：

0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9

我尝试了以下但只得到了错误

错误：sm_1x 不支持函数指针和函数模板参数。

float f1(float x) {
    return x;
}

__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
    for (lower; lower < upper; lower++) {
        *result = *result + p_function(lower);
    }
}

int main() {
    float res;
    float* dev_res;

    cudaMalloc( (void**)&dev_res, sizeof(float) ) ;

    tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
    cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);

    printf("%f\n", res);
    /************************************************************************/
    scanf("%s");

    return 0;
}

score 10 · Accepted Answer

要消除编译错误，您必须-gencode arch=compute_20,code=sm_20在编译代码时将其用作编译器参数。但是你可能会遇到一些运行时问题：

摘自 CUDA 编程指南http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions

主机代码支持指向函数的函数指针__global__，但设备代码不支持。__device__仅在为计算能力 2.x 及更高版本的设备编译的设备代码中支持指向函数的函数指针。

不允许__device__在主机代码中获取函数的地址。

所以你可以有这样的东西（改编自“FunctionPointers”示例）：

//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);

//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
   ...
}

//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;

//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))

然后，您可以将h_pointFunction作为参数传递给内核，内核可以使用它来调用您的__device__函数。

//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
    unsigned char tmp;
    ...
    tmp = (*pPointOperation)(tmp, 150.0)
    ...
}

//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);

希望这有点道理。总而言之，看起来您必须将 f1 函数更改为__device__函数并遵循类似的过程（typedef 不是必需的，但它们确实使代码更好）才能将其作为主机上的有效函数指针-side 传递给你的内核。我还建议给 FunctionPointers CUDA 示例看看

score 1 · Accepted Answer

即使您可以编译此代码（请参阅@Robert Crovella 的答案），此代码也不起作用。您不能从主机代码传递函数指针，因为主机编译器无法确定函数地址。

score 1 · Accepted Answer

这是一个简单的函数指针类，可以从我根据这个问题编写的内核中调用：

template <typename T>
struct cudaCallableFunctionPointer
{
public:
  cudaCallableFunctionPointer(T* f_)
  {
    T* host_ptr = (T*)malloc(sizeof(T));
    cudaMalloc((void**)&ptr, sizeof(T));

    cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
    cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
    
    cudaFree(host_ptr)
  }

  ~cudaCallableFunctionPointer()
  {
    cudaFree(ptr);
  }

  T* ptr;
};

你可以像这样使用它：

__device__ double func1(double x)
{
    return x + 1.0f;
}

typedef double (*func)(double x);
__device__ func f_ = func1;



__global__ void test_kernel(func* f)
{
    double x = (*f)(2.0);
    printf("%g\n", x);
}



int main()
{
    cudaCallableFunctionPointer<func> f(&f_);

    test_kernel << < 1, 1 >> > (f.ptr);
}

输出：

cuda - CUDA 函数指针

3 回答 3

Related

Reference