1

在我的程序中,我想使用一个包含常量变量的结构,并在程序执行完成时一直将其保留在设备上。

我有几个头文件,其中包含“全局”函数的声明及其各自的“.cu”文件的定义。我保留了这个方案,因为它可以帮助我在一个地方包含类似的代码。例如,完成'KERNEL_1'所需的所有' device '函数与完成' KERNEL_2 '所需的' device '函数以及内核定义是分开的。

在编译和链接期间,我对这个方案没有任何问题。直到我遇到常量变量。我想在所有内核和设备函数中使用相同的常量变量,但它似乎不起作用。

##########################################################################
                                CODE EXAMPLE
###########################################################################
filename: 'common.h'
--------------------------------------------------------------------------
typedef struct {
    double height;
    double weight;
    int age;
} __CONSTANTS;

__constant__ __CONSTANTS d_const;

---------------------------------------------------------------------------
filename: main.cu
---------------------------------------------------------------------------
#include "common.h"
#include "gpukernels.h"
int main(int argc, char **argv) {

    __CONSTANTS T;
    T.height   = 1.79;
    T.weight   = 73.2;
    T.age      = 26;

    cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
    test_kernel <<< 1, 16 >>>();
    cudaDeviceSynchronize();
}

---------------------------------------------------------------------------
filename: gpukernels.h
---------------------------------------------------------------------------
__global__ void test_kernel();

---------------------------------------------------------------------------
filename: gpukernels.cu
---------------------------------------------------------------------------
#include <stdio.h>
#include "gpukernels.h"
#include "common.h"

__global__ void test_kernel() {
    printf("Id: %d, height: %f, weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}

当我执行此代码时,内核执行,显示线程 ID,但常量值显示为零。我怎样才能解决这个问题?

建议的修改

filename: gpukernels.h
----------------------------------------------------------------------

__global__ void test_kernel();

----------------------------------------------------------------------
filename: gpukernels.cu
----------------------------------------------------------------------

#include <stdio.h>
#include "common.h"
#include "gpukernels.h"

extern "C" __constant__ __CONSTANTS d_const;

__global__ void test_kernel() {
    printf("Id: %d, Height: %f, Weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}

----------------------------------------------------------------------
filename: common.h
----------------------------------------------------------------------

typedef struct {
    double height;
    double weight;
    int age;
} __CONSTANTS;

----------------------------------------------------------------------
filename: main.cu
----------------------------------------------------------------------
#include "common.h"
#include "gpukernels.h"

__constant__ __CONSTANTS d_const;

int main(int argc, char **argv) {

    __CONSTANTS T;
    T.height = 1.79;
    T.weight = 73.2;
    T.age    = 26;

    cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
    test_kernel <<< 1, 16 >>> ();
    cudaDeviceSynchronize();

    return 0;
}

所以按照建议,我尝试了代码,仍然无法正常工作。我在这里错过了什么吗?

4

1 回答 1

2

下面,我报告了对我有用的解决方案。请记住,您使用的是单独编译,因此不要忘记使用 Generate Relocatable Device Code(-rdc=true选项)。

文件 main.cu

#include <cuda.h>
#include <cuda_runtime.h>

typedef struct {
    double height;
    double weight;
    int age;
} __CONSTANTS;

__constant__ __CONSTANTS d_const;

__global__ void test_kernel();

#include <conio.h>
int main(int argc, char **argv) {

    __CONSTANTS T;
    T.height   = 1.79;
    T.weight   = 73.2;
    T.age      = 26;

    cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
    test_kernel <<< 1, 16 >>>();
    cudaDeviceSynchronize();

    getch();
    return 0;
}

文件 kernel.cu

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

typedef struct {
    double height;
    double weight;
    int age;
} __CONSTANTS;

extern __constant__ __CONSTANTS d_const;

__global__ void test_kernel() {
    printf("Id: %d, height: %f, weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}
于 2013-10-25T12:54:45.780 回答