1

我正在实现一个带有nvml库的示例程序,如https://devtalk.nvidia.com/default/topic/504951/how-to-call-nvml-apis-/

程序如下:

#include <stdio.h>

#include <nvidia/gdk/nvml.h>



const char * convertToComputeModeString(nvmlComputeMode_t mode)

{

    switch (mode)

    {

        case NVML_COMPUTEMODE_DEFAULT:

            return "Default";

        case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:

            return "Exclusive_Thread";

        case NVML_COMPUTEMODE_PROHIBITED:

            return "Prohibited";

        case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:

            return "Exclusive Process";

        default:

            return "Unknown";

    }

}



int main()

{

    nvmlReturn_t result;

    unsigned int device_count, i;



    // First initialize NVML library

    result = nvmlInit();

    if (NVML_SUCCESS != result)

    { 

        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));



        printf("Press ENTER to continue...\n");

        getchar();

        return 1;

    }



    result = nvmlDeviceGetCount(&device_count);

    if (NVML_SUCCESS != result)

    { 

        printf("Failed to query device count: %s\n", nvmlErrorString(result));

        goto Error;

    }

    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");



    printf("Listing devices:\n");    

    for (i = 0; i < device_count; i++)

    {

        nvmlDevice_t device;

        char name[64];

        nvmlPciInfo_t pci;

        nvmlComputeMode_t compute_mode;



        // Query for device handle to perform operations on a device

        // You can also query device handle by other features like:

        // nvmlDeviceGetHandleBySerial

        // nvmlDeviceGetHandleByPciBusId

        result = nvmlDeviceGetHandleByIndex(i, &device);

        if (NVML_SUCCESS != result)

        { 

            printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));

            goto Error;

        }



        result = nvmlDeviceGetName(device, name, sizeof(name)/sizeof(name[0]));

        if (NVML_SUCCESS != result)

        { 

            printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));

            goto Error;

        }



        // pci.busId is very useful to know which device physically you're talking to

        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.

        result = nvmlDeviceGetPciInfo(device, &pci);

        if (NVML_SUCCESS != result)

        { 

            printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));

            goto Error;

        }



        printf("%d. %s [%s]\n", i, name, pci.busId);



        // This is a simple example on how you can modify GPU's state

        result = nvmlDeviceGetComputeMode(device, &compute_mode);

        if (NVML_ERROR_NOT_SUPPORTED == result)

            printf("\t This is not CUDA capable device\n");

        else if (NVML_SUCCESS != result)

        { 

            printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));

            goto Error;

        }

        else

        {

            // try to change compute mode

            printf("\t Changing device's compute mode from '%s' to '%s'\n", 

                    convertToComputeModeString(compute_mode), 

                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

            result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);

            if (NVML_ERROR_NO_PERMISSION == result)

                printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));

            else if (NVML_ERROR_NOT_SUPPORTED == result)

                printf("\t\t Compute mode prohibited not supported. You might be running on\n"

                       "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");

            else if (NVML_SUCCESS != result)
            {

                printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));

                goto Error;
            } 
            else
            {
                printf("\t Restoring device's compute mode back to '%s'\n", 
                        convertToComputeModeString(compute_mode));
                result = nvmlDeviceSetComputeMode(device, compute_mode);

                if (NVML_SUCCESS != result)
                {
                    printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
                    goto Error;
                }
            }
        }
    }
    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
    printf("All done.\n");
    printf("Press ENTER to continue...\n");
    getchar();
    return 0;  
Error:
    result = nvmlShutdown();   
    if (NVML_SUCCESS != result)  
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));  
    printf("Press ENTER to continue...\n");  
    getchar();    
    return 1;
}

生成文件如下:

ARCH   := $(shell getconf LONG_BIT)

ifeq (${ARCH},32)

  NVML_LIB := ../lib/

else ifeq (${ARCH},64)

  NVML_LIB := /usr/lib/nvidia-340/

else

 $(error Unknown architecture!)

endif



CFLAGS  := -I ../inc

LDFLAGS := -lnvidia-ml -L $(NVML_LIB)



example: example.o

    $(CC) $(LDFLAGS) $< -o $@



clean:

    -@rm -f example.o

    -@rm -f example

我得到的错误是:

cc -lnvidia-ml -L /usr/src/gdk/nvml/lib/ example.o -o example
example.o: In function `main':
example.c:(.text+0x5f): undefined reference to `nvmlInit_v2'
example.c:(.text+0x7b): undefined reference to `nvmlErrorString'
example.c:(.text+0xb5): undefined reference to `nvmlDeviceGetCount_v2'
example.c:(.text+0xd1): undefined reference to `nvmlErrorString'
example.c:(.text+0x149): undefined reference to `nvmlDeviceGetHandleByIndex_v2'
example.c:(.text+0x165): undefined reference to `nvmlErrorString'
example.c:(.text+0x19f): undefined reference to `nvmlDeviceGetName'
example.c:(.text+0x1bb): undefined reference to `nvmlErrorString'
example.c:(.text+0x1f3): undefined reference to `nvmlDeviceGetPciInfo_v2'
example.c:(.text+0x20f): undefined reference to `nvmlErrorString'
example.c:(.text+0x269): undefined reference to `nvmlDeviceGetComputeMode'
example.c:(.text+0x29d): undefined reference to `nvmlErrorString'
example.c:(.text+0x2ff): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x31b): undefined reference to `nvmlErrorString'
example.c:(.text+0x360): undefined reference to `nvmlErrorString'
example.c:(.text+0x3b5): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x3d1): undefined reference to `nvmlErrorString'
example.c:(.text+0x40c): undefined reference to `nvmlShutdown'
example.c:(.text+0x428): undefined reference to `nvmlErrorString'
example.c:(.text+0x45f): undefined reference to `nvmlShutdown'
example.c:(.text+0x47b): undefined reference to `nvmlErrorString'
collect2: error: ld returned 1 exit status
make: *** [example] Error 1
pranjal@PCL:~/nvidia$ make
cc -lnvidia-ml -L /usr/lib/nvidia-340/ example.o -o example
example.o: In function `main':
example.c:(.text+0x5f): undefined reference to `nvmlInit_v2'
example.c:(.text+0x7b): undefined reference to `nvmlErrorString'
example.c:(.text+0xb5): undefined reference to `nvmlDeviceGetCount_v2'
example.c:(.text+0xd1): undefined reference to `nvmlErrorString'
example.c:(.text+0x149): undefined reference to `nvmlDeviceGetHandleByIndex_v2'
example.c:(.text+0x165): undefined reference to `nvmlErrorString'
example.c:(.text+0x19f): undefined reference to `nvmlDeviceGetName'
example.c:(.text+0x1bb): undefined reference to `nvmlErrorString'
example.c:(.text+0x1f3): undefined reference to `nvmlDeviceGetPciInfo_v2'
example.c:(.text+0x20f): undefined reference to `nvmlErrorString'
example.c:(.text+0x269): undefined reference to `nvmlDeviceGetComputeMode'
example.c:(.text+0x29d): undefined reference to `nvmlErrorString'
example.c:(.text+0x2ff): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x31b): undefined reference to `nvmlErrorString'
example.c:(.text+0x360): undefined reference to `nvmlErrorString'
example.c:(.text+0x3b5): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x3d1): undefined reference to `nvmlErrorString'
example.c:(.text+0x40c): undefined reference to `nvmlShutdown'
example.c:(.text+0x428): undefined reference to `nvmlErrorString'
example.c:(.text+0x45f): undefined reference to `nvmlShutdown'
example.c:(.text+0x47b): undefined reference to `nvmlErrorString'
collect2: error: ld returned 1 exit status
make: *** [example] Error 1

任何帮助,将不胜感激。谢谢你。

4

1 回答 1

2

这是我在 linux CUDA 7.5 设置上所做的:

  1. 将 GPU 驱动程序更新为 352.79。就我而言,这是通过此处的运行文件安装程序完成的。如果您以前通过包管理器方法(例如 .deb)安装了 GPU 驱动程序,那么您不想使用运行文件安装程序方法。

  2. 获取最新版本的 GDK(参见下面的注释),此时恰好针对 352.79,并包括 nvml:

    wget --no-check-certificate http://developer.download.nvidia.com/compute/cuda/7.5/Prod/gdk/gdk_linux_amd64_352_79_release.run
    
  3. 安装 GDK:

    sh gdk_linux_amd64_352_79_release.run
    
  4. 验证相应的库是否已更新:

    ls /usr/lib64/libnv*
    

    (你应该看到libnvidia-ml.so.352.79等)

  5. 编译示例文件:

    g++ -I./gdk352_79/usr/include -L/usr/lib64 -lnvidia-ml example.c -o example
    

当我运行example可执行文件时,我得到:

$ ./example
Found 2 devices

Listing devices:
0. Quadro 5000 [0000:02:00.0]
         Changing device's compute mode from 'Default' to 'Prohibited'
                 Need root privileges to do that: Insufficient Permissions
1. GeForce GT 640 [0000:03:00.0]
         Changing device's compute mode from 'Default' to 'Prohibited'
                 Need root privileges to do that: Insufficient Permissions
All done.
Press ENTER to continue...

$

希望这会让你继续前进。我假设您在需要时不需要帮助进行任何Makefile更改。如果您Makefile不工作,请继续修改它,直到您获得我在步骤 5 中列出的确切编译命令。

注意:从 CUDA 8.0 开始,GDK 不是一个单独的实体,而是随 CUDA 8.0 工具包一起安装的。不需要单独安装 GDK。

于 2016-03-21T19:21:51.483 回答