2

这是某种并行缩减/极值内核的一部分。我已将其减少到仍然clBuildProgram崩溃的最小代码(请注意,它确实崩溃了,而不仅仅是返回错误代码):

编辑:这似乎也发生在local_value声明global而不是local.

EDIT2 / SOLUTION:问题是有一个无限循环。我应该写remaining_items >>= 1而不是remaining_items >> 1。正如答案中所说,nvidia 编译器在编译/优化错误方面似乎不是很健壮。

kernel void testkernel(local float *local_value)
{
    size_t thread_id = get_local_id(0);

    int remaining_items = 1024;

    while (remaining_items > 1)
    {
        // throw away the right half of the threads
        remaining_items >> 1; // <-- SPOTTED THE BUG
        if (thread_id > remaining_items)
        {
            return;
        }

        // look for a greater value in the right half of the memory space
        int right_index = thread_id + remaining_items;
        float right_value = local_value[right_index];
        if (right_value > local_value[thread_id])
        {
            local_value[thread_id] = right_value;
        }

        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}

删除这些行return;和/或local_value[thread_id] = right_value;导致 clBuildProgram 成功完成。

我可以在我的所有计算机上重现这个问题(NVIDIA GTX 560、GT 555M、GT 540M,它们都是 Fermi 2.1 架构)。在使用 x64 或 x86 库时,在 NVIDIA CUDA Toolkit SDK 版本 4.0、4.1 和 4.2 上很明显。

有谁知道可能是什么问题?

是否有可能自动假定本地(又名共享)内存是(WORK_GROUP_SIZE) * siezof(its_base_type)?这可以解释为什么当我上面提到的行被删除时它会起作用。


用于复制的最小主机代码(C99 兼容):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)

int main(int argc, char **argv)
{
    // Load the kernel source code into the array source_str
    FILE *fp;

    fp = fopen("testkernel.cl", "rb");
    if (!fp)
    {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    fseek(fp, 0, SEEK_END);
    int filesize = ftell(fp);
    rewind(fp);
    char *source_str = (char*)calloc(filesize, sizeof(char));
    size_t bytes_read = fread(source_str, 1, filesize, fp);
    source_str[bytes_read] = 0;
    fclose(fp);

    // Get platform information
    cl_uint num_platforms;
    RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));

    cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
    RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));

    cl_device_id selected_device_id = NULL;

    printf("available platforms:\n");
    for (cl_uint i = 0; i < num_platforms; i++)
    {
        char platform_name[50];
        RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
        printf("%s\n", platform_name);

        // get devices for this platform
        cl_uint num_devices;
        RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));

        cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
        RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));

        // select first nvidia device
        if (strstr(platform_name, "NVIDIA"))        // ADAPT THIS ACCORDINGLY
        {
            selected_device_id = device_ids[0];
        }
    }

    if (selected_device_id == NULL)
    {
        printf("No NVIDIA device found\n");
        exit(1);
    }

    // Create an OpenCL context
    cl_context context;
    REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));

    // Create a program from the kernel source
    cl_program program;
    REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));

    // Build the program
    cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
    if (ret)
    {
        printf("BUILD ERROR\n");
        // build error - get build log and display it
        size_t build_log_size;
        ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
        char *build_log = new char[build_log_size];
        ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
        printf("%s\n", build_log);
        exit(1);
    }

    printf("build finished successfully\n");
    return 0;
}
4

1 回答 1

1

根据我的经验,nvidia 编译器在处理构建错误时不是很健壮,因此您可能在某处遇到编译错误。

我认为您的问题确实是return,或者更重要的是它与barrier. 根据关于障碍的opencl规范:

在处理器上执行内核的工作组中的所有工作项必须执行此功能,然后才能允许任何工作项继续执行越过障碍。执行内核的工作组中的所有工作项都必须遇到此函数。

如果屏障在条件语句内,则如果任何工作项进入条件语句并执行屏障,则所有工作项都必须输入条件语句。

如果 barrer 在循环内,则所有工作项都必须为循环的每次迭代执行屏障,然后才能允许任何工作项继续执行超出屏障。

所以我认为你的问题可能是很多线程会在到达障碍之前返回,使这段代码无效。也许你应该尝试这样的事情:

kernel void testkernel(local float *local_value) {
    size_t thread_id = get_local_id(0);
    int remaining_items = 1024;
    while (remaining_items > 1) {
        remaining_items >>= 1;// throw away the right half of the threads
        if (thread_id <= remaining_items) {
             // look for a greater value in the right half of the memory space
             int right_index = thread_id + remaining_items;
             float right_value = local_value[right_index];
             if (right_value > local_value[thread_id])
                 local_value[thread_id] = right_value;
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}

Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.

于 2012-07-02T12:33:17.563 回答