2

当我尝试从 OpenCL in Action 的源代码中运行缩减程序时遇到问题。

我使用 Visual Studio 2008。这是错误:

Reduction.exe 中 0x013526a7 中的未处理异常:0xC00000FD:堆栈溢出。

在 asm 文件中,光标是

测试 dword ptr [eax],eax ; 探测页面。

我试图调试它,但是当我在 main 函数中放置一个断点时,调试开始了,但程序并没有继续运行。

我不知道真正的问题是什么。

这些是源文件:reduction.cpp

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction_complete.cl"

#define ARRAY_SIZE 1048576
#define KERNEL_1 "reduction_vector"
#define KERNEL_2 "reduction_complete"

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

   cl_platform_id platform;
   cl_device_id dev;
   int err;

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
   if(err == CL_DEVICE_NOT_FOUND) {
      err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
   }
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   int err;

   /* Read program file and place content into buffer */
   program_handle = fopen(filename, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)malloc(program_size + 1);
   program_buffer[program_size] = '\0';
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Create program from file */
   program = clCreateProgramWithSource(ctx, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {

      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) malloc(log_size + 1);
      program_log[log_size] = '\0';
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
            log_size + 1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   return program;
}

int main() {

   /* OpenCL structures */
   cl_device_id device;
   cl_context context;
   cl_program program;
   cl_kernel vector_kernel, complete_kernel;
   cl_command_queue queue;
   cl_event start_event, end_event;
   cl_int i, err;
   size_t local_size, global_size;

   /* Data and buffers */
   float data[ARRAY_SIZE];
   float sum, actual_sum;
   cl_mem data_buffer, sum_buffer;
   cl_ulong time_start, time_end, total_time;

   /* Initialize data */
   for(i=0; i<ARRAY_SIZE; i++) {
      data[i] = 1.0f*i;
   }

   /* Create device and determine local size */
   device = create_device();
   err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,     
         sizeof(local_size), &local_size, NULL);    
   if(err < 0) {
      perror("Couldn't obtain device information");
      exit(1);   
   }

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create data buffer */
   data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
   sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(float), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 
         CL_QUEUE_PROFILING_ENABLE, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Create kernels */
   vector_kernel = clCreateKernel(program, KERNEL_1, &err);
   complete_kernel = clCreateKernel(program, KERNEL_2, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Set arguments for vector kernel */
   err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL);

   /* Set arguments for complete kernel */
   err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL);
   err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);
   if(err < 0) {
      perror("Couldn't create a kernel argument");
      exit(1);   
   }

   /* Enqueue kernels */
   global_size = ARRAY_SIZE/4;
   err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, 
         &local_size, 0, NULL, &start_event);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }
   printf("Global size = %zu\n", global_size);

   /* Perform successive stages of the reduction */
   while(global_size/local_size > local_size) {
      global_size = global_size/local_size;
      err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, 
            &local_size, 0, NULL, NULL);
      printf("Global size = %zu\n", global_size);
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }
   }
   global_size = global_size/local_size;
   err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, 
         NULL, 0, NULL, &end_event);
   printf("Global size = %zu\n", global_size);

   /* Finish processing the queue and get profiling information */
   clFinish(queue);
   clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START,
         sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END,
         sizeof(time_end), &time_end, NULL);
   total_time = time_end - time_start;

   /* Read the result */
   err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, 
      sizeof(float), &sum, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   /* Check result */
   actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1);
   if(fabs(sum - actual_sum) > 0.01*fabs(sum))
      printf("Check failed.\n");
   else
      printf("Check passed.\n");
   printf("Total time = %lu\n", total_time);

   /* Deallocate resources */
   clReleaseEvent(start_event);
   clReleaseEvent(end_event);
   clReleaseMemObject(sum_buffer);
   clReleaseMemObject(data_buffer);
   clReleaseKernel(vector_kernel);
   clReleaseKernel(complete_kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}

reduction_complete.cl

__kernel void reduction_vector(__global float4* data, 
      __local float4* partial_sums) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      data[get_group_id(0)] = partial_sums[0];
   }
}

__kernel void reduction_complete(__global float4* data, 
      __local float4* partial_sums, __global float* sum) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_local_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      *sum = partial_sums[0].s0 + partial_sums[0].s1 +
             partial_sums[0].s2 + partial_sums[0].s3;
   }
}

我不知道是什么导致stackoverflow...

4

1 回答 1

2

我没有看到任何递归,所以我的猜测是float data[ARRAY_SIZE];#define ARRAY_SIZE 10485764MB 放在堆栈上,这非常大。尝试将其更改为动态分配。

于 2012-04-07T17:50:12.760 回答