有没有人在 NVIDIA GPU 上运行 AMD (http://developer.amd.com/libraries/appmathlibs/pages/default.aspx) 的 OpenCL FFT 库?
我正在尝试将现有算法从 CUDA(具有最新的 CUFFT)移植到 OpenCL。新代码在 AMD GPU 上运行良好,但在我的 NVIDIA GPU 上却不行。NVIDIA GPU 被正确识别,但结果数组完全为零,没有抛出任何错误。顺便说一句,代码在 Intel Core i3 CPU 上也能正常运行。所以我的代码似乎很好。
AMD 和 NVIDIA 似乎拒绝支持这个话题。
有任何想法吗?
编辑:
我的环境是 Windows 7 Professional x64 操作系统,我正在使用带有内置 x86 编译器的 Visual Studio C++ Professional IDE。NVIDIA GPU 是 GeForce GTX 560 Ti(MSI N560GTX-Ti Twin Frozr II/OC 1GB)。运行良好的 CPU 是 Intel Core i3-2100 (2x3.1GHz),然后是 Radeon HD 6850 (Sapphire Radeon HD 6850 1GB)。我尝试针对 AMD、NVIDIA 和 Intel 的最新 OpenCL 版本编译代码,结果相同,并且安装了最新的开发者驱动程序。
这是我非常基本的示例代码...
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <clAmdFft.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Typedef for complex field objects
using namespace std;
typedef std::complex<float> cl_compl_flt;
int main(int argc, char* argv[])
{
cl_uint width = 1024, height = 1024; // Field dimensions
cl_uint cl_platformsN = 0; // Platform count
cl_platform_id *cl_platformIDs = NULL; // IDs of OpenCL platforms
cl_uint cl_deviceCount = 0; // Device count
cl_device_id *cl_devices = NULL; // Device IDs
cl_int cl_err = 0; // Buffer for error informations
cl_context cl_dev_context; // Context
cl_command_queue cl_queue; // Queue
clAmdFftSetupData fftSetupData; // FFT setup data
clAmdFftPlanHandle fftPlan; // FFT plan
clAmdFftDim fftDim = CLFFT_2D; // FFT dimension
size_t fftSize[2]; // FFT size
fftSize[0] = width;
fftSize[1] = height;
cl_mem d_data; // Device level data
cl_compl_flt* h_src; // Host level input data
cl_compl_flt* h_res; // Host level output data
// Allocate host memory
h_src = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
h_res = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
// Get source field
createPinholeField( h_src, width, height, 5 );
// Get FFT version
checkCL( clAmdFftInitSetupData(&fftSetupData) );
printf("Using clAmdFft %u.%u.%u\n",fftSetupData.major,fftSetupData.minor,fftSetupData.patch);
// Get available platforms
checkCL( clGetPlatformIDs ( 0, NULL, &cl_platformsN));
cl_platformIDs = (cl_platform_id*) malloc( cl_platformsN * sizeof(cl_platform_id));
checkCL( clGetPlatformIDs( cl_platformsN, cl_platformIDs, NULL) );
// Loop over platforms
for( cl_uint i = 0; i < cl_platformsN; i++)
{
// Get number of available devices for this platform
checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, NULL, NULL, &cl_deviceCount));
// Skip platform if no device available
if(cl_deviceCount < 1)
continue;
// Get available device IDs for this platform
cl_devices = (cl_device_id*) malloc( cl_deviceCount * sizeof(cl_device_id));
checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, cl_deviceCount, cl_devices, NULL));
// Print platform name
char platform_name[1024];
checkCL( clGetPlatformInfo( cl_platformIDs[i], CL_PLATFORM_NAME, 1024, &platform_name, NULL) );
printf("\nCompute using OpenCl platfrom #%i [ %s ]\n", i,platform_name);
// Loop over devices
for( cl_uint j = 0; j < cl_deviceCount; j++)
{
// Print device name and type
cl_device_type device_type;
char device_name[1024];
checkCL( clGetDeviceInfo( cl_devices[j], CL_DEVICE_NAME, 1024, &device_name, NULL) );
checkCL( clGetDeviceInfo( cl_devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL) );
printf("\n\tUsing OpenCl device #%i [ %s -- %s ]\n", j, device_name, getDevTypeString(device_type));
// Create OpenCL context
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)cl_platformIDs[i],
0
};
cl_dev_context = clCreateContext( cps, cl_deviceCount, cl_devices, NULL, NULL, &cl_err);
checkCL( cl_err);
// Create command queue
cl_queue = clCreateCommandQueue( cl_dev_context, cl_devices[j], CL_QUEUE_PROFILING_ENABLE, &cl_err);
checkCL( cl_err);
// Create device buffer
d_data = clCreateBuffer( cl_dev_context, CL_MEM_READ_WRITE, width*height*sizeof(cl_compl_flt), NULL, &cl_err);
checkCL( cl_err);
// Setup FFT
checkCL( clAmdFftSetup(&fftSetupData) );
// Create FFT plan
checkCL( clAmdFftCreateDefaultPlan( &fftPlan, cl_dev_context, fftDim, fftSize) );
// Copy data from host to device
clEnqueueWriteBuffer( cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_src, 0, NULL, NULL);
// Execute FFT
checkCL( clAmdFftEnqueueTransform( fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL) );
clFinish( cl_queue);
// Copy result from device to host
checkCL( clEnqueueReadBuffer(cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_res, 0, NULL, NULL) );
clFinish( cl_queue);
// Save result
char filename[512];
sprintf( filename, "raw/result_%u_%u_in.raw",i,j);
printf("\tSave result to \"%s\" ", filename);
saveRawData( h_res, filename, width, height, true);
printf("\n");
// Free FFT plan
checkCL( clAmdFftDestroyPlan( &fftPlan) );
// Free FFT
checkCL( clAmdFftTeardown() );
// Free device memory
checkCL( clReleaseMemObject(d_data) );
// Release OpenCL context and queue
checkCL( clReleaseCommandQueue( cl_queue ) );
checkCL( clReleaseContext( cl_dev_context) );
}
// Free OpenCL devices
free( cl_devices);
}
free( h_src);
free( h_res);
printf("\n\nPress any key ...");
getchar();
return 0;
}
以及其他使用的功能...
// Generate a pinhole
void createPinholeField( cl_compl_flt* data, cl_uint width, cl_uint height, cl_uint radius)
{
if(data==NULL)
data = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
if(radius < 1)
radius = (width>height)?height/2:width/2;
cl_float min_val = 0.0f;
cl_float max_val = 255.0f;
for(cl_uint y = 0; y < height; y++)
for(cl_uint x = 0; x < width; x++)
{
if ( ceil( sqrt( pow(x-width/2., 2.) + pow(y-height/2., 2.) )) <= radius )
{
data[x+y*width].real(max_val);
data[x+y*width].imag(0.f);
}
else
{
data[x+y*width].real(min_val);
data[x+y*width].imag(0.f);
}
}
}
// Save a cl_compl_flt array as an unsigned char raw image file
void saveRawData( cl_compl_flt* char_array, const char* filepath, cl_uint width, cl_uint height, bool print_minmax )
{
cl_float* abs_v = (cl_float*) malloc(width*height*sizeof(cl_float));
for( cl_uint i = 0; i < width*height; i++)
abs_v[i] = abs(char_array[i]);
cl_float min = abs_v[0];
cl_float max = abs_v[0];
for( cl_uint i = 1; i < width*height; i++)
{
if( abs_v[i] < min)
min = abs_v[i];
if( abs_v[i] > max)
max = abs_v[i];
}
if( print_minmax)
printf(" [min=%f , max=%f] ",min,max);
max *= .01f;
cl_uchar* temp = (cl_uchar*) malloc(width*height*sizeof(cl_uchar));
for( cl_uint i = 0; i < width*height; i++)
temp[i] = 255*(cl_uchar)(( (cl_float)abs_v[i] - min) / ( max-min ));
FILE *pFile = NULL;
pFile=fopen(filepath,"wb");
fwrite(temp,1,width*height,pFile);
fclose(pFile);
free(abs_v);
free(temp);
}
// Check functions that return OpenCL error IDs.
bool checkCL( cl_int oclErrorCode)
{
if( oclErrorCode == CL_SUCCESS)
return true;
else
{
printf("\n\nAn OpenCL related error occured!\nError ID #%d\nPress ENTER to exit the program...\n\n", oclErrorCode);
getchar();
exit( oclErrorCode);
return false;
}
}
// Get device type as string
char* getDevTypeString(cl_device_type type)
{
switch(type)
{
case CL_DEVICE_TYPE_CPU:
return "CPU";
break;
case CL_DEVICE_TYPE_GPU:
return "GPU";
break;
case CL_DEVICE_TYPE_ACCELERATOR:
return "ACCELERATOR";
break;
default:
return "DEFAULT";
break;
}
}
我希望这有助于缩小问题的范围。
PS:图片可以在这里看到:http: //devgurus.amd.com/thread/159149