我是 OpenCL 的新手,我在 clCreateKernel 中遇到问题,它抛出 CL_INVALID_PROGRAM_EXECUTABLE,任何人都可以帮忙,代码基于http://www.cs.bris.ac.uk/home/simonm/workshops/OpenCL_lecture3.pdf,最后优化
这是代码:
#define ORDER 10 // Order of the square matrices A, B, and C
#define AVAL 3.0 // A elements are constant and equal to AVAL
#define BVAL 5.0 // B elements are constant and equal to BVAL
#define TOL (0.001) // tolerance used in floating point comparisons
#define DIM 2 // Max dim for NDRange
#define COUNT 1 // number of times to do each multiplication
#define SUCCESS 1
#define FAILURE 0
// Funciones Auxiliares
void initmat(int Mdim, int Ndim, int Pdim, float *A, float *B, float *C)
{
int i, j;
/* Initialize matrices */
for (i = 0; i < Ndim; i++)
for (j = 0; j < Pdim; j++)
A[i*Ndim+j] = AVAL;
for (i = 0; i < Pdim; i++)
for (j = 0; j < Mdim; j++)
B[i*Pdim+j] = BVAL;
for (i = 0; i < Ndim; i++)
for (j = 0; j < Mdim; j++)
C[i*Ndim+j] = 0.0f;
}
// Definicion de la funcion:
char * readKernel(void)
{
size_t *source_length;
FILE *fp = fopen("kernel.cl", "r");
if (fp == NULL)
{
printf("Cannot Open Kernel.cl\n");
}
else
{
printf("Kernel.cl Opened\n");
}
fseek(fp, 0, SEEK_END);
source_length[0] = ftell(fp);
if (source_length[0] == 0)
{
printf("Kernel.cl is empty\n");
}
else
{
printf("Kernel.cl length: %zu bytes\n", source_length[0]);
}
char *source = (char*) calloc(source_length[0] + 1, 1);
if (source == 0)
{
printf("Memory allocation failed");
}
fseek(fp, 0, SEEK_SET);
fread(source, 1, source_length[0], fp);
printf("Kernel.cl Read\n");
return source;
}
int main(int argc, char **argv)
{
// Declare and iniciate data
float *A, *B, *C;
int Mdim, Ndim, Pdim;
int err, szA, szB, szC;
size_t global[DIM];
size_t local[DIM];
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in, b_in, c_out;
Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
szA = Ndim*Pdim;
szB = Pdim*Mdim;
szC = Ndim*Mdim;
A = (float *)malloc(szA*sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC*sizeof(float));
const char* C_elem_KernelSource =
"__kernel \n"
"void mmul( \n"
" const int Mdim, \n"
" const int Ndim, \n"
" const int Pdim, \n"
" __global float* A, \n"
" __global float* B, \n"
" __global float* C, \n"
" __local float* Bwrk) \n"
"{ \n"
" int k,j; \n"
" int i = get_global_id(0); \n"
" int iloc = get_local_id(0); \n"
" int nloc = get_local_size(0); \n"
" float Awrk[10]; \n"
" float tmp; \n"
" for (k=0; k<Pdim; k++) \n"
" Awrk[k] = A[i*Ndim+k]; \n"
" for (j=0; j<Mdim; j++){ \n"
" for (k=iloc; k<Pdim; k=k+nloc) \n"
" Bwrk[k] = B[k*Pdim+j]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" tmp = 0.0f; \n"
" for (k=0; k<Pdim; k++) \n"
" tmp += Awrk[k] * Bwrk[k]; \n"
" C[i*Ndim+j] += tmp; \n"
"} \n"
;
initmat(Mdim, Ndim, Pdim, A, B, C);
// Setup the plataform
cl_uint num_platforms;
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS)
{
printf("Unable to get platform!\n");
}else{
printf("Plataformas Disponibles: %u \n", num_platforms);
}
//Identificador
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, &num_platforms);
printf("Plataformas creada\n");
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err==CL_SUCCESS){
printf("Device creado \n");
}else {
printf("Error %d \n", err);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
if (err==CL_SUCCESS){
printf("Contexto creado \n");
}else {
printf("Error creando contexto \n");
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (err==CL_SUCCESS){
printf("cola de comandos creadas \n");
}else {
printf("Error creando cola de comandos \n");
}
// Setup buffers and write A and B matrices to the device memory
a_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szA, NULL, NULL);
b_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szB, NULL, NULL);
c_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
// Build the program, define the kernel and setup arguments
program = clCreateProgramWithSource(context, 1, (const char **) &C_elem_KernelSource, NULL, &err);
if (err==CL_SUCCESS){
printf("programa creado \n");
}else {
printf("Error generado %d creando programa\n", err);
}
//Compila el programa en el dispositivo elegido
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL );
if (err==CL_SUCCESS){
printf("programa compilado 1\n");
}else {
printf("Error generado %d compilando programa 1\n", err);
}
kernel = clCreateKernel(program, "mmul", &err);
if (err==CL_SUCCESS){
printf("Kernel creado \n");
}else {
printf("Error generado %d creando kernel\n", err);
}
err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err |= clSetKernelArg(kernel, 6, sizeof(float)*Pdim, NULL);
if (err==CL_SUCCESS){
printf("Argumentos del Kernel configurados \n");
}else {
printf("Error configurando argumentos del kernel \n");
}
//Run the kernel and collect results
// 1D ND Range set to dimensions of C matrix
//Local Dim set to 250 so number of work-groups match number of
//compute units (4 in this case) for our order 1000 matrices
//Pass local memory to kernels. This requires a change to the kernel
//argument list … a new call to clSetKernelArg is needed
printf("Encolando Kernel:\n");
global[0] = (size_t) Ndim; global[1] = (size_t) Mdim; local[0] = (size_t) 2;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err==CL_SUCCESS){
printf("Kernel enviado a device \n");
}else {
printf("Error enviando kernel a device \n");
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, NULL, NULL );
//test_results(A, B, c_out);
}
谢谢