1
__kernel void kmeans_kernel(__global float* data, int points, 
                            __global float* centroids, int clusters, 
                             int dimensions)
{           

    //extern __shared__ float storage_space[];
     __local float storage_space[];

    __local int iterations;
    __local float *means; 
    __local float *index; 
    __local float *mindist;
    __local float *s_data;


    iterations = points / ( get_global_size(0)) + 1;    


    if( get_local_id(0) == 0 ){
        s_data[get_local_id(0)] = data[get_local_id(0)];
        means=&storage_space[0];
        index=&storage_space[get_local_size(0)];
        mindist=&storage_space[2*get_local_size(0)];
    }

    //data = &data[blockDim.x * blockIdx.x + threadIdx.x];
    data = &data[get_global_id(0)];         

    while( iterations )
    {
        mindist[get_local_id(0)] = 3.402823466e+38F;                
        index[get_local_id(0)] = 0;             

        for( short j = 0; j < clusters; j++ )
        {                       
            if( get_local_id(0) <= dimensions )
                //means[get_local_id(0)] = centroids[get_local_id(0)+j*c_pitch];
                means[get_local_id(0)] = centroids[get_local_id(0)+j];
            //__syncthreads();      
            barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);

            if( !(data[get_local_id(0)] - s_data[get_local_id(0)] > points - 1) )
            {
                float dist = distance_gpu_transpose( means, data, dimensions);
                if( dist < mindist[get_local_id(0)] )
                {
                    mindist[get_local_id(0)] = dist;
                    index[get_local_id(0)] = j;
                }                   
            }           
            //__syncthreads();          
            barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
        }

        if( !(data[get_local_id(0)] - s_data[get_local_id(0)] > points - 1) )                               
                data[0] = index[get_local_id(0)];

        data += (get_global_size(0));               
        if( get_local_id(0) == 0 )
            --iterations;
        //__syncthreads();
        barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    }
}

我正在使用带有 ATI 3200 显卡的 AMD 处理器,它不支持 openCL,但其余代码在 CPU 本身上运行良好。

这次我的代码问题对我来说相当复杂。内核执行后,我无法使用clEnqueueReadBuffer. 在调试时它在这一点上被打破并说,

Unhandled exception at 0x10001098 in CL_kmeans.exe: 0xC000001D: Illegal Instruction.

当我在这里按下休息时,它给出了

No symbols are loaded for any call stack frame. The source code cannot be displayed.

这里可能有什么问题?请建议我一些解决方案。我的内核代码如上所示,我用来读取数据的语句是,

ret = clEnqueueReadBuffer(command_queue, gpu_data, CL_TRUE, 0,sizeof( float ) * instances->cols* 1 , instances->data, 0, NULL, NULL);

我如何在这里检查可能的错误?

4

0 回答 0