问题已解决(如果您有兴趣;您可以查看第二段;在该行下方)。现在我有一个新问题;为什么#define BLOCK_DIM 16;
会导致下面的函数出错?随便用16
就好。
这是错误
expected a "]"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error:
expected a ")"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error: operand
of "*" must be a pointer
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
error:
expected a ";"
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
^
和功能
__kernel void transpose(
__global float2* dataout,
__global float2* datain,
int width, int height)
// width = N (signal length)
// height = batch_size (number of signals in a batch)
{
// read the matrix tile into shared memory
__local float2 block[32 * (32 + 1)] ;
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
int Idin = get_local_id(1)*(32+1)+get_local_id(0);
block[Idin]= datain[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write the transposed matrix tile to global memory
xIndex = get_group_id(1) * 32 + get_local_id(0);
yIndex = get_group_id(0) * 32 + get_local_id(1);
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
int Idout = get_local_id(0)*(32+1)+get_local_id(1);
dataout[index_out] = block[Idout];
}
}
================================
我正在努力提高图像上 2D FFT 的性能。经过基准测试;我 regconize 转置函数是使程序变慢的原因,所以我用更优化的替换它。
但在那之后;我收到了之前正常工作的所有功能的返回码CL_INVALID_KERNEL_NAME
。除了转置函数和clSetKernelArg
宿主代码中;我没有改变其他任何东西。所以我没主意。希望大家帮帮我:)
更新:这里是错误。不要介意行号:) 这些行对我来说似乎很正常。有什么问题吗 ?
错误:
expected a "]"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error:
expected a ")"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error: operand
of "*" must be a pointer
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
error:
expected a ";"
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
^
这是内核文件
新的那一个 :
#define BLOCK_DIM 16
__kernel void transpose(
__global float2* dataout,
__global float2* datain,
int width, int height)
// width = N (signal length)
// height = batch_size (number of signals in a batch)
{
// read the matrix tile into shared memory
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
int Idin = get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0);
block[Idin]= datain[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write the transposed matrix tile to global memory
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
dataout[index_out] = block[Idout];
}
}