我使用两个 1D 高斯滤波器(高斯可分性)在 OpenCL 上实现了 2D 高斯滤波器。
我实现了我的卷积的 2 个版本: - 第一个使用一个内核,在行上应用一维滤波器,然后转置图像(大约需要 20 毫秒)
- 第二个使用两个内核:一个应用卷积,一个转置图像(大约需要 7ms - 卷积和大约 1ms - 转置图像)
我评估了两种实现的计算时间,我发现使用两个内核的实现比使用一个内核的实现要快(注意转置内核必须等待卷积内核)。
你能帮我弄清楚为什么使用一个内核的实现会更慢,即使一个内核的设置时间应该比两个内核的设置时间快。
在下面找到这两种实现的 OpenCL 源代码:
一个内核实现
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; temp[iG + jG*widthG] = acc; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }
两个内核实现
卷积:
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __constant float *filter, __local float4 *cache, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; location.x = i; location.y = j; write_imagef(dstBuffer, location , acc); }
转置:
__kernel void TransponseKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int widthGM = widthG + (margin<<1); int2 location; float4 focus,set; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); temp[iG + jG*widthG] = focus; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }