我正在使用OpenCL
. 人脸检测算法基于 Viola Jones 算法。我试图制作 Cascade 分类步骤内核代码。我在级联阶段中设置classifier data
了级联阶段 1,local memory(__local)
因为分类器数据用于所有工作项。
但是,不使用本地内存(使用全局内存)的内核分析时间比使用本地内存更快。
编辑:
我上传了完整的代码。
带本地内存版本
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
int cascadeLocalSize = get_local_size(0);
__local float localS1F1[42];
int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
if(localIdx<42)
{
int stage1Idx = localIdx + idxNumValStageArray[0]+4;
localS1F1[localIdx] = classifierMem[stage1Idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
int type = gx%3;
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
unsigned int stageIndex;
float stageThres;
float numFeatures;
unsigned int featureIndex;
float featureValue;
if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
if(index == 0){
stageIndex = idxNumValStageArray[0];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = 0;
featureValue = 0.0;
}
else{
stageIndex = idxNumValStageArray[index];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = stageIndex+4;
featureValue = 0.0;
}
float featureThres;
float succVal;
float failVal;
unsigned int numRegions;
float regionValue;
if(type ==0 && index==0)
{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}// end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(type ==1 && index ==0)
{
featureIndex +=14;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==1)
featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(index == 0)
{
featureIndex +=28;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==2) featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}// end of for(unsigned int k=numRegions; k>0;k--)
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}//end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
//stage
else{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
succVal=classifierMem[featureIndex++];
failVal=classifierMem[featureIndex++];
numRegions = classifierMem[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}
}else return;
}
原始版本(没有本地内存)
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
unsigned int stageIndex = idxNumValStageArray[index++];
float stageThres = classifierMem[stageIndex+2];
float numFeatures = classifierMem[stageIndex+3];
unsigned int featureIndex = stageIndex+4;
float featureValue = 0.0;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
float succVal=classifierMem[featureIndex++];
float failVal=classifierMem[featureIndex++];
unsigned int numRegions = classifierMem[featureIndex++];
float regionValue =0.0;
for(unsigned int k=numRegions; k>0;k--){
float4 rectValue;
int4 regionP;
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}else return;
}
分析时间:原始版本(不带本地内存):24ms 修改版本(带本地内存):28ms
编辑:实际上 localWorkSize NULL 因为 globalWorkSize 总是因放置 NDRangeKernel 的向量大小而异。当放置特定的localWorkSize时,人脸检测率下降......所以我尝试将localWorkSize NUll,然后人脸检测率很好。所以我想知道原因。
这是主机代码:
//localWorkSize[0] = 16;
//localWorkSize[1] = 12;
numThreadsX=512;
globalWorkSize[0] = numThreadsX;
globalWorkSize[1] = vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL);
errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL);
errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);
errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);