我正在尝试将一个featWidth * featHeight * 31
立方体与另一个modelWidth * modelHeight * 31
立方体“卷积”。问题是这个内核非常慢(嗯,我设法比顺序 CPU 代码快,但与 OpenMP 版本一样慢)。我正在使用 Quadro FX 1800(是的,64 个 CUDA 核心......)。
__constant__ float d_model[31*22*22];
#define IMUL(a,b) ( __mul24((a), (b)) )
#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
__global__ void dMatch(float *score, const int featWidth, const int featHeight, const int modelWidth, const int modelHeight, const int scoreWidth, const int scoreHeight)
{
const int x = IMAD(blockIdx.x, blockDim.x, threadIdx.x);
const int y = IMAD(blockIdx.y, blockDim.y, threadIdx.y);
if(x < scoreWidth && y < scoreHeight)
{
const int scoreIdx = IMAD(x, scoreHeight, y);
score[scoreIdx] = 0.f;
const int baseFeatIdx = IMUL(x,scoreHeight) + IMAD(modelHeight-1, x, y);
for(int z = 0; z < 31; ++z)
{
// Index positionning
int featIdx = IMAD(z, IMUL(featWidth,featHeight), baseFeatIdx);
int modelIdx = IMUL(z, IMUL(modelWidth,modelHeight));
float value = 0.f;
// filter
for(int xx=0; xx<modelWidth; xx++)
{
const int xxmodelIdx = IMAD(xx, modelHeight, modelIdx);
const int xxfeatIdx = IMAD(xx, featHeight, featIdx);
for(int yy=0; yy<modelHeight; yy++)
{
value += d_model[xxmodelIdx+yy] * tex1Dfetch(texFeatures,xxfeatIdx+yy);
}
}
score[scoreIdx] += value;
}
}
}
无论如何,我用8*8
块中的线程启动这个内核,网格大小为(scoreWidth/8)*(scoreHeight/8)
(scoreWidth 和 scoreHeight 是生成的矩阵大小)。我想知道你是否知道我的代码出了什么问题或者什么地方比较慢。
编辑:
一个更快的版本(480 毫秒的进程下降了 150 毫秒!)感谢 tera:
__global__ void dMatch(float *score, const int featWidth, const int featHeight, const int modelWidth, const int modelHeight, const int scoreWidth, const int scoreHeight)
{
const int y = IMUL(4,IMAD(blockIdx.x, blockDim.x, threadIdx.x));
const int x = IMAD(blockIdx.y, blockDim.y, threadIdx.y);
if(x < scoreWidth && y < scoreHeight)
{
const int scoreIdx = IMAD(x, scoreHeight, y);
const int baseFeatIdx = IMUL(x,scoreHeight) + IMAD(modelHeight-1, x, y);
float value=0.f, value1 = 0.f, value2 = 0.f, value3 = 0.f;
float feat,feat1,feat2,feat3;
// Index positionning
int featIdx = 0;
int modelIdx = 0;
int xxmodelIdx;
int xxfeatIdx;
float val;
for(int z = 0; z < 31; ++z)
{
featIdx = IMAD(z,IMUL(featWidth,featHeight),baseFeatIdx);
modelIdx = IMUL(z,IMUL(modelWidth,modelHeight));
// filter
for(int xx=0; xx<modelWidth; xx++)
{
xxmodelIdx = IMAD(xx, modelHeight, modelIdx);
xxfeatIdx = IMAD(xx, featHeight, featIdx);
feat=tex1Dfetch(texFeatures,xxfeatIdx+0);
feat1=tex1Dfetch(texFeatures,xxfeatIdx+1);
feat2=tex1Dfetch(texFeatures,xxfeatIdx+2);
feat3=tex1Dfetch(texFeatures,xxfeatIdx+3);
for(int yy=0; yy<modelHeight; yy++)
{
val = d_model[xxmodelIdx+yy];
value += val * feat;
value1 += val * feat1;
value2 += val * feat2;
value3 += val * feat3;
feat = feat1;
feat1 = feat2;
feat2 = feat3;
feat3 = tex1Dfetch(texFeatures,xxfeatIdx+yy+4);
}
}
}
score[scoreIdx] = value;
if(y+1 < scoreHeight)
score[scoreIdx+1] = value1;
if(y+2 < scoreHeight)
score[scoreIdx+2] = value2;
if(y+3 < scoreHeight)
score[scoreIdx+3] = value3;
}
以此启动dim3 threads(16,16); dim3 grid(divup(scoreHeight,64), divup(scoreWidth,16));
。