int tx = threaIdx.x;
extern __shared__ int smem[];
int *stateSet_s = smem; //new active set
int *arcSet_s = &(smem[Q_LEN]); //local shared queue
float *scores_s = (float*)&(smem[2*Q_LEN]);
__shared__ int arcCnt;
__shared__ int stateCnt;
if ( tx == 0 )
arcCnt = 0;
stateCnt = 0;
//load state index from compacted list of state indexes
int stateId = activeSetIn_g[gtx];
float srcCost = scores_g[ stateId ];
int startId = outputArcStartIds_g[stateId];
int nArcs = outputArcCounts_g[stateId]; //number of outgoing arcs to be propagated (2-3 to thousands)
/// prepare arc set
/// !!!! that is the troubled code I think !!!!
/// bank conflicts? uncoalesced access?
int myPos = atomicAdd ( &arcCnt, nArcs );
while ( nArcs > 0 ) && ( myPos < Q_LEN ) )
scores_s[myPos] = srcCost;
arcSet_s[myPos] = startId + nArcs - 1;
/// parallel propagate arc set
if ( arcSet_s[tx] > 0 )
FstArc arc = arcs_g[ arcSet_s[tx] ];
float srcCost_ = scores_s[tx];
DoSomeJob ( &srcCost_ );
int *dst = &(transitionData_g[arc.dst]);
int old = atomicMax( dst, FloatToInt ( srcCost_ ) );
//// new active set
if ( old == ILZERO )
int pos = atomicAdd ( &stateCnt, 1 );
stateSet_s[ pos ] = arc.dst;
/// transfer new active set from smem to gmem
__shared__ int gPos;
if ( tx == 0 )
gPos = atomicAdd ( activeSetOutSz_g, stateCnt );
if ( tx < stateCnt )
activeSetOut_g[gPos + tx] = stateSet_s[tx];
但它运行得非常慢,我的意思是如果没有使用活动集(活动集 = 所有状态)则更慢,尽管活动集占所有状态的 10% 至 15%。登记压力大幅上升,入住率很低,但我认为对此无能为力。
线程之间可能有更有效的工作共享方式吗?考虑一下 3.0 上的 warp-shuffle 操作,但我必须使用 2.x 设备。