我将首先进行一些上下文化。我正在尝试使用 CUDA 中的双端队列实现非阻塞工作窃取方法。双端队列 (aDeques) 位于全局内存中的块分段数组中,并且 popWork() 设备函数的目标是弹出工作以提供线程。除了全局双端队列之外,每个块在共享内存(aLocalStack)中都有一个堆栈,它可以在本地工作。流行音乐发生在 3 个级别。第一次尝试在共享堆栈中,第二次尝试在块拥有的双端队列中,第三次尝试是窃取其他双端队列。每个双端队列都有全局底部和弹出指针,它们位于全局内存数组(aiDequesBottoms 和 auiDequesAges)中。我的问题是,当一个块更改全局双端队列指针时,当我在 GTS450 中测试代码时,其他块看不到更改。好像缓存没有被更新。我也在GT520卡上测试过,没有出现这个问题。我在使用 aiDequeFlags 数组时遇到过类似的问题。通过将其声明为 volatile 可以解决这些问题。不幸的是,我不能对双端队列指针数组做同样的事情,因为我以后需要对它们使用原子函数。很抱歉没有把问题放在一个更简单的例子中,但我无法重现这种行为。第一个片段解释了 popWork() 接口。很抱歉没有把问题放在一个更简单的例子中,但我无法重现这种行为。第一个片段解释了 popWork() 接口。很抱歉没有把问题放在一个更简单的例子中,但我无法重现这种行为。第一个片段解释了 popWork() 接口。
template <int iDequeSize> //Size of each segment in aDeques
bool __inline__ __device__ popWork(
volatile int *aiDequeFlags , //Flags that indicates if a deque is active (has work)
int *aiDequesBottoms , //Deque bottom pointers
unsigned int *auiDequesAges , //Deque top pointers (29 higher bits) +
//Tag bits(3 lower bits).
const Int2Array *aDeques , //Deques (Int2Array is an interface for 2 int arrays)
int &uiStackBot , //Shared memory stack pointer
int2 *aLocalStack , //Shared memory local stack
const int &iTid , //threadIdx.x
const int &iBid , //blockIdx.x
//All other parameters are output
unsigned int &uiPopDequeIdx , //Choosen deque for pop
int2 *popStartIdxAndSize , //Arrays of pop start index and sizes
bool *bPopFlag , //Array of flags for pop in each level
unsigned int &uiActiveDequesIdx , //Flag to indicate pop failed (no more work)
int2 &work //Actual acquired thread work)
第二个片段具有完整的功能。使用该函数的内核以 8 个块、64 个线程启动,一开始只有 deque 0 有 1 个工作,而所有其他 deque 都是空的。有一些调试 printf 调用来生成日志,这将在下一个片段中显示。
template <int iDequeSize>
bool __inline__ __device__ popWork(volatile int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Int2Array *aDeques , int &uiStackBot , int2 *aLocalStack , const int &iTid , const int &iBid ,
unsigned int &uiPopDequeIdx , int2 *popStartIdxAndSize , bool *bPopFlag , unsigned int &uiActiveDequesIdx , int2 &work)
{
//Pop from local stack
if(iTid == 0)
{
unsigned int uiAge = 0;
bPopFlag[0] = popBottom(uiStackBot , uiAge , popStartIdxAndSize[iBid]);
bPopFlag[3] = bPopFlag[0];
}
__syncthreads();
if(bPopFlag[0])
{
if(iTid < popStartIdxAndSize[iBid].y)
{
work = aLocalStack[popStartIdxAndSize[iBid].x + iTid];
}
}
else
{
if(iTid == 0)
{ //Try to pop from block deque
bPopFlag[1] = popBottom(aiDequesBottoms[iBid] , auiDequesAges[iBid] , popStartIdxAndSize[iBid]);
if(bPopFlag[1])
{
uiPopDequeIdx = iBid;
//Debug
if(iBid == 0)
{
printf("Block %d pop global deque. Bottom=%d\n" , iBid , aiDequesBottoms[iBid]);
}
//
}
else
{
aiDequeFlags[iBid] = 0;
popStartIdxAndSize[iBid].x = INFTY;
uiPopDequeIdx = INFTY;
}
bPopFlag[3] = bPopFlag[1];
bPopFlag[2] = false;
}
__syncthreads();
if(!bPopFlag[1])
{
//Verify if lazy steal can be done.
if(iTid < NDEQUES)
{
if(popStartIdxAndSize[iTid].x != INFTY && iTid != iBid)
{
atomicMin(&uiPopDequeIdx , iTid);
bPopFlag[2] = true;
bPopFlag[3] = true;
}
}
__syncthreads();
if(iTid == uiPopDequeIdx)
{
popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
popStartIdxAndSize[iTid].x = INFTY;
}
while(!bPopFlag[3])
{ //No more work, try to steal some!
__syncthreads();
if(iTid == 0)
{
uiActiveDequesIdx = 0;
}
__syncthreads();
if(iTid < NDEQUES)
{
if(aiDequeFlags[iTid] == 1)
{
uiActiveDequesIdx = 1;
//Debug
printf("Block %d steal attempt on block %d. Victim bottom=%d\n" , blockIdx.x , threadIdx.x , aiDequesBottoms[iTid]);
//
if(popTop(aiDequesBottoms , auiDequesAges , iTid , popStartIdxAndSize[iTid]))
{
aiDequeFlags[iBid] = 1;
atomicMin(&uiPopDequeIdx , iTid);
bPopFlag[3] = true;
//Debug
//printf("%d ss %d %d %d\n" , iBid , iTid , popStartIdxAndSize[iTid].x , popStartIdxAndSize[iTid].y);
//
}
}
}
__syncthreads();
if(uiActiveDequesIdx == 0)
{ //No more work to steal. End.
break;
}
if(iTid == uiPopDequeIdx)
{
popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
popStartIdxAndSize[iTid].x = INFTY;
}
__syncthreads();
}
}
__syncthreads();
if(bPopFlag[3] && iTid < popStartIdxAndSize[iBid].y) //assuming number of threads >= WORK_SIZE
{
aDeques->getElement(work , uiPopDequeIdx*iDequeSize + popStartIdxAndSize[iBid].x + iTid);
}
}
return bPopFlag[3];
}
最后一个片段是生成的日志。推送线(“Block X push.Bottom=Y”)由此处未显示的推送功能生成。请记住,一开始,只有块 0 有 1 个工作。
Block 0 pop global deque. Bottom=0
Block 4 steal attempt on block 0. Victim bottom=0
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 4 steal attempt on block 0. Victim bottom=0
Block 7 steal attempt on block 0. Victim bottom=1
Block 0 push. Bottom=448
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 4. Victim bottom=0
Block 1 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 4. Victim bottom=0
Block 5 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 4. Victim bottom=0
Block 4 push. Bottom=384
可以看出,只有block 4可以看到block 0 deque底部指针的变化。我尝试在指针发生任何更改后添加一些 __threadfence() 调用,但没有成功。感谢关注!