caching - 如何在不将指针声明为易失性的情况下强制执行 CUDA 全局内存一致性？

Question

我将首先进行一些上下文化。我正在尝试使用 CUDA 中的双端队列实现非阻塞工作窃取方法。双端队列 (aDeques) 位于全局内存中的块分段数组中，并且 popWork() 设备函数的目标是弹出工作以提供线程。除了全局双端队列之外，每个块在共享内存（aLocalStack）中都有一个堆栈，它可以在本地工作。流行音乐发生在 3 个级别。第一次尝试在共享堆栈中，第二次尝试在块拥有的双端队列中，第三次尝试是窃取其他双端队列。每个双端队列都有全局底部和弹出指针，它们位于全局内存数组（aiDequesBottoms 和 auiDequesAges）中。我的问题是，当一个块更改全局双端队列指针时，当我在 GTS450 中测试代码时，其他块看不到更改。好像缓存没有被更新。我也在GT520卡上测试过，没有出现这个问题。我在使用 aiDequeFlags 数组时遇到过类似的问题。通过将其声明为 volatile 可以解决这些问题。不幸的是，我不能对双端队列指针数组做同样的事情，因为我以后需要对它们使用原子函数。很抱歉没有把问题放在一个更简单的例子中，但我无法重现这种行为。第一个片段解释了 popWork() 接口。很抱歉没有把问题放在一个更简单的例子中，但我无法重现这种行为。第一个片段解释了 popWork() 接口。很抱歉没有把问题放在一个更简单的例子中，但我无法重现这种行为。第一个片段解释了 popWork() 接口。

template <int iDequeSize> //Size of each segment in aDeques 
bool __inline__ __device__ popWork(
    volatile int *aiDequeFlags , //Flags that indicates if a deque is active (has work)
    int *aiDequesBottoms , //Deque bottom pointers
    unsigned int *auiDequesAges , //Deque top pointers (29 higher bits) + 
                                  //Tag bits(3 lower bits).
    const Int2Array *aDeques , //Deques (Int2Array is an interface for 2 int arrays)
    int &uiStackBot , //Shared memory stack pointer
    int2 *aLocalStack , //Shared memory local stack
    const int &iTid , //threadIdx.x
    const int &iBid , //blockIdx.x

    //All other parameters are output

unsigned int &uiPopDequeIdx , //Choosen deque for pop
    int2 *popStartIdxAndSize , //Arrays of pop start index and sizes
    bool *bPopFlag , //Array of flags for pop in each level
unsigned int &uiActiveDequesIdx , //Flag to indicate pop failed (no more work)
    int2 &work //Actual acquired thread work)

第二个片段具有完整的功能。使用该函数的内核以 8 个块、64 个线程启动，一开始只有 deque 0 有 1 个工作，而所有其他 deque 都是空的。有一些调试 printf 调用来生成日志，这将在下一个片段中显示。

template <int iDequeSize>
bool __inline__ __device__ popWork(volatile int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Int2Array *aDeques , int &uiStackBot , int2 *aLocalStack , const int &iTid , const int &iBid ,
unsigned int &uiPopDequeIdx , int2 *popStartIdxAndSize , bool *bPopFlag , unsigned int &uiActiveDequesIdx , int2 &work)
{
//Pop from local stack
if(iTid == 0)
{
    unsigned int uiAge = 0;
    bPopFlag[0] = popBottom(uiStackBot , uiAge , popStartIdxAndSize[iBid]); 
    bPopFlag[3] = bPopFlag[0];
}

__syncthreads();

if(bPopFlag[0])
{
    if(iTid < popStartIdxAndSize[iBid].y)
    {
        work = aLocalStack[popStartIdxAndSize[iBid].x + iTid];
    }
}
else
{
    if(iTid == 0)
    {   //Try to pop from block deque

        bPopFlag[1] = popBottom(aiDequesBottoms[iBid] , auiDequesAges[iBid] , popStartIdxAndSize[iBid]);

        if(bPopFlag[1])
        {
            uiPopDequeIdx = iBid;
            //Debug
            if(iBid == 0)
            {
                printf("Block %d pop global deque. Bottom=%d\n" , iBid , aiDequesBottoms[iBid]);
            }
            //
        }
        else
        {
            aiDequeFlags[iBid] = 0;
            popStartIdxAndSize[iBid].x = INFTY;
            uiPopDequeIdx = INFTY;
        }
        bPopFlag[3] = bPopFlag[1];
        bPopFlag[2] = false;
    }
    __syncthreads();

    if(!bPopFlag[1])
    {
        //Verify if lazy steal can be done.
        if(iTid < NDEQUES)
        {
            if(popStartIdxAndSize[iTid].x != INFTY && iTid != iBid)
            {
                atomicMin(&uiPopDequeIdx , iTid);
                bPopFlag[2] = true;
                bPopFlag[3] = true;
            }
        }

        __syncthreads();

        if(iTid == uiPopDequeIdx)
        {
            popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
            popStartIdxAndSize[iTid].x = INFTY;
        }

        while(!bPopFlag[3])
        {   //No more work, try to steal some!
            __syncthreads();

            if(iTid == 0)
            {
                uiActiveDequesIdx = 0;
            }
            __syncthreads();

            if(iTid < NDEQUES)
            {
                if(aiDequeFlags[iTid] == 1)
                {
                    uiActiveDequesIdx = 1;

                    //Debug
                    printf("Block %d steal attempt on block %d. Victim bottom=%d\n" , blockIdx.x , threadIdx.x , aiDequesBottoms[iTid]);
                    //

                    if(popTop(aiDequesBottoms , auiDequesAges , iTid , popStartIdxAndSize[iTid]))
                    {
                        aiDequeFlags[iBid] = 1;
                        atomicMin(&uiPopDequeIdx , iTid);
                        bPopFlag[3] = true;

                        //Debug
                        //printf("%d ss %d %d %d\n" , iBid , iTid , popStartIdxAndSize[iTid].x , popStartIdxAndSize[iTid].y);
                        //
                    }
                }
            }

            __syncthreads();

            if(uiActiveDequesIdx == 0)
            { //No more work to steal. End.
                break;
            }

            if(iTid == uiPopDequeIdx)
            {
                popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
                popStartIdxAndSize[iTid].x = INFTY;
            }

            __syncthreads();
        }
    }

    __syncthreads();

    if(bPopFlag[3] && iTid < popStartIdxAndSize[iBid].y) //assuming number of threads >= WORK_SIZE
    {
        aDeques->getElement(work , uiPopDequeIdx*iDequeSize + popStartIdxAndSize[iBid].x + iTid);
    }
}

return bPopFlag[3];

}

最后一个片段是生成的日志。推送线（“Block X push.Bottom=Y”）由此处未显示的推送功能生成。请记住，一开始，只有块 0 有 1 个工作。

Block 0 pop global deque. Bottom=0
Block 4 steal attempt on block 0. Victim bottom=0
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 4 steal attempt on block 0. Victim bottom=0
Block 7 steal attempt on block 0. Victim bottom=1
Block 0 push. Bottom=448
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 4. Victim bottom=0
Block 1 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 4. Victim bottom=0
Block 5 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 4. Victim bottom=0
Block 4 push. Bottom=384

可以看出，只有block 4可以看到block 0 deque底部指针的变化。我尝试在指针发生任何更改后添加一些 __threadfence() 调用，但没有成功。感谢关注！

score 3 · Accepted Answer

根据评论，唯一可行的解决方案似乎是关闭 L1 缓存。这可以通过在编译时将以下开关传递给 nvcc 在程序范围内完成：

–Xptxas –dlcm=cg

L1 缓存是 SM 的属性/资源，而不是整个设备。由于线程块在特定的 SM 上执行，一个线程块在其 L1 缓存中的活动可能与另一个线程块及其 L1 缓存的活动不一致（假设它恰好在不同的 SM 上运行），即使它们都引用相同的全局内存中的位置。不同 SM 中的 L1 缓存相互之间没有联系，不保证相互一致。

请注意，L2 缓存是设备范围的，因此从各个线程块的角度来看是“连贯的”。关闭 L1 缓存对 L2 缓存没有影响，因此仍然有可能获得一些缓存优势，但是满足 L2 外的请求所需的时间比满足 L1 外的请求所需的时间长，因此关闭程序范围内的 L1 缓存是一个相当大的锤子，可以尝试让事情正常工作。

变量定义前面的volatile关键字应该具有告诉编译器在加载时跳过 L1 缓存的效果（根据我的理解）。但是 volatile 本身并不能解决写入路径，因此一个 SM 中的一个线程块可以进行volatile读取，从 L2 中拉出一个值，修改该值，然后将其写回，最终在 L1 中（直到它被驱逐）。如果另一个线程块读取相同的全局值，它可能看不到更新的效果。

勤奋地使用__threadfence()虽然很乏味，但应该强制任何此类更新从 L1 进入 L2，以便其他线程块可以读取它们。但是，从写入值到其他 SM/线程块可以观察到值，这仍然会留下同步间隙。

（全局）原子还应该具有直接进入“全局内存”以读取和写入使用的值的效果。

还可以检查代码以确保正确处理从全局同步位置的每个可能读取（例如使用volatile或使用原子）并且正确处理对全局同步位置的所有可能写入（例如使用__threadfence()或原子），并检查不同块之间的竞争条件。

如发现的那样，在 GPU 中创建稳定的全局同步环境的过程并非易事。这些其他问题也可能是有趣的（例如关于开普勒）（例如讨论全局信号量）。

编辑：要回答评论中发布的问题，我会这样说：

也许没有问题。但是__threadfence()，不能保证（据我所知）最长完成时间。因此，在对全局位置进行更新时，只有与正在执行的线程块/SM 关联的 L1 被更新。然后我们击中__threadfence(). 大概 threadfence 需要一些时间才能完成，在此期间，另一个 threadblock 可能驻留在同一个 SM 上，被带入执行（而前一个线程/warp/block 在 threadfence 处停止），并“看到”更新的全局值在与该 SM 关联的（本地）L1 中。在其他 SM 中执行的其他线程块将看到“陈旧”值，直到__threadfence()完成。这就是我所说的可能的“同步间隙”。两个不同的块在短时间内仍然可以看到两个不同的值。这是否重要将取决于全局值如何用于块之间的同步（因为这是正在讨论的主题。）因此 atomics + volatile 可能是比 volatile + threadfence 更好的选择，尝试涵盖两者阅读并写入同步路径。

编辑#2：从评论看来，结合使用原子加上volatile也解决了这个问题。

score 0 · Accepted Answer

坦率地说，我发现你的代码过于复杂，而且 - 更重要的是 - 不完整。怎么做popBottom和popTop作用？此外，push操作是如何实现的？这两个必须精心设计才能正常工作并确保不会发生某些同步问题。

例如：当一个块试图将某些东西推送到它的全局内存队列，而另一个块试图在同一时刻从中读取时会发生什么？这非常重要，如果没有正确完成，它可能会在一些非常罕见的情况下崩溃，例如，您可能会从尚未写入的数据单元中弹出。

当我实现一个类似的东西时——一个在所有块之间共享的单个全局内存双端队列，我另外将每个数据单元标记为：空的、占用的和死的。在伪代码中，算法或多或少像这样工作：

/* Objects of this class should reside in CUDA global memory */
template <typename T, size_t size>
class WorkQueue {
private:
    size_t head, tail;
    size_t status[size];
    T data[size];

    enum {
        FieldFree = 0,
        FieldDead = 1,
        FieldTaken = 2
    };      

public:
    /* 
       This construction should actually be done by host on the device,
       before the actual kernel using it is launched!
       Zeroing the memory should suffice.
    */
    WorkQueue() : head(0), tail(0) {
        for (size_t i=0; i<size; ++i)
            status[i]=FieldFree;
    }   

    __device__ bool isEmpty() { return head==tail; }

    /* single thread of a block should call this */
    __device__ bool push(const T& val) {
        size_t oldFieldStatus;
        do {
            size_t cell = atomicInc(&tail,size-1);
            data[cell]=val;
            __threadfence(); //wait untill all blocks see the above change
            oldFieldStatus=atomicCAS(&status[cell],FieldFree,FieldTaken); //mark the cell as occupied
        } while (oldFieldStatus!=FieldFree); 
        return true;
    }

    /* single thread of a block should call this */
    __device__ bool pop(T& out) {
        size_t cellStatus;
        size_t cell;
        do {
            cell=atomicInc(&head,size-1);
            cellStatus=atomicCAS(&status[cell],FieldFree,FieldDead);
            //If cell was free, make it dead - any data stored there will not be processed. Ever.
        } while (cellStatus==FieldDead);
        if (cellStatus!=FieldTaken)
            return false;
        out = data[cell];
        status[cell]=FieldFree;
        return true;
    }
};

如果没有单元格状态，我看不到实现它的可靠方法 - 否则，如果来自两个不同块的两个线程尝试推送/弹出到同一个出列单元格，则会发生不好的事情。使用上述方法，最坏的情况可能发生，弹出线程将无法弹出，返回 false 并将单元格标记为dead，并且推送线程将重新尝试推送到下一个单元格。背后的想法是，如果弹出线程无法弹出，那么无论如何可能没有太多工作要做，并且块可以终止。使用这种方法，您只会“杀死”与并行运行的块一样多的单元。

注意，在上面的代码中我不检查溢出！

caching - 如何在不将指针声明为易失性的情况下强制执行 CUDA 全局内存一致性？

2 回答 2

Related

Reference