cuda - Cuda原子锁：顺序线程

Question

我有一段代码需要严格执行。我为那段代码使用了一个锁，以便内核的每个线程（每个块设置一个线程）原子地执行那段代码。线程的顺序让我感到困扰 - 我需要线程根据它们的索引（或者实际上，按照它们的 blockIdx 的顺序）按时间顺序执行，从 0 到 10（而不是随机的，例如 5、8、3， 0，...等）。有可能这样做吗？

这是一个示例代码：

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<math_functions.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>

// number of blocks
#define nob 10

struct Lock{
  int *mutex;
  Lock(void){
    int state = 0;
    cudaMalloc((void**) &mutex, sizeof(int));
    cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
  }
  ~Lock(void){
    cudaFree(mutex);
  }
  __device__ void lock(void){
    while(atomicCAS(mutex, 0, 1) != 0);
  }
  __device__ void unlock(void){
    atomicExch(mutex, 0);
  }
};


__global__ void theKernel(Lock myLock){
  int index = blockIdx.x; //using only one thread per block

  // execute some parallel code

  // critical section of code (thread with index=0 needs to start, followed by index=1, etc.)
  myLock.lock();

  printf("Thread with index=%i inside critical section now...\n", index);

  myLock.unlock();
}

int main(void)
{
  Lock myLock;
  theKernel<<<nob, 1>>>(myLock);
  return 0;
}

这给出了以下结果：

Thread with index=1 inside critical section now...
Thread with index=0 inside critical section now...                                                                                                                                   
Thread with index=5 inside critical section now...                                                                                                                                            
Thread with index=9 inside critical section now...
Thread with index=7 inside critical section now...
Thread with index=6 inside critical section now...
Thread with index=3 inside critical section now...
Thread with index=2 inside critical section now...
Thread with index=8 inside critical section now...
Thread with index=4 inside critical section now...

我希望这些索引从 0 开始并按时间顺序执行到 9。

我认为修改 Lock 以实现此目的的一种方法如下：

struct Lock{
  int *indexAllow;
  Lock(void){
    int startVal = 0;
    cudaMalloc((void**) &indexAllow, sizeof(int));
    cudaMemcpy(indexAllow, &startVal, sizeof(int), cudaMemcpyHostToDevice);
  }
  ~Lock(void){
    cudaFree(indexAllow);
  }
  __device__ void lock(int index){
    while(index!=*indexAllow);
  }
  __device__ void unlock(void){
    atomicAdd(indexAllow,1);
  }
};

然后通过将索引作为参数传递来初始化锁：

myLock.lock(index);

但这让我的电脑停了下来……我可能遗漏了一些明显的东西。

如果有人可以提供帮助，我将不胜感激！

谢谢！！！

score 2 · Accepted Answer

我稍微改变了你的代码。现在它会产生您想要的输出：

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<math_functions.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>

// number of blocks
#define nob 10

struct Lock{
  int *mutex;
  Lock(void){
    int state = 0;
    cudaMalloc((void**) &mutex, sizeof(int));
    cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
  }
  ~Lock(void){
    cudaFree(mutex);
  }
  __device__ void lock(uint compare){
    while(atomicCAS(mutex, compare, 0xFFFFFFFF) != compare);    //0xFFFFFFFF is just a very large number. The point is no block index can be this big (currently).
  }
  __device__ void unlock(uint val){
    atomicExch(mutex, val+1);
  }
};


__global__ void theKernel(Lock myLock){
  int index = blockIdx.x; //using only one thread per block

  // execute some parallel code

  // critical section of code (thread with index=0 needs to start, followed by index=1, etc.)
  myLock.lock(index);
  printf("Thread with index=%i inside critical section now...\n", index);
  __threadfence_system();   // For the printf. I'm not sure __threadfence_system() can guarantee the order for calls to printf().
  myLock.unlock(index);
}

int main(void)
{
  Lock myLock;
  theKernel<<<nob, 1>>>(myLock);
  return 0;
}

该lock()函数接受compare作为参数并检查它是否等于 alraedy 中的值mutex。如果是，则放入0xFFFFFFFF以mutex表明锁是由线程获取的。因为mutex在构造函数中初始化为 0，所以只有块 ID 为 0 的线程才能成功获取锁。在中unlock，我们将下一个块 ID 索引放入中，mutex以保证您想要的排序。另外，因为您printf()在 CUDA 内核中使用过，我认为您需要调用 tothreadfence_system()才能以相同的顺序在输出中看到它们。

cuda - Cuda原子锁：顺序线程

1 回答 1

Related

Reference