c - 循环内的 OpenMP 同步

Question

我有一个生成数据的大循环。例如，每次迭代需要 1 秒并产生一大块数据。我需要以正确的顺序将所有块写入文件。

如果我只是想并行化循环，我可以写这样的东西（高度简化）：

    FILE* f = fopen("output.txt", "w");
    omp_lock_t lock;
    omp_init_lock(&lock);
    int nIterations = 1000000;
#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    {
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            omp_set_lock(&lock);
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    }
#pragma omp barrier
    fclose(f);
    omp_destroy_lock(&lock);

这会将我的输出放入文件中，但不能保证条目的顺序。

我想同步执行，以便所有线程执行它们的任务，然后主线程写入文件，然后线程恢复。换句话说，我想要这样的东西：

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            int values[4];
            for(int n=thread; n<nIterations; n+=4)
            {
                values[n] = do_computations(&a, &b, &c);
#pragma omp barrier
                if(thread == 0)
                {
                      for(int i=0; i<4; i++)
                        fprintf(f, "%d\n", values[i]);
                }
#pragma omp barrier
            }
        }
#pragma omp barrier

除非出于某种莫名其妙的原因，OpenMP 规范禁止这样做。

或者我可以试试

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            for(int n=thread; n<nIterations; n+=4)
            {
                int value = do_computations(&a, &b, &c);
#pragma omp ordered
                {
                    fprintf(f, "%d\n", value);
                }
            }
        }
    #pragma omp barrier
        fclose(f);

但这也行不通，因为“带有 for 构造的循环迭代......不能执行多个有序指令。”

我不想将代码重写为单个循环，也不想交换循环。

没有其他线程/同步工具，有没有一种干净的方法可以用 OpenMP 做到这一点？

score 3 · Accepted Answer

您正在尝试做两件事 - 计算和 IO。计算可以并行化，但 IO 必须是串行的。但是通过将 IO 放在与计算相同的循环中，您也在强制对计算进行序列化，这是没有意义的。

你最好先做所有的计算，然后做 IO。即使在串行中，这几乎肯定会更快，特别是如果您可以将数据以二进制形式写入一大块而不是通过 fprintfs 循环。

    FILE* f = fopen("output.txt", "w");
    const int nIterations = 1000000;
    int values[nIterations];

#pragma omp parallel for 
    for(int n=0; n<niterations; n++)
    {
        int a=0, b=0, c=0;
        values[n] = do_computations(&a, &b, &c);
    }

    for (int n=0; n<niterations; n++)
         fprintf(f,"%d\n", values[n]);

    fclose(f);

当然，这需要更多内存，但是速度与内存是一个常见的权衡。如果这种权衡的极端不起作用，您总是可以在可调整大小的块中进行计算：

    const int nIterations = 1000000;
    const int chunkSize   = 10000;
    int values[chunkSize];
    int chunkNum = 0;
    int chunkLeft = chunkSize;

    for (int start = 0; start < nIterations; start+= chunkSize) {

        if (start+chunkSize > nIterations) chunkLeft = nIterations - start;

    #pragma omp parallel for 
        for(int n=start; n<start+chunkLeft; n++)
        {
            int a=0, b=0, c=0;
            values[n-start] = do_computations(&a, &b, &c);
        }

        for (int n=0; n<chunkLeft; n++)
             fprintf(f,"%d\n", values[n]);

    }
    fclose(f);

score 0 · Accepted Answer

我将尝试提出以前的答案中尚未出现的解决方案：

#include <stdio.h>
#include <assert.h>
#include <unistd.h>

#define NITER 100

int main() {

  FILE * f = fopen("output.bin", "w+");

#pragma omp parallel
  {
#pragma omp for schedule(runtime)
    for (int ii = 0; ii < NITER; ++ii) {    
      sleep(1); // Simulate computation
      printf("%d\n",ii); // Just to be convinced that the loop is not evaluated in serial order
#pragma omp critical (FILEWRITE)
      {
    fseek (f  ,sizeof(ii)*ii,SEEK_SET);
    fwrite(&ii,sizeof(ii),1,f);
      }      
    }
  }

  // Check serially that the file is written in the right order
  fseek(f,0,SEEK_SET);
  int value = -1;
  for (int ii = 0; ii < NITER; ++ii) {        
    fread (&value,sizeof(ii),1,f);    
    assert( value == ii );
  }  

  fclose(f);
  return 0;
}

这种情况仅适用于每个块具有非常明确的大小，这样，知道您正在计算哪个迭代，您可以从文件的开头得出它的偏移量。

也就是说，在您提供的代码片段中存在许多错误，表明您必须查看 OpenMP 的基础知识。例如：

#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    { // No need to unroll the loop as OpenMP runtime 
      // map iterations on threads based on the scheduling policy
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            // No need to use lock, when a critical construct suffices
            omp_set_lock(&lock); 
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    } // Implicit barrier at the end of the parallel for
#pragma omp barrier 
// Why a barrier when there is only one thread?

score 0 · Accepted Answer

聚会迟到了，但是如果有人在这里偶然发现答案，您需要的是#pragma omp single，但也可以查看与@jonathan-dursi 的讨论。

c - 循环内的 OpenMP 同步

3 回答 3

Related

Reference