0

我正在尝试用 c 语言编写多线程 Nagel–Schreckenberg 模型模拟,当线程访问尚未计算的数据时遇到一些问题。

这是一个仅并行化每行速度计算的工作代码:

#define L 3000         // number of cells in row
#define num_iters 3000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2

    for (int i = 0; i < num_iters - 1; i++)
    {
            int temp[L] = {0};

            #pragma omp parallel for
            for (int x = 0; x < L; x++)
            {
                if (iterations[i][x] > -1)
                {
                    int vi = iterations[i][x]; // velocity of previews iteration
                    int d = 1;                 // index of the next vehicle

                    while (iterations[i][(x + d) % L] < 0)
                        d++;

                    int vtemp = min(min(vi + 1, d - 1), vmax);    // increase speed, but avoid hitting the next car
                    int v = r2() < p ? max(vtemp - 1, 0) : vtemp; // stop the vehicle with probability p
                    temp[x] = v;
                }
            }
            
            for (int x = 0; x < L; x++) // write the velocities to the next line
            {
                if (iterations[i][x] > -1)
                {
                    int v = temp[x];
                    iterations[i + 1][(x + v) % L] = v;
                }
            }
    }

工作代码

这工作正常,但速度不够快。我正在尝试使用卷积来提高性能,但由于尚未计算,它有一半的时间无法读取相邻线程的数据。这是我使用的代码:

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>

#define L 4000         // number of cells in row
#define num_iters 4000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2
#define BLOCKS_Y 4
#define BLOCKS_X 4
#define BLOCKSIZEY (L / BLOCKS_Y)
#define BLOCKSIZEX (L / BLOCKS_X)

time_t t;

#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif

#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif

void shuffle(int *array, size_t n)
{
  if (n > 1)
  {
    size_t i;
    for (i = 0; i < n - 1; i++)
    {
      size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
      int t = array[j];
      array[j] = array[i];
      array[i] = t;
    }
  }
}

double r2()
{
  return (double)rand() / (double)RAND_MAX;
}

void writeImage(int *iterations[], char filename[])
{
    int h = L;
    int w = num_iters;
    FILE *f;
    unsigned char *img = NULL;
    int filesize = 54 + 3 * w * h;

    img = (unsigned char *)malloc(3 * w * h);
    memset(img, 0, 3 * w * h);

    for (int i = 0; i < w; i++)
    {
        for (int j = 0; j < h; j++)
        {
            int x = i;
            int y = (h - 1) - j;
            int color = iterations[i][j] == 0 ? 0 : 255;
            img[(x + y * w) * 3 + 2] = (unsigned char)(color);
            img[(x + y * w) * 3 + 1] = (unsigned char)(color);
            img[(x + y * w) * 3 + 0] = (unsigned char)(color);
        }
    }

    unsigned char bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    unsigned char bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    unsigned char bmppad[3] = {0, 0, 0};

    bmpfileheader[2] = (unsigned char)(filesize);
    bmpfileheader[3] = (unsigned char)(filesize >> 8);
    bmpfileheader[4] = (unsigned char)(filesize >> 16);
    bmpfileheader[5] = (unsigned char)(filesize >> 24);

    bmpinfoheader[4] = (unsigned char)(w);
    bmpinfoheader[5] = (unsigned char)(w >> 8);
    bmpinfoheader[6] = (unsigned char)(w >> 16);
    bmpinfoheader[7] = (unsigned char)(w >> 24);
    bmpinfoheader[8] = (unsigned char)(h);
    bmpinfoheader[9] = (unsigned char)(h >> 8);
    bmpinfoheader[10] = (unsigned char)(h >> 16);
    bmpinfoheader[11] = (unsigned char)(h >> 24);

    f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (int i = 0; i < h; i++)
    {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    free(img);
    fclose(f);
}

void simulation()
{
    printf("L=%d, num_iters=%d\n", L, num_iters);
    int z = 0;
    z++;
    int current_index = 0;
    int success_moves = 0;

    const int cars_num = (int)(density * L);

    int **iterations = (int **)malloc(num_iters * sizeof(int *));
    for (int i = 0; i < num_iters; i++)
        iterations[i] = (int *)malloc(L * sizeof(int));

    for (int i = 0; i < L; i++)
    {
        iterations[0][i] = i <= cars_num ? 0 : -1;
    }
    shuffle(iterations[0], L);

    for (int i = 0; i < num_iters - 1; i++)
        for (int x = 0; x < L; x++)
            iterations[i + 1][x] = -1;

    double *randoms = (double *)malloc(L * num_iters * sizeof(double));
    for (int i = 0; i < L * num_iters; i++) {
        randoms[i] = r2();
    }

    #pragma omp parallel for collapse(2)
    for (int blocky = 0; blocky < BLOCKS_Y; blocky++)
    {
        for (int blockx = 0; blockx < BLOCKS_X; blockx++)
        {
            int ystart = blocky * BLOCKSIZEY;
            int yend = ystart + BLOCKSIZEY;
            int xstart = blockx * BLOCKSIZEX;
            int xend = xstart + BLOCKSIZEX;

            for (int y = ystart; y < yend; y++)
            {
                for (int x = xstart; x < xend; x++)
                {
                    if (iterations[y][x] > -1)
                    {
                        int vi = iterations[y][x];
                        int d = 1;

                        int start = (x + d) % L;
                        int i;
                        for (i = start; i < L && iterations[y][i] < 0; ++i);
                        d += i - start;
                        if (i == L)
                        {
                            for (i = 0; i < start && iterations[y][i] < 0; ++i);
                            d += i;
                        }

                        int vtemp = min(min(vi + 1, d - 1), vmax);
                        int v = randoms[x * y] < p ? max(vtemp - 1, 0) : vtemp;
                        iterations[y + 1][(x + v) % L] = v;
                    }
                }
            }
        }
    }

    if (L <= 4000)
        writeImage(iterations, "img.bmp");
    free(iterations);
}

void main() {
    srand((unsigned)time(&t));
    simulation();
}

问题

正如你所看到的,当第二个块被计算出来时,第一个块可能还没有计算出来,它产生了那个空白空间。

我认为可以通过卷积来解决这个问题,但我只是做错了,我不确定是什么。如果您能就如何解决此问题提供任何建议,我将不胜感激。

4

1 回答 1

0

第二个代码中存在竞争条件,因为iterations可以由一个线程读取并由另一个线程写入。更具体地说,设置另一个线程在检查或两个线程处理连续值(两个连续值)iterations[y + 1][(x + v) % L] = v时应读取的值。iterations[y][x]iterations[y][(x + d) % L]yblocky

此外,r2函数必须是线程安全的。它似乎是一个随机数生成器 (RNG),但这种随机函数通常是使用通常不是线程安全的全局变量来实现的。一种简单而有效的解决方案是改用thread_local变量。另一种解决方案是将参数显式传递给随机函数的可变状态。当您设计并行应用程序时,后者是一种很好的做法,因为它使内部状态的变化可见,并提供了更好地控制 RNG 确定性的方法。

除此之外,请注意模数通常很昂贵,特别是如果L它不是编译时常数。您可以通过在循环之前预先计算余数或拆分循环来删除其中的一些,以便仅在边界附近执行检查。这是一个(未经测试的)示例while

int start = (x + d) % L;
int i;

for(i=start ; i < L && iterations[y][i] < 0 ; ++i);
d += i - start;

if(i == L) {
    for(i=0 ; i < start && iterations[y][i] < 0 ; ++i);
    d += i;
}

最后,请注意块应该能被 4 整除。否则,当前代码无效(可能需要最小/最大钳位)。

于 2021-12-29T22:18:26.807 回答