我正在尝试用 c 语言编写多线程 Nagel–Schreckenberg 模型模拟,当线程访问尚未计算的数据时遇到一些问题。
这是一个仅并行化每行速度计算的工作代码:
#define L 3000 // number of cells in row
#define num_iters 3000 // number of iterations
#define density 0.48 // how many positives
#define vmax 2
#define p 0.2
for (int i = 0; i < num_iters - 1; i++)
{
int temp[L] = {0};
#pragma omp parallel for
for (int x = 0; x < L; x++)
{
if (iterations[i][x] > -1)
{
int vi = iterations[i][x]; // velocity of previews iteration
int d = 1; // index of the next vehicle
while (iterations[i][(x + d) % L] < 0)
d++;
int vtemp = min(min(vi + 1, d - 1), vmax); // increase speed, but avoid hitting the next car
int v = r2() < p ? max(vtemp - 1, 0) : vtemp; // stop the vehicle with probability p
temp[x] = v;
}
}
for (int x = 0; x < L; x++) // write the velocities to the next line
{
if (iterations[i][x] > -1)
{
int v = temp[x];
iterations[i + 1][(x + v) % L] = v;
}
}
}
这工作正常,但速度不够快。我正在尝试使用卷积来提高性能,但由于尚未计算,它有一半的时间无法读取相邻线程的数据。这是我使用的代码:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#define L 4000 // number of cells in row
#define num_iters 4000 // number of iterations
#define density 0.48 // how many positives
#define vmax 2
#define p 0.2
#define BLOCKS_Y 4
#define BLOCKS_X 4
#define BLOCKSIZEY (L / BLOCKS_Y)
#define BLOCKSIZEX (L / BLOCKS_X)
time_t t;
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void shuffle(int *array, size_t n)
{
if (n > 1)
{
size_t i;
for (i = 0; i < n - 1; i++)
{
size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
int t = array[j];
array[j] = array[i];
array[i] = t;
}
}
}
double r2()
{
return (double)rand() / (double)RAND_MAX;
}
void writeImage(int *iterations[], char filename[])
{
int h = L;
int w = num_iters;
FILE *f;
unsigned char *img = NULL;
int filesize = 54 + 3 * w * h;
img = (unsigned char *)malloc(3 * w * h);
memset(img, 0, 3 * w * h);
for (int i = 0; i < w; i++)
{
for (int j = 0; j < h; j++)
{
int x = i;
int y = (h - 1) - j;
int color = iterations[i][j] == 0 ? 0 : 255;
img[(x + y * w) * 3 + 2] = (unsigned char)(color);
img[(x + y * w) * 3 + 1] = (unsigned char)(color);
img[(x + y * w) * 3 + 0] = (unsigned char)(color);
}
}
unsigned char bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
unsigned char bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
unsigned char bmppad[3] = {0, 0, 0};
bmpfileheader[2] = (unsigned char)(filesize);
bmpfileheader[3] = (unsigned char)(filesize >> 8);
bmpfileheader[4] = (unsigned char)(filesize >> 16);
bmpfileheader[5] = (unsigned char)(filesize >> 24);
bmpinfoheader[4] = (unsigned char)(w);
bmpinfoheader[5] = (unsigned char)(w >> 8);
bmpinfoheader[6] = (unsigned char)(w >> 16);
bmpinfoheader[7] = (unsigned char)(w >> 24);
bmpinfoheader[8] = (unsigned char)(h);
bmpinfoheader[9] = (unsigned char)(h >> 8);
bmpinfoheader[10] = (unsigned char)(h >> 16);
bmpinfoheader[11] = (unsigned char)(h >> 24);
f = fopen(filename, "wb");
fwrite(bmpfileheader, 1, 14, f);
fwrite(bmpinfoheader, 1, 40, f);
for (int i = 0; i < h; i++)
{
fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
}
free(img);
fclose(f);
}
void simulation()
{
printf("L=%d, num_iters=%d\n", L, num_iters);
int z = 0;
z++;
int current_index = 0;
int success_moves = 0;
const int cars_num = (int)(density * L);
int **iterations = (int **)malloc(num_iters * sizeof(int *));
for (int i = 0; i < num_iters; i++)
iterations[i] = (int *)malloc(L * sizeof(int));
for (int i = 0; i < L; i++)
{
iterations[0][i] = i <= cars_num ? 0 : -1;
}
shuffle(iterations[0], L);
for (int i = 0; i < num_iters - 1; i++)
for (int x = 0; x < L; x++)
iterations[i + 1][x] = -1;
double *randoms = (double *)malloc(L * num_iters * sizeof(double));
for (int i = 0; i < L * num_iters; i++) {
randoms[i] = r2();
}
#pragma omp parallel for collapse(2)
for (int blocky = 0; blocky < BLOCKS_Y; blocky++)
{
for (int blockx = 0; blockx < BLOCKS_X; blockx++)
{
int ystart = blocky * BLOCKSIZEY;
int yend = ystart + BLOCKSIZEY;
int xstart = blockx * BLOCKSIZEX;
int xend = xstart + BLOCKSIZEX;
for (int y = ystart; y < yend; y++)
{
for (int x = xstart; x < xend; x++)
{
if (iterations[y][x] > -1)
{
int vi = iterations[y][x];
int d = 1;
int start = (x + d) % L;
int i;
for (i = start; i < L && iterations[y][i] < 0; ++i);
d += i - start;
if (i == L)
{
for (i = 0; i < start && iterations[y][i] < 0; ++i);
d += i;
}
int vtemp = min(min(vi + 1, d - 1), vmax);
int v = randoms[x * y] < p ? max(vtemp - 1, 0) : vtemp;
iterations[y + 1][(x + v) % L] = v;
}
}
}
}
}
if (L <= 4000)
writeImage(iterations, "img.bmp");
free(iterations);
}
void main() {
srand((unsigned)time(&t));
simulation();
}
正如你所看到的,当第二个块被计算出来时,第一个块可能还没有计算出来,它产生了那个空白空间。
我认为可以通过卷积来解决这个问题,但我只是做错了,我不确定是什么。如果您能就如何解决此问题提供任何建议,我将不胜感激。