我是 OpenMP 的新手,我正在尝试并行化这个循环:
int ii,jj,kk; /* generic counters */
const double c_sq = 1.0/3.0; /* square of speed of sound */
const double w0 = 4.0/9.0; /* weighting factor */
const double w1 = 1.0/9.0; /* weighting factor */
const double w2 = 1.0/36.0; /* weighting factor */
double u_x,u_y; /* av. velocities in x and y directions */
double u[NSPEEDS]; /* directional velocities */
double d_equ[NSPEEDS]; /* equilibrium densities */
double u_sq; /* squared velocity */
double local_density; /* sum of densities in a particular cell */
/* loop over the cells in the grid
** NB the collision step is called after
** the propagate step and so values of interest
** are in the scratch-space grid */
//#pragma omp parallel for private (ii, jj, kk, d_equ) shared (cells, tmp_cells)
for(ii=0;ii<params.ny;ii++) {
for(jj=0;jj<params.nx;jj++) {
/* don't consider occupied cells */
if(!obstacles[ii*params.nx + jj]) {
/* compute local density total */
local_density = 0.0;
for(kk=0;kk<NSPEEDS;kk++) {
local_density += tmp_cells[ii*params.nx + jj].speeds[kk];
}
/* compute x velocity component */
u_x = (tmp_cells[ii*params.nx + jj].speeds[1] +
tmp_cells[ii*params.nx + jj].speeds[5] +
tmp_cells[ii*params.nx + jj].speeds[8]
- (tmp_cells[ii*params.nx + jj].speeds[3] +
tmp_cells[ii*params.nx + jj].speeds[6] +
tmp_cells[ii*params.nx + jj].speeds[7]))
/ local_density;
u_y = (tmp_cells[ii*params.nx + jj].speeds[2] +
tmp_cells[ii*params.nx + jj].speeds[5] +
tmp_cells[ii*params.nx + jj].speeds[6]
- (tmp_cells[ii*params.nx + jj].speeds[4] +
tmp_cells[ii*params.nx + jj].speeds[7] +
tmp_cells[ii*params.nx + jj].speeds[8]))
/ local_density;
/* velocity squared */
u_sq = u_x * u_x + u_y * u_y;
/* directional velocity components */
u[1] = u_x; /* east */
u[2] = u_y; /* north */
u[3] = - u_x; /* west */
u[4] = - u_y; /* south */
u[5] = u_x + u_y; /* north-east */
u[6] = - u_x + u_y; /* north-west */
u[7] = - u_x - u_y; /* south-west */
u[8] = u_x - u_y; /* south-east */
/* equilibrium densities */
/* zero velocity density: weight w0 */
d_equ[0] = w0 * local_density * (1.0 - u_sq / (2.0 * c_sq));
/* axis speeds: weight w1 */
d_equ[1] = w1 * local_density * (1.0 + u[1] / c_sq
+ (u[1] * u[1]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[2] = w1 * local_density * (1.0 + u[2] / c_sq
+ (u[2] * u[2]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[3] = w1 * local_density * (1.0 + u[3] / c_sq
+ (u[3] * u[3]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[4] = w1 * local_density * (1.0 + u[4] / c_sq
+ (u[4] * u[4]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
/* diagonal speeds: weight w2 */
d_equ[5] = w2 * local_density * (1.0 + u[5] / c_sq
+ (u[5] * u[5]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[6] = w2 * local_density * (1.0 + u[6] / c_sq
+ (u[6] * u[6]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[7] = w2 * local_density * (1.0 + u[7] / c_sq
+ (u[7] * u[7]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
d_equ[8] = w2 * local_density * (1.0 + u[8] / c_sq
+ (u[8] * u[8]) / (2.0 * c_sq * c_sq)
- u_sq / (2.0 * c_sq));
/* relaxation step */
for(kk=0;kk<NSPEEDS;kk++) {
cells[ii*params.nx + jj].speeds[kk] = (tmp_cells[ii*params.nx + jj].speeds[kk]
+ params.omega *
(d_equ[kk] - tmp_cells[ii*params.nx + jj].speeds[kk]));
}
}
}
}
params.nx = 300 & params.ny = 200 对于 300x200 d2q9 LB 立方体...我评论的 pragma 语句只会导致增加的减少并且还会使雷诺数乱序...我尝试将 2 for 循环合并到通过使其看起来像这样来避免可能的错误共享:
c=params.nx*params.ny;
#pragma omp for private (ii,jj,kk,d_equ) shared (cells, tmp_cells)
for(ii=0;ii<c;ii++) {
/* don't consider occupied cells */
if(obstacles[ii]) {
/* called after propagate, so taking values from scratch space
** mirroring, and writing into main grid */
cells[ii].speeds[1] = tmp_cells[ii].speeds[3];
cells[ii].speeds[2] = tmp_cells[ii].speeds[4];
......
....
}
编译指示仍然无法帮助我加快速度.. 不过我在这里得到了正确的结果.. 自上周六以来我一直在做这件事,并没有在任何我需要的地方结束.. 在网上找不到太多有用的资源。 .我真的很感激帮助...