9

Donzis & Aditya 的一篇论文表明,可以使用可能在模板中有延迟的有限差分方案。这是什么意思?FD 方案可用于求解热方程并读取(或对其进行一些简化)

u[t+1,i] = u[t,i] + c (u[t,i-1]-u[t,i+1])

意思是,下一个时间步的值取决于上一个时间步的相同位置及其邻居的值。

通过将(在我们的案例中为一维)域拆分到不同的处理器上,可以轻松解决这个问题。但是,在处理器上计算边界节点时,我们需要通信,因为该元素u[t,i+-1]仅在另一个处理器上可用。

下图说明了该问题,该图取自引用的论文。

在此处输入图像描述

MPI 实现可能使用MPI_SendMPI_Recv进行同步计算。由于计算本身相当容易,因此通信可能会成为瓶颈。

论文中给出了该问题的解决方案:

而不是同步过程,只需使用可用的边界注释,尽管它可能是较早时间步的值。然后该方法仍然收敛(在某些假设下)

对于我的工作,我想实现异步 MPI 案例(这不是论文的一部分)。同步部分使用MPI_Send并且MPI_Recv工作正常。我将内存扩展了两个元素作为相邻元素的幽灵单元,并通过发送和接收发送所需的值。下面的代码基本上是上图的实现,在计算前的每个时间步执行。

MPI_Send(&u[NpP],1,MPI_DOUBLE,RIGHT,rank,MPI_COMM_WORLD);
MPI_Recv(&u[0],1,MPI_DOUBLE,LEFT,LEFT,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

MPI_Send(&u[1],1,MPI_DOUBLE,LEFT,rank,MPI_COMM_WORLD);
MPI_Recv(&u[NpP+1],1,MPI_DOUBLE,RIGHT,RIGHT,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

现在,我绝不是 MPI 专家。我想通了,这MPI_Put可能是我需要的异步案例和阅读一点点,我想出了以下实现。

在时间循环之前:

MPI_Win win;
double *boundary;
MPI_Alloc_mem(sizeof(double) * 2, MPI_INFO_NULL, &boundary);
MPI_Info info;
MPI_Info_create(&info);
MPI_Info_set(info,"no_locks","true");
MPI_Win_create(boundary, 2*sizeof(double), sizeof(double), info, MPI_COMM_WORLD, &win);

在时间循环内:

MPI_Put(&u[1],1,MPI_DOUBLE,LEFT,1,1,MPI_DOUBLE,win);
MPI_Put(&u[NpP],1,MPI_DOUBLE,RIGHT,0,1,MPI_DOUBLE,win);
MPI_Win_fence(0,win);
u[0] = boundary[0];
u[NpP+1] = boundary[1];

它将所需的元素放在窗口中,即boundary(具有两个元素的数组)放在相邻的处理器上,u[0]u[NpP+1]boundary数组本身获取值。这个实现是有效的,我得到的结果与MPI_Send/Recv. 但是,这并不是真正的异步,因为我仍在使用MPI_Win_fence,据我了解,这确保了同步。

问题是:如果我取出MPI_Win_fence里面的值boundary永远不会更新并保持初始值。我的理解是,如果没有您,将采用其中可能(或可能不会)已由相邻处理器更新的MPI_Win_fence任何可用值。boundary

有没有人知道在MPI_Win_fence解决问题的同时避免使用内部的值boundary永远不会更新?

我也不确定我提供的代码是否足以理解我的问题或给出任何提示。如果是这种情况,请随时询问,因为我会尝试添加所有缺少的部分。

4

2 回答 2

2

The following works seems to work for me, in the sense of correct execution - a small 1d heat equation taken from one of our tutorials, using for the RMA stuff:

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
MPI_Win_unlock( left, rightwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
MPI_Win_unlock( right, leftwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
temperature[current][0]           = *leftgc;
MPI_Win_unlock( rank, leftwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
temperature[current][locpoints+1] = *rightgc;
MPI_Win_unlock( rank, rightwin );

In the code I have even ranks wait an extra 10ms each time step to try to make sure that things get out of sync; but looking at traces it actually looks like things remain pretty synced up. I don't know if that high degree of synchrony can be fixed by tweaking the code, or is a restriction of the implementation (IntelMPI 5.0.1), or just happens because the amount of time passing in computation is too little and communication time is dominating (but as to the last, cranking up the usleep interval doesn't seem to have an effect).

#define _BSD_SOURCE     /* usleep */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>


int main(int argc, char **argv) {
    /* simulation parameters */
    const int totpoints=1000;
    int locpoints;
    const float xleft = -12., xright = +12.;
    float locxleft, locxright;
    const float kappa = 1.;

    const int nsteps=100;

    /* data structures */
    float *x;
    float **temperature;

    /* parameters of the original temperature distribution */
    const float ao=1., sigmao=1.;

    float fixedlefttemp, fixedrighttemp;

    int current, new;
    int step, i;
    float time;
    float dt, dx;
    float rms;

    int rank, size;
    int start,end;
    int left, right;
    int lefttag=1, righttag=2;

    /* MPI Initialization */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    locpoints = totpoints/size;
    start = rank*locpoints;
    end   = (rank+1)*locpoints - 1;
    if (rank == size-1)
        end = totpoints-1;
    locpoints = end-start+1;

    left = rank-1;
    if (left < 0) left = MPI_PROC_NULL;
    right= rank+1;
    if (right >= size) right = MPI_PROC_NULL;

    #ifdef ONESIDED
    if (rank == 0)
        printf("Onesided: Allocating windows\n");
    MPI_Win leftwin, rightwin;
    float *leftgc, *rightgc;
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &leftgc,  &leftwin);
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &rightgc, &rightwin);
    #endif
    /* set parameters */

    dx = (xright-xleft)/(totpoints-1);
    dt = dx*dx * kappa/10.;

    locxleft = xleft + start*dx;
    locxright = xleft + end*dx;

    x      = (float *)malloc((locpoints+2)*sizeof(float));
    temperature = (float **)malloc(2 * sizeof(float *));
    temperature[0] = (float *)malloc((locpoints+2)*sizeof(float));
    temperature[1] = (float *)malloc((locpoints+2)*sizeof(float));
    current = 0;
    new = 1;

    /* setup initial conditions */

    time = 0.;
    for (i=0; i<locpoints+2; i++) {
        x[i] = locxleft + (i-1)*dx;
        temperature[current][i] = ao*exp(-(x[i]*x[i]) / (2.*sigmao*sigmao));
    }
    fixedlefttemp = ao*exp(-(locxleft-dx)*(locxleft-dx) / (2.*sigmao*sigmao));
    fixedrighttemp= ao*exp(-(locxright+dx)*(locxright+dx)/(2.*sigmao*sigmao));
    #ifdef ONESIDED
    *leftgc  = fixedlefttemp;
    *rightgc = fixedrighttemp;
    #endif

    /* evolve */
    for (step=0; step < nsteps; step++) {
        /* boundary conditions: keep endpoint temperatures fixed. */

        #ifdef ONESIDED
            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
            MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
            MPI_Win_unlock( left, rightwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
            MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
            MPI_Win_unlock( right, leftwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
            temperature[current][0]           = *leftgc;
            MPI_Win_unlock( rank, leftwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
            temperature[current][locpoints+1] = *rightgc;
            MPI_Win_unlock( rank, rightwin );
        #else
            temperature[current][0] = fixedlefttemp;
            temperature[current][locpoints+1] = fixedrighttemp;

            /* send data rightwards */
            MPI_Sendrecv(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, righttag,
                         &(temperature[current][0]), 1, MPI_FLOAT, left,  righttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

            /* send data leftwards */
            MPI_Sendrecv(&(temperature[current][1]), 1, MPI_FLOAT, left, lefttag,
                         &(temperature[current][locpoints+1]), 1, MPI_FLOAT, right,  lefttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        #endif

        for (i=1; i<locpoints+1; i++) {
            temperature[new][i] = temperature[current][i] + dt*kappa/(dx*dx) *
                (temperature[current][i+1] - 2.*temperature[current][i] +
                 temperature[current][i-1]) ;
        }

        time += dt;

        if ((rank % 2) == 0)
            usleep(10000u);

        current = new;
        new = 1 - current;
    }

    rms  = 0.;
    for (i=1;i<locpoints+1;i++) {
        rms += (temperature[current][i])*(temperature[current][i]);
    }
    float totrms;
    MPI_Reduce(&rms, &totrms, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        totrms = sqrt(totrms/totpoints);
        printf("Step = %d, Time = %g, RMS value = %g\n", step, time, totrms);
    }


    #ifdef ONESIDED
    MPI_Win_free(&leftwin);
    MPI_Win_free(&rightwin);
    #endif

    free(temperature[1]);
    free(temperature[0]);
    free(temperature);
    free(x);

    MPI_Finalize();
    return 0;
}
于 2014-10-13T01:19:11.667 回答
1

这是 Jonathen Dursi 帖子的克隆,但对 MPI-3 RMA 同步进行了更改...

#define _BSD_SOURCE     /* usleep */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>


int main(int argc, char **argv) {
    /* simulation parameters */
    const int totpoints=1000;
    int locpoints;
    const float xleft = -12., xright = +12.;
    float locxleft, locxright;
    const float kappa = 1.;

    const int nsteps=100;

    /* data structures */
    float *x;
    float **temperature;

    /* parameters of the original temperature distribution */
    const float ao=1., sigmao=1.;

    float fixedlefttemp, fixedrighttemp;

    int current, new;
    int step, i;
    float time;
    float dt, dx;
    float rms;

    int rank, size;
    int start,end;
    int left, right;
    int lefttag=1, righttag=2;

    /* MPI Initialization */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    locpoints = totpoints/size;
    start = rank*locpoints;
    end   = (rank+1)*locpoints - 1;
    if (rank == size-1)
        end = totpoints-1;
    locpoints = end-start+1;

    left = rank-1;
    if (left < 0) left = MPI_PROC_NULL;
    right= rank+1;
    if (right >= size) right = MPI_PROC_NULL;

    #ifdef ONESIDED
    if (rank == 0)
        printf("Onesided: Allocating windows\n");
    MPI_Win leftwin, rightwin;
    float *leftgc, *rightgc;
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &leftgc,  &leftwin);
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &rightgc, &rightwin);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, leftwin);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, rightwin);
    #endif
    /* set parameters */

    dx = (xright-xleft)/(totpoints-1);
    dt = dx*dx * kappa/10.;

    locxleft = xleft + start*dx;
    locxright = xleft + end*dx;

    x      = (float *)malloc((locpoints+2)*sizeof(float));
    temperature = (float **)malloc(2 * sizeof(float *));
    temperature[0] = (float *)malloc((locpoints+2)*sizeof(float));
    temperature[1] = (float *)malloc((locpoints+2)*sizeof(float));
    current = 0;
    new = 1;

    /* setup initial conditions */

    time = 0.;
    for (i=0; i<locpoints+2; i++) {
        x[i] = locxleft + (i-1)*dx;
        temperature[current][i] = ao*exp(-(x[i]*x[i]) / (2.*sigmao*sigmao));
    }
    fixedlefttemp = ao*exp(-(locxleft-dx)*(locxleft-dx) / (2.*sigmao*sigmao));
    fixedrighttemp= ao*exp(-(locxright+dx)*(locxright+dx)/(2.*sigmao*sigmao));
    #ifdef ONESIDED
    *leftgc  = fixedlefttemp;
    *rightgc = fixedrighttemp;
    #endif

    /* evolve */
    for (step=0; step < nsteps; step++) {
        /* boundary conditions: keep endpoint temperatures fixed. */

        /* RMA code assumes no conflicts in updates via MPI_Put.
           If that is wrong, hopefully it is fine to use MPI_Accumulate
           with MPI_SUM to accumulate the result. */
        #ifdef ONESIDED
            MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
            MPI_Win_flush( left, rightwin );

            MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
            MPI_Win_flush( right, leftwin );

            temperature[current][0]           = *leftgc;
            MPI_Win_flush( rank, leftwin );

            temperature[current][locpoints+1] = *rightgc;
            MPI_Win_flush( rank, rightwin );
        #else
        #error Define ONESIDED...
        #endif

        for (i=1; i<locpoints+1; i++) {
            temperature[new][i] = temperature[current][i] + dt*kappa/(dx*dx) *
                (temperature[current][i+1] - 2.*temperature[current][i] +
                 temperature[current][i-1]) ;
        }

        time += dt;

        if ((rank % 2) == 0)
            usleep(10000u);

        current = new;
        new = 1 - current;
    }

    rms  = 0.;
    for (i=1;i<locpoints+1;i++) {
        rms += (temperature[current][i])*(temperature[current][i]);
    }
    float totrms;
    MPI_Reduce(&rms, &totrms, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        totrms = sqrt(totrms/totpoints);
        printf("Step = %d, Time = %g, RMS value = %g\n", step, time, totrms);
    }


    #ifdef ONESIDED
    MPI_Win_unlock_all(leftwin);
    MPI_Win_unlock_all(rightwin);
    MPI_Win_free(&leftwin);
    MPI_Win_free(&rightwin);
    #endif

    free(temperature[1]);
    free(temperature[0]);
    free(temperature);
    free(x);

    MPI_Finalize();
    return 0;
}
于 2015-04-05T00:06:18.927 回答