The following works seems to work for me, in the sense of correct execution - a small 1d heat equation taken from one of our tutorials, using for the RMA stuff:
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
MPI_Put(&(temperature[current][1]), 1, MPI_FLOAT, left, 0, 1, MPI_FLOAT, rightwin);
MPI_Win_unlock( left, rightwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
MPI_Win_unlock( right, leftwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
temperature[current][0] = *leftgc;
MPI_Win_unlock( rank, leftwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
temperature[current][locpoints+1] = *rightgc;
MPI_Win_unlock( rank, rightwin );
In the code I have even ranks wait an extra 10ms each time step to try to make sure that things get out of sync; but looking at traces it actually looks like things remain pretty synced up. I don't know if that high degree of synchrony can be fixed by tweaking the code, or is a restriction of the implementation (IntelMPI 5.0.1), or just happens because the amount of time passing in computation is too little and communication time is dominating (but as to the last, cranking up the usleep interval doesn't seem to have an effect).
#define _BSD_SOURCE /* usleep */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
int main(int argc, char **argv) {
/* simulation parameters */
const int totpoints=1000;
int locpoints;
const float xleft = -12., xright = +12.;
float locxleft, locxright;
const float kappa = 1.;
const int nsteps=100;
/* data structures */
float *x;
float **temperature;
/* parameters of the original temperature distribution */
const float ao=1., sigmao=1.;
float fixedlefttemp, fixedrighttemp;
int current, new;
int step, i;
float time;
float dt, dx;
float rms;
int rank, size;
int start,end;
int left, right;
int lefttag=1, righttag=2;
/* MPI Initialization */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
locpoints = totpoints/size;
start = rank*locpoints;
end = (rank+1)*locpoints - 1;
if (rank == size-1)
end = totpoints-1;
locpoints = end-start+1;
left = rank-1;
if (left < 0) left = MPI_PROC_NULL;
right= rank+1;
if (right >= size) right = MPI_PROC_NULL;
#ifdef ONESIDED
if (rank == 0)
printf("Onesided: Allocating windows\n");
MPI_Win leftwin, rightwin;
float *leftgc, *rightgc;
MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &leftgc, &leftwin);
MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &rightgc, &rightwin);
#endif
/* set parameters */
dx = (xright-xleft)/(totpoints-1);
dt = dx*dx * kappa/10.;
locxleft = xleft + start*dx;
locxright = xleft + end*dx;
x = (float *)malloc((locpoints+2)*sizeof(float));
temperature = (float **)malloc(2 * sizeof(float *));
temperature[0] = (float *)malloc((locpoints+2)*sizeof(float));
temperature[1] = (float *)malloc((locpoints+2)*sizeof(float));
current = 0;
new = 1;
/* setup initial conditions */
time = 0.;
for (i=0; i<locpoints+2; i++) {
x[i] = locxleft + (i-1)*dx;
temperature[current][i] = ao*exp(-(x[i]*x[i]) / (2.*sigmao*sigmao));
}
fixedlefttemp = ao*exp(-(locxleft-dx)*(locxleft-dx) / (2.*sigmao*sigmao));
fixedrighttemp= ao*exp(-(locxright+dx)*(locxright+dx)/(2.*sigmao*sigmao));
#ifdef ONESIDED
*leftgc = fixedlefttemp;
*rightgc = fixedrighttemp;
#endif
/* evolve */
for (step=0; step < nsteps; step++) {
/* boundary conditions: keep endpoint temperatures fixed. */
#ifdef ONESIDED
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
MPI_Put(&(temperature[current][1]), 1, MPI_FLOAT, left, 0, 1, MPI_FLOAT, rightwin);
MPI_Win_unlock( left, rightwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
MPI_Win_unlock( right, leftwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
temperature[current][0] = *leftgc;
MPI_Win_unlock( rank, leftwin );
MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
temperature[current][locpoints+1] = *rightgc;
MPI_Win_unlock( rank, rightwin );
#else
temperature[current][0] = fixedlefttemp;
temperature[current][locpoints+1] = fixedrighttemp;
/* send data rightwards */
MPI_Sendrecv(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, righttag,
&(temperature[current][0]), 1, MPI_FLOAT, left, righttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
/* send data leftwards */
MPI_Sendrecv(&(temperature[current][1]), 1, MPI_FLOAT, left, lefttag,
&(temperature[current][locpoints+1]), 1, MPI_FLOAT, right, lefttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#endif
for (i=1; i<locpoints+1; i++) {
temperature[new][i] = temperature[current][i] + dt*kappa/(dx*dx) *
(temperature[current][i+1] - 2.*temperature[current][i] +
temperature[current][i-1]) ;
}
time += dt;
if ((rank % 2) == 0)
usleep(10000u);
current = new;
new = 1 - current;
}
rms = 0.;
for (i=1;i<locpoints+1;i++) {
rms += (temperature[current][i])*(temperature[current][i]);
}
float totrms;
MPI_Reduce(&rms, &totrms, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) {
totrms = sqrt(totrms/totpoints);
printf("Step = %d, Time = %g, RMS value = %g\n", step, time, totrms);
}
#ifdef ONESIDED
MPI_Win_free(&leftwin);
MPI_Win_free(&rightwin);
#endif
free(temperature[1]);
free(temperature[0]);
free(temperature);
free(x);
MPI_Finalize();
return 0;
}