Outline of MPI parallelization  (for 2D r,z grid)

MPI functions for C and Fortran have same names and syntax except ierr 
(which C returns as value, so no ierr in arguments).

MPI basic functions:		(here COMM stands for MPI_COMM_WORLD)
blocking:
	MPI_BARRIER( COMM, ierr)
	MPI_BCAST( buf,count,datatype,0,COMM, ierr)
	MPI_GATHER(buf,count,type,recvbuf,recvcount,recvtype,0,COMM,ierr)
	MPI_REDUCE( sendbuf,recvbuf,count,datatype,op,root,COMM,ierr)
	tag = 1
	MPI_SEND( buf,count,datatype,dest,  tag,COMM,ierr)
	MPI_RECV( buf,count,datatype,source,tag,COMM,status,ierr)
non-blocking:
	MPI_ISEND( buf,count,datatype,dest,tag,COMM,request,ierr)
	MPI_IRECV( buf,count,datatype,source,tag,COMM,request,ierr)
	MPI_Wtime()	!timer

	MPI_ABORT( COMM, errorcode, ierr)
	MPI_FINALIZE(ierr)

Every file with routines that call MPI functions must:
	include 'mpif.h'	( include 'mpi.h'  in C )
________________________________________________________________________
running:
	on local:  make run	(see Makefile)
	on a cluster (via PBSscript):  make pbs  (see Makefile)

========================================================================
_____________Explanation of variables for MPI  (all integers)___________
   nPROC    = number of PROCesses = nWRs+1 to use in this run
   nWRs     = number of workers   = nPROC-1
   mster    = master rank (=0)
   myID     = rank of a process (=0,1,2,...,nWRs)
   Me       = worker's number (rank)  (=1,2,...,nWRs)
   NodeUP   = rank of neighbor UP from Me
   NodeDN   = rank of neighbor DN from Me
   ierr     = MPI error flag

Since parallelization will be only in the z-direction,
           ALWAYS USE AN EVEN Mz and DIVISIBLE by nWRs

Message passing should be coded in 3 routines:
	(to keep things clean, put them in one file 'messaging.f')
	RECV_output_MPI(....)  for MR
	SEND_output_MPI(....)  for WR
	EXCHANGE_bry_MPI(....)  for WRs exchanging their "bry" values

========================================================================
____________in main.f:	 main program
        include 'mpif.h'	( include 'mpi.h' in C )
startup:
        call MPI_INIT( ierr )
        call MPI_COMM_SIZE( MPI_COMM_WORLD,nPROC,ierr)	!returns nPROC
        call MPI_COMM_RANK( MPI_COMM_WORLD,myID,ierr)	!assigns myID
	mster = 0 ; nWRs = nPROC - 1
		print*,'>>>main>>> running on ', nWRs, ' WRs'
        IF( myID .EQ. mster ) THEN
		tt0 = MPI_Wtime()	!...start CPU timer on MR
              call MASTER( nWRs, mster, ... )
		tt1 = MPI_Wtime()	!...end timer
	   print*,'>>main>> MR timing= ', tt1-tt0,' sec on ',nWRs,' WRs'
        ELSE
              call WORKER( nPROC, myID, ... )
        ENDIF

termination:
        call MPI_FINALIZE(ierr)
	END

========================================================================
____________in mainMR.f: subroutine MASTER( nWRs, mster, ... )
Tasks of MASTER:
1. Read run-time parameters from data file, print them in o.out
2. Pack integers in iparms array, reals in parms array, and send to all:
   call MPI_BCAST( iparms, Niparms, MPI_INTEGER, 0,MPI_COMM_WORLD, ierr)
   call MPI_BCAST( parms, Nparms, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
3. Receive output from WRs when time=tout

timestepping loop:
	WRs do computation and send output to MR every dtout, 
	MR receives from each WR its section of output array(s)
  at each timestep:
	!............synchronize everyone.............!
              call MPI_BARRIER( MPI_COMM_WORLD, ierr )
	if( time = tout ) RECV_OUTPUT_MPI( nWRs, Mr, Mz, U )

========================================================================
____________in mainWR.f: subroutine WORKER( nWRs, Me, ... )
Tasks of WORKER:
1. Unpack initial iparms and parms arrays, local Mz = Mz / nWRs
2. Exchange "boundary" values with neighbors 
3. Do timestepping computation 
4. Send output to MR every dtout

timestepping loop:
	WRs do computation and send output to MR every dtout, 
	MR receives from each WR its section of output array(s)
 at each timestep:
	!............synchronize everyone.............!
              call MPI_BARRIER( MPI_COMM_WORLD, ierr )
	!--------------- Exchange "boundary" values ----------------!
        call EXCHANGE_bry_MPI( nWRs, Me, NodeUP, NodeDN, Mr, Mz, U )

              time = nsteps*dt	!! = time + dt
         call FLUX( nWRs, Me, Mr,Mz, U, ... , AFr, AFz )
         call PDE ( nWRs, Me, Mr,Mz, Vol, AFr, AFz, ... , U )
         IF( time .GE. tout) THEN
	   !-------------- send  output to the  MASTER ----------!
            call SEND_output_MPI( Me, NodeUP, NodeDN, Mr,Mz, U )
            tout = tout + dtout
         ENDIF

========================================================================
____________in messaging.f  (or in io.f):	for 2D (r,z) grid

      subroutine RECV_output_MPI( nWRs, Mr, Mz, U )
	!............ only MR does this ..........!
	include "mpif.h"
                I2 = Mr + 2	!!size of U(i,:) array
        call MPI_TYPE_CONTIGUOUS(I2,MPI_DOUBLE_PRECISION,Mytype,ierr)
        call MPI_TYPE_COMMIT ( Mytype, ierr )
	!.........receive values from everybody for output...........
                J2 = Mz + 2	!!size of U(:,j) array
        DO  i = 1, nWRs
              Jme = ( i-1 )*Mz + 1
             msgtag = 1000 + i
           call MPI_RECV(U(0,Jme),J2,Mytype,MPI_ANY_SOURCE,msgtag,MPI_COMM_WORLD,status,ierr)
        ENDDO
       call MPI_TYPE_FREE( Mytype, ierr )
       RETURN
       END  subroutine RECV_output_MPI

!----------------------------------------------------------------------!
      subroutine SEND_output_MPI( Me, NodeUP, NodeDN, Mr, Mz, U )
	!.................. every WR does this .................!
	include "mpif.h"
                I2 = Mr + 2
        call MPI_TYPE_CONTIGUOUS( I2,MPI_DOUBLE_PRECISION,Mytype,ierr)
        call MPI_TYPE_COMMIT ( Mytype, ierr )

	!.....everybody  sends values to the Master for output.....
             mster = 0
                J2 = Mz + 2
            msgtag = 1000 + Me
       call MPI_SEND(U(0,0),J2,Mytype, mster,msgtag,MPI_COMM_WORLD,ierr)

       call MPI_TYPE_FREE( Mytype, ierr )
       RETURN
       END  subroutine SEND_output_MPI
!----------------------------------------------------------------------!
 
       subroutine EXCHANGE_bry_MPI( nWRs, Me,NodeUP,NodeDN, Mr,Mz, U)
	!..........Exchange "boundary" values btn neighbors.........!
	!.................... every WR does this ...................!
	include "mpif.h" 
               Jup = Mz
              Jup1 = Jup + 1
             msgUP = 10
             msgDN = 20
                I2 = Mr + 2
        call MPI_TYPE_CONTIGUOUS( I2,MPI_DOUBLE_PRECISION,Mytype,ierr)
        call MPI_TYPE_COMMIT( Mytype, ierr )

	!.................send bottom row to neighbor down:
      IF( Me .NE. 1 ) THEN
          msgtag = msgDN

        call MPI_SEND(U(0,1),1,Mytype,NodeDN,msgtag,MPI_COMM_WORLD,ierr)     ← corrected, send U(0,1)  
      END IF

	!.....receive bottom row from neighbor up and save as upper bry:
      IF( Me .NE. nWRs ) THEN
          msgtag = msgDN
        call MPI_RECV(U(0,Jup1),1,Mytype,NodeUP,msgtag,MPI_COMM_WORLD,status,ierr)
      END IF

	!....................send the top row to neighbor up:
      IF( Me .NE. nWRs ) THEN
          msgtag = msgUP
        call MPI_SEND(U(0,Jup),1,Mytype,NodeUP,msgtag,MPI_COMM_WORLD,ierr)
      END IF

	!......receive top row from neighbor down and save as lower bry:
      IF( Me .NE. 1 ) THEN
          msgtag = msgUP
        call MPI_RECV( U(0,0),1,Mytype,NodeDN,msgtag,MPI_COMM_WORLD,status,ierr)
      END IF

      call MPI_TYPE_FREE( Mytype, ierr )
      RETURN
      END  subroutine EXCHANGE_BNDRY_MPI
!______________________________________________________________________!