#ifndef lint
static char SCCSid[] = "@(#) ./blkcm/mesh/gtol.c 07/23/93";
#endif

#include <math.h>
#include <stdio.h>                /*I <stdio.h> I*/
#include "tools.h"
#include "blkcm/bc.h"
#include "blkcm/mesh.h"
#include "comm/comm.h"

void BCAddToArrayPGM();

/*@
    BCGlobalToLocalArray - Given a global array, return the local portion

    Input Parameters:
.    nd    - number of dimensions of the array
.    sz    - array of decomposition information (typedef BCArrayPart)
            The fields m, is_parallel, and ndp should be set
            (if ndp is < 1, a default value will be chosen)
            On output, the other fields are filled in.
.    nproc - total number of partitions to use
.    myid  - number of partition.  Must be in [0,nproc-1].  The usual value
            of these parameters is PInumtids and PImytid, but others
	    may be used.
@*/
void BCGlobalToLocalArray( nd, sz, nproc, myid )
int         nd, nproc, myid;
BCArrayPart sz[];
{
int  ndp, i, n, j, nprocleft;
	
/* Find the number of parallel dimensions */
ndp = 0;
for (i=0; i<nd; i++) {
    if (sz[i].is_parallel) ndp++;
    else {
    	sz[i].start = 0;
    	sz[i].end   = sz[i].mdim - 1;
	sz[i].loc   = 0;
	sz[i].ndim  = 1;
        }
    }
/* Find the decomposition.  To start with, we do a "square" decomposition.
   Later, we can tune this to the sizes */
/* Use the sizes and locations to set the parameters.
   The complication here is in making sure that the numbers of processors 
   can be evenly divided amoung the sections. */
nprocleft = nproc;
j         = 0;
for (i=0; i<nd; i++) {
    if (sz[i].is_parallel) {
	if (sz[i].ndim > 0) {
	    if (nprocleft % sz[i].ndim) {
		/* Error, the number of processors doesn't match */
		fprintf( stderr, 
			 "Number of processors doesn't divide sizes\n" );
		}
	    else 
		nprocleft /= sz[i].ndim;
	    }
	else {
	    /* Find a size */
	    n  = pow( (double)nprocleft, 1.0/(double)(ndp-j) ) + 0.5;
	    if (n < 1) n = 1;
	    /* Adjust n to divide nprocleft.  This is grody but it works. */
	    while (n > 1 && (nprocleft % n)) n--;
	    sz[i].ndim = n;
	    nprocleft /= n;
	    }
	j++;
	}
    }

/*
    First, make sure that the sz array is consistent.  To date, this 
    is simply a requirement that if the parallel dimension is 1, the 
    dimension is marked as sequential.
 */
for (i=0; i<nd; i++) {
    if (sz[i].ndim == 1 && sz[i].is_parallel) {
	sz[i].is_parallel = 0;
    	sz[i].start = 0;
    	sz[i].end   = sz[i].mdim - 1;
	sz[i].loc   = 0;
	sz[i].ndim  = 1;
	}
    }

j = 0;
for (i=0; i<nd; i++) {
    if (sz[i].is_parallel) {
	j++;
	n         = sz[i].ndim;
	sz[i].loc = myid % n;
	myid     /= n;

	sz[i].start = sz[i].loc * sz[i].mdim / n;
	sz[i].end   = (sz[i].loc + 1) * sz[i].mdim / n - 1;
	if (sz[i].end >= sz[i].mdim) sz[i].end = sz[i].mdim - 1;
        }    
    }
}


/*@
    BCPrintArrayPart - Prints an array partition

    Input Paramters:
.    fp    - pointer to FILE structure
.    nd    - number of dimensions of the array
.    sz    - array of decomposition information (typedef BCArrayPart)

     Notes:
     This routine produces a consise description of the elements of the
     sz array.  The output is of the form
$          mdim ? strt  end lc pdim dclr
$          values
$    where
.    mdim - dimension of the global array
.    ?    - P for a parallel dimension, S for a sequential dimension
.    strt - the local part of the array has this start index
.    end  - the local part of the array has this end index
.    lc   - the relative location of this processor in the processor array
.    pdim - the number of processors in this dimension
.    dclr - the declared size of the local array in this dimension (includes
            ghost points).

     A more verbose description may be generated by BCPrintArrayPartVerbose.
@*/
void BCPrintArrayPart( fp, nd, sz )
FILE        *fp;
int         nd;
BCArrayPart sz[];
{
int i, j, cnt, nbr;

if (fp == 0) fp = stdout;

fprintf( fp, "mdim ? strt  end lc pdim dclr\n" );
for (i=0; i<nd; i++) {
    fprintf( fp, "%4d %c %4d %4d %2d %3d (%d:%d)\n", sz[i].mdim, 
	     sz[i].is_parallel ? 'P' : 'S', 
	     sz[i].start, sz[i].end, sz[i].loc, sz[i].ndim,
             sz[i].start-sz[i].sg, sz[i].end+sz[i].eg );
    }

}

/*@
    BCPrintArrayPartVerbose - Prints an array partition in verbose form

    Input Paramters:
.    fp    - pointer to FILE structure
.    nd    - number of dimensions of the array
.    sz    - array of decomposition information (typedef BCArrayPart)

    Notes:
    In a parallel environment, this routine should be used synchronously;
    that is, the use should insure that only one processor at a time is
    calling it.  For example, in Fortran, the code could be
$        np = PInumtids()    
$        do 10 i=0, np
$            if (PIgtoken(0,i) .ne. 0) then
$                print *, 'The array on processor ', i, ' is:'
$                call BCPrintArrayPartVerbose( 0, nd, sz )
$            endif
$    10  continue
$
     
@*/
void BCPrintArrayPartVerbose( fp, nd, sz )
FILE        *fp;
int         nd;
BCArrayPart sz[];
{
int i, j, cnt, nbr;

if (fp == 0) fp = stdout;

fprintf( fp, "The global array is declared as:\n\tarray(" );
for (i=0; i<nd; i++) {
    fprintf( fp, " 0:%d%s", sz[i].mdim-1, (i == (nd-1)) ? " )" : "," );
    }

fprintf( fp,"\nThe local part of the array should be declared as:\n\tarray(" );
for (i=0; i<nd; i++) {
    fprintf( fp, " %d:%d%s", sz[i].start-sz[i].sg, sz[i].end+sz[i].eg,
    		 (i == (nd-1)) ? " )" : "," );
    }

fprintf( fp,
  "\nThe local part of the array is (not including ghost points):\n\tarray(" );
for (i=0; i<nd; i++) {
    fprintf( fp, " %d:%d%s", sz[i].start, sz[i].end,
    		 (i == (nd-1)) ? " )" : "," );
    }

fprintf( fp,
"\nThe processors are arranged in an array with dimensions:\n   " );
for (i=0; i<nd; i++) {
    fprintf( fp, "%c %d", (i == 0) ? '(' : ',', sz[i].ndim );
    }
fprintf( fp, " )\n" );
fprintf( fp, "\nIn this array, this processor is:\n" );
for (i=0; i<nd; i++) {
    fprintf( fp, "%c %d", (i == 0) ? '(' : ',', sz[i].loc );
    }
fprintf( fp, " )\n" );
cnt = 0;
for (i=0; i<nd; i++) {
    if (sz[i].is_parallel) {
    	if (cnt == 0)
	    fprintf( fp,
        "\nThe neighboring processors are, :\n" );
	cnt++;
	fprintf( fp, "In dimension %d, to the \"left\"\n(", i );
	for (j=0; j<nd; j++) {
	    nbr = sz[j].loc;
	    if (j == i) nbr = (nbr - 1 + sz[j].ndim) % sz[j].ndim;
	    fprintf( fp, " %d%s", nbr, (j == (nd-1)) ? " )\n" : "," );
	    }
	fprintf( fp, "In dimension %d, to the \"right\"\n(", i );
	for (j=0; j<nd; j++) {
	    nbr = sz[j].loc;
	    if (j == i) nbr = (nbr + 1) % sz[j].ndim;
	    fprintf( fp, " %d%s", nbr, (j == (nd-1)) ? " )\n" : "," );
	    }
        }
    }
if (cnt == 0)
    fprintf( fp, "\nThis is an entirely sequential array\n" );
fflush( fp );    
}

/*@
    BCBuildArrayPGM - Build the program for a global array.

    Input Parameters:
.    nd    - number of dimensions of the array
.    sz    - array of decomposition information (typedef BCArrayPart)
            BCGlobalToLocalArray (or similar) should have been used
	    to set the values in sz.
.    nproc - total number of partitions to use
.    myid  - number of partition.  Must be in [0,nproc-1].  The usual value
            of these parameters is PInumtids and PImytid, but others
	    may be used.
.    nbyte - number of bytes in an element.  Should be sizeof(double) for
            now.

   Note: 
   If you look at the code for this routine, be assured that it
   really is simple.  The need to handle fairly general cases causes
   the code to be longer than for any particular instance, but this single
   code will handle any regular decomposition.
@*/
BCPGM *BCBuildArrayPGM( nd, sz, nproc, myid, nbyte )
int         nd;
BCArrayPart sz[];
int         nproc, myid, nbyte;
{
BCPGM *pgm;
int   line, i;

/* The maximum size of the program is nd * 6; */
pgm = BCalloc( nd * 6 );
BCSetDefault( pgm );

/* Generate the programs.  This is the algorithm:
   First ndp does:
   send (s1:e1+gs,...,s(n-1):e(n-1)+gs,sn:sn+gs-1,...) to
        (s1:e1+gs,...,s(n-1):e(n-1)+gs,en+1:en+gs,...)

   Second ndp does:
   send (s1-gs:e1+gs,...,s(n-1)-gs:e(n-1)+gs,en-gs+1:en, s(n+1):e(n+1)+gs,...)
   to   (s1-gs:e1+gs,.......................,sn-gs:sn-1, s(n+1):e(n+1)+gs,...)

   To make it easier to compute these points in the form that BC wants, 
   we use:
   nb[i]   - the number of points in dimension i within the domain
   n[i]    - as nb[i], but the number of points to be trasfered; may include
             some ghost points
   incr[i] - increment (number of values in the underlying array) between 
             adjacent values in the i'th dimension.  Thus, the index of
	     p(i,j,k,l) is i*incr[0]+j*incr[1]+k*incr[2]+l*incr[3].
	     This makes computing the starting indices for the blocks to
	     be transfered much easier
   Finally, we need a way to specify the "id"s of the segments.  Since 
   we know how things are laid out, we actually generate the process id's
   for each piece.  This is much like the incr, but for the number of
   processors in each dimension.  This is in :

   incrp[i] - increment for processors.

   Finally, a word on the stencil.  Right now, only the ghost-widths
   from sz are used; these should be considered as the stencil that 
   sticks out from the subdomain.  In other words, if a boundary is
   NOT periodic, and an edge is on the boundary, the ghost-width for
   that side will be zero.  The routine BCSetGhostWidths can be called
   to handle that.

   Also, ghost-widths must match up.  If a domain as a right ghost-width
   of 2, the domain to its right must have a LEFT ghost-width of 2.
   A more general interface would have send and receive stencils and
   ghost-widths; feel free to write it.  If a ghost-width is 0, 
   the no data exchange takes place.

   Finally, the id must be that of the RECEIVER (this is a bug in the
   "findowner" code which should really match up pairs of id's).
 */
line = 0;
BCAddToArrayPGM( pgm, &line, nd, sz, nproc, myid, nbyte, 0 );
return pgm;
}

/*@
   BCSetGhostWidths - Set the ghost point widths, taking care of
                      periodic boundaries.

   Input Parameters:
.   nd,sz    - array part description
.   iper     - iper[i] is 1 if dimension i is periodic, 0 otherwise
@*/
void BCSetGhostWidths( nd, sz, iper )
int         nd;
BCArrayPart sz[];
int         iper[];
{
int i;

for (i=0; i<nd; i++) {
    /* We ignore is_parallel since this may apply to uni-processor data */
    if (!iper[i]) {
	if (sz[i].start == 0)            sz[i].sg = 0;
	if (sz[i].end == sz[i].mdim - 1) sz[i].eg = 0;
	}
    }
}

/*ARGSUSED*/
/*@
   BCFindGhostFromStencil - Compute the ghost points given a stencil 
                            specification

   Input parameters:
.   nd           - number of dimensions in array
.   sz           - Partition structure of array (BCArrayPart)
.   nds          - number of stencil points
.   stencil      - array of size [nds,nd] that describes stencil offsets
.   stencil_type - if > 0, ignore stencil and use to describe basic 
                  stencils:
$                  1-5   - plus stencil with half-width given
$                  11-15 - box  stencil with half-width stencil_type-10
@*/
void BCFindGhostFromStencil( nd, sz, nds, stencil, stencil_type )
int         nd, nds, *stencil, stencil_type;
BCArrayPart sz[];
{
int   i;

if (stencil_type <= 0) {
    fprintf( stderr, "Stencil specification not yet supported\n" );
    return;
    }
if (stencil_type > 10) stencil_type -= 10;
if (stencil_type > 5 || stencil_type < 1) {
    fprintf( stderr, "Stencil type not supported\n" );
    return;
    }
for (i=0; i<nd; i++) {
    sz[i].sg  = stencil_type;
    sz[i].eg  = stencil_type;
    sz[i].nsg = stencil_type;
    sz[i].neg = stencil_type;
    }
}

/*@
     BCArrayPartSize - Given an array decomposition, return the number
     of elements in it, including any ghost points 

     Input parameters:
.    nd - number of dimensions
.    sz - Array partition
@*/
int BCArrayPartSize( nd, sz )
int         nd;
BCArrayPart sz[];
{
int i, size;

size = 1;

for (i=0; i<nd; i++) 
    size = size * (sz[i].end - sz[i].start + sz[i].sg + sz[i].eg + 1); 
return size;
}

/*@
   BCArrayCompile - Compile a program built with the Array routines

   Input Parameters:
.  pgm - BlockComm program (from BCBuildArrayPGM)
.  options - Special options for compiling the program.  Use zero if no special
             options are desired.

   Notes:
   See BCcompile for the available options.  The options BCOPTION_BUFFER
   and BCOPTION_TRIMMOVE are always applied.
@*/
void BCArrayCompile( pgm, options )
BCPGM *pgm;
int   options;
{
double *BCDefaultAddress2( );

/* BCprint_pgms( pgm, stdout ); */
BCcompile( pgm, (BCOPTION_BUFFER | BCOPTION_TRIMMOVE) | options );
/* BCprint_pgms( pgm, stdout ); */

/* The address calculation functions are different from the default */
BCset_option( pgm, BCOPTION_SRCADDRESS,  BCDefaultAddress2 );
BCset_option( pgm, BCOPTION_DESTADDRESS, BCDefaultAddress2 );
}

/*
   These next two functions manage matching up adjacent pieces
   They are separated out of the code to both keep the routines
   cleaner and because these are the "internal connection" routines
   for more complicated global structures (such as c-grids or
   octahedral grids).

   They must handle both the case of parallel and serial copies, in case the
   periodic boundaries in a serial dimension.
 */

/* 
    Handle sending the "low" coordinate.  Note the use of neg; 
    these are the sizes of the NEIGHBORS ghost points, and my not
    be equal to the local sg value.

    All sends and receives here are made relative to the origin of the
    non-ghost-point domain (offset).  The values of n have been set to
    include the LOCAL end-ghost-points for the dimensions already 
    processed (see the code where BCiForwardMatch is called).

    The current code assumes that sg == neg and eg == nsg, except for the
    case of no neighbor (that is, physical boundaries).  Changing this
    assumption is possible but a low priority.
 */
void BCiForwardMatch( pgm, line, 
		      phase, offset, i, sz, myid, n, nb, incr, incrp )
BCPGM       *pgm;
int         *line, phase, offset;
BCArrayPart *sz;
int         i, myid, *n, *nb, *incr, *incrp;
{
int stmp, nbr;

stmp   = n[i];
n[i]   = sz[i].neg;
if (sz[i].sg > 0) {
    /* Compute the processor number of the neighbors */
    if (!sz[i].is_parallel)
    	nbr = myid;
    else if (sz[i].loc > 0) 
	nbr = myid - incrp[i];
    else
	nbr = myid + (sz[i].ndim - 1) * incrp[i];
    BCset5d( pgm, *line, nbr, phase, BLOCK_COMM_SRC , 
	    (void *)offset,
	    n[0], n[1], n[2], n[3], n[4], 
	    incr[0], incr[1], incr[2], incr[3], incr[4] );
    *line = *line + 1;
    }
if (sz[i].eg > 0) {
    n[i] = sz[i].eg;
    BCset5d( pgm, *line, myid, phase, BLOCK_COMM_DEST, 
	    (void *)(offset + incr[i]*nb[i]), 
	    n[0], n[1], n[2], n[3], n[4], 
	    incr[0], incr[1], incr[2], incr[3], incr[4] );
    *line = *line + 1;
    }
n[i] = stmp;
}

/*
   Handle sending the "high" coordinates.  
   
   Note the use of nsg for instead of eg for determining the number of 
   ghost-point values to send.

   soffset is the starting offset to use in sending, 
   eoffset is the ending offset to use in receiving.
 */
void BCiBackMatch( pgm, line, phase, 
	      soffset, eoffset, i, sz, myid, n, nb, incr, incrp )
BCPGM       *pgm;
int         *line, phase, *soffset, *eoffset;
BCArrayPart *sz;
int         i, myid, *n, *nb, *incr, *incrp;
{
int stmp, nbr;

stmp   = n[i];
n[i]   = sz[i].nsg;
if (sz[i].eg > 0) {
    /* If incrp[i] == 0, then we don't need to special case this */
    if (!sz[i].is_parallel)
    	nbr = myid;
    else if (sz[i].loc + 1 < sz[i].ndim) 
	nbr = myid + incrp[i];
    else
	nbr = myid - (sz[i].ndim - 1) * incrp[i];
    BCset5d( pgm, *line, nbr, phase, BLOCK_COMM_SRC , 
	    (void *)(*soffset + (nb[i] - sz[i].nsg) * incr[i]),
	    n[0], n[1], n[2], n[3], n[4], 
	    incr[0], incr[1], incr[2], incr[3], incr[4] );
    *line    = *line + 1;
    }
/* Move backwards EVEN IF no-one to send to.  This is required because we
   are expecting to send part of our ghost-points to other processors */
*soffset = *soffset - sz[i].sg * incr[i];
/* Compute the processor number of the neighbors */
if (sz[i].sg > 0) {
    /* We don't need to move this outside since if sg is 0, we don't change
       anything */
    *eoffset = *eoffset - sz[i].sg * incr[i];
    n[i]     = sz[i].sg;
    BCset5d( pgm, *line, myid, phase, BLOCK_COMM_DEST, 
	    (void *)(*eoffset), 
	    n[0], n[1], n[2], n[3], n[4], 
	    incr[0], incr[1], incr[2], incr[3], incr[4] );
    *line = *line + 1;
    }
n[i] = stmp;
}

/*ARGSUSED*/
/*
   Add to an allocated program.
 */
void BCAddToArrayPGM( pgm, Line, nd, sz, nproc, myid, nbyte, exoffset )
BCPGM       *pgm;
int         *Line, nd;
BCArrayPart sz[];
int         nproc, myid, nbyte, exoffset;
{
int   line, phase, ntot, i, n[5], incr[5], offset, nb[5];
int   soffset, eoffset;
int   incrp[5], ntotp;

line = *Line;
/* set the sizes of the array */
for (i=0; i<5; i++) {
    n[i]    = 1;
    incr[i] = 1;
    }
ntot   = 1;
ntotp  = 1;
offset = exoffset;
for (i=0; i<nd; i++) {
    nb[i]   = sz[i].end - sz[i].start + 1;
    n[i]    = nb[i];
    incr[i] = ntot;
    incrp[i]= ntotp;
    offset += sz[i].sg * incr[i];
    ntot    *= (n[i] + sz[i].sg + sz[i].eg);
    ntotp   *= sz[i].ndim;
    }
phase= 0;
/* For the first phase, the SEND offset is the "lower-left" corner */
for (i=0; i<nd; i++) {
    phase = i;
    BCiForwardMatch( pgm, &line, phase, offset, i, 
			 sz, myid, n, nb, incr, incrp );
    n[i] += sz[i].eg;
    }
soffset = offset;
eoffset = offset;
for (i=0; i<nd; i++) {
    phase = nd + i;
    BCiBackMatch( pgm, &line, phase, &soffset, &eoffset, 
		      i, sz, myid, n, nb, incr, incrp );
    n[i] += sz[i].sg;
    }
*Line = line;
}
