#ifndef lint
static char SCCSid[] = "@(#) ./comm/io/pio.c 07/23/93";
#endif

#include <stdio.h>
#include <ctype.h>
#include "tools.h"
#include "comm/comm.h"
#include "comm/io/pio.h"
#ifndef SEEK_SET
#include <unistd.h>
#endif

#define READ  (*fp->read)
#define WRITE (*fp->write)
#define LSEEK (*fp->lseek)

/* For now, this just implements a single writer.  Later, these routines
   will be split off for the different access modes

   Parallel modes:
   AsSequential - the same file is written as if it were a sequential file.
                  For parallel data, the "correct" routines must be
		  called (such as PIWriteDistributedArray)

   Buffering options:
   Multibuffering
   Double-buffering
   No buffering

   We also need routines for random access (scratch files with parallelism)

   When writing large blocks, we'll need to divide the output into 
   individual chuncks
*/

/* Forward references */
extern void PIiRwriter(), PIiRwriterWait(), PIiRreader(), PIiRreaderWait();


/* #if !defined(rs6000) */
extern int lseek();
/* #endif */

extern int read(), write();

#if defined(intelnx)
extern int cread(), cwrite();
#endif

/* Default readers/writers */
static void (*defwriter)()       = PIiRwriter;
static void (*defwriterwait)()   = PIiRwriterWait;
static void (*defwriterflush)()  = 0;
static void *(*defwritercreate)() = 0;

static int filemtype = 10101010;
/*@
     PIFopen - Open a parallel file

     Input Parameters:
.    name    - name of file
.    procset - procest of processors that will use this file
.    mode    - mode of file.  Any valid mode for the Unix open call.  The ones
               that are meaningfule are
$    O_RDONLY - Open for reading only
$    O_WRONLY - Open for writing only
$    O_RDWR   - Open for reading and writing
$    O_CREATE - Create if the file does not exist 
$
.    pmode   - parallel mode.  Currently only PIO_AS_SEQUENTIAL supported.
$    PIO_AS_SEQUENTIAL - write as a seqential file, independent of the 
     number of processors
$    PIO_AS_PARALLEL - write in parallel.
$    PIO_ANY_NUMBER  - (modifier) allow a PIO_AS_PARALLEL file to be read
     by an arbitrary number of processors.

     Notes:
     To open a file for reading, use the mode O_RDONLY.  To open a file for
     writing, use (in C) O_WRONLY | O_CREATE.
@*/
PIFILE *PIFopen( name, procset, mode, pmode )
char    *name;
ProcSet *procset;
int     mode, pmode;
{
int    myid, fmode;
PIFILE *fp;

myid = procset ? procset->lidx : MYPROCID;
fp   = NEW(PIFILE);   CHKPTRN(fp);
/* Single writer for now */
if (myid == 0) {
    fmode = 0666;
    fp->fd = open( name, mode, fmode );
    /* Must let all processors know if the file was successfully opened, 
       and how to identify a message for the file */
    if (!fp->fd) {
	fprintf( stderr, "Could not open file %s\n", name );
	}
    fp->mtype = filemtype++;
    }
GSCATTERSRC( &fp->fd, sizeof(int), PSROOT(procset), procset, MSG_INT );
if (!fp->fd) {
    FREE( fp );
    return 0;
    }
GSCATTERSRC( &fp->mtype, sizeof(long), PSROOT(procset), procset, MSG_INT );
fp->fpos    = 0;
fp->sbuf    = (long *)MALLOC( PIO_BUF_SIZE * sizeof(long) ); CHKPTRN(fp->sbuf);
fp->bsize   = PIO_BUF_SIZE;
fp->procset = procset;
fp->rcnt    = 0;
fp->diswriterSetup = 0;
fp->disreaderSetup = 0;
if (defwritercreate)
    fp->disWctx = (*defwritercreate)( 0 );
else
    fp->disWctx        = 0;
fp->disRctx        = 0;
fp->diswriter      = defwriter;
fp->diswriterwait  = defwriterwait;
fp->disflush       = defwriterflush;
fp->disreader      = PIiRreader;
fp->disreaderwait  = PIiRreaderWait;
#if defined(intelnx)
fp->lseek          = lseek;
/* This should use PIiFileNameIsCFS() */
if (1) {
    fp->write          = cwrite;
    fp->read           = cread;
    }
else {
    fp->write          = write;
    fp->read           = read;
    }
#else
fp->lseek          = lseek;
fp->write          = write;
fp->read           = read;
#endif

return fp;
}

/* Everyone writes the same thing to the file.  All must call with the
   same arguments */
/*@
    PIWriteCommon - write data to a parallel file.  All processors must call

    Input Parameters:
.   fp - parallel file pointer
.   fmat - format to use.  If null, use unformated output.
.   flen - length of a single item of formatted output.  
           If fmat is null, this should be
           the size of the data item (e.g., sizeof(double) if datatype is
	   MSG_DBL).
.   v    - pointer to data to write
.   n    - number of elements in v
.   datatype - type of data (MSG_INT etc)

    Note:
    All processors must call; only one will actually write data out.

    Example:
    If you want to write 10 integers with format %5d, use
$   PIWriteCommon( fp, "%5d", 5, v, 10, MSG_INT )
@*/
void PIWriteCommon( fp, fmat, flen, v, n, datatype )
PIFILE *fp;
char   *fmat;
void   *v;
int    flen, n, datatype;
{
int myid, nbytes, err;

myid = fp->procset ? fp->procset->lidx : MYPROCID;
if (myid == 0) {
    nbytes = PIiFormatData( fmat, flen, fp->sbuf, v, n, datatype );
    err    = WRITE( fp->fd, fp->sbuf, nbytes );
    if (err != nbytes) {
	SETERRC(1,"Write failed" );
	}
    }
else {
    nbytes = flen * n;
    }
fp->fpos += nbytes;
}

/*@
    PIReadCommon - read data from a parallel file.  All processors must call

    Input Parameters:
.   fp - parallel file pointer
.   fmat - format for each element.  If null, use unformatted input.
.   flen - length of a single item of formatted input.
           If fmat is null, this should be
           the size of the data item (e.g., sizeof(double) if datatype is
	   MSG_DBL).
.   v    - pointer to data to read
.   n    - number of elements in v
.   datatype - type of data (MSG_INT etc)

    Note:
    All processors must call; only one will actually read data from the file
    (but ALL will receive the data).
@*/
void PIReadCommon( fp, fmat, flen, v, n, datatype )
PIFILE *fp;
char   *fmat;
void   *v;
int    flen, n, datatype;
{
int myid, nbytes, err, size;

myid = fp->procset ? fp->procset->lidx : MYPROCID;
nbytes = flen*n;
if (myid == 0) {
    err    = READ( fp->fd, fp->sbuf, nbytes );
    PIDecodeString( fmat, flen, fp->sbuf, v, n, datatype );
    if (err != nbytes) {
        SETERRC(1,"Read failed" );
        }
    }
size = PIiMSGtoLen( datatype );
GSCATTERSRC(v,size,PSROOT(fp->procset),fp->procset,datatype);
fp->fpos += nbytes;
}

/*@
    PIWriteDistributedArray - Write a distributed array to a parallel file

    Input Parameters:
.   fp - file pointer
.   fmat - format for each element.  If null, use unformatted output
.   flen - length of a single item of formatted output.  
           If fmat is null, this should be
           the size of the data item (e.g., sizeof(double) if datatype is
	   MSG_DBL).
.   sz   - structure describing array section.  See below.
.   nd   - number of dimensions in array
.   v    - pointer to array
.   datatype - type of v (MSG_INT etc)

    Note:
    sz is an array that contains a description of the array section to 
    be output.  The fields are
.   mdim - size of global array in this dimension
.   ndim - size of local array in this dimension 
.   start - starting index for local piece
.   end - ending index for local piece 
.   gstart,gend - GLOBAL indices for start and end

    For example, if the global array is defined as 
$   A(0:100)
and the array B(0:51) corresponds to A(49:100), then the sz values are
$   mdim = 100 - 0 + 1
$   ndim = 51 - 0 + 1
$   start = 0
$   end   = 51
$   gstart = 49
$   gend   = 100
If B contains "ghost points", so that only the "owned" part of the array
B is to be written out, say B(1:50), then sz becomes
$   mdim = 100 - 0 + 1
$   ndim = 51 - 0 + 1
$   start = 1
$   end   = 50
$   gstart = 50
$   gend   = 99
Any number of dimensions may be specified.

This routine is designed so that if the sz arrays are correctly defined,
the EXACT SAME FILE will be generated regardless of the number of processors
employed.
@*/
void PIWriteDistributedArray( fp, fmat, flen, sz, nd, v, datatype )
PIFILE      *fp;
char        *fmat;
void        *v;
PIArrayPart *sz;
int         flen, nd, datatype;
{
int  nbytes, finalloc;
void PIiNdimFormat();
int  PIiDistSize(), ln;

fp->rcnt = PSNUMNODES(fp->procset) - 1;
ln       = PIiMSGtoLen( datatype );
PIiNdimFormat( fp, sz, nd, fmat, flen, fp->fpos, v, 
	       fp->diswriter, datatype, ln );
finalloc = fp->fpos += PIiDistSize( sz, nd, flen );
(*fp->diswriterwait)( fp, datatype, finalloc );

fp->fpos = finalloc;
}

/*@
    PIReadDistributedArray - Read a distributed array from a parallel file.
    The EXACT same data will be read in independent of the number of 
    processors.

    Input Parameters:
.   fp - file pointer
.   fmat - format for each element.  If null, use unformatted input
.   flen - length of a single item of formatted input.
           If fmat is null, this should be
           the size of the data item (e.g., sizeof(double) if datatype is
	   MSG_DBL).
.   sz   - structure describing array section.  See below.
.   nd   - number of dimensions in array
.   v    - pointer to array
.   datatype - type of v (MSG_INT etc)

    Note:
    sz is an array that contains a description of the array section to 
    be input.  The fields are
.   mdim - size of global array in this dimension
.   ndim - size of local array in this dimension 
.   start - starting index for local piece
.   end - ending index for local piece 
.   gstart,gend - GLOBAL indices for start and end

    Further Details and Examples:

.  fmat - If NULL then data is copied straight from the file to memory
(unformatted).  Otherwise the format is similar to the the C 'printf'
command.  Use "%d" for integers, "%l" for long integers, "%f" for type
float, and "%lf" for double precision numbers.  One may more than one
type in a line ("%d %d %f"), but they must be seperated by a space or
newline for this to work correctly.  Do not add 'field specifiers' as
one did for writing data, e.g. do not use "%f.7.2", very important.

.  sz - This array specifies how the global array and the local
processor's array is configured.  Each dimension in the array has five
fields.  Here on, Local start or end refers to the first element that is
actually used in computation, while ghost start or end refers to the
very first and last index as how the array is defined.  Example: An
array could defined as [0,100] so ghost_start = 0, ghost_end=100.  It
may have two ghost points on each end, so live_start = 2, live_end =
98 (if no ghost points then local_start = 0 too)

. sz[ ].mdim - the global size of this dimension not counting ghost
points, just as it would if it was not parallel.  Used for computing
file size.

. sz[ ].ndim - the local length in this dimension of processor's
array, including ghost points.  Typically, ghost_end - ghost_start +
1.  Use for

. sz[ ].start - The offset from the ghost_start that will recieve
data.  One normally does not want data to be placed in ghost elements,
so if one has no ghost points before the local_start index, then start
would be equal to zero, if one has n ghost points before, start = n.

. sz[ ].end - The offset from the ghost_start index of the last
element to recieve data.  If one has no ghost points, end would be
local_end - local_start.  If there are ghost points, then end =
local_end - local_start + number of ghost points before local_start =
local_start - ghost_start.

. sz[ ].gstart - This is normally equal to ghost_start.  However, if
one is passing a Fortran array, subtract 1 (this is due to the fact
that Fortran arrays start at 1, while C arrays start at 0).  This is
used for positioning in the file.

. sz[ ].gend - Normally ghost_end, but with the same Fortran exception
as above.  Used for positioning in the file.


.  flen - is the size in bytes of output generate by fmat,
above.  Example if one used unformatted data then flen would
"sizeof(datatype)", e.x. sizeof(int).  If one is reading in ascii data
that was written with a fmat of "%f7.2 ", a floating point number with
seven characters plus one space at the end, would have a flen of 8.  .

.  mdim - size of global array in this dimension .  ndim - size of
local array in this dimension .  start - starting index for local
piece .  end - ending index for local piece .  gstart,gend - GLOBAL
indices for start and end

. nd  - Equal to the number of the dimensions of the array.  If one is
using complex data, add 1 to your normal dimension ( a 3-d complex
array would have a nd value of 4)

. datatype - use MSG_INT for "int" data, use MSG_FLT for "float" data,
use MSG_DBL for "double" data, use MSG_OTHER for character or any
other data you do not want formatted (typically non-numeric data).


    For example, if the global array is defined as 
$   A(0:100)
and the array B(0:51) corresponds to A(49:100), then the sz values are
$   mdim = 100 - 0 + 1
$   ndim = 51 - 0 + 1
$   start = 0
$   end   = 51
$   gstart = 49
$   gend   = 100

$ If B contains "ghost points", so that only the "owned" part of the array
B is to be read in, say B(1:50), then sz becomes
$   mdim = 100 - 0 + 1
$   ndim = 51 - 0 + 1
$   start = 1
$   end   = 50
$   gstart = 50
$   gend   = 99
Any number of dimensions may be specified.

This routine is designed so that if the sz arrays are correctly defined,
the EXACT SAME DATA will be read regardless of the number of processors
employed.
@*/
void PIReadDistributedArray( fp, fmat, flen, sz, nd, v, datatype )
PIFILE      *fp;
char        *fmat;
void        *v;
PIArrayPart *sz;
int         flen, nd, datatype;
{
int  nbytes;
void PIReadFormat();
int  PIiDistSize(), ln;
char fmatcopy[128], ftemp[128], *strchr();

fp->rcnt = PSnumtids(fp->procset) - 1;
ln       = PIiMSGtoLen( datatype );
if (fmat) {
    /* Check for correct format string; this means:
       adding "l"s as appropriate
       removing precision specifications.
       For example, if the format is %12.4e and the datatype is MSG_DBL, 
       then we need to change this to %12e
     */
    char *p, *p1;
    strcpy( fmatcopy, fmat );
    fmat = fmatcopy;
    p = strchr( fmatcopy, '%' );
    if (p) {
	p++;
	while (*p && isdigit(*p)) p++;
	if (*p == '.') {
	    /* Skip over the length field */
	    p1 = p + 1;
	    while (*p1 && isdigit(*p1)) p1++;
	    strcpy( p, p1 );
	    p1 = p;
	    switch (datatype) {
		case MSG_DBL: 
	        if (*p1 == 'e' || *p1 == 'f') {
		    char f[3];
		    f[0] = 'l';
		    f[1] = *p1;
		    f[2] = 0;
		    *p1 = 0;
		    strcpy( ftemp, fmat );
		    strcat( ftemp, f );
		    strcat( ftemp, p + 1 );
		    strcpy( fmatcopy, ftemp );
		    }
		break;
		}
	    }
	}
    }
PIReadFormat( fp, sz, nd, fmat, flen, fp->fpos, v,
               fp->disreader, datatype, ln );
(*fp->disreaderwait)( fp, datatype );

fp->fpos += PIiDistSize( sz, nd, flen );
}

/*@
    PIFclose - close a parallel file

    Input Parameter: 
.   fp - parallel file pointer    
@*/
void PIFclose( fp )
PIFILE *fp;
{
int myid;

PIFflush( fp );
myid   = PSMYPROCID(fp->procset);
if (myid == 0)
    close( fp->fd );
if (fp->sbuf)
    FREE( fp->sbuf );

FREE( fp );
}

/*@
   PIFflush - Flush the output to a parallel file

   Input Parameter:
.  fp - parallel file pointer
@*/
void PIFflush( fp )
PIFILE *fp;
{
if (fp->disflush)
    (*fp->disflush)( fp );
}

/* Internal routine to format data.  Note, if v is null, do not use formatted
   operations */
int PIiFormatData( fmat, flen, sbuf, v, n, datatype )
char *fmat, *sbuf;
int  flen, n, datatype;
char *v;
{
int size, i;
long   *lv;
double *dv;
float  *fv;

size = PIiMSGtoLen( datatype );
if (!v) {
    for (i=0; i<n; i++) {
	strcpy( sbuf, fmat );
	sbuf += flen;
	}
    }
else if (fmat) {
    for (i=0; i<n; i++) {
	switch (datatype) {
	    case MSG_LNG:
	    case MSG_INT: lv = (long *)v;
	                  sprintf( sbuf, fmat, *lv ); break;
	    case MSG_FLT: fv = (float *)v;
	                  sprintf( sbuf, fmat, *fv ); break;
	    case MSG_DBL: dv = (double *)v;
	                  sprintf( sbuf, fmat, *dv ); break;
	    default:
	                  sprintf( sbuf, fmat, *v ); break;
	    }
	sbuf += flen;
	v    += size;
	}
    }
else {
    MEMCPY( sbuf, v, n*size );
    }
return n * flen;
}

/**************************************************************/
/* PIDecodeString -- Parallel Interface, Decode String        */
/*   Converts a string back into original form.               */
/**************************************************************/
int PIDecodeString(fmat,flen, sbuf, v, n, datatype )
char *fmat, *sbuf, *v;
int  flen, n, datatype;
{
int size, i;
size = PIiMSGtoLen( datatype );
/*if (MYPROCID == 1) printf("%d>>%s\n",MYPROCID,sbuf); */
if (fmat) {
    for (i=0; i<n; i++) {
        switch (datatype) {
            case MSG_LNG: sscanf(sbuf, fmat, (long*) v);   break;
            case MSG_INT: sscanf(sbuf, fmat, (int*) v);    break;
            case MSG_FLT: sscanf(sbuf, fmat, (float*) v);  break;
            case MSG_DBL: sscanf(sbuf, fmat, (double*) v); break; 
            default: break;
            }
        sbuf += flen;  
        v += size;
        }
    }
else {
    MEMCPY( v, sbuf, n*size );
    }
return n*flen;
}  


/* 
   We need a routine that takes a size description and
   calls the format routine (or copy routine) for each element, 
   regardless of the number of dimensions.  At each non-contiguous 
   segment, issue a new call...

   Everytime we increase a dimension, we output the lower-dimensioned
   piece.  So the basic routine is an vector write; the n-d version becomes:
   
   format( sz, nd, k, v )
   {
   for (j=sz[nd-1].start; j<=sz[nd-1].end; j++) {
       format( sz, nd-1, k + j*..., v+j*... )
       }
   }

   Then the termination criteria should be more adaptive-say, send a single
   empty message (we could try to optimize for the special case)
 */

void PIiNdimFormat( fp, sz, nd, fmat, flen, offset, v, writer, datatype, ln )
PIFILE      *fp;
PIArrayPart *sz;
char        *fmat;
int         nd, flen, offset, datatype, ln;
char        *v;
void        (*writer)();
{
int nbytes, n, k, j, nn, ln2;

if (nd == 1) {
    n =  sz[0].end - sz[0].start + 1;
    v += sz[0].start * ln;
    fp->sbuf[0] = offset + sz[0].gstart*flen;
    /* is this the correct offset? */
    nbytes = PIiFormatData( fmat, flen, fp->sbuf + 1, v, n, datatype );
    (*writer)( fp, fp->sbuf, nbytes, offset );
    }
else {
    nn = flen;
    nd--;
    /* Here is the rule.  We need to compute two quantities:
       the offsets for a(gs0,gs1,gs2,...) (global offset) and
                       b(s0,s1,s2,...)    (local  offset).
       These are just 
            gs0 + mdim0 * (gs1 + mdim1 * (gs2 + mdim2 * ( ... ) ) )
       (and similarly for s0... ).
       But since we are doing the computation recursively, we just compute
       the offset for our index; the calls at the lower levels will add
       the offsets in for those positions.

       We also need to know the offsets for each hyperplane (since
       we do this recursively), we need the product mdim0*mdim1*...*mdim{nd-2}
     */
    ln2 = ln;
    for (j=0; j<nd; j++) {
	nn  *= sz[j].mdim;    /* size of hyperplane in file */
	ln2 *= sz[j].ndim;    /* size of hyperplane in array */
	}
    offset += sz[nd].gstart * nn;
    k = 0;
    for (j=sz[nd].start; j<=sz[nd].end; j++) {
	/* printf( "j = %d\n", j ); */
	PIiNdimFormat( fp, sz, nd, fmat, flen, offset + k*nn, 
		                  v + j*ln2, writer, datatype, ln );
	k++;
	}
    }
return;
}

void PIReadFormat(fp, sz, nd, fmat, flen, offset, v, reader, datatype, ln )
PIFILE      *fp;
PIArrayPart *sz;
char        *fmat;
int         nd, flen, offset, datatype, ln;
char        *v;
void        (*reader)();
{
int nbytes, n, k, j, nn, ln2;

if (nd == 1) {
    n =  sz[0].end - sz[0].start + 1;
    v += sz[0].start * ln;
    fp->sbuf[0] = offset + sz[0].gstart*flen;
    nbytes = flen * n ;
    (*reader)( fp, fp->sbuf, nbytes, fp->sbuf[0] );
    PIDecodeString( fmat, flen, fp->sbuf + 1, v, n, datatype );
    }
else {
    nn = flen;
    nd--;
    ln2 = ln;
    for (j=0; j<nd; j++) {
        nn  *= sz[j].mdim;    /* size of hyperplane in file */
        ln2 *= sz[j].ndim;    /* size of hyperplane in array */
        }

    offset += sz[nd].gstart * nn;

    k = 0;
    for (j=sz[nd].start; j<=sz[nd].end; j++) {
        /* printf( "j = %d\n", j ); */
        PIReadFormat( fp, sz, nd, fmat, flen, offset + k*nn,
                                  v + j*ln2, reader, datatype, ln );
        k++;
        }
    }
return;
}

int PIiDistSize( sz, nd, flen )
PIArrayPart *sz;
int         nd, flen;
{
int tot, i;

tot = 1;
for (i=0; i<nd; i++) tot *= sz[i].mdim;
return tot * flen;
}

/*
    Comments on Intel/NX CFS version

    CFS uses 4k blocks.  Use iread/iwrite to get individual blocks.
    Use lsize to force the file to be allocated before doing writing.
    Use IO mode 0
    IO nodes read 32K at a time.

    On the delta, the name of the CFS is /usr2/<username>/<filename>
    Larger block sizes (8k and larger) give better performance.  Note
    that the IO nodes also manage an IO cache.

    Here is an alternate scheme that has the advantage of 
    (a) using larger blocks for writing
    (b) using simpler termination logic
    
    The disadvantages are
    (a) more total data-traffic
    (b) more complicated logic on each node.

    This is the algorithm:
    The output/input is broken up into bufsize pieces.

    (for output)
    for (i=0; i<ceil(totalsize/bufsize); i++) {
        clear the buffer
        format the current buffer's worth of bytes
	(allow upto fmatlen's spill).

	When all elements that this node is contributing to this
	buffer are present, start a GIORhalf of it.
	The writer writes the data out.  The root node of the GIORhalf
	may be selected from a set of nodes so that multiple
	processors may be writing to the file.  A CFS implementation
	may choose to use 8k blocks and multiple nodes.
        }
    
    In this case, there isn't a WriterWait since the writes are done
    synchronously.
    
 */

int PIiMSGtoLen( datatype )
int datatype;
{
int size;
switch (datatype) {
    case MSG_INT: size = sizeof(int)    ; break;
    case MSG_LNG: size = sizeof(long)   ; break;
    case MSG_FLT: size = sizeof(float)  ; break;
    case MSG_DBL: size = sizeof(double) ; break;
    default: size = 1;
    }
return size;
}

/*
   This routine chnages the default parallel writers 
 */
void PIFSetWriters( writer, writerwait, writerflush, writercreate )
void (*writer)(), (*writerwait)(), (*writerflush)(), *(*writercreate)();
{
defwriter       = writer;
defwriterwait   = writerwait;
defwriterflush  = writerflush;
defwritercreate = writercreate;
}
