#ifndef lint
static char SCCSid[] = "@(#) ./comm/csync.c 07/23/93";
#endif

/*
    This file contains routines to provide the raw material necessary
    for a posteriori adjustment of the clocks.

    A previous version of this generated events using alog or blog format.
    This version holds the information in internal buffers that may be
    accessed by anyone wanting to compute a global clock value from a
    local one (and not just at the time that the local value is 
    accessed; it may in fact be more accurate to correct the local
    times at the end of the execution).

    This version also includes new code that
    (1) Takes multiple observations, saving the one with the 
        shortest interval
    (2) Gives the "master" processor the offset time
 */
#include <stdio.h>
#include "tools.h"

/* Turn off all of the event logging */
#ifndef LOGCOMMDISABLE
#define LOGCOMMDISABLE
#endif

#include "comm/comm.h"

/* Maximum number of observations to make */
#define MAX_OBSERVATIONS 10

/* Forward references */
void PIInitOffsetEvents( );
void PIDumpOffsetEvents( );
int  PIComputeOffsets();

typedef enum { SLAVE, MASTER } PISyncNodeType;

/*
    PISync1 - routine to generate a pair-exchange used for computing
    clock offsets.  It returns, for the master, an estimate of the
    offset FROM the master of the SLAVE (we may want the negative of this).

    Note that by saving the values of adiff, we can estimate the
    accuracy of the synchronization.
 */
double PISync1( nbr, ismaster, mtype, local_origin, Adiff )
int            nbr, mtype;
PISyncNodeType ismaster;
SYusc_time_t   *local_origin;
double         *Adiff;
{
int  *msgs, *msgr;
int  i, size;
SYusc_time_t T1, T2, *t1 = &T1, *t2 = &T2;
double a1, a2, b1, adiff;
double d1, oldsep;

size = sizeof(int);
MSGALLOCSEND( msgs, size, int );
MSGALLOCRECV( msgr, size, int );
/* Set the initial values */
oldsep = 1.e6; /* Huge */
d1     = 0;
for (i=0; i<MAX_OBSERVATIONS; i++) {
    if (ismaster == MASTER) {
	SENDSYNC( mtype, msgs, size, nbr, MSG_INT );
	RECVSYNC( mtype, msgr, size, MSG_INT );
	SYusc_clock( t1 );  /* a1 */
	SENDSYNC( mtype, msgs, size, nbr, MSG_INT );
	RECVSYNC( mtype, msgr, size, MSG_INT );
	SYusc_clock( t2 );  /* a2 */
	RECVSYNCNOMEM( mtype, &b1, sizeof(double), MSG_DBL );
	adiff = SYuscDiff( t1, t2 );
	if (adiff < oldsep) {
	    a1     = SYuscDiff( local_origin, t1 );
	    a2     = SYuscDiff( local_origin, t2 );
	    oldsep = adiff;
	    d1     = 0.5 * (a1 + a2) - b1;
	    }
	}
    else {    
	RECVSYNC( mtype, msgr, size, MSG_INT );
	SENDSYNC( mtype, msgs, size, nbr, MSG_INT );
	RECVSYNC( mtype, msgr, size, MSG_INT );
	SYusc_clock( t1 ); /* b1 */
	SENDSYNC( mtype, msgs, size, nbr, MSG_INT );
	/* Compute the time b1 in double and send that to the master */
	b1 = SYuscDiff( local_origin, t1 );
	SENDSYNCNOMEM( mtype, &b1, sizeof(double), nbr, MSG_DBL );
	}
    }
MSGFREESEND( msgs );
MSGFREERECV( msgr );

/* Note that slave always returns zero */
if (Adiff) *Adiff = oldsep;
return d1;
}


/* Here are the routines to gather up the offset events.  These are organized
   as: 
   
   
 */
typedef struct {
    int          p_master, p_slave;   /* between these processors.
					 We WANT p_slave > p_master (this
					 simplifies the conversion of 
					 individual events to offsets) */
    } OffsetEventsP;
static double        *offsettimes   = 0;
static double        *offsetintval  = 0;
static OffsetEventsP *offsetevents  = 0;
static int           noffsets       = 0;

/*
    PIAddOffsetPair0 - add clock offset events for the "Dunnigan"
    method (all processors exchange with processor 0)
 */
PIAddOffsetPair0( mtype, local_origin )
int mtype;
SYusc_time_t   *local_origin;
{
int i;

PIInitOffsetEvents( );

if (MYPROCID == 0) {
    for (i=1; i<NUMNODES; i++) {
        offsettimes[i]      = PISync1( i, MASTER, mtype + i, local_origin, 
				       offsetintval + i );
	offsetevents[i].p_master = 0;
	offsetevents[i].p_slave  = i;
	}
    }
else
    (void) PISync1( 0, SLAVE, mtype + MYPROCID, local_origin, (double *)0 );
}

/*
   PIAddOffsetPair1 - use a tree to add offset events

   Note:  This could execute in 4 steps by using the fact that
   alternate levels of the tree can be running simultaneously.  For
   modest numbers of processors, this 2log(p) method won't be much
   slower, and is vastly easier to code.
 */
PIAddOffsetPair1( mtype, local_origin )
int mtype;
SYusc_time_t   *local_origin;
{
int l_child, r_child, parent, am_left;

PIInitOffsetEvents( );

PISetTreeNodes( MYPROCID, NUMNODES, &l_child, &r_child, &parent, &am_left );

if (parent) {
    (void) PISync1( parent, SLAVE, mtype + parent, local_origin );
    }
if (l_child) {
    offsettimes[l_child]     = PISync1( l_child, MASTER, mtype + MYPROCID,
				        local_origin, offsetintval + l_child );
    offsetevents[l_child].p_master = MYPROCID;
    offsetevents[l_child].p_slave  = l_child;
    }
if (r_child) {
    offsettimes[r_child]     = PISync1( r_child, MASTER, mtype + MYPROCID,
				        local_origin, offsetintval + r_child );
    offsetevents[r_child].p_master = MYPROCID;
    offsetevents[r_child].p_slave  = l_child;
    }
}

/* 
  This uses the "use neighbor" version.  To parallelize it, the odd nodes 
  first serve as masters and then as slaves, and the even nodes use the
  opposite order
 */
PIAddOffsetPair2( mtype, local_origin )
int mtype;
SYusc_time_t   *local_origin;
{
int lnbr, rnbr;

PIInitOffsetEvents( );

lnbr = MYPROCID - 1;
rnbr = MYPROCID + 1;
if (MYPROCID & 0x1) {
    if (rnbr < NUMNODES) {
	offsettimes[MYPROCID]     = PISync1( rnbr, MASTER, mtype, 
					     local_origin, 
					     offsetintval + MYPROCID );
	offsetevents[MYPROCID].p_master = MYPROCID;
	offsetevents[MYPROCID].p_slave  = rnbr;
	}
    if (lnbr >= 0)
	(void) PISync1( lnbr, SLAVE, mtype + 1, local_origin );
    }
else {
    if (rnbr < NUMNODES) 
	(void) PISync1( rnbr, SLAVE, mtype, local_origin );
    if (lnbr >= 0) {
	offsettimes[MYPROCID]     = PISync1( lnbr, MASTER, mtype + 1, 
					     local_origin, 
					     offsetintval + MYPROCID );
	offsetevents[MYPROCID].p_master = MYPROCID;
	offsetevents[MYPROCID].p_slave  = lnbr;
	}
    }
}

void PIInitOffsetEvents( )
{
int np, i;

np           = NUMNODES;
offsetevents = (OffsetEventsP *)MALLOC( np * sizeof(OffsetEventsP) );
CHKPTR(offsetevents);
offsettimes  = (double *)MALLOC( np * sizeof(double) ); CHKPTR(offsettimes);
offsetintval = (double *)MALLOC( np * sizeof(double) ); CHKPTR(offsetintval);
               
for (i=0; i<np; i++) {
    offsettimes[i]           = 0.0;
    offsetintval[i]          = 0.0;
    offsetevents[i].p_master = 0;
    offsetevents[i].p_slave  = 0;
    }
}

int PIComputeOffsets( )
{
int    *work, np, i, cnt;
double *dwork;
np     = NUMNODES;

/* Combine the event structures */
work   = (int *)MALLOC( np * sizeof(OffsetEventsP) );   CHKPTRV(work,0);
dwork  = (double *)MALLOC( np * sizeof(double) );       CHKPTRV(dwork,0);
GIOR( offsetevents, 
      (np * sizeof(OffsetEventsP)) / sizeof(int), work, ALLPROCS );
/* I need a way to collect non-zero doubles
/* GDMERGE( offsettimes, np, dwork, ALLPROCS ); */
/* For now, convert to microseconds and use those.  Note that since the
   times generated are differences from (roughly) when each process started,
   the 1000 msec range permitted by an 32-bit int is adequate */
{int *iwork;
iwork = (int *)MALLOC( np * sizeof(int) ); CHKPTRV(iwork,0);
for (i=0; i<np; i++) 
    iwork[i] = 1.0e6 * offsettimes[i];
GIOR( iwork, np, dwork, ALLPROCS );
for (i=0; i<np; i++) 
    offsettimes[i] = 1.0e-6 * iwork[i];
FREE(iwork);
 }
    
FREE( work );
FREE( dwork );
/* printf( "[%d] Done with GIOR\n", MYPROCID ); fflush( stdout ); */

/* Count the number of non-zero offset records */
cnt = 0;
for (i=0; i<np; i++) {
    if (offsetevents[i].p_master != offsetevents[i].p_slave)
	cnt++;
    }
noffsets = cnt;

/* printf( "[%d] cnt = %d in get offsets\n", MYPROCID, noffsets ); 
   fflush(stdout); 
 */

if (MYPROCID == 0) {PIDumpOffsetEvents( ); fflush(stdout);}

return noffsets;
}

/*
    This routine takes offset events and solves for the offsets.  The
    approach is described in the paper "A Posteriori clock synchronization
    on distributed processors".

    Let the global time be given by (local_time - offset)*scale ,
    with a different offset and scale on each processor.  Each processor
    originates exactly one communication event (except processor 0),
    generating an a1 and a2 event.  A corresponding number of b2 events
    are generated, but note that one processor may have more than 1 b2
    event (if using Dunnigan's synchronization, there will be np-1 b2 events
    on processor 0, and none anywhere else).

    These events are:

   pi   a1 (send to nbr)                        (recv) a2
   pj                     (recv) b1 (send back)

    We base the analysis on the assumption that in the GLOBAL time
    repreresentation, a2-a1 is twice the time to do a (send) and
    a (recv).  This is equivalent to assuming that global((a1+a2)/2) ==
    global(b1).  Then, with the unknowns the offsets (the scales
    are assumed known from the syncevent calculation), the matrix is

    1
    -s0 s1
       ....
       -sj ... si

    where si is the scale for the i'th processor (note s0 = 1).
    The right hand sides are (1/2)(a1(i)+a2(i)) *s(i) - b1(j)*s(j).
    Because of the triangular nature of the matrix, this reduces to

       o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j))

    Note that if s(i)==s(j) and b1 == (a1+a2)/2, this gives o(i)==o(j).

    This works with ANY triangular matrix; we can use a master-slave
    version (all exchange with processor 0), a log-tree version
    (everyone exchanges with binary tree parent), or a linear version
    (2p+1 exchanges with 2p).  Others are possible.    

    This returns the offset and skew for the calling processor only.
 */
void PIComputeTimeOffsets( np, skew, goff )
int    np;
double *skew, *goff;
{
int    i, j, k;
double *globaloffset;

/* If there aren't enough events, return */
if (noffsets != np - 1) {
    printf( "[%d] found %d offsets\n", MYPROCID, noffsets );
    *goff = 0.0;
    return;
    }

globaloffset = (double *)MALLOC( np * sizeof(double) );   CHKPTR(globaloffset);
for (i=0; i<np; i++)
    globaloffset[i] = 0.0;
globaloffset[0] = *goff;
/* We assume that the matrix is lower triangular here. Also, the 0th entry is
   master to master and is assumed to have offsettimes[0] == 0.  */
for (i=1; i<np; i++) {
    j     = offsetevents[i].p_slave;
    k     = offsetevents[i].p_master;
    if (j <= k) {
	SETERRC(1,"Error in clock synchronization events" );
	break;
	}
    globaloffset[j] = globaloffset[k] + offsettimes[i];
    }
*goff = globaloffset[MYPROCID];

printf( "[%d] final goff = %f\n", MYPROCID, *goff ); fflush(stdout);

FREE( globaloffset );
}

/* for debugging */
void PIDumpOffsetEvents( )
{
int i;
for (i=0; i<NUMNODES; i++) {
    if (offsetevents[i].p_master >= 0) 
	fprintf( stdout, "[%f] with [%d] from [%d] with error bound [%f]\n", 
		 offsettimes[i], 
		 offsetevents[i].p_master, offsetevents[i].p_slave,
		 offsetintval[i] );
    }
}

void PIOffsetFree( )
{
FREE( offsetevents );
FREE( offsettimes );
FREE( offsetintval );
}
