#include <stdio.h>
#include <math.h>

#include "easy.h"

#define msgtag_ARGS 1

#define msgtag_A 100
#define msgtag_B 200
#define msgtag_C 300


extern char *createndf(FILE **fp, char *file, int nx, int ny, int nz);
extern int appendndf(FILE *fp, int startproc, int endproc, char *machine,
		     char *a_out, char *all_args);

extern void mm(double *C, double *A, double *B, int lsize, int update);


static void dorest(double *C, double *A, double *B, 
		   int msize, int nproc,
		   int lsize, int sqrt_nproc,
		   int newcase)
{
  int x,y;
  double **ap, **bp, **cp;
  int oldtype, oldstrd;
  int k;
  
  oldtype = setdatatype(INTEGER4);
  oldstrd = setstride(1);

  if (!newcase) {
    int args[2];
    
    args[0] = lsize;
    args[1] = sqrt_nproc;

    broadcast(msgtag_ARGS,args,2);
  }

  ap = (double **)calloc(nproc , sizeof(double *));
  bp = (double **)calloc(nproc , sizeof(double *));
  cp = (double **)calloc(nproc , sizeof(double *));
  
  /* Start indices for matrix sub-blocks to be transferred */
  
  k = 0;
  for (y=sqrt_nproc-1; y>=0; y--)
    for (x=0; x<sqrt_nproc; x++) {
      int index = x*lsize*msize + y*lsize;
      ap[k] = &A[index];
      bp[k] = &B[index];
      cp[k] = &C[index];
      k++;
    }
  
  setdatatype(REAL8);

  /* Send appropriate 2D-blocks of A & B matrices to corresponding nodes */
  
  for (k=0; k<nproc; k++) 
    send2d(k,msgtag_A,ap[k], lsize, lsize, msize);
  
  for (k=0; k<nproc; k++)
    send2d(k,msgtag_B,bp[k], lsize, lsize, msize);
  
  /* Wait for reply */

  k = 0;
  while (k < nproc) {
    int source = NOCARE;
    int msgtag = msgtag_C;
    int bufid;
    
    bufid = probe(&source, &msgtag); /* Snoop the sender first */
    
    if (bufid > 0) {
      if (msgtag == msgtag_C && (source >=0 && source < nproc)) {
	recv2d(source,msgtag_C,cp[source],lsize, lsize, msize);
	if (k==0) 
	  printf("dorest: C(%d",source);
	else
	  printf(",%d",source);
	k++;
	if (k==nproc) printf(")\n");
      }
      else {
	printf("dorest: ??? bufid, source, msgtag = %d, %d, %d ???\n",
	       bufid, source, msgtag);
      }
    }
    else {
      pvm_perror("dorest@probe");
      exit(1);
    }
  } /* while (k < nproc) */
  
  free(ap);
  free(bp);
  free(cp);

#ifdef WAITALL
  {
    extern int waitall_verbose;
    waitall_verbose = 0; /* Set this to zero if no verbose is required */
    /* waitall(NOCARE); */
  }
#endif
  
  setdatatype(oldtype);
  setstride(oldstrd);
}



void matmul(double *C, double *A, double *B,
	    int msize, int nproc, char *nodeprog,
	    char *machine)
{
  static int cur_nproc = 0;
  static int cur_cluster = -1;
  int save_cluster = getcluster();
  int newcase = 1;

  if (nproc < 1) { /* Host does the job */
    int update = 0;

    mm(C,A,B,msize,update);

    return;

  }
  else if (cur_nproc != nproc) { /* New nodes are needed */
    int sqrt_nproc = sqrt(nproc);
    int lsize = msize / sqrt_nproc;
    char *ndffile;
    FILE *fp;
    char args[30];

    if (cur_nproc > 0 && cur_cluster >= 0) {
      /* Shut down old cluster */
      killcluster(cur_cluster);
    }

    ndffile = createndf(&fp,NULL,sqrt_nproc,sqrt_nproc,1);
    sprintf(args,"%d %d",lsize,sqrt_nproc);
    {
      char *mach = machine ? machine : "*";
      if (mach[0] != '*') {
	if (mach[0] == '$') {
	  extern char *getenv(char *);
	  mach = getenv(mach+1);
	}
	if (!mach) mach = "*";
      }
      appendndf(fp,0,nproc-1,mach,nodeprog,args);
    }
    fclose(fp);
    
    if (createproc(ndffile) <= 0) {
      fprintf(stderr,
	      "matmul: Unable to start proper number of processes\n");
      exit(1);
    }

    remove(ndffile);
    free(ndffile);

    pvm_setopt(PvmRoute,PvmRouteDirect);

    cur_nproc = numnodes();
    cur_cluster = getcluster();
  }
  else { /* Nodes are still running */
    newcase = 0;
  }

  {
    int sqrt_nproc = sqrt(cur_nproc);
    int lsize = msize / sqrt_nproc;

    setcluster(cur_cluster);
    printf("matmul: Using cluster id = %d\n",getcluster());
    dorest(C,A,B,msize,cur_nproc,lsize,sqrt_nproc,newcase);
  }

  setcluster(save_cluster);

  /* 
    After this:
    - C contains solution of product A * B
    - A & B matrices remained unchanged
    - Node programs are still running and are waiting for new job
    - Data types, strides & cluster ids are restored

    If another call with exactly same the number of procs is made, 
    then node programs are NOT restarted, since they already wait
    for data to calculate.

   */

}




