#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <savantio.h>

static int start_new_chunk(int);
static int finish_current_chunk(void);
static int write_global_tree(void);
static int accum_dvmags_from_wvtree_node_tfidf(WV_Tree *, DB_FLOAT *);

FILE *WORDVEC_FILE, *WVOFF_FILE, *DVMAG_FILE, *DATE_FILE, *WMAP2_FILE;

/* Globals needed by docvec-to-wordvec conversion */
WV_Tree *Global_Tree;
DB_INT Global_Num_Words;
DB_INT Global_Num_Dates;
DB_INT Global_Num_Docs;   /* I think Global_Num_Docs and Current_IDnum are */
int Current_IDnum;        /* the same thing for RA, maybe not for Yenta */
DB_INT Global_WVoffset;

/* Globals needed by auto-chunking */
#define MEM_LIMIT 16777216  /* 16 megs */
unsigned int Memory_Usage;
int Current_Chunk_Num;
int Current_Chunk_Is_Open;
char *Database_Path;

int 
init_write(char *db_path)
{
  /* init_write() is one of the original db-lib API interfaces.  It
     initiates building a database in the directory db_path. 
  */
  Database_Path = strdup(db_path); /* for posterity */
  Current_Chunk_Num = 0;
  Memory_Usage = 0;
  
  return(0);
}

int 
save_dv(DV_Tree *dvtree, int doc_id)
{
  /* save_dv() is one of the interfaces of the original db-lib API.
     It takes a DV_Tree (since binary tree format is
     vectorize_buffer()'s canonical output format) and mutates
     Global_Word_Tree (and many other things along with it) to reflect
     new term frequencies.  Also intializes a new "chunk" if one is
     not open, and finalizes the current chunk if the malloc limit is
     hit.  */

  DenseDocVec *ddv = dvtree_to_ddv(dvtree);
  
  if(! Current_Chunk_Is_Open) {
    start_new_chunk(Current_Chunk_Num);
    Current_Chunk_Is_Open = 1;
  }

  wvtree_insert_ddv(ddv);
  destroy_ddv(ddv);
  fwrite_big(&doc_id, 1, sizeof(DB_INT), WMAP2_FILE);
  
  if(Memory_Usage >= MEM_LIMIT) {
    finish_current_chunk();
    Current_Chunk_Is_Open = 0;
    return(++Current_Chunk_Num);
  }
  else {
    return(0);
  }
}

int
finalize_write(void)
{
  /* finalize_write() is also from the original API; It closes the
     current chunk if one is open and then merges all the chunks.  
     */

  char **db_paths, *db_path0="first entry not used"; 
  char *temp_fname;
  char *db_fnames[] = {WORDVEC_FNAME, WVOFF_FNAME, DVMAG_FNAME, 
		       DATE_FNAME, WMAP2_FNAME}; 
  int argc, i, j;
  
  if(Current_Chunk_Is_Open) {
    finish_current_chunk();
  }
  else {
    Current_Chunk_Num--;
  }
  
  argc = Current_Chunk_Num + 3;  
  /* (Current_Chunk_Num+1) sources, one target, one unused */
  
  db_paths = (char **)malloc(argc*sizeof(char *));
  db_paths[0] = db_path0;
  for(i=1; i < argc-1; i++) {
    db_paths[i] = (char *)malloc(strlen(Database_Path) + 9);  /*  +9 for "/tempxyz\0"  */
    sprintf(db_paths[i], "%s/temp%03d", Database_Path, i-1);
  }
  db_paths[argc-1] = Database_Path;
  
  merge_databases(argc, db_paths);
  
  /* overkill with the strlens here, just to be safe: */
  temp_fname = (char *)malloc(strlen(Database_Path) + 9 +  /*  +9 for "/tempxyz\0"  */
			      strlen(WORDVEC_FNAME) + strlen(WVOFF_FNAME) + 
			      strlen(DVMAG_FNAME) + strlen(DATE_FNAME) +
			      strlen(WMAP2_FNAME));
  for(i=1; i<argc-1; i++) {
    for(j=0; j<5; j++) {
      strcpy(temp_fname, db_paths[i]);
      strcat(temp_fname, "/");
      strcat(temp_fname, db_fnames[j]);
      unlink(temp_fname);
    }
    rmdir(db_paths[i]);
    free(db_paths[i]); 
  }
  
  free(db_paths);
  free(temp_fname);
  
  return(0);
}

static int
start_new_chunk(int idnum)
{
  /* start_new_chunk() takes a number and creates a temp subdirectory
     of Database_Path with the appropriate wordvec-related files.  It
     also (re)initializes the global variables Global_Tree,
     Global_Num_Words, Global_Num_Dates, Global_Num_Docs,
     Current_IDnum and Global_WVoffset.
     */

  char *temp_dbpath;
  
  temp_dbpath = (char *)malloc(strlen(Database_Path) + 9);  /*  +9 for "/tempxyz\0"  */
  sprintf(temp_dbpath, "%s/temp%03d", Database_Path, idnum);
  mkdir(temp_dbpath, S_IRWXU);
  
  WORDVEC_FILE = open_or_die(temp_dbpath, WORDVEC_FNAME, "w");
  WVOFF_FILE = open_or_die(temp_dbpath, WVOFF_FNAME, "w");
  DVMAG_FILE = open_or_die(temp_dbpath, DVMAG_FNAME, "w");
  DATE_FILE = open_or_die(temp_dbpath, DATE_FNAME, "w");
  WMAP2_FILE = open_or_die(temp_dbpath, WMAP2_FNAME, "w");

  free(temp_dbpath);
  
  Global_Tree = NULL;
  Global_Num_Words = 0;
  Global_Num_Dates = 0;
  Global_Num_Docs = 0;
  Current_IDnum = 0;
  Global_WVoffset = 0;
  
  if(SavantDebug) {
    fprintf(stderr, "Opened chunk %d\n", Current_Chunk_Num);
    fflush(stderr);
  }
  
  return(0);
}

static int
finish_current_chunk(void)
{
  /* finish_current_chunk() writes the current Global_Tree, resets
     Memory_Usage and closes the database files in the temporary subdir. 
     */

  write_global_tree(); 
  Memory_Usage = 0;
  
  fclose(WORDVEC_FILE);
  fclose(WVOFF_FILE);
  fclose(DVMAG_FILE);
  fclose(DATE_FILE);
  fclose(WMAP2_FILE);
  
  if(SavantDebug) {
    fprintf(stderr, "Closed chunk %d\n", Current_Chunk_Num);
    fflush(stderr);
  }
  
  return(0);
}


static int
write_global_tree()
{
  /* write_global_tree() is the procedure in charge of writing to
     disk.  It does an in-order traversal of the tree, employing
     write_wvtree_node() to do the grunt work of writing the data, and
     also accumulates TFiDF norm contributions using
     accum_dvmags_from_wvtree_node_tfidf().  Come to think of it,
     there is no reason to do that here, since the mergewv.c routines
     throw that info away and recompute what they need from scratch...
     write_global_tree() will also free up all the allocated memory.
     */

  WV_Tree *treeptr = Global_Tree, *treetmp;
  WV_List *list_prev, *list;
  DB_FLOAT *magnitude_array;  /* NUM_FIELD_TYPES * NumDocs Long */
  int i;
  
  magnitude_array = (DB_FLOAT *)malloc(sizeof(DB_FLOAT) * 
				       NUM_FIELD_TYPES * Global_Num_Docs);
  for (i=0; i < (NUM_FIELD_TYPES * Global_Num_Docs); i++) {
    magnitude_array[i] = (DB_FLOAT)0.0;
  }
  
  fwrite_big(&Global_Num_Words, sizeof(DB_INT), 1, WORDVEC_FILE);
  fwrite_big(&Global_Num_Dates, sizeof(DB_INT), 1, DATE_FILE);
  Global_WVoffset = sizeof(DB_INT);
  
  /* The plan: if the top node of the tree has no left branch, then it
     is also the "first" node of the tree, and so we write it out,
     kill it and move on.  If not, then rearrange the tree (tack the
     top node onto the rightmost leaf of its left branch) and try
     again */
  
  while (treeptr != NULL) {
    if (treeptr->left == NULL) {
      write_wvtree_node(treeptr);
      /* %%% Don't need to do this here, only in merge */
      accum_dvmags_from_wvtree_node_tfidf(treeptr, magnitude_array);
      /* kill the wvlist for this node... */
      /* %%% I would like to eventually replace crap like this everywhere in the code
	 %%% with standardized constructors/selectors/mutators/destructors... */
      list = treeptr->wvlist;
      list_prev = NULL;
      while(list != NULL) {
	list_prev = list;
	list = list->next;
	free(list_prev);
      }
      /* then destroy the node itself and move along */
      treetmp = treeptr->right;
      free(treeptr);
      treeptr = treetmp;
    }
    else {         /* not found the end yet */
      /* make treetmp the rightmost leaf of treeptr's left branch: */
      treetmp = treeptr->left;
      while(treetmp->right != NULL) {
	treetmp = treetmp->right;
      }
      treetmp->right = treeptr;  /* stick treeptr onto this */
      treeptr = treeptr->left; /* the left branch is now the main tree */
      treetmp->right->left = NULL; /* erase this connection */
    }
  }
  
  /* write the DV_Mags too */
  /* %%% Don't need to do this here, only in merge */
  for (i=0; i < (Global_Num_Docs * NUM_FIELD_TYPES); i++) {
    magnitude_array[i] = (DB_FLOAT)sqrt(magnitude_array[i]);
  }
  fwrite_big(&Global_Num_Docs, sizeof(DB_INT), 1, DVMAG_FILE); 
  fwrite_big(magnitude_array, sizeof(DB_FLOAT), 
	     (NUM_FIELD_TYPES * Global_Num_Docs), DVMAG_FILE);
  free(magnitude_array);
  
  return((int)Global_Num_Words);
}

static int
accum_dvmags_from_wvtree_node_tfidf(WV_Tree *node, 
				    DB_FLOAT *mag_array)
{
  /* This function used to be half of the loop taken out of write_global_tree
     which became write_wvtree_node.  write_wvtree_node has shrunk as a result :) */
  enum Field_Types type;
  WV_List *list;
  int docnum, term_frequency;

  type = word_type(node->wordcode[0]);
  if ((type == DATE_FIELD) ||
      (type == TIME_FIELD)) {
    /* treat dates and times specially because dates are only assigned
       to one doc by definition */
    for(list=node->wvlist; list!=NULL; list=list->next) {
      /* Update the magnitude entry for this doc & word-type */
      docnum = WV_DVNUM(list->docweight);
      /* weight is just raw term frequency number (tf) */
      term_frequency = WV_DVFREQ(list->docweight);
      /* dates are only assigned to one doc by definition */
      mag_array[(docnum * NUM_FIELD_TYPES) + (int)type] +=
	(DB_FLOAT)(term_frequency * term_frequency * log(Global_Num_Docs));
    }
  }
  else {  /* not a date */
    for(list=node->wvlist; list!=NULL; list=list->next) {
      /* Update the magnitude entry for this doc & word-type */
      /* term_frequency is just raw term frequency number (tf) */
      docnum = WV_DVNUM(list->docweight);
      term_frequency = WV_DVFREQ(list->docweight);
      mag_array[(docnum * NUM_FIELD_TYPES) + (int)type] +=
	((DB_FLOAT)(term_frequency * term_frequency) * 
	 log((float)Global_Num_Docs / (float)node->num_entries) *
	 log((float)Global_Num_Docs / (float)node->num_entries));
    }
  }

  return(0);  /* temporary... make this return something useful */
}
