#include "savant.h"
#include "savutil.h"
#include "savantio.h"
#include "collect.h"
#include "yenta-savant.h"
#include "template.h"

#include <stdio.h>
#include <malloc.h>
#define min(a, b) (((a) < (b)) ? (a) : (b))

#define MAXCHUNKSIZE 4096

DDV_coll *load_coll(char *colname)
{
  FILE *colfile;
  DenseDocVec *doc;
  DDV_coll *result;
  colfile = fopen(colname, "r");
  if (!colfile) /* new collection */
    {
      colfile = fopen(colname, "w"); /* create the file */
      if (colfile != NULL)
	{
	  fclose(colfile);
	  result = (DDV_coll *)malloc(sizeof(DDV_coll));
	  result->items =
	    (DenseDocVec **)malloc(sizeof(DenseDocVec *) * 16);
	  result->length = 0;
	  result->mallength = 16;
	  result->colname = strdup(colname);
	  return result;
	}
      else
	return NULL;
    }
  result = (DDV_coll *)malloc(sizeof(DDV_coll));
  result->items =
    (DenseDocVec **)malloc(sizeof(DenseDocVec *) * 16);
  result->length = 0;
  result->mallength = 16;
  result->colname = strdup(colname);
  do
    if ((doc = load_ddv(colfile)))
      coll_add_doc(result, doc);
  while (doc);
  fclose(colfile);
  return result;
}

typedef struct namelist
{
  char *name;
  struct namelist *next;
} namelist;

static int coll_add_file(DDV_coll *coll, FILE *file, int docsize)
/* docsize == -1 is a special case for the rest of the file. file should be
 * positioned at the start of the document before calling this */
{
  char buf[3072];
  DV_Tree *temp;
  DenseDocVec *ddv;
  int sz, num = 0, i;
  while (docsize != 0)
    {
      sz = ((docsize == -1) || (docsize > 3070)) ? 3071 : docsize;
#if 0
      printf("Looked for %d characters.\n", sz);
#endif
      sz = fread(buf, 1, sz, file);
#if 0
      printf("Got %d characters.\n", sz);
#endif
      buf[sz] = '\0';
      if ((docsize == -1) && (sz < 3071))
	docsize = 0;
      if (docsize > 0)
	docsize -= sz;
      /* Do clever stuff to reject .sigs, etc. */

      temp = vectorize_buffer(buf, BODY_FIELD);
      
      if (temp)
	{
	  ddv = dvtree_to_ddv(temp);
	  coll_add_doc(coll, ddv);
	  for (i = 0; i < ddv->num_entries; i++)
	    tfidf_inc_word(ddv->wordcodes + i * WORD_ENCODE_WIDTH);
	  Num_DocVecs++;
	  num++;
	}
      destroy_dvtree(temp);
      
    }
  return num;
}

int coll_acquire_file(DDV_coll *coll, char *filename)
/* Reads the specified file, and vectorizes all the documents it finds, adding
 * them to the given collection. Should be relatively quick, unless you give
 * it a huge mail file, or something.
 * Weighting is: (Document + 5 * Subject) * (1 + 4 * (Author is user)) *
 *               (1 + (Date is recent))
 * Actually, that's just what I plan to make weighting. We'll see.
 * Return value is number of documents acquired */
{
  int num = 0;
  FILE *file;
  Template *template;
  
  ssize_t body_end, posn;

  List_of_Docs *match, *next, *swap;

  if ((file = fopen(filename, "r")) == NULL)
    return 0; /* we didn't get any */

  if (DocVec_Mags)
    {
      free(DocVec_Mags);
      DocVec_Mags = NULL;
    }

  template = recognize(file);
  if (template == NULL) /* assume whole file is body */
    {
      rewind(file);
      num = coll_add_file(coll, file, -1);
      fclose(file);
      return num;
    }
  if (template->format == NULL) /* reject */
    {
      fclose(file);
      return 0;
    }
  rewind(file);

  match = (List_of_Docs *)malloc(sizeof(List_of_Docs));
  next = (List_of_Docs *)malloc(sizeof(List_of_Docs));
  match->doc_start = match->body_start =
    match->subject[0] = match->subject[1] =
    match->location[0] = match->location[1] =
    match->date[0] = match->date[1] =
    match->source[0] = match->source[1] = -1;

  /* The problem here is that there is no good way of finding the end of the
   * body of a document; what you have to do is find the next match, and then
   * use its start as your end. This is why I match the next document, take
   * its start, and then add the previous document. */

  
  match_pattern(file, template->delimiter, -1, NULL, 1);
  match_pattern(file, template->delimiter, -1, NULL, 1);

  match_pattern(file, template->format, -1, match, 1);

  posn = ftell(file);
  
  while ((body_end = (template->delimiter ?
		      (match_pattern(file, template->delimiter, -1, 
				     NULL, 1)) :
		      -1)) != -1)
    {
      fseek(file, posn, SEEK_SET);
      match_pattern(file, template->format, -1, next, 1);
      
#if 0
      printf("A document, %d to %d.\n", match->body_start, body_end);
#endif
      posn = ftell(file);
      if (posn == body_end)
	posn++; /* if start = end */
      fseek(file, match->body_start, SEEK_SET);
      num += coll_add_file(coll, file, body_end - match->body_start);
      fseek(file, posn, SEEK_SET);
      swap = match;
      match = next;
      next = swap;
    }
  fseek(file, match->body_start, SEEK_SET);
  num += coll_add_file(coll, file, -1);

  free(next);
  free(match);

  fclose(file);

  return num;
  
}

DDV_coll *temp_coll()
     /* Creates a collection that will not be saved when it gets closed */
{
  DDV_coll *result = 
    (DDV_coll *)malloc(sizeof(DDV_coll));
  result->items =
    (DenseDocVec **)malloc(sizeof(DenseDocVec *) * 16);
  result->length = 0;
  result->mallength = 16;
  result->colname = NULL;
  return result;
}

int close_coll(DDV_coll *col)
     /* Returns non-zero if unable to save; in this case the collection is not
      * closed */
{
  int problem = 0;
  int posn = 0;
  if (checkpoint_coll(col))
    return 1;
		      
  for (posn = 0; posn<col->length; posn++)
    destroy_ddv(col->items[posn]);
  free(col->items);
  free(col);
  return 0;
}

int checkpoint_coll(DDV_coll *col)
{
  int problem = 0;
  int posn = 0;
  if (col->colname) /* temps have no names */
    {
      FILE *colfile = fopen(col->colname, "w");
      if (!colfile)
	return 1;
      while (posn < col->length && !problem)
	problem = save_ddv(colfile, col->items[posn++]);
	
      problem = problem || fclose(colfile);
      if (problem)
	return 1; /* Device full, probably */
    }
  return 0;
}

void coll_add_doc(DDV_coll *col, DenseDocVec *doc)
     /* Adds a document, resizing the collection if necessary */
{
  col->items[col->length++] = doc;
  if (col->length==col->mallength)
    {
      col->mallength *= 2;
      col->items = 
	(DenseDocVec **)realloc(col->items, sizeof(DenseDocVec *) *
				col->mallength);
    }
}

void coll_remove_doc(DDV_coll *col, int index)
     /* Removes a document from the collection, only changing higher numbered
      * items */
{
  DenseDocVec *doc = col->items[index];
  col->items[index] = col->items[--col->length];
  if ((col->length < col->mallength>>2) && (col->mallength > 8))
    {
      col->mallength /= 2;
      col->items = 
	(DenseDocVec **) realloc(col->items, sizeof(DenseDocVec *) *
				      col->mallength);
    }
  destroy_ddv(doc);
}

int coll_size(DDV_coll *col)
{
  return col->length;
}


