#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <savantio.h>

static void init_search(void);
static int update_matches(float *, unsigned int *, WordVec *, float, 
			  int *, int, DocSim *, int, float, int, DB_INT *); 
static WordVec *fetch_wordvec(unsigned int *); 
static WordVec *fetch_datevec(unsigned int *, float); 
static long locate_hash(unsigned int *); 
static int locate_date_hash(unsigned int *, double, int);
static long locate_dates(unsigned int *, double, long *, long *);
static float date_distance_at_index(long, unsigned int *);
static int destroy_wordvec(WordVec *);

DB_INT Num_DocVecs; /* needed by find_matches() */
DB_FLOAT *DocVec_Mags; /* used by update_matches to normalize vectors */
DB_INT *Window_Map; /* " " to prevent duplicates -- maps windows to docnums */
size_t Max_WVoff;  /* for locate_hash */
size_t Max_DATEoff;  /* for locate_dates */

DB_INT
init_read(char *db_path)
{
  char wvoff_path[256];
  char date_path[256];
  struct stat buf;

  date_path[254] = date_path[255] = wvoff_path[254] = wvoff_path[255] = '\0';

  /*open_database(db_dir, "r");*/
  WORDVEC_FILE = open_or_die(db_path, WORDVEC_FNAME, "r");
  WVOFF_FILE = open_or_die(db_path, WVOFF_FNAME, "r");
  DVMAG_FILE = open_or_die(db_path, DVMAG_FNAME, "r");
  DATE_FILE = open_or_die(db_path, DATE_FNAME, "r");
  WMAP2_FILE = open_or_die(db_path, WMAP2_FNAME, "r");

  /* determine Max_DATEoff, the size of DATE_FILE */
  strncpy(date_path, db_path, 254);
  if (date_path[strlen(date_path)-1] != '/')
    strcat(date_path, "/");
  strncat(date_path, DATE_FNAME, 255-strlen(date_path));
  stat(date_path, &buf);
  Max_DATEoff = buf.st_size; 
  /*  printf("Max_DATEoff = %d\n", Max_DATEoff);*/

  /* determine Max_WVoff, the size of WORDVEC_FILE */
  strncpy(wvoff_path, db_path, 254);
  if (wvoff_path[strlen(wvoff_path)-1] != '/')
    strcat(wvoff_path, "/");
  strncat(wvoff_path, WVOFF_FNAME, 255-strlen(wvoff_path));
  stat(wvoff_path, &buf);
  Max_WVoff = buf.st_size; 
  
  fread_big(&Num_DocVecs, sizeof(DB_INT), 1, DVMAG_FILE);
  DocVec_Mags = (DB_FLOAT *)malloc(sizeof(DB_FLOAT) * Num_DocVecs * NUM_FIELD_TYPES);
  fread_big(DocVec_Mags, sizeof(DB_FLOAT), Num_DocVecs * NUM_FIELD_TYPES, DVMAG_FILE);
  /* %%% Why is win_map in here?! 
     Window_Map = (DB_INT *)malloc(2*sizeof(DB_INT)*Num_DocVecs);
     fread_big(Window_Map, sizeof(DB_INT), 2*Num_DocVecs, WMAP_FILE);*/
  Window_Map = (DB_INT *)malloc(sizeof(DB_INT)*Num_DocVecs);
  fread_big(Window_Map, sizeof(DB_INT), Num_DocVecs, WMAP2_FILE);

  return(Num_DocVecs);
}

static void 
init_search(void)
{
  unsigned int zerocode[WORD_ENCODE_WIDTH];
  int j;
  for (j=0; j<WORD_ENCODE_WIDTH; j++) {
    zerocode[j] = 0;
  }
  locate_hash(zerocode);
}
  
DocSim *
find_matches(DV_Tree *query,
	     int num_matches,
	     int *query_bias,
	     int use_weights,
	     DB_INT *all_dv_biases)
{
  /* find_matches() is one of the library functions provided for in the API.
     Given a query, it returns the top num_matches hits in a DocSim struct.
   */
  int i;
  DenseDocVec *ddv_query;
  DocSim *matches = NULL;
  WordVec *wordvec = NULL;
  float *all_sims;                            /* array of sim, rating for each doc */
  float term_weight;                          /* term weight according to tfidf */
  float query_normalized_tf;                  /* term frequency, normalized */
  float query_mags[NUM_FIELD_TYPES];          /* sum(tf) for all terms */
  float query_true_mags[NUM_FIELD_TYPES];     /* actual mag of final weights */
  float query_true_mags_total = 0.0;          /* total actual mag of final weights */
  float query_max_frequency[NUM_FIELD_TYPES]; /* max number of a single word */
  enum Field_Types type;
  float bias_total = (float)0.0;
  enum Field_Types temptype;

  /* initialize all_sims, matches, query_mags, query_max_frequency 
     and query_true_mags */
  all_sims = (float *)malloc(Num_DocVecs*sizeof(float));
  for(i=0; i<Num_DocVecs; i++) {
    all_sims[i] = 0.0;
  }
  matches = (DocSim *)malloc(sizeof(DocSim)*num_matches);
  for(i=0; i<num_matches; i++) {
    matches[i].vecnum = -1;
    matches[i].sim = -1.0;
  }
  for (type=BODY_FIELD; type < LAST_FIELD; type++) {
    query_mags[type] = 0.0;
    query_max_frequency[type] = 0.0;
    query_true_mags[type] = 0.0;
  }

  ddv_query = dvtree_to_ddv(query);

  /* Here we set up query_max_frequency and query_mags.  These are arrays
     parameterized by the various field-types.  The intent is to effectively
     treat a docvec (the query dv or one from the database) as several different
     vectors, one for each field-type.  The comparisons are done separately and
     the results are combined later.  */
  for(i=0;i<ddv_query->num_entries;i++) {  
    type = word_type(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]);
    /* query_max_frequency[T] ends up being the largest term frequency
       in the query of field-type T */
    if (query_max_frequency[type] < ddv_query->weights[i]) {
      query_max_frequency[type] = ddv_query->weights[i];
    }
    /* query_mags[T], after the sqrt's taken later, is the magnitude
       of the vector made of terms in the query of field-type T */
    query_mags[type] += (float)ddv_query->weights[i] * 
      (float)ddv_query->weights[i];
  }
  for (type=BODY_FIELD; type < LAST_FIELD; type++) {
    query_mags[type] = (float)sqrt((double)query_mags[type]);
  }

  init_search();     /* resets bounds of search */

  /* This loop does the bulk of the work, fetching wordvecs and calling
     update_matches().  */
  for(i=0; i<ddv_query->num_entries; i++) {
    type = word_type(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]);
    if ((type == BODY_FIELD) || 
	(type == LOCATION_FIELD) || 
	(type == SUBJECT_FIELD) || 
	(type == DAY_FIELD) ||
	(type == SOURCE_FIELD)) {
      /* This is the "discrete" field-type case; fetch_wordvec() returns a list 
	 of documents containing exactly this term */
      wordvec = fetch_wordvec(&(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]));
    }
    else if ((type == DATE_FIELD) || (type == TIME_FIELD)) {
      /* This is the "continuous" field-type case; fetch_datevec() returns a
	 list of documents containing terms of nearby value.   */
      wordvec = fetch_datevec(&(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]), 0.5);
    }

    if(wordvec != NULL) {
      if(SavantDebug) {
	char decoded[60];
	decode_word(wordvec->wordcode, decoded);
	printf("%s; ", decoded);
      }
      /*  term_weight is the weight of this term in the query.  This uses an 
	  augmented normalized term frequency taken with respect to the field-type
	  sub-vector.  (nfx in Salton) */
      term_weight = (1.0 + (float)ddv_query->weights[i] / 
		     (float)query_max_frequency[type]) / 2.0 *
	log((float)Num_DocVecs / (float)wordvec->num_docs);
      if(SavantDebug) {
	printf("term weight = %0.4e; ", term_weight);
      }

      /*  query_normalized_tf is the vanilla term weight, frequency divided by
	  vector magnitude (again wrt to field-type).  */
      if (query_mags[type] > 0.0)
	query_normalized_tf = (float)ddv_query->weights[i] / 
	  (float)query_mags[type];
      else
	query_normalized_tf = (float)0.0;
      /*  %%% question on above: if this term has field-type 'type', how is 
	  query_mags[type] ever going to be not > 0.0?
	  */
      if(SavantDebug) {
	printf("norm t.f. = %0.4e\n", query_normalized_tf);
      }

      /* query_true_mags[T] ends up being the sum of the squared term_weights
	 for field-type T.  */
      query_true_mags[type] += (term_weight * term_weight);

      /* pass the buck... */
      update_matches(all_sims, &(ddv_query->wordcodes[WORD_ENCODE_WIDTH*i]),
		     wordvec, term_weight, query_bias,
		     ddv_query->num_entries,
		     matches, num_matches,
		     query_normalized_tf, 
		     use_weights, all_dv_biases);
      destroy_wordvec(wordvec);
    }
  }

  free(all_sims);
  destroy_ddv(ddv_query);

  if (use_weights) {
    /* matches now contains the top hits for this query.  We want to normalize 
       their similarity ratings according to the query biases.  */
    bias_total = (float)0.0;
    query_true_mags_total = (float)0.0;
    for (temptype=BODY_FIELD; temptype < LAST_FIELD; temptype++) {
      bias_total += (float)query_bias[temptype];    /* add up the biases */
    }
    if (bias_total > 0.0) {
      for (temptype=BODY_FIELD; temptype < LAST_FIELD; temptype++) {
	query_true_mags_total += sqrt(query_true_mags[temptype]) * 
	  query_bias[temptype] / bias_total;   /* add up the scaled-back query_true_mags */
      }
    }
    for(i=0; i<num_matches; i++) {
      if (query_true_mags_total > 0.0) 
	/* %%% it's not clear right off why this is a good idea, or a good way to do it. */
	matches[i].sim = (float)matches[i].sim / (float)query_true_mags_total;
      else {
	matches[i].vecnum = -1;
	matches[i].sim = (float)-1.0;
      }
    }
  }

  return(matches);
}

static int 
update_matches(float *all_sims,        /* array of similarity ratings for each doc */
	       unsigned int *thisword, /* this wordcode */
	       WordVec *fetched_wv,    /* array of documents containing this word */
	       float term_weight,      /* weight of this term in the query */
	       int *query_bias,        /* one for each field-type */
	       int query_length,       /* number of unique words in this query */
	       DocSim *matches,        /* pointer to partial matches array */
	       int num_matches,        /* number of matches in *matches */
	       float query_normalized_tf,  /* raw tf, normalized */
	       int use_weights,        /* if == 0, don't use weights in the query, just #hits */
	       DB_INT *all_dv_biases)
     /* %%% something kind of displeasing: thisword, term_weight, query_bias,
	query_length, and query_normalized_tf all just get passed on to
	sim_contrib. */
{
  /* update_matches is called once for each unique word in the query.  It goes
     through the supplied wordvec and adds appropriate similarity contributions
     to the appropriate slots in all_sims.  */
  int i, j, vecnum;
  float sim;
  DocSim ds_temp1, ds_temp2;

  for(i=0; i<fetched_wv->num_docs; i++) {
    vecnum = WV_DVNUM(fetched_wv->docweights[i]);
    all_sims[vecnum] += 
      sim_contrib(fetched_wv->docweights[i], fetched_wv->num_docs,
		  thisword, fetched_wv->wordcode,
		  term_weight, query_bias, all_dv_biases, 
		  query_normalized_tf, use_weights, query_length);
    sim = all_sims[vecnum];
    if(SavantDebug) {
      printf("docvec %d: +%0.4e\n", vecnum, sim);
    }

    if(sim > 0.0) {
      for(j=0; j<num_matches; j++) {
	if(matches[j].vecnum == -1) { /* no match at j yet; stick this one there */
	  matches[j].vecnum = vecnum;
	  matches[j].sim = sim;
	  break;
	}
#if 0 /* gotta turn this off for the moment to save the abstraction barrier;
	 however, this problem needs to be solved in some way  */
	else if(Window_Map[2*matches[j].vecnum] == Window_Map[2*vecnum]) {
	  /* The even entries of Window_Map are the document id numbers for the
	     corresponding vectors; there are potentially many vecs per doc, due
	     to windowing.  E.g., Window_Map[2*i] is the number of the document
	     from which the ith vector was obtained.  A matching doc. num means
	     something from the same document is already here, and we don't want
	     to repeat. */
	  if(matches[j].sim < sim) { /* clobber the old one */
	    matches[j].vecnum = vecnum;
	    matches[j].sim = sim;
	    break;
	  }
	  else { /* forget this whole vector */
	    break;
	  }
	}
#endif /* endif broken win-map */
	/* here is the fixed version of it (if you call this fixed!) */
	else if(Window_Map[matches[j].vecnum] == Window_Map[vecnum]) {
	  /* Window_Map[i] is the document id number for the vector i; there are
	     potentially many vecs per doc, due to windowing.  A matching
	     doc. num means something from the same document is already here,
	     and we don't want to repeat. */
	  if(matches[j].sim < sim) { /* clobber the old one */
	    matches[j].vecnum = vecnum;
	    matches[j].sim = sim;
	    break;
	  }
	  else { /* forget this whole vector */
	    break;
	  }
	}
	
	
#if 0 /* this heuristic broke when we started doing date-based queries.  It was
	 intended to reject duplicate document suggestions by culling hits with
	 exactly the same sim rating, a likely thing for identical documents but
	 unlikely for different ones.  Since we assert that date and time terms
	 appear in only one document, however, %%% (which they tend to do, but
	 why do we assert that?)  partial similarities would often look quite
	 alike early on, and documents would get unjustly shafted.  */
	else if((floor (sim * 10000)) == (floor (matches[j].sim * 10000)) &&
		(use_weights == 1)) {
	  break; /* This vector is probably a duplicate, so get rid of it */
	  /* (This heuristic is only valid when using weights.) */
	}
#endif /* endif broken heuristic */
	
	else if(matches[j].sim < sim) { /* insert our new one, shuffle others */
	  ds_temp1 = matches[j];
	  matches[j].vecnum = vecnum;
	  matches[j].sim = sim;
	  j++;
	  /* note the new array bounds read error fix, below.  It used to be up a few lines */
	  while(j<num_matches 
		&& (ds_temp1.vecnum != -1) /* dummy placeholder, forget the shuffle */
		&& (Window_Map[ds_temp1.vecnum] != Window_Map[vecnum])) {
	    ds_temp2 = matches[j]; /* shuffle old matches down in the ranks */
	    matches[j] = ds_temp1; /* unless one is from the same document */
	    ds_temp1 = ds_temp2;   /* in which case clobber it */
	    j++;
	  }
	  break;
	}
	/* else continue for loop */
      }
    }
    /* continue with next docvec */
  }  
  
  return(0);
}


static WordVec *
fetch_wordvec(unsigned int *wordcode)
{
  int i;
  DB_INT offset;
  WordVec *out;

/* 
   [PROBABLY NOT, BUT I'LL LEAVE THE COMMENT IN JUST IN CASE]
   NEED A FUZZY LOCATE_HASH FOR DATES & GPS, 
   fuzzy_locate_hash(wordcode, min-distance, &newmin)
   should locate the nearest hash greater than 
   min-distance from wordcode, filling in the new 
   distance into newmin.  Then, after processing that 
   one, do the next one with that new min.  repeat until
   the distance is far enough away.
*/

  offset = locate_hash(wordcode);
  /*  fprintf(LOG_FILE, "%08x%08x, at %lx\n", hash0, hash1, offset);*/
  if(offset == -1) 
    return(NULL);

  out = (WordVec *)malloc(sizeof(WordVec));
  fseek(WORDVEC_FILE, offset+(WORD_ENCODE_WIDTH*sizeof(int)), SEEK_SET);
  fread_big(&(out->num_docs), sizeof(DB_INT), 1, WORDVEC_FILE);
 
  out->docweights = (DB_UINT *)malloc(out->num_docs * sizeof(DB_INT));
  fread_big(out->docweights, sizeof(DB_INT), out->num_docs, WORDVEC_FILE);

  out->wordcode = (DB_UINT *)malloc(WORD_ENCODE_WIDTH * sizeof(DB_UINT));
  for(i=0; i<WORD_ENCODE_WIDTH; i++) {
    out->wordcode[i] = wordcode[i];
  }

/*  printf("fetch_wordvec: out->num_docs = %d\n", out->num_docs); */

/*  { int i;
    for(i=0; i<out->num_docs; i++) {
      fprintf(LOG_FILE, "%08x: vecid %u, weight %u\n", out->docweights[i], 
	      WV_DVNUM(out->docweights[i]), WV_DVFREQ(out->docweights[i]));
    }
  } */
/*  for(i=out->total_count=0; i<out->num_docs; i++) {
    out->total_count += WV_DVFREQ(out->docweights[i]);
  }
*/
  return(out);
}

static long 
locate_hash(unsigned int *wordcode)
{
  static int low_word;
  int cur_word, high_word;
  DB_UINT diskint;  
  DB_INT disklong;
  int j, gothit;

  for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1); j++) {
    if (wordcode[j] != 0) {
      gothit = 0;
    }
  }
  if(gothit == 1) { /* reset search */
    low_word = 0; /* since the words come to fetch_wordvec() in 
		     alphabetical order, we can start the search where
		     we found the last one (trading an n for a logn?) */
    return(0);
  }

  high_word = (Max_WVoff/(WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long)))-1;
 
  /* check if the target hash is less than (or equal to) our lower bound */
  fseek(WVOFF_FILE, (WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long))*low_word, SEEK_SET);
  for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1);j++) {
    fread_big(&diskint, sizeof(DB_INT), 1, WVOFF_FILE);
    if(diskint > wordcode[j]) {
      return(-1);
    }
    else if (diskint != wordcode[j]) {
      gothit = 0;
    }
  }
  if (gothit == 1) {  /* Lucky score -- we found it exact */
    fread_big(&disklong, sizeof(DB_INT), 1, WVOFF_FILE);
    return(disklong);
  }

  /* check if the target hash is above (or equal to) our upper bound */
  fseek(WVOFF_FILE, (WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long))*high_word, SEEK_SET);
  for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1);j++) {
    fread_big(&diskint, sizeof(DB_INT), 1, WVOFF_FILE);
    if(diskint < wordcode[j])
      return(-1);
    else if (diskint != wordcode[j]) {
      gothit = 0;
    }
  }
  if (gothit == 1) { /* Lucky score -- we found it exact */
    fread_big(&disklong, sizeof(DB_INT), 1, WVOFF_FILE);
    return(disklong);
  }

  /* binary search through file */
  cur_word = (low_word + high_word)/2;
  while(cur_word > low_word) { /* this is true unless high_word==low_word+1 */
    fseek(WVOFF_FILE, (WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long))*cur_word, SEEK_SET);

    for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1);j++) {
      fread_big(&diskint, sizeof(DB_INT), 1, WVOFF_FILE);
      if(diskint > wordcode[j]) { /* check low_word through cur_word */
	high_word = cur_word;
	cur_word = (low_word + high_word)/2;
	gothit = 0;
      }
      else if (diskint < wordcode[j]) { /* check cur_word through high_word */
	low_word = cur_word;
	cur_word = (low_word + high_word)/2;
	gothit = 0;
      }
    }
    if (gothit == 1) {  /* matched hashes, return offset into WORDVEC_FILE */
      fread_big(&disklong, sizeof(DB_INT), 1, WVOFF_FILE);
      return(disklong);
    }
  }

  /* handle the special case high_word == low_word+1 */
  fseek(WVOFF_FILE, (WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long))*low_word, SEEK_SET);
  for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1);j++) {
    fread_big(&diskint, sizeof(DB_INT), 1, WVOFF_FILE);
    if (diskint != wordcode[j]) {
      gothit=0;
    }
  }
  if (gothit == 1) { /* Ding! */
    fread_big(&disklong, sizeof(DB_INT), 1, WVOFF_FILE);
    return(disklong);
  }

  fseek(WVOFF_FILE, (WORD_ENCODE_WIDTH*sizeof(int)+sizeof(long))*high_word, SEEK_SET);
  for(j=0,gothit=1; (j<WORD_ENCODE_WIDTH) && (gothit == 1);j++) {
    fread_big(&diskint, sizeof(DB_INT), 1, WVOFF_FILE);
    if (diskint != wordcode[j]) {
      gothit=0;
    }
  }
  if (gothit == 1) { /* Ding! */
    fread_big(&disklong, sizeof(DB_INT), 1, WVOFF_FILE);
    return(disklong);
  }

  /* all has resulted in miserable failure. */
  low_word = high_word; /* for next time */
  return(-1);
}

static WordVec *
fetch_datevec(unsigned int *wordcode, 
	      float max_distance)
{
  /* Create a wordvec from all dates within max-distance of wordcode. */

  long offset_low, offset_high;
  int i, numdocs;
  WordVec *out;

  /* find offsets for low & high dates within max-distance of this date */
  numdocs = locate_dates(wordcode, (double)max_distance, &offset_low, &offset_high);
  if (numdocs <= 0)
    return(NULL);

  out = (WordVec *)malloc(sizeof(WordVec));
  out->num_docs = numdocs;
  out->docweights = (DB_UINT *)malloc(out->num_docs*sizeof(DB_UINT));
  out->wordcode = (DB_UINT *)malloc(out->num_docs*sizeof(DB_UINT)*WORD_ENCODE_WIDTH);

  fseek(DATE_FILE, ((offset_low * 
		     (sizeof(unsigned int) * WORD_ENCODE_WIDTH + sizeof(unsigned int)) +
		     sizeof(int))), SEEK_SET);
/*
  printf("fetch_wordvec: seeking at %d = %d * (%d * %d + %d) + %d\n",  
	 ((offset_low * (sizeof(unsigned int)*WORD_ENCODE_WIDTH + sizeof(unsigned int)) + sizeof(int)),
	  offset_low, sizeof(unsigned int), WORD_ENCODE_WIDTH, sizeof(unsigned int), sizeof(int)));
*/

  for (i=0; i<numdocs; i++) {
    fread_big(&(out->wordcode[i*WORD_ENCODE_WIDTH]), sizeof(DB_INT), 
	  WORD_ENCODE_WIDTH, DATE_FILE);
    fread_big(&(out->docweights[i]), sizeof(DB_INT), 1, DATE_FILE);
/*
    printf("fetch_wordvec: wordcode = %x %x %x, %f\n", out->wordcode[i*WORD_ENCODE_WIDTH],
	   out->wordcode[i*WORD_ENCODE_WIDTH + 1], 
	   out->wordcode[i*WORD_ENCODE_WIDTH + 2], out->docweights[i]);
*/
  }
  return(out);
}

static long 
locate_dates(unsigned int *wordcode, 
	     double max_distance, 
	     long *offset_low, 
	     long *offset_high)
{
  long high_index, low_index;

  low_index = locate_date_hash(wordcode, max_distance, 0);
  high_index = locate_date_hash(wordcode, max_distance, 1);

/*
  printf("locate_dates: low_index = %d, high_index = %d, max_distance = %f\n", low_index, high_index,
	 max_distance);
*/
  if ((high_index == -1) || (low_index == -1)) {
    return (-1);
  }
  else {
    *offset_low = low_index;
    *offset_high = high_index;
    return (high_index - low_index + 1);
  }
}



/* given an index into the DATE_FILE, return the distance of that date 
   from wordcode */
static float 
date_distance_at_index(long index, 
		       unsigned int *wordcode) 
{
  DB_UINT checkcode[WORD_ENCODE_WIDTH];
  double distance;

  fseek(DATE_FILE, sizeof(unsigned int) + 
	(WORD_ENCODE_WIDTH*sizeof(unsigned int) + sizeof(unsigned int))*index, SEEK_SET);
  fread_big(checkcode, sizeof(DB_INT), WORD_ENCODE_WIDTH, DATE_FILE);
  /* distance < 0 <==> checkcode < wordcode, -1.0 <= distance <= 1.0 */
  distance = wordcode_diff(checkcode, wordcode);
  /*
    printf("date_distance_at_index: index = %d, wordcode = %x, distance = %f\n", 
    index, wordcode[2], distance);
    */
  return(distance);
}

/* Find the hash value that's closest to being offset_distance away from 
   wordcode, without going over offset_distance away.  If high-bracket-p
   is true, the hashed date should be the highest date within offset_distance
   of wordcode.  If high_bracket_p is false (0) then return the hash of the
   lowest date still within offset_distance */

static int 
locate_date_hash(unsigned int *wordcode, 
		 double offset_distance,
		 int high_bracket_p) 
{
  long low_word, high_word, cur_word;
  double distance;

  high_word = ((Max_DATEoff - sizeof(unsigned int)) / 
    (WORD_ENCODE_WIDTH*sizeof(unsigned int) + sizeof(unsigned int)) - 1);
  low_word = 0;

  /* check if our lower bound is too high, or is a hit */
  distance = date_distance_at_index(low_word, wordcode);
  /*   printf("locate_date_hash: distance = %f, offset_distance = %f\n", distance, offset_distance);*/
  if (distance > offset_distance) {
    /*    printf("everything too late\n");*/
    return(-1);   /* Everything too late */
  }
  else if ((fabs(distance) < offset_distance) && !high_bracket_p) {
    return(low_word);
  }

  /* check if our upper bound is too low, or is a hit */
  distance = date_distance_at_index(high_word, wordcode);
  if ((-1 * distance) > offset_distance) {
    /*    printf("everything too early\n");*/
    return(-1);   /* Everything too early */
  }
  else if ((fabs(distance) < offset_distance) && high_bracket_p) {
    return(high_word);
  }
  
  /* binary search through file */
  cur_word = (low_word + high_word)/2;
  while(cur_word > low_word) { /* this is true unless high_word==low_word+1 */
    distance = date_distance_at_index(cur_word, wordcode);
    if ((high_bracket_p  && (distance > offset_distance)) ||
	(!high_bracket_p && (distance > (-1 * offset_distance)))) {
      high_word = cur_word;
      cur_word = (low_word + high_word)/2;
    }
    else if ((high_bracket_p  && (distance < offset_distance)) ||
	(!high_bracket_p && (distance < (-1 * offset_distance)))) {
      low_word = cur_word;
      cur_word = (low_word + high_word)/2;
    }
    else {    /* Very rare, since we're talking == of doubles */
      return(cur_word);  /* Got a boundry point */
    }
  } /* end while (cur_word > low_word) */

  /* high_word == low_word + 1 */
  if (high_bracket_p && 
      (fabs(date_distance_at_index(low_word, wordcode))) < offset_distance) {
    /*    printf("low_word = %d, high_word = %d, returning %d\n", low_word, high_word, low_word);*/
    return(low_word);
  }
  else if (!high_bracket_p &&
	   (fabs(date_distance_at_index(high_word, wordcode))) < offset_distance) {
    /*    printf("low_word = %d, high_word = %d, returning %d\n", low_word, high_word, high_word);*/
    return(high_word);
  }
  
  /* all has resulted in miserable failure. */
  /*  printf("low_word = %d, high_word = %d, returning %d\n", low_word, high_word, -1);*/
  low_word = high_word; /* for next time, if we start caching this */
  return(-1);
}




static int 
destroy_wordvec(WordVec *vec)
{
  free(vec->docweights);
  free(vec->wordcode);
  free(vec);

  return(0);
}
