#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <savantio.h>

/*
  sim_contrib has the job of taking a single doc-ID/word-frequency pair 
  from a retrieved wordvec and, using some other extenuating information, 
  deciding how much it will contribute to that document's similarity 
  rating.
  */

float
sim_contrib(DB_UINT docweight,
	    DB_INT num_docs,
	    DB_UINT *exact_wordcode,  /* exact_wordcode is the wc searched for, wordcode is that which was  */
	    DB_UINT *wordcode,        /* found.  They can differ in the case of continuous fields (date) */
	    float query_weight,       /* weight of word in the query vector */
	    int *query_biases,
	    DB_INT *index_biases,   
	    float query_normalized_tf,  /* needed for chopping heuristic */
	    int use_weights,   /* boolean: do all the fancy normalization or just keywords */
	    int query_length)  /* if not use_weights, this will give the similarity */        
{
  int vecnum, docfreq;
  float vecweight, combined_bias_sum, combined_biases[NUM_FIELD_TYPES];
  enum Field_Types type, temptype;

  if(! use_weights) {  /* %%% may be possible to move this up to readwv_ra if yenta doesn't do this */
    return(1.0/query_length);
  }

  vecnum = WV_DVNUM(docweight);
  docfreq = WV_DVFREQ(docweight);

  /* divide by magnitude, and multiply by inverse document frequency
     (log N/n, N=num documents in collection, n = vector->num_docs = the
     number of documents this word is in) This corresponds to "tfc" in
     Salton's system when combined with the vector magnitude (saved as
     sqrt(sum(tf * log (N/n)^2))) */
  
  type = word_type(wordcode[0]);  
  if(SavantDebug) {
    printf("  docvec %d mag. = %0.4e\n", vecnum,
	   DocVec_Mags[(vecnum * NUM_FIELD_TYPES) + (int)type]);
  } 
  if (DocVec_Mags[(vecnum * NUM_FIELD_TYPES) + (int)type] > 0.0) {
    vecweight = (float)docfreq / 
      DocVec_Mags[(vecnum * NUM_FIELD_TYPES) + (int)type]
      * log((double)Num_DocVecs / (float)num_docs);
  }
  else {
    return(0.0);  /* All words in this doc are too common */
    /* %%% so should the dv be skipped during indexing? */
  }
  if(SavantDebug) {
    printf("  vecweight = %0.4e\n", vecweight);
  }
  
  /* Note the splefty chopping method to compute similarity.  Essentially,
     the non-query vector can't have a higher weight for this
     word than the query vector does.  If the weight is higher, it gets
     'chopped' back to the query's value.  This might avoid problems with
     one-word documents and stuff like 'spam spam spam spam eggs bacon
     spam...'  (rhodes, 12/1/96) */
  
  if (vecweight > query_normalized_tf) { 
    vecweight = query_normalized_tf;
  }
  if(SavantDebug) {
    printf("  vecweight = %0.4e (after chopping)\n", vecweight);
  }
  
  /* We need to bias the importance lent to this particular match 
     according to the query biases and the index biases.  The query 
     and index vectors each have associated biases attached to each
     of their fields, like so:
     
     Query biases = q1, q2, ..., q{num_fields}
     Index biases = i1, i2, ..., i{num_fields}
     
     Non-normalized biases = q1*i1, q2*i2, ...
     
     Normalized biases = q1*i1/M, q2*i2/M, ...
     where M = combined_bias_sum = q1*i1 + q2*i2 + ... 
     */

  combined_bias_sum = 0.0;
  /* if(SavantDebug) {
     printf("  combined_biases: ");
     }  */
  for (temptype=BODY_FIELD; temptype < LAST_FIELD; temptype++) {
    combined_biases[temptype] = (double)(query_biases[temptype] * 
				    index_biases[temptype]);
    /*    if(SavantDebug) {
	  printf("%d, ", combined_biases[temptype]);
	  }  */
    combined_bias_sum += combined_biases[temptype];
  }
  /*  if(SavantDebug) {
      printf("\n  combined bias sum: %d\n", combined_bias_sum);
      }  */

  if (combined_bias_sum > 0.0) {
    vecweight = vecweight * (combined_biases[type] / combined_bias_sum);
  }
  else {
    return(0.0);
  }
  if(SavantDebug) {
    printf("  vecweight = %0.4e (after biasing)\n", vecweight);
  } 
  
  
  /* Now, if this is a continuous field (currently DATE or TIME), 
     compute a logarithmic drop-off factor according to how close
     a match: */
  if ((type == DATE_FIELD) || (type == TIME_FIELD)) {
    vecweight *= (1.0 - fabs(wordcode_diff(exact_wordcode, wordcode)));
  }
  if(SavantDebug) {
    printf("  vecweight = %0.4e (after log scale)\n", vecweight);
  } 
  
  /* %%% NOTE: if YENTA doesn't ever do dates, we can move this up to 
     readwv_ra and save a couple of confusing parameters */
  
  /* and now scale the result by the importance of this word
     within the query vector: */
  return(query_weight * vecweight);
}
