#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <stops.h>

int upper,power,num_words;
char **comm;

int by_alpha(const void *in1,
	     const void *in2)
{
  return(strcmp(*(char **)in1, *(char **)in2));
}

int core_comm_local(void)
{
  comm = comm_local;
  num_words = num_words_local;

  /* for safety's sake */
  qsort(comm, num_words, sizeof(char *), by_alpha);

  upper = 1;
  power = 0;
  while(upper < num_words) {
    upper = upper*2 + 1;  /* upper = one less than highest 2^n < num_words */
    power++;  /* power = log(upper+1)-1 */
  }

  return(0);
}

int core_comm(char *comm_file)
{
  int i,j;
  char temp[64];
  FILE *file;

  if(NULL == (file = fopen(comm_file,"r"))) {
    fprintf(stderr,"core_comm: Unable to open dictionary %s for reading\n",comm_file);
    fprintf(stderr,"Using the default common words list\n");
    fflush(stderr);
    return(core_comm_local());
  }

  num_words = 0;
  while(NULL != fgets(temp,64,file))
    num_words++;

  fseek(file,0,SEEK_SET);

  comm = (char **)malloc(sizeof(char *)*num_words);
  for(i=0;fgets(temp,64,file);i++) {
    for(j=0;temp[j];j++)
      if(isupper(temp[j]))
	temp[j] = temp[j] - 'A' + 'a';
    temp[j-1] = 0;
    comm[i] = (char *)malloc(j);
    strcpy(comm[i],temp);
  }
  fclose(file);
  
  qsort(comm, num_words, sizeof(char *), by_alpha);

  upper = 1;
  power = 0;
  while(upper < num_words) {
    upper = upper*2 + 1;  /* upper = one less than highest 2^n < num_words */
    power++;  /* power = log(upper+1)-1 */
  }

  return(0);
}

int is_common(char *word)
{
  int disp,loc,i,itr,step;

  loc = 0;
  itr = power+1;
  step = (1<<power);
  disp = step - 1;
  while(itr--) {
    step = step>>1;
    loc += disp;
    if(loc>=num_words)
      loc = num_words - 1;
    i = strcmp(word,comm[loc]);
    if(i == 0)
      return(1);
    else if(i < 0)
      disp = -step;
    else
      disp = step;
  }

  return(0);
}


/**********        WORDCODE STUFF        **********/  

unsigned int encode_char(unsigned char c) {
/* return a 6-bit packed representation of a char:
   a-z = 01-1A
   0-9 = 1B-24
   _   = 25
   -   = 26
   !   = 27
anything else gets mapped to ascii(c) & 0x3F (lower 6 bits)
*/
  if ((c >= 'a') && (c <= 'z')) {
    return ((int)c - (int)'a' + 1);
  }
  else if ((c >= '0') && (c <= '9')) {
    return ((int)c - (int)'0' + 1);
  }
  else if (c == '_') {
    return (0x25);
  }
  else if (c == '-') {
    return (0x26);
  }
  else if (c == '!') {
    return (0x27);
  }
  else { return ((int)c & 0x3F); }
}

unsigned char decode_char (unsigned int code) {
  if ((code >= 0x1) && (code <= 0x1A)) {
    return ((char)(code + 'a' - 1));
  }
  else if ((code >= 0x1B) && (code <= 0x24)) {
    return ((char)(code + '0' -1));
  }
  else if (code == 0x25) {
    return ('_');
  }
  else if (code == 0x26) {
    return ('-');
  }
  else if (code == 0x27) {
    return ('!');
  }
  else if (code == 0) {
    return ((char)0);          /* so the rest of the string is printable */
  }
  else {
    return ((char)(code - 1));
  }
}

int encode_word(unsigned char *s, 
		unsigned int * code,
		enum Field_Types field_type)
{
/* Encoding scheme:

EXTENDED ENCODING SYSTEM (3 unsigned ints):

In the 3-int system, each character is 6 bits, with the most significant 6-bits in the
first int being the type field.  Bits wrap to the next byte, as follows:

tttttt 111111 222222 333333 444444 55 = 32 bits
5555 666666 777777 888888 999999 0000
00 111111 222222 333333 444444 555555 

= 15 characters, 6-bits type

Code       Type
0x0        Body
0x1        Text Location
0x2        Subject
0x3        Source
0x4        Date
0x5        Time
0x6        Day
------------------------------------
ORIGINAL ENCODING SYSTEM (2 unsigned ints):

In the 2-int system, each character is 5 bits

tt ccccc ccccc ccccc ccccc ccccc ccccc = 32 bits
tt ccccc ccccc ccccc ccccc ccccc ccccc

= 12 characters, 4-bits type

Code       Type
0x0        Body
0x1        Text Location
0x2        Subject
0x3        Source
0x4        Date
0x5        Time
0x6        Day

*/

  time_t timestamp;
  struct parsedate *pd;
  const time_t SECONDS_IN_DAY = 86400;
  const time_t SECONDS_IN_WEEK = 604800;
  unsigned int endstr, type;
  int i,j,k,offset;

/*  printf("encode_word: word = %s\n", s); */

  for (k=0;k<WORD_ENCODE_WIDTH;k++) {
    code[k] = (unsigned int)0;
  }

  if (field_type == BODY_FIELD) {
    type = 0;
  }
  if (field_type == LOCATION_FIELD) {
    type = 1;
  }
  if (field_type == SUBJECT_FIELD) {
    type = 2;
  }
  if (field_type == SOURCE_FIELD) {
    type = 3;
  }
  if (field_type == DATE_FIELD) {
    type = 4;
  }
  if (field_type == TIME_FIELD) {
    type = 5;
  }
  if (field_type == DAY_FIELD) {
    type = 6;
  }


  if (field_type == DATE_FIELD) {
    pd = parsedate(s);
    timestamp = (time_t)pd->unixtime;
    if ((timestamp == -1) || (sizeof(time_t) >= WORD_ENCODE_WIDTH*sizeof(unsigned int)))  {
      return(-1);
    }
    else {
      /* start with the type bits */
      code[0] = (type << (sizeof(unsigned int)*8 - 6));
      for (j=0; j < WORD_ENCODE_WIDTH; j++) {
	if (((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8) < sizeof(time_t)) {
	  code[j] += (unsigned int)(timestamp >> ((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8));
	}
/*	printf("code[%d] = %x\n", j, code[j]);*/
      }
    }
    return(0);
  }

  if (field_type == TIME_FIELD) {
    pd = parsedate(s);
    timestamp = (time_t)pd->unixtime;
    if ((timestamp == -1) || (sizeof(time_t) >= WORD_ENCODE_WIDTH*sizeof(unsigned int)))  {
      return(-1);
    }
    else {
      timestamp = timestamp % SECONDS_IN_DAY;  /* seconds since midnight */
      /* start with the type bits */
      code[0] = (type << (sizeof(unsigned int)*8 - 6));
      for (j=0; j < WORD_ENCODE_WIDTH; j++) {
	if (((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8) < sizeof(time_t)) {
	  code[j] += (unsigned int)(timestamp >> ((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8));
	}
/*	printf("code[%d] = %x\n", j, code[j]);*/
      }
    }
    return(0);
  }

  if (field_type == DAY_FIELD) {
    pd = parsedate(s);
    timestamp = (time_t)pd->unixtime;
    if ((timestamp == -1) || 
	(sizeof(time_t) >= WORD_ENCODE_WIDTH*sizeof(unsigned int)))  {
      return(-1);
    }
    else {
      /* Day of week # (Sun=0) */
      timestamp = (long)((timestamp % SECONDS_IN_WEEK) / SECONDS_IN_DAY); 

      /* start with the type bits */
      code[0] = (type << (sizeof(unsigned int)*8 - 6));
      for (j=0; j < WORD_ENCODE_WIDTH; j++) {
	if (((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8) < sizeof(time_t)) {
	  code[j] += (unsigned int)(timestamp >> ((WORD_ENCODE_WIDTH - j - 1)*sizeof(unsigned int)*8));
	}
/*	printf("code[%d] = %x\n", j, code[j]);*/
      }
    }
    return(0);
  }

  if ((field_type == BODY_FIELD) || (field_type == LOCATION_FIELD) || 
      (field_type == SUBJECT_FIELD) || (field_type == SOURCE_FIELD)) {

    /* start with the type bits */
    code[0] = type << (sizeof(unsigned int)*8 - 6);
    offset = sizeof(unsigned int)*8 - 6 - CHARACTER_ENCODE_WIDTH;   /* 12 bits into the code for the start of the next char */
    i=0;
    for(j=0,endstr=0; (j<WORD_ENCODE_WIDTH); j++) {
      while((offset >= 0) && (!endstr)) {
	if (!endstr && (s[i] == 0)) {  
	  endstr = 1;  /* at end of the string, don't add more to the code */
	}
	if (endstr == 0) {
	  code[j] += (encode_char(s[i]) << offset);
	  offset -= CHARACTER_ENCODE_WIDTH;
	  i++;
	}
      }
      if (!endstr) {   /* break this char up across int boundaries */
	code[j] += (encode_char(s[i]) >> -offset);
	offset = sizeof(unsigned int)*8 + offset;
      }
    }
/*    printf("encode_word: code[] = %x %x %x\n", code[0], code[1], code[2]);*/
  }
  return(0);
}

enum Field_Types 
word_type (unsigned int code) 
{
  unsigned int type = 0;

  type = (code >> (sizeof(unsigned int)*8 - 6));

/*  printf ("word_type: code = %x, amt = %d, type = %d\n", code, (sizeof(unsigned int)*8-6), type);*/

  if (type == 0) return(BODY_FIELD);
  if (type == 1) return(LOCATION_FIELD);
  if (type == 2) return(SUBJECT_FIELD);
  if (type == 3) return(SOURCE_FIELD);
  if (type == 4) return(DATE_FIELD);
  if (type == 5) return(TIME_FIELD);
  if (type == 6) return(DAY_FIELD);
  return(ERROR_FIELD);
}

int decode_word(unsigned int * code,
		char *str)
{
/* Fill str with a printable representation of code */

  int i, j, offset;
  unsigned int d, carryoverd, type, mask;
  time_t timestamp;

  type = code[0] >> (sizeof(unsigned int)*8 - 6);
  timestamp = (time_t)0;
  if ((type == 4) || (type == 5)) {         /* Dates & Times are strange */
    for (i = 0; i < WORD_ENCODE_WIDTH; i++) {
      if (((WORD_ENCODE_WIDTH - i - 1)*sizeof(unsigned int)*8) < sizeof(time_t)) {
	timestamp = code[i];
      }
    }
    strcpy(str, ctime(&timestamp));
  }

  else if (type == 6) {   /* it's a day */
    for (i = 0; i < WORD_ENCODE_WIDTH; i++) {
      if (((WORD_ENCODE_WIDTH - i - 1)*sizeof(unsigned int)*8) < sizeof(time_t)) {
	timestamp = code[i];
      }
    }
    if (timestamp == 3) strcpy(str, "Sunday");
    if (timestamp == 4) strcpy(str, "Monday");
    if (timestamp == 5) strcpy(str, "Tuesday");
    if (timestamp == 6) strcpy(str, "Wednesday");
    if (timestamp == 0) strcpy(str, "Thursday");
    if (timestamp == 1) strcpy(str, "Friday");
    if (timestamp == 2) strcpy(str, "Saturday");
  }
    
  else {   /* not a date or time */
    d = code[0] & 0x3f;
    j = 0;
    carryoverd = 0;
    mask = 0xffffffff >> (8 * 4 - CHARACTER_ENCODE_WIDTH);  /* a mask for one packed character width */
    offset = sizeof(unsigned int)*8 - 6 - CHARACTER_ENCODE_WIDTH;
    for(i=0; i < WORD_ENCODE_WIDTH; i++) {
      while(offset >= 0) {
	d = ((code[i] >> offset) & mask) + carryoverd;
	carryoverd = 0;
	if (d) {
	  str[j] = decode_char(d);
	  j++;
	}
	offset -= CHARACTER_ENCODE_WIDTH;
      }
      carryoverd = ((code[i] << -offset) & mask);
      offset = sizeof(unsigned int)*8 + offset;
    }
    str[j] = 0;
  } /* not a date */

  if (type == 1) {
    strcat(str, ": LOCATION");
  }
  if (type == 2) {
    strcat(str, ": SUBJECT");
  }
  if (type == 3) {
    strcat(str, ": SOURCE");
  }
  if (type == 4) {
    str[strlen(str)-1] = ':';    /* get rid of the \n */
    strcat(str, " DATE");
  }
  if (type == 5) {
    str[strlen(str)-1] = ':';    /* get rid of the \n */
    strcat(str, " TIME");
  }
  if (type == 6) {
    strcat(str, ": DAY");
  }

  return(0);
}

int wordcode_cmp(Wordcode code1,
		 Wordcode code2) /* compare func for qsort */
{
  int j;
  for(j=0; j<WORD_ENCODE_WIDTH; j++) {
    if (((unsigned int *)code1)[j] < ((unsigned int *)code2)[j])
      return -1;
    else if (((unsigned int *)code1)[j] > ((unsigned int *)code2)[j])
      return 1;
  }
  return 0;
}




double wordcode_diff(Wordcode code1,
		     Wordcode code2) /* distance func */
{
  int j;
  enum Field_Types wt1, wt2;
  wt1 = word_type(code1[0]);
  wt2 = word_type(code2[0]);
  if (wt1 != wt2) {
    return(1.0);
  }

  switch (wt1) 
    {
    case BODY_FIELD:
    case LOCATION_FIELD:
    case SUBJECT_FIELD:
    case SOURCE_FIELD:

      for(j=0; j<WORD_ENCODE_WIDTH; j++) {
	if (code1[j] < code2[j])
	  return -1.0;
	else if (code1[j] > code2[j])
	  return 1.0;
      }
      return 0.0;
      break;
    case DATE_FIELD:
      if (code1[2] == code2[2]) 
	return (0.0);
      else if (code1[2] > code2[2]) {
	/* Ranges from 0 to 1, with .5 at 24 hours difference 
	   between dates (exponential dropoff) */
	/*	printf("wordcode_diff: Date difference 1.0 - 2^(-abs(%x - %x) / 0x152ac) = 1.0 - 2^(-%d) = %f\n",
		code2[2], code1[2], abs(code2[2] - code1[2]), 
		1.0 - pow(2, -1.0 * abs(code2[2] - code1[2]) / 0x152ac));
		*/
	return ((double)(1.0 - pow(2, -1.0 * abs(code2[2] - code1[2]) / 0x152ac)));
      }
      else {
	/* Ranges from 0 to 1, with .5 at 24 hours difference 
	   between dates (exponential dropoff) */
	/*	printf("wordcode_diff: Date difference 1.0 - 2^(-abs(%x - %x) / 0x152ac) = 1.0 - 2^(-%d) = %f\n",
		code2[2], code1[2], abs(code2[2] - code1[2]), 
		-(1.0 - pow(2, -1.0 * abs(code2[2] - code1[2]) / 0x152ac)));
		*/
	return ((-(1.0 - pow(2, -1.0 * abs(code2[2] - code1[2]) / 0x152ac))));
      }
      break;
    default:
      return(-1.0);
    }
  
}




