#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <template.h>

List_of_Files *find_files(char *, char **);
int is_bin_file(char *);
Processed_File *process_file(char *);
Template *recognize(FILE *);
ssize_t match_pattern(FILE *, Pattern *, ssize_t, List_of_Docs *, int);
ssize_t string_offset(FILE *, char *, size_t, size_t, int, int, int);
char *convert_escapes(char *, char *);
int string_present(char *, char **);

List_of_Files *find_files(char *sourcename,
			  char **excludees)
{
  int i;
  char filename[256], *shortname;
  struct stat buf;
  List_of_Files *head=NULL, *tail=NULL;
  FILE *file;
  DIR *directory;
  savant_direct *entry;

  filename[sizeof(filename)-1] = '\0';

  if(string_present(sourcename, excludees)) {
    if(SavantVerbose) {
      printf("Excluding %s.\n", sourcename);
      fflush(stdout);
    }
    return(NULL);
  }

  shortname = strrchr(sourcename, '/')+1;
  if((shortname[0] == '#') || /* ignore *~ and #*, possibly .* */ 
     (shortname[strlen(shortname)-1] == '~') ||
     (shortname[0] == '.' && !Config.index_dotfiles)) {
    if(SavantVerbose) {
      printf("  Ignoring %s.\n", shortname);
      fflush(stdout);
    }
    return(NULL);
  }

  if(NULL == (directory = opendir(sourcename))) {
    if((file = fopen(sourcename, "r")) != NULL) {
      fclose(file);
      if(is_bin_file(sourcename)) {
	if(SavantVerbose) {
	  printf("  %s:", shortname);
	  for(i=strlen(shortname); i<20; i++) {
	    fputc(' ', stdout);
	  }
	  printf("not text, ignoring.\n");
	  fflush(stdout);
	}
	return(NULL);
      }
      else {
	head = (List_of_Files *)malloc(sizeof(List_of_Files));
	head->file = process_file(sourcename);
	head->next = NULL;
	return(head);
      }
    }
    else {
      fprintf(stderr,"find_files: Unable to open %s\n",sourcename);
      fflush(stderr);
      return(NULL);
    }
  }

  /* sourcename is a directory, recurse */
  if(SavantVerbose) {
    printf("Searching %s:\n", sourcename);
    fflush(stdout);
  }
  entry = readdir(directory);
  while(entry != NULL) {
    shortname = entry->d_name;
    strncpy(filename, sourcename, sizeof(filename)-2);
    filename[sizeof(filename)-1] = '\0';
    if (filename[strlen(filename)-1] != '/')
      strcat(filename, "/");
    strncat(filename, shortname, sizeof(filename)-1);

    if(string_present(filename, excludees)) {
      if(SavantVerbose) {
	printf("  Excluding %s.\n", shortname);
	fflush(stdout);
      }
    }
    else {
      stat(filename,&buf);
      if(strcmp(shortname,".") && strcmp(shortname,"..")) {
	/* ignore . and .. */
	if((buf.st_mode & S_IFMT) == S_IFDIR) { /* recurse into subdir */
	  if (head == NULL) {
	    tail = head = find_files(filename, excludees);
	    if(tail != NULL) {
	      while(tail->next != NULL) 
		tail = tail->next;       /* shuffle to end of list */
	    }
	  }
	  else {
	    tail->next = find_files(filename, excludees);
	    while(tail->next != NULL) 
	      tail = tail->next;       /* shuffle to end of list */
	  }
	}
	else if ((file = fopen(filename, "r")) != NULL) { /* normal file */
	  fclose(file);
	  if(is_bin_file(filename)) {
	    if(SavantVerbose) {
	      printf("  %s:", shortname);
	      for(i=strlen(shortname); i<20; i++) {
		fputc(' ', stdout);
	      }
	      printf("not text, ignoring.\n");
	      fflush(stdout);
	    }
	  }
	  else if((shortname[0] == '#') || /* ignore *~ and #* */ 
		  (shortname[strlen(shortname)-1] == '~')) {
	    if(SavantVerbose) {
	    printf("  Ignoring %s.\n", shortname);
	    fflush(stdout);
	    }
	  }
	  else
	    if (head == NULL) {
	      head = tail = (List_of_Files *)malloc(sizeof(List_of_Files));
	      tail->file = process_file(filename);
	      tail->next = NULL;
	    }
	    else {
	      tail->next = (List_of_Files *)malloc(sizeof(List_of_Files));
	      tail = tail->next;
	      tail->file = process_file(filename);
	      tail->next = NULL;
	    }
	}
	else {
	  fprintf(stderr,"find_files: Unable to open %s\n",filename);
	  fflush(stderr);
	}
      }
    }
    entry = readdir(directory);
  }
  if(SavantVerbose) {
    printf("Finished %s.\n", sourcename);
    fflush(stdout);
  }
  closedir(directory);

  return(head);
}

int is_bin_file(char *filename)
{
  /* For now, just read in a few K and see if 95% is 
     printable (there may be a better way, but this is fast) */
  char buf[2048];
  FILE *file;
  int i, max, printables=0;

  if((file = fopen(filename,"r")) == NULL) {
    fprintf(stderr,"is_bin_file: Cannot open %s\n",filename);
    fflush(stderr);
    exit(1);
  }

  max=fread(&buf, sizeof(char), 2048, file);
  
  fclose(file);

  for(i=0;i<max;i++)
    if (isprint(buf[i]) || isspace(buf[i]))
      printables++;
  
  if((max > 0) && (100*printables/max > 95))
    return(0);
  else 
    return(1);
}

Processed_File *process_file(char *filename)
{
  int i;
  size_t prev_filepos;
  char *short_name;
  /*char *delimiter;*/
  ssize_t parse_limit;
  FILE *file;
  Template *template;
  List_of_Docs *doc, *prev_doc=NULL;
  Processed_File *processed = malloc(sizeof(Processed_File));

  short_name = strrchr(filename, '/') + 1;
  processed->filename = strdup(filename);
  processed->docs = NULL;

  if((file = fopen(filename,"r")) == NULL) {
    fprintf(stderr,"process_file: Cannot open %s\n",filename);
    fflush(stderr);
    exit(-1);
  }

  if(SavantVerbose) {
    printf("  %s:", short_name);
    for(i=strlen(short_name); i<20; i++) {
      fputc(' ', stdout);
    }
    fflush(stdout);
  }

  template = recognize(file);

  if (template == NULL) {  /* if unrecognized */
    if (SavantVerbose) {
      printf("not recognized, using default: 1 document\n");
      fflush(stdout);
    }
    processed->num_docs = 1;
    /* default biases are 1 for all */
    for(i=0; i<NUM_FIELD_TYPES; i++) {
      processed->biases[i] = 1;
    }
    processed->docs = (List_of_Docs *)malloc(sizeof(List_of_Docs));
    processed->docs->doc_start = processed->docs->body_start = 0;
    processed->docs->subject[0] = processed->docs->subject[1] =
      processed->docs->date[0] = processed->docs->date[1] =
      processed->docs->location[0] = processed->docs->location[1] =
      processed->docs->source[0] = processed->docs->source[1] = -1;
    processed->docs->next = NULL;
/*
    fseek(file, 0, SEEK_END);
    processed->docs->body_length = ftell(file);
*/
    fclose(file);
    return(processed);
  }

  /* otherwise */
  if (SavantVerbose) {
    printf("%s, ",template->name);
    fflush(stdout);
  }
  
  /* propagate the bias values through to the processed file */
  for(i=0; i<NUM_FIELD_TYPES; i++) {
    processed->biases[i] = template->biases[i];
  }

  /* deal with "Reject" (NULL) format */
  if(template->format == NULL) {
    if (SavantVerbose) {
      printf("rejecting.\n");
      fflush(stdout);
    }
    processed->num_docs = 0;
    processed->docs = NULL;
    fclose(file);
    return(processed);
  }

  /* reset file */
  rewind(file);
  prev_filepos = 0;
  /* init first doc */
  processed->docs = doc = (List_of_Docs *)malloc(sizeof(List_of_Docs));
  doc->doc_start = doc->body_start = 
    doc->subject[0] = doc->subject[1] = 
    doc->location[0] = doc->location[1] =
    doc->date[0] = doc->date[1] = 
    doc->source[0] = doc->source[1] = -1;
  processed->num_docs = 0;
  /* this loop reads the format-specified variables for each doc in file
     at the end of the loop we'll have malloced one more doc than we need */
  parse_limit = -1;
  if (template->delimiter != NULL) {
    match_pattern(file, template->delimiter, -1, NULL, 1);  /* find and ignore the first one; */
    parse_limit = match_pattern(file, template->delimiter, -1, NULL, 1);  /* find and use the second one. */
  }
  /* reset file */
  rewind(file);
  prev_filepos = 0;
  while(((doc->doc_start = 
	  match_pattern(file, template->format, parse_limit, doc, 1)) != -1) ||
	(parse_limit != -1)) {
    if(doc->body_start == prev_filepos) {
      /* This happens if all template strings are optional and none are found.
	 It could be a real hit or a bogus one, and we should break out in any case */
      if(processed->num_docs == 0) { 
	/* then it's a genuine hit;  malloc a dummy doc for the clean-up to eat */
	processed->num_docs++;
	prev_doc = doc;
	doc->next = (List_of_Docs *)malloc(sizeof(List_of_Docs));
	doc = doc->next;
      }
      break; 
    } 
    if(doc->doc_start != -1) {
      /* doc_start == -1 is the case where match_pattern failed, but
	 parse_limit != -1, indicating the failure was a bad document
	 (parsing ran past parse_limit and there may still be more
	 good documents).  Only allocate the new document (as follows) 
	 in the case where doc_start != -1.  Note that this code only 
	 executes when delimiters are in use, since otherwise the loop
	 test would be failed. */
      processed->num_docs++;
      prev_doc = doc;
      doc->next = (List_of_Docs *)malloc(sizeof(List_of_Docs));
      doc = doc->next;
      doc->doc_start = doc->body_start = 
	doc->subject[0] = doc->subject[1] = 
	doc->location[0] = doc->location[1] =
	doc->date[0] = doc->date[1] = 
	doc->source[0] = doc->source[1] = -1;
    }
    if (template->delimiter != NULL) {
      prev_filepos = parse_limit;
      fseek(file, parse_limit+1, SEEK_SET);
      parse_limit = match_pattern(file, template->delimiter, -1, NULL, 1);  /* find and use the second one. */
      fseek(file, prev_filepos, SEEK_SET);
    }
    else {
      prev_filepos = ftell(file);
    }
  }
  
  if(prev_doc != NULL) {
    doc = prev_doc;  /* the last cell in the list is empty */
    free(doc->next);
    doc->next = NULL;
  } 
  else {
    free(doc);
    doc = NULL;
  }
/*
  fseek(file, 0, SEEK_END);
  if (prev_doc)
  prev_doc->body_length = ftell(file) - prev_doc->body_start;
*/
  if (SavantVerbose) {
    if(processed->num_docs == 1) 
      printf("1 document\n");
    else
      printf("%d documents\n", processed->num_docs);
    fflush(stdout);
  }
  
  fclose(file);
  return(processed);
}

Template *recognize(FILE *file)
{
  List_of_Templates *templates=All_Templates;
  Template *best_guess = NULL;
  ssize_t best_offset=-1, this_offset;

  while(templates != NULL) {
    this_offset = match_pattern(file, templates->template->recognize, -1, NULL, 1);
    if(this_offset == 0) {
      return(templates->template);
    }
    if ((this_offset != -1) &&      /* success */
	((best_offset == -1) || (best_offset > this_offset))) {
      best_offset = this_offset;
      best_guess = templates->template;
    }
    rewind(file);
    templates = templates->next;
  }
    
  return(best_guess);
}
    
ssize_t match_pattern(FILE *file,
		      Pattern *format,
		      /* Pattern *delimiter,*/
		      ssize_t parse_limit,
		      List_of_Docs *doc,
		      int new_match)
{
  static int ret_from_anyorder, ignore, optionals_not_found;
  int anyof=0, icase=0, startline=0, immediate;
  static ssize_t first_hit, anyorder_lasthit, var_start;
  static DB_INT *var_length;
  ssize_t anyorder_pos = -1, optional_pos = -1, literal_pos, prev_filepos;

  if(new_match == 1) { /* initialize statics */
    ret_from_anyorder = optionals_not_found = 0;
    var_start = -1;
    var_length = NULL;
    ignore = 1;
    first_hit = anyorder_lasthit = -1; 
  }

  while(format != NULL) {
    switch(format->token->ID) {
    case STARTLINE:
      startline = 1;
      break;
    case IGNORE:
      ignore = 1;
      break;
    case ANYORDER:
      anyorder_pos = ftell(file);
      ignore = 1;
      break;
    case ANYOF:
      anyorder_pos = ftell(file);
      anyof = 1;
      break;
    case ICASE:
      icase = 1;
      break;
    case OPTIONAL:
      optional_pos = ftell(file);
      break;
    case PATTERN:
      if((match_pattern(file, format->token->ptr.pat, parse_limit, doc, 0) == -1) && 
	 (optionals_not_found == 0)) { 
	if (optional_pos == -1) { 
	/* if the recursion fails, and it was not optional, also fail */
	  return(-1);
	}
	else {
	  fseek(file, optional_pos, SEEK_SET);
	}
      }
      if (anyof == 1) {    /* found one */
	return(first_hit);
      } 
      optional_pos = -1;
      if(ret_from_anyorder == 1) { /* if the recursion set this, */
	fseek(file, anyorder_lasthit, SEEK_SET);
	ret_from_anyorder = 0;          /* then ignore text until next item */
	ignore = 1;
      }
      if (anyorder_pos != -1) {
	fseek(file, anyorder_pos, SEEK_SET);
	ignore = 1;
      }      
      break;
    case SUBJECT:  /* these five cases are handled together (no breaks)*/
      if(var_start == -1) {
	var_start = doc->subject[0] = ftell(file);
	var_length = &(doc->subject[1]);
      }
    case LOCATION:  
      if(var_start == -1) {
	var_start = doc->location[0] = ftell(file);
	var_length = &(doc->location[1]);
      }
    case DATE:
      if(var_start == -1) {
	var_start = doc->date[0] = ftell(file);
	var_length = &(doc->date[1]);
      }
    case SOURCE:
      if(var_start == -1) {
	var_start = doc->source[0] = ftell(file);
	var_length = &(doc->source[1]);
      }
    case BODY:
      if(var_start == -1) {
	doc->body_start = ftell(file);
      }

/* SUBJECT, LOCATION, DATE, SOURCE, and BODY all hit here */
      if(first_hit == -1) { /* this is the first thing found */
	first_hit = ftell(file);
      }
      if (anyof == 1) {    /* found one */
	return(first_hit);
      }
      ignore = 1;
      break;
    case LITERAL:
      prev_filepos = ftell(file);
      immediate = (ignore == 0) && (anyof == 0) && (optional_pos != -1);
      literal_pos = string_offset(file, format->token->ptr.str, 
				  prev_filepos, parse_limit, startline, icase,
				  immediate);
      if ((literal_pos == -1) || /* not found or not found immediately */
	  ((literal_pos != prev_filepos) && (ignore == 0))) { 
	if (optional_pos != -1) {
	  fseek(file, optional_pos, SEEK_SET);
	  optional_pos = -1;
	} 
	else if (anyof == 1) {
	  if (anyorder_pos != -1) {
	    fseek(file, anyorder_pos, SEEK_SET);
	  } else {
	    fseek(file, prev_filepos, SEEK_SET);
	  }
	} 
	else {
	  return(-1);
	}
      }
      else {
	startline = ignore = 0;
	optional_pos = -1;
	if((first_hit == -1) || (first_hit > literal_pos)) { 
	  /* first string found, or best so far */
	  first_hit = literal_pos;
	}
	if(literal_pos > anyorder_lasthit) {
	  anyorder_lasthit = literal_pos;
	}
	if(var_start != -1) {
	  (*var_length) = literal_pos - var_start;
	  var_length = NULL;
	  var_start = -1;
	}
	if (anyorder_pos != -1) {
	  fseek(file, anyorder_pos, SEEK_SET);
	}
	if (anyof == 1) {
	  return(first_hit);
	}
      }
      break;
    default:
      break;
    }
    format = format->next;
  }
  
  if(anyorder_pos != -1) { /* set ret_from_anyorder */
    ret_from_anyorder = 1;
  }
  if((first_hit == -1) && (anyof == 0)) {  /* at this point, only possible if */
    optionals_not_found = 1;        /* everything was optional and not found */
  }

  return(first_hit);
}


char *convert_escapes(char *src, char *dest)
{
  int sptr=0, dptr=0;

  for(sptr=dptr=0; src[sptr]!='\0'; sptr++, dptr++) {
    switch(src[sptr]) {
    case '^':
      sptr++;
      if(src[sptr] == '^')
	dest[dptr] = '^';
      else
	dest[dptr] = 0x1f & src[sptr];
      break;
    case '\\':
      sptr++;
      switch(src[sptr]) {
      case 'r':
	dest[dptr] = '\r'; 
	break;
      case 'n':
	dest[dptr] = '\n'; 
	break;
      case 't':
	dest[dptr] = '\t'; 
	break;
      case 'b':
	dest[dptr] = '\b'; 
	break;
      case '\\':
	dest[dptr] = '\\';
	break;
      }
      break;
    default: 
      dest[dptr] = src[sptr];
      break;
    }
  }
  dest[dptr] = '\0';
  return(dest);
}
  
ssize_t string_offset(FILE *file,
		      char *str,
		      size_t start,
		      size_t stop,
		      int stline,
		      int icase,
		      int immediate)
{
  int ptr;
  char real_str[512], data;
  size_t return_offset;

  return_offset = start;

  if(stline == 1) {
    real_str[0] = '\n';
    convert_escapes(str, &(real_str[1]));
    if(start == 0) {
      data = '\n';
    }
    else {
      fseek(file, start-1, SEEK_SET);
      data = fgetc(file);
    }
  }
  else {
    convert_escapes(str, real_str);
    data = fgetc(file);
  }    
  
  if(icase == 1) {
    for(ptr=0; real_str[ptr]!='\0'; ptr++)
      real_str[ptr] = tolower(real_str[ptr]);
  }

  ptr = 0;
  while((signed char)data != EOF) {
    if (icase == 1) {
      data = tolower(data);
    }
    if(data == real_str[ptr]) {
      ptr++;
      if(real_str[ptr] == '\0')
	break;
    }
    else { /* failed character match */
      if(immediate) {
	return(-1);
      }
      return_offset += ptr + 1;
      ptr = 0;
      if(data == real_str[ptr]) {
	return_offset--;
	ptr++;
      }
    }

    
    if(stop != -1 && 
       return_offset != start) { 
      /* second part of test (above) added to fix file-snarfing bug (1/98)
	 In the case of a plain_email with none of the significant 
	 header lines, the search for "\n\n" begins at the "\nFrom" starting
	 the document, causing the code in this block to signal a false
	 negative. */
      if(return_offset >= stop) {
	return(-1);
      }
    }

    data = fgetc(file);
  }

  if((signed char)data == EOF) {
    return(-1);
  }
  else {
    return(return_offset);
  }
}

int string_present(char *string,
		   char **strings)
{
  int i;

  for(i=0; strings[i] != NULL; i++) {
    if (strcmp(string, strings[i]) == 0)
      return(1);
  }

  return(0);
}
