/* WIDE AREA INFORMATION SERVER SOFTWARE
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.    
   Brewster@think.com
*/

/* Looks up words in the inverted file index.
 * Please pardon my novice C code.
 *
 * -brewster
 */

/* Important functions:
 * run_search
 * search_for_words
 */

/* to do:
 *    handle the null request by answering something.
 *    answer questions that are just "help" and "?"
 *    Handle searches on multiple databases
 */
 
/* changes 5.2.90 HWM
	- changed calls to perror() to calls to panic()
	- made print_best_hits() only print hits w/ non-zero weight
	- made random arrays static instead of reading them in.  
	  removed getRandomArray.
	- removed unused variables
  Brewster 7/90 made look_up_word_in_dictionary safer.
  Brewster 7/90 elimiated trailing <lf> on filename and headline table accesses
  HWM 7.12.90 - replaced all calls to panic with error code returns and a log
                file  
	      - added the routine initSearchEngine() which should be called 
	        before any other search routine
	      - added beFriendly() to give other processes time under 
	        multifinder
  JG 5.31.91 - added relevance feedback for line fragments.
  JG 7.8.91  - added doc_id to search_for_words, removed scale_scores.
*/

#define _search_c

#include <ctype.h>

#include <string.h> 	/* for strlen() */
#ifdef THINK_C
#include <unix.h> 		/* for sleep() */
#endif /* think_c */

#include "cutil.h"
#include "irfiles.h"
#include "irlex.h"
#include "irext.h"
#include "irsearch.h"
#include "docid.h"
#include <math.h>

#define TEST_SEARCH 	false	/* set to TRUE to allow printing to console */

/*----------------------------------------------------------------------*/

static Boolean calcDocLength _AP((hit* theHit,long* lines,long* bytes));

static Boolean
calcDocLength(theHit,lines,bytes)
hit* theHit;
long* lines;
long* bytes;
/* Given a hit, open the file and figure out how many bytes and lines
   it contains.  This is not needed by the serial search engine (it
   stores these values in its dictionary.  It is used by the dynamic
   help facility).
*/
{
  *lines = theHit->number_of_lines;

  /* find the length of the document */
  if(theHit->end_character != 0)
    {
      /* document is not whole file, so size is stored */
      *bytes = theHit->end_character - theHit->start_character;
      return(true);
    }
  else
    {	
      /* whole file, find file length from the file */
      FILE* file = NULL;
      if (((file = s_fopen(theHit->filename, "r")) != NULL) &&
	  (s_fseek(file, 0L, SEEK_END) == 0)  &&
	  ((*bytes = ftell(file)) != -1))
	{ s_fclose(file);
	  return(true);		/* we are done, bytes is set */
	}
      else
	{ s_fclose(file);
	  return(false);	/* something went wrong with the file */
	}
    }
}




static long wordDelimiter _AP((long c));

static long wordDelimiter(c)
long c;
/* decide if c is a delimiter or not */
{ 
  if (isalnum((char)(c & 0xFF)))
    return(NOT_DELIMITER);
  else
    return(IS_DELIMITER);
}

boolean search_for_words(words, db, doc_id)
     char* words;
     /* break the string into words (delimited by non-alphanumerics) 
	and repeatedly call 
	search_for_word(). Note that the string is modified in the process!
	XXX could do something interesting to return feedback on which of the seedwords
	was most/least important
	Returns true if successful.
	*/
     database *db;
     long doc_id;
{
  char* word = NULL;
  /* printf("words: %s\n", words); */
  word = strtokf(words,wordDelimiter);
  while(word != NULL){
    long dictionary_value;
    /* trim the string if necessary */
    if(strlen(word) > MAX_WORD_LENGTH){
      word[MAX_WORD_LENGTH] = '\0';
    }
    dictionary_value = look_up_word_in_dictionary(string_downcase(word), db);
    if(dictionary_value > 0){
      if(0 != search_word(word, 0L, 0L, 1L, doc_id, dictionary_value, db))
	return(false);
    }
    word = strtokf(NULL,NULL);
    beFriendly();
  } 
  return(true);
}

/* gets the next best hit from the search engine and fills in all the slots.
   If the document does not exist, then it gets another, etc.
   It returns 0 if successful */   
long next_best_hit(the_best_hit, db)
     hit *the_best_hit;
     database *db;
{
  document_table_entry doc_entry;
  long ret_value;
  while(1){ /* keep going until we get a good document */
    if(0 != (ret_value = best_hit(&(the_best_hit->document_id), &(the_best_hit->weight))))
      return(ret_value);
    if(the_best_hit->weight <= 0)	/* if we are out of good stuff, return */
      return(1);
    /* fill in the rest of the hit */
    if (read_document_table_entry(&doc_entry,
				  the_best_hit->document_id,
				  db) 
	== true){
      the_best_hit->start_character = doc_entry.start_character;
      the_best_hit->end_character = doc_entry.end_character;
      the_best_hit->document_length = doc_entry.document_length;
      the_best_hit->number_of_lines = doc_entry.number_of_lines;
      sprintf(the_best_hit->date, "%d", doc_entry.date);
      read_filename_table_entry(doc_entry.filename_id, 
				the_best_hit->filename,
				the_best_hit->type,
				NULL,
				db),
      strncpy(the_best_hit->headline, 
	      read_headline_table_entry(doc_entry.headline_id,db),
	      MAX_HEADLINE_LEN);
      if(probe_file(the_best_hit->filename))
	return(0);  /* we win */
      else /* we lose */
	{
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"Dangling File %s in database %s.", 
		the_best_hit->filename,
		db->database_file);
	/*
	strncpy(the_best_hit->headline, "***Missing Document***: ",
		MAX_HEADLINE_LEN);
	strncat(the_best_hit->headline,
		read_headline_table_entry(doc_entry.headline_id,db),
		MAX_HEADLINE_LEN - strlen(the_best_hit->headline));
	return(0);
	*/
      }
    }
    else {
      waislog(WLOG_HIGH, WLOG_ERROR, 
	      "Error reading doc_table_entry for database %s, docid: %ld",
	      db->database_file,
	      the_best_hit->document_id);
    }
    beFriendly();
  }
}

/*----------------------------------------------------------------------*/

boolean run_search(aSearch, headers, diags, index_directory, 
		   seed_words_used, waisProtocolVersion, headerNum)
     SearchAPDU* aSearch;
     WAISDocumentHeader** headers; /* list of results */
     diagnosticRecord*** diags;  /* list of diagnostics */
     char *index_directory;
     char **seed_words_used;  /* called with enough space */
     long waisProtocolVersion;
     long *headerNum;
/* runs a search on the inverted file index and returns false if it errors 
   in such a way that it can not even make a diagnostic record 
   (should not happen).
   It changes headers with the replies or makes a diagnostic record
 */
{ 
  diagnosticRecord* diag = NULL;
  WAISSearch* wais_search = (WAISSearch*)aSearch->Query; /* for convenience */
  char* new_db_name = (aSearch->DatabaseNames == NULL) ?
    merge_pathnames(INFO_DATABASE_NAME, index_directory) : 
  merge_pathnames(aSearch->DatabaseNames[0], index_directory);
  char* dbName = new_db_name;
  database* db;
  long maxRawScore;
  long normalScore;
  char* originName = NULL;
  long i;
  query_parameter_type parameters;
  boolean search_result;

  db = openDatabase(new_db_name, false, true);
  if (db == NULL)
    { char msg[MAX_FILENAME_LEN * 2];
      strncpy(msg,"The following database is not available: ",
	      MAX_FILENAME_LEN);
      s_strncat(msg,new_db_name,MAX_FILENAME_LEN,MAX_FILENAME_LEN);
      diag = makeDiag(false,D_PermanentSystemError,msg);
      *diags = (diagnosticRecord **)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * 2));
      (*diags)[0] = diag;
      (*diags)[1] = NULL;
      return(false);
    }

  {
    DocObj** docs = NULL;

    /* read the query */
    docs = wais_search->Docs;
    if(docs != NULL) {
      if(docs[0] != NULL && docs[0]->Type != NULL) {
	long id = -1;
	if(strcmp(docs[0]->Type,"WAIS_NEXT") == 0)
	  id = next_docid(anyToString(GetLocalID(docIDFromAny(docs[0]->DocumentID))),
			  db);
	else if(strcmp(docs[0]->Type,"WAIS_PREV") == 0)
	  id = previous_docid(anyToString(GetLocalID(docIDFromAny(docs[0]->DocumentID))),
			      db);
	if (id > -1) {
	  document_table_entry doc_entry;
	  hit foo;
	  long lines,length;
	  DocID* theDocID = NULL;
	  char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
	  local_id[0] = '\0';

	  if (read_document_table_entry(&doc_entry, id, db) == true) {
	    foo.start_character = doc_entry.start_character;
	    foo.end_character = doc_entry.end_character;
	    foo.document_length = doc_entry.document_length;
	    foo.number_of_lines = doc_entry.number_of_lines;

	    read_filename_table_entry(doc_entry.filename_id, 
				      foo.filename,
				      foo.type,
				      NULL,
				      db),
	    strncpy(foo.headline, 
		    read_headline_table_entry(doc_entry.headline_id,db),
		    MAX_HEADLINE_LEN);
	    sprintf(foo.date, "%d", doc_entry.date);
	    sprintf(local_id, "%ld %ld %s", 
		    doc_entry.start_character,
		    doc_entry.end_character,
		    foo.filename);
		
	    if (calcDocLength(&(foo),&lines,&length))
	      {			/* this document is good, return it */
		char** type = NULL;
		
		if (waisProtocolVersion >= '2')
		  { type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
		    type[0] = s_strdup(foo.type);
		    type[1] = NULL;
		  }
		else
		  type = NULL;

		theDocID = makeDocID();
		theDocID->originalDatabase = stringToAny(dbName);
		theDocID->originalLocalID = stringToAny(local_id);
		headers[(*headerNum)++] = 
		  makeWAISDocumentHeader(anyFromDocID(theDocID),
					 UNUSED,
					 -1L,
					 UNUSED,length,lines,
					 type,
					 s_strdup(dbName),
					 s_strdup(foo.date),
					 s_strdup(foo.headline),
					 NULL);
		headers[*headerNum] = NULL;
		freeDocID(theDocID);
		return(true);
	      }
	    else
	      { 
		waislog(WLOG_HIGH, WLOG_ERROR, 
			"document <%ld %ld %s> skipped.",
			doc_entry.start_character,
			doc_entry.end_character,
			foo.filename);
		return(true);
	      }
	  }
	      
	}
      }
    }	
  }
  /* until seed_words_used is supported */
  strcpy(*seed_words_used, wais_search->SeedWords);

  /* note that the serial search engine does not do relevance feedback.
     As such, fed back doc-id's are ignored.  In a real system, we might
     want to generate diagnostics if such an id was inappropriate for this
     database (of course the UI should intercept such requests in the first
     place - but...It has no way of knowing what a server can handle!)
     */

  parameters.max_hit_retrieved = wais_search->MaxDocumentsRetrieved;
  set_query_parameter(SET_MAX_RETRIEVED_MASK, &parameters);

  search_result = false;


#ifdef RELEVANCE_FEEDBACK
#define MAX_TEXT_SIZE 10000	/* Maximume size of relevant text */
  {
    WAISDocumentText *doctext, *getData(), *getDocumentText();
    DocObj** docs = NULL;
    DocObj* doc = NULL;

    /* read the query */
    docs = wais_search->Docs;
    if(docs != NULL) {
      /* assemble the elements and construct a response */
      for (i = 0, doc = docs[i]; doc != NULL; doc = docs[++i])
	{
	  if(doc->Type == NULL ||
	     strcmp(doc->Type,"TEXT") == 0 ||
	     doc->Type[0] == 0) {

	    long errorCode;
	    doctext = NULL;

	    if (doc->ChunkCode == CT_line)
	      doctext = getDocumentText(doc, dbName, &errorCode);
	    else if ((doc->ChunkCode == CT_byte) ||
		     (doc->ChunkCode == CT_document))
	      doctext = getData(doc, dbName, &errorCode);
	    if (doctext != NULL) {
	      if(doctext->DocumentText->size > MAX_TEXT_SIZE)
		doctext->DocumentText->bytes[MAX_TEXT_SIZE] = 0;
	      search_result |= 
		search_for_words(doctext->DocumentText->bytes, db, 1);
	      freeWAISDocumentText(doctext);
	    }
	  }
	}
    }
  }
#endif				/* RELEVANT_FEEDBACK */

  search_result |= search_for_words(wais_search->SeedWords, db, 0);

  if (search_result == true)
    {				/* the search went ok */
      hit best_hit;
      originName = dbName;

      finished_search_word(db);
      for (i = 0; i < wais_search->MaxDocumentsRetrieved; i++){ 
	if(0 != next_best_hit(&best_hit, db))
	  break;		/* out of hits */
	if(i == 0)
	  maxRawScore = best_hit.weight;
	if (best_hit.weight > 0){
	  long lines,length;
	  DocID* theDocID = NULL;
	  char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
	  local_id[0] = '\0';

	  if (calcDocLength(&(best_hit),&lines,&length))
	    {			/* this document is good, return it */
	      char** type = NULL;
	      normalScore = (long)floor(
					(((double)best_hit.weight) /
					 ((double)maxRawScore)) *	
					(MAX_NORMAL_SCORE + 1));
	      if (normalScore > MAX_NORMAL_SCORE)
		normalScore = MAX_NORMAL_SCORE;

	      sprintf(local_id, "%ld %ld %s", 
		      best_hit.start_character,
		      best_hit.end_character,
		      best_hit.filename);
         
	      if (waisProtocolVersion >= '2')
		{ type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
		  type[0] = s_strdup(best_hit.type);
		  type[1] = NULL;
		}
	      else
		type = NULL;
	      /*
		printf("header %ld out of %ld\n", *headerNum, 
		wais_search->MaxDocumentsRetrieved); 
		*/
	      theDocID = makeDocID();
	      theDocID->originalDatabase = stringToAny(originName);
	      theDocID->originalLocalID = stringToAny(local_id);
	      headers[(*headerNum)++] = 
		makeWAISDocumentHeader(anyFromDocID(theDocID),
				       UNUSED,
				       (long)normalScore,
				       UNUSED,length,lines,
				       type,
				       s_strdup(originName),
				       s_strdup(best_hit.date),
				       s_strdup(best_hit.
						headline),
				       NULL);
	      headers[*headerNum] = NULL;
	      freeDocID(theDocID);
	    }
	  else
	    { 
	      waislog(WLOG_HIGH, WLOG_ERROR, 
		      "document <%ld %ld %s> skipped.",
		      best_hit.start_character,
		      best_hit.end_character,
		      best_hit.filename);
	      return(true);
	    }
	}
      }
    }
  else
    {				/* something went awry in the search */
      diag = makeDiag(true,D_PermanentSystemError,
		      "Serious error in server");
      *diags = (diagnosticRecord**)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * 2));
      (*diags)[0] = diag;
      (*diags)[1] = NULL;
    }
  finished_best_hit();
  /* free everything */
  closeDatabase(db);
  return(true);
}
