#!/bin/sh
#
# cache_check:
#
# Checkpoint the URL's saved away in the Harvest cache,
# and optionally verify cached is running, start it if necessary,
# and/or retrieve non-existent URLs from the network.
#
# This script takes the file $CACHE/log, and performs a set union with 
# the URLs contained in $URLFILE.  The reason for a set union is in
# case you run cached by hand; then you won't clobber your $URLFILE.
# (cached deletes the entire cache when it starts.)
#
# Options:
#    -c 	Check that cached is running, and start it if necessary.
#               You must have compiled cached with -DDAEMON.
#    -g         get, from the network, any URL's that are not present.
#    -v    	verbose
# 
# Options which may be passed through to $BATCH
#
#    -r 	reload all URLS in the cache.
#    -p port 	connect to cached at specified port
#
# The intent is to run this script from /etc/rc to start cached, 
# to run from cron frequently to checkpoint the cache state, 
# and to run the -g option nightly to synchronize the cache 
# after a system crash.
#
# Typical usage:
#    From /etc/rc:	# start the cached daemon after updating $URLFILE
#       if [ -x /usr/local/harvest/bin/cache_check ]; then
#           /usr/local/harvest/bin/cache_check -c
#       fi
#    From crontab every 1/2 hour: 	# update $URLFILE every 1/2 hour
#       0,30 * * * * exec /usr/local/harvest/bin/cache_check -v >>/usr/adm/cache_check.log
#    From crontab once per day at 3AM:	# update cache from network
#       15 3 * * * exec /usr/local/harvest/bin/cache_check -g
#
# Written by David A. Barrett <barrett@Asgard.cs.Colorado.EDU>
#
HARVEST=/usr/local/harvest
CACHE=/usr/local/spool/harvest/cache
LOGDIR=/usr/adm/harvest
CACHED=$HARVEST/bin/cached
CBASE=`basename $CACHED`
CACHED_OPTS=
#
# client_batch may be obtained from <barrett@Asgard.cs.Colorado.EDU>
#
BATCH=client_batch
CLIENT=$HARVEST/lib/cache/$BATCH
#
# Don't put $URLFILE $CACHE; all files are removed when cached starts!
#
URLFILE=$LOGDIR/cached_urls
TMP=$URLFILE.$$	# must use $$ in case another instance is running
#
# You have to edit the line below to fetch the pid of any running
# cached process.  Beware of path components, and the grep command itself!
# This is for SYSV machines, edit for BSD.
#
CACHED_PID=`ps -t\? | grep '[0-9]  *[^ ]*'$CBASE'$' | sed -e 's/^ *\([0-9]*\).*$/\1/'`
#
# Set the date argument below to show he date in the form: mm/dd/yy hh:mm:ss
#
TIMEARG='+%D %X'
#
# Set the above commands as appropriate for your system
#
while [ $# -ne 0 ]; do
   case $1 in 
      -c) CHECK=1; shift; ;;
      -g) GETNET=1; shift; ;;
      -v) VERBOSE=1; shift; ;;
      *) break;
   esac;
done
ME=`basename $0`
CLIENT_OPTS=`echo $*`
#
# Must make sure to add any entries in the cache log to $URLFILE before running
# cached.  Cached will overwrite log with an empty file when it starts.
#
if [ ! -w $URLFILE ]; then
   touch $URLFILE;
fi
if [ ! -w $URLFILE ]; then
   touch $CACHE/log;		# in case it doesn't exist
fi
#
# $CACHE/log contains lines of the form:
# FILE: /usr/local/spool/harvest/cache/41/641 URL: http://www.cern.ch/WebLinker/
# 
URLCOUNT=`wc $URLFILE | cut -d\  -f1`
if cut -d\  -f 4 < $CACHE/log | cat - $URLFILE | sort -u >$TMP; then
   if [ -s $URLFILE ]; then
      cp $URLFILE /tmp		# just in case something goes wrong
   fi
   if [ -s $TMP ]; then
      cp $TMP $URLFILE		# don't overwrite with zero length file!
   fi
   rm -f $TMP
   NURLCOUNT=`wc $URLFILE  | cut -d\  -f1`
   NEWURLS=`expr "$NURLCOUNT" - "$URLCOUNT"`
   TIME=`date "$TIMEARG"` 
   if [ -n "$VERBOSE" -a "$NEWURLS" -ne 0 ]; then
      echo $ME: $TIME -- added $NEWURLS URLs to $URLFILE
   fi
else
   # if you could not update the $URLFILE file, DON'T run cached and destroy
   # our cache.  Thus, we exit immediately.
   echo $ME: could not create file $TMP
   rm -f $TMP
   exit 1;
fi
#
# Now, try to start cached if requested, or if recharging the cache,
# and its not already running.
#
if [ -n "$CHECK" -o -n "$GETNET" ]; then
   if [ -n "$CACHED_PID" ] && kill -0 "$CACHED_PID" 2>/dev/null; then 
      if [ -n "$VERBOSE" ]; then
	 echo $ME: $CBASE running as pid $CACHED_PID
      fi
   else  
      # Presumes you compiled cached with -DDAEMON
      if [ -n "$VERBOSE" ]; then
	 echo $CACHED $CACHED_OPTS
      fi
      if $CACHED $CACHED_OPTS; then :; 
      else echo $ME: command failed: $CACHED $CACHED_OPTS; fi
      exit 1;
   fi
fi
#
# Reload the cache if requested.
# Pass any unrecognized command line options from our args to $CLIENT.
#
if [ -n "$GETNET" ] && [ -x $CLIENT ]; then
   if [ -n "$VERBOSE" ]; then
      echo $CLIENT $CLIENT_OPTS -n -l $LOGDIR/$BATCH.log $URLFILE
   fi
   if $CLIENT $CLIENT_OPTS -n -l $LOGDIR/$BATCH.log $URLFILE; then :;
   else 
      echo $ME: command failed: $CLIENT $CLIENT_OPTS;
      exit 1;
   fi
fi

