/* 
   Copyright (C) 1990 C van Reewijk, email: dutentb.uucp!reeuwijk

This file is part of GLASS.

GLASS is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 1, or (at your option)
any later version.

GLASS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with GLASS; see the file COPYING.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */

/* File: refex.c
 *
 * refex - File regular expression pattern matching
 *         and replacement
 *
 * By:  Ozan S. Yigit (oz)
 *      Dept. of Computer Science
 *      York University
 *
 * Munged by C. van Reeuwijk to do shell style pattern matching.
 * {} shell patterns are not handled.
 *
 * These routines are the PUBLIC DOMAIN equivalents
 * of refex routines as found in 4.nBSD UN*X, with minor
 * extensions.
 *
 * These routines are derived from various implementations
 * found in software tools books, and Conroy's grep. They
 * are NOT derived from licensed/restricted software.
 * For more interesting/academic/complicated implementations,
 * see Henry Spencer's refexp routines, or GNU Emacs pattern
 * matching module.
 *
 * Routines:
 *      ref_comp:        compile a regular expression into a DFA.
 *
 *			char *ref_comp(s)
 *			char *s;
 *
 *      ref_exec:       execute the DFA to match a pattern.
 *
 *			int ref_exec(s)
 *			char *s;
 *
 *      ref_subs:	substitute the matched portions in
 *              	a new string.
 *
 *			int ref_subs(src, dst)
 *			char *src;
 *			char *dst;
 *
 * Regular Expressions:
 *
 *      [1]     char    matches itself, unless it is a special
 *                      character (metachar): . \ [ ] * + ^ $
 *
 *      [2]     ?       matches any character.
 *
 *      [3]     \       matches the character following it, except
 *			when followed by a digit 1 to 9.
 *			(see [7], [8] and [9])
 *			It is used as an escape character for all
 *			other meta-characters, and itself. When used
 *			in a set ([4]), it is treated as an ordinary
 *			character.
 *
 *      [4]     [set]   matches one of the characters in the set.
 *                      If the first character in the set is "^",
 *                      it matches a character NOT in the set. A
 *                      shorthand S-E is used to specify a set of
 *                      characters S upto E, inclusive. The special
 *                      characters "]" and "-" have no special
 *                      meaning if they appear as the first chars
 *                      in the set.
 *                      examples:        match:
 *
 *                              [a-z]    any lowercase alpha
 *
 *                              [^]-]    any char except ] and -
 *
 *                              [^A-Z]   any char except uppercase
 *                                       alpha
 *
 *                              [a-zA-Z] any alpha
 *
 *      [5]     *       matches the longest possible span of zero or more
 *                      arbitrary characters.
 *
 *      [6]             a regular expression in the form [1] to [10], enclosed
 *                      as (form) matches what form matches. The enclosure
 *                      creates a set of tags, used for pattern substution.
 *                      The tagged forms are numbered starting from 1.
 *
 *      [7]             a composite regular expression xy where x and y
 *                      are in the form [1] to [7] matches x followed by
 *                      a match for y.
 *
 * Acknowledgements:
 *
 *	HCR's Hugh Redelmeier has been most helpful in various
 *	stages of development. He convinced me to include BOW
 *	and EOW constructs, originally invented by Rob Pike at
 *	the University of Toronto.
 *
 * References:
 *              Software tools			Kernighan & Plauger
 *              Software tools in Pascal        Kernighan & Plauger
 *              Grep [rsx-11 C dist]            David Conroy
 *		ed - text editor		Un*x Programmer's Manual
 *		Advanced editing on Un*x	B. W. Kernighan
 *		RegExp routines			Henry Spencer
 *
 * Notes:
 *
 *	This implementation uses a bit-set representation for character
 *	classes for speed and compactness. Each character is represented
 *	by one bit in a 128-bit block. Thus, CCL or NCL always takes a
 *	constant 16 bytes in the internal dfa, and ref_exec does a single
 *	bit comparison to locate the character in the set.
 *
 * Examples:
 *
 *	pattern:	fo*
 *	compile:	CHR f CHR o star END
 *	matches:	fo foo fooo foobar fobar foxx ...
 *
 *	pattern:	fo[ob]a[rz]
 *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
 *	matches:	fobar fooar fobaz fooaz
 *
 *	pattern:	(foo)[1-3]
 *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset END
 *	matches:	foo1 foo2 foo3
 *
 *	pattern:	(fo*)-
 *	compile:	BOT 1 CHR f CHR o STAR EOT 1 CHR - END
 *	matches:	foo- fo- fob- foobar-
 *
 */

#include <stdio.h>
#include "refex.h"

#define MAXDFA  1024
#define MAXTAG  10

#define OKP     1
#define NOP     0

#define CHR     1
#define ANY     2
#define CCL     3
#define NCL     4
#define BOT     7
#define EOT     8
#define STAR   13

#define END     0

/*
 * The following defines are not meant
 * to be changeable. They are for readibility
 * only.
 *
 */
#define MAXCHR	128
#define CHRBIT	8
#define BITBLK	MAXCHR/CHRBIT
#define BLKIND	0170
#define BITIND	07

#define ASCIIB	0177

typedef /*unsigned*/ char CHAR;

static int  tagstk[MAXTAG];             /* subpat tag stack..*/
static CHAR dfa[MAXDFA];		/* automaton..       */
static int  sta = NOP;               	/* status of lastpat */

static CHAR bittab[BITBLK];		/* bit table for CCL */

static void chset( c )
 register CHAR c;
{
    bittab[((c)&BLKIND)>>3] |= 1<<((c)&BITIND);
}

#define badpat(x)	return(*dfa = END, x)
#define store(x)	*mp++ = x

char *ref_comp(pat)
char *pat;
{
    register char *p;               /* pattern pointer   */
    register CHAR *mp=dfa;          /* dfa pointer       */

    register int tagi = 0;          /* tag stack index   */
    register int tagc = 1;          /* actual tag count  */

    register int n;
    int c1, c2;

    if (!pat || !*pat)
	if (sta)
	    return( 0 );
	else
	    badpat("No previous regular expression");
    sta = NOP;

    for (p = pat; *p != '\0'; p++) {
	switch(*p){
	    case '?':               /* match any char..  */
		store(ANY);
		break;

	    case '[':               /* match char class..*/
		if (*++p == '^') {
		    store(NCL);
		    p++;
		}
		else
		    store(CCL);

		if (*p == '-')		/* real dash */
		    chset(*p++);
		if (*p == ']')		/* real brac */
		    chset(*p++);
		while (*p && *p != ']') {
		    if (*p == '-' && *(p+1) && *(p+1) != ']') {
			p++;
			c1 = *(p-2) + 1;
			c2 = *p++;
			while (c1 <= c2)
			    chset(c1++);
		    }
		    else
			chset(*p++);
		}
		if (!*p)
		    badpat("Missing ]");

		for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
		    store(bittab[n]);

		break;

	    case '*':               /* match 0 or more.. */
		store(STAR);
		break;

	    case '(':
		if( tagc>=MAXTAG ){
		    badpat( "Too many () pairs" );
		}
		tagstk[++tagi] = tagc;
		store(BOT);
		store(tagc++);
		break;
	    
	    case ')':
		if( tagi<=0 ){
		    badpat( "Unmatched )" );
		}
		store( EOT );
		store( tagstk[tagi--] );
		break;

	    case '\\':              /* tags, backrefs .. */
		p++;
		store(CHR);
		store(*p);
		break;

	    default :               /* an ordinary char  */
		store(CHR);
		store(*p);
		break;
	}
    }
    if (tagi > 0)
	    badpat("Unmatched \\(");
    store(END);
    sta = OKP;
    return( 0 );
}

static char *bopat[MAXTAG];
static char *eopat[MAXTAG];
static char *pmatch();

/*
 * ref_exec:
 * 	execute dfa to match the string.
 *
 *	If the pattern matches, bopat[0] and eopat[0] are set
 *	to the beginning and the end of the matched fragment,
 *	respectively.
 *
 * return 1 if the compiled pattern matches the given string,
 * or else 0.
 */

int ref_exec(lp)
 register char *lp;
{
    register char *ep = 0;
    register CHAR *ap = dfa;

    bopat[0] = 0;
    bopat[1] = 0;
    bopat[2] = 0;
    bopat[3] = 0;
    bopat[4] = 0;
    bopat[5] = 0;
    bopat[6] = 0;
    bopat[7] = 0;
    bopat[8] = 0;
    bopat[9] = 0;

    if( *ap == END ) return( 0 );	/* munged automaton. fail always */
    ep = pmatch( lp, ap );
    if (!ep)
	return( 0 );
    bopat[0] = lp;
    eopat[0] = ep;
    return(1);
}

/*
 * pmatch:
 *	internal routine for the hard part
 *
 * 	This code is mostly snarfed from an early
 * 	grep written by David Conroy. The backref and
 * 	tag stuff, and various other mods are by oZ.
 *
 *	At the end of a successful match, bopat[n] and eopat[n]
 *	are set to the beginning and end of subpatterns matched
 *	by tagged expressions (n = 1 to 9).
 *
 */

#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & (1<<((y)&BITIND)))

static char *pmatch( lp, ap )
register char *lp;
register CHAR *ap;
{
    register char *e;		/* extra pointer for CLO */
    register int op;
    register int c;
    char *are;			/* to save the line ptr. */

    for(;;){
	op = *ap++;
	if( op == END ) break;
	switch(op) {
	    case CHR:
		if (*lp++ != *ap++)
		    return( 0 );
		break;

	    case ANY:
		if (*lp++ == '\0' )
		    return( 0 );
		break;

	    case CCL:
		c = *lp++;
		if (!isinset(ap,c))
		    return( 0 );
		ap += BITBLK;
		break;

	    case NCL:
		c = *lp++;
		if (isinset(ap,c))
		    return( 0 );
		ap += BITBLK;
		break;

	    case BOT:
		bopat[*ap++] = lp;
		break;

	    case EOT:
		eopat[*ap++] = lp;
		break;

	    case STAR:
		are = lp;
		while (*lp) lp++;
		while( lp >= are ){
		    if( e = pmatch( lp, ap) ) return( e );
		    --lp;
		}
		return( 0 );

	    default:
		fprintf( stderr, "ref_exec: bad dfa opcode: %d.\n", op );
		exit( 2 );
	}
    }
    if( *lp != '\0' ) return( 0 );
    return( lp );
}

/* Substitute the matched portions of the 'src' in 'dst'.
 * The following special characters are recognized in 'src':
 *
 * &:
 *	Substitute the entire matched pattern.
 *
 * \digit:
 *	Substitute a subpattern, with the given tag number. Tags are
 *	numbered from 1 to 9. If the particular tagged subpattern
 * 	does not exist, the empty string is substituted.
 */
void ref_subs( src, dst )
 register char *src;
 register char *dst;
{
    register char c;
    register int pin;
    register char *bp;
    register char *ep;

    if( !*src || !bopat[0] )
	return;

    while( c = *src++ ){
	switch(c) {
	    case '&':
		pin = 0;
		break;

	    case '\\':
		c = *src++;
		if (c >= '0' && c <= '9') {
		    pin = c - '0';
		    break;
		}

	    default:
		*dst++ = c;
		continue;
	    }
	    bp = bopat[pin];
	    ep = eopat[pin];
	    if( bp && ep ){
		while( *bp != '\0' && bp < ep )
		    *dst++ = *bp++;
	    }
    }
    *dst = '\0';
}
