/* 
 * tclRegexp.c --
 *
 *	This file contains the public interfaces to the Tcl regular
 *	expression mechanism.
 *
 * Copyright (c) 1998 by Sun Microsystems, Inc.
 *
 * See the file "license.terms" for information on usage and redistribution
 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 *
 * SCCS: @(#) tclRegexp.c 1.10 98/01/22 14:46:25
 */

#include "tclInt.h"
#include "tclPort.h"
#include "tclRegexp.h"

/*
 *----------------------------------------------------------------------
 * The routines in this file use Henry Spencer's regular expression
 * package contained in the following additional source files:
 *
 *	chr.h		tclRegexp.h	lex.c
 *	guts.h		color.c		locale.c
 *	wchar.h		compile.c	nfa.c
 *	wctype.h	exec.c
 *
 *	Copyright (c) 1986 by University of Toronto.
 *	Written by Henry Spencer.  Not derived from licensed software.
 *
 *	Permission is granted to anyone to use this software for any
 *	purpose on any computer system, and to redistribute it freely,
 *	subject to the following restrictions:
 *
 *	1. The author is not responsible for the consequences of use of
 *		this software, no matter how awful, even if they arise
 *		from defects in it.
 *
 *	2. The origin of this software must not be misrepresented, either
 *		by explicit claim or by omission.
 *
 *	3. Altered versions must be plainly marked as such, and must not
 *		be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 *
 * *** NOTE: this code has been altered slightly for use in Tcl: ***
 * *** 1. Use ckalloc, ckfree, and ckrealloc instead of malloc,	 ***
 * ***    free, and realloc.					 ***
 * *** 2. Add extra argument to regexp to specify the real	 ***
 * ***    start of the string separately from the start of the	 ***
 * ***    current search. This is needed to search for multiple	 ***
 * ***    matches within a string.				 ***
 * *** 3. Names have been changed, e.g. from re_comp to		 ***
 * ***    TclRegComp, to avoid clashes with other 		 ***
 * ***    regexp implementations used by applications. 		 ***
 * *** 4. Various lint-like things, such as casting arguments	 ***
 * ***	  in procedure calls, removing debugging code and	 ***
 * ***	  unused vars and procs.	 	 	 	 ***
 * *** 5. Removed "backward-compatibility kludges" such as	 ***
 * ***	  REG_PEND and REG_STARTEND flags, the re_endp field in	 ***
 * ***	  the regex_t struct, and the fronts.c layer.		 ***
 * *** 6. Changed char to Tcl_UniChar.				 ***
 * *** 7. Removed -DPOSIX_MISTAKE compile-time flag.		 ***
 * ***	  This is now the default.				 ***
 */

/*
 * Declarations for functions used only in this file.
 */

static void		DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
			    Tcl_Obj *copyPtr));
static void		FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *regexpPtr));
static int		SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp,
			    Tcl_Obj *objPtr));
static int		SetRegexpFromAnyFlags _ANSI_ARGS_((Tcl_Interp *interp,
			    Tcl_Obj *objPtr, int flags));

/*
 * The regular expression Tcl object type.  This serves as a cache
 * of the compiled form of the regular expression.
 */

Tcl_ObjType tclRegexpType = {
    "regexp",				/* name */
    FreeRegexpInternalRep,		/* freeIntRepProc */
    DupRegexpInternalRep,		/* dupIntRepProc */
    NULL,				/* updateStringProc */
    SetRegexpFromAny			/* setFromAnyProc */
};


/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpCompile --
 *
 *	Compile a regular expression into a form suitable for fast
 *	matching.  This procedure is DEPRECATED in favor of the
 *	object version of the command.
 *
 * Results:
 *	The return value is a pointer to the compiled form of string,
 *	suitable for passing to Tcl_RegExpExec.  This compiled form
 *	is only valid up until the next call to this procedure, so
 *	don't keep these around for a long time!  If an error occurred
 *	while compiling the pattern, then NULL is returned and an error
 *	message is left in the interp's result.
 *
 * Side effects:
 *	Updates the cache of compiled regexps.
 *
 *----------------------------------------------------------------------
 */

Tcl_RegExp
Tcl_RegExpCompile(interp, string)
    Tcl_Interp *interp;			/* For use in error reporting. */
    char *string;			/* String for which to produce
					 * compiled regular expression. */
{
    Interp *iPtr = (Interp *) interp;
    int i, length;
    TclRegexp *result;

    length = strlen(string);
    for (i = 0; i < NUM_REGEXPS; i++) {
	if ((length == iPtr->patLengths[i])
		&& (strcmp(string, iPtr->patterns[i]) == 0)) {
	    /*
	     * Move the matched pattern to the first slot in the
	     * cache and shift the other patterns down one position.
	     */

	    if (i != 0) {
		int j;
		char *cachedString;

		cachedString = iPtr->patterns[i];
		result = iPtr->regexps[i];
		for (j = i-1; j >= 0; j--) {
		    iPtr->patterns[j+1] = iPtr->patterns[j];
		    iPtr->patLengths[j+1] = iPtr->patLengths[j];
		    iPtr->regexps[j+1] = iPtr->regexps[j];
		}
		iPtr->patterns[0] = cachedString;
		iPtr->patLengths[0] = length;
		iPtr->regexps[0] = result;
	    }
	    return (Tcl_RegExp) iPtr->regexps[0];
	}
    }

    /*
     * No match in the cache.  Compile the string and add it to the
     * cache.
     */

    result = TclRegComp(interp, string);
    if (!result) {
	return NULL;
    }

    /*
     * We successfully compiled the expression, so add it to the cache.
     */

    if (iPtr->patterns[NUM_REGEXPS-1] != NULL) {
	ckfree(iPtr->patterns[NUM_REGEXPS-1]);
	regfree(&(iPtr->regexps[NUM_REGEXPS-1]->re));
	ckfree((char *) iPtr->regexps[NUM_REGEXPS-1]);
    }
    for (i = NUM_REGEXPS - 2; i >= 0; i--) {
	iPtr->patterns[i+1] = iPtr->patterns[i];
	iPtr->patLengths[i+1] = iPtr->patLengths[i];
	iPtr->regexps[i+1] = iPtr->regexps[i];
    }
    iPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
    strcpy(iPtr->patterns[0], string);
    iPtr->patLengths[0] = length;
    iPtr->regexps[0] = result;
    return (Tcl_RegExp) result;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpExec --
 *
 *	Execute the regular expression matcher using a compiled form
 *	of a regular expression and save information about any match
 *	that is found.
 *
 * Results:
 *	If an error occurs during the matching operation then -1
 *	is returned and the interp's result contains an error message.
 *	Otherwise the return value is 1 if a matching range is
 *	found and 0 if there is no matching range.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpExec(interp, re, string, start)
    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */
    Tcl_RegExp re;		/* Compiled regular expression;  must have
				 * been returned by previous call to
				 * Tcl_RegExpCompile or TclRegCompObj. */
    CONST char *string;		/* String against which to match re. */
    CONST char *start;		/* If string is part of a larger string,
				 * this identifies beginning of larger
				 * string, so that "^" won't match. */
{
    int status, flags, result;
    size_t len;			/* Number of Unicode characters. */
    unsigned i;
    Tcl_DString stringBuf;
    Tcl_Encoding encoding;
    Tcl_UniChar *uniString;
#define NSUBEXP 20
    regmatch_t matches_small[NSUBEXP];
    regmatch_t *matches;
    TclRegexp *regexpPtr = (TclRegexp *) re;

    /*
     * If string offset is not 0, use not-beginning-of-line flag.
     */
    
    if (start == string) {
	flags = 0;
    } else {
	flags = REG_NOTBOL;
    }

    /*
     * Translate string from UTF to UniChar.
     */
    
    encoding = Tcl_GetEncoding(NULL, "unicode");
    uniString = (Tcl_UniChar *) Tcl_UtfToExternalDString(encoding,
	    string, -1, &stringBuf);
    Tcl_FreeEncoding(encoding);
    len = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);

    /*
     * Perform the regexp match.
     */

    if ((regexpPtr->re.re_nsub + 1) < NSUBEXP) {
	matches = matches_small;
    } else {
	matches = (regmatch_t *) ckalloc(sizeof(regmatch_t)
		* (regexpPtr->re.re_nsub + 1));
    }
    status = re_uexec(&regexpPtr->re, uniString, len,
	    regexpPtr->re.re_nsub + 1, matches, flags);
    Tcl_DStringFree(&stringBuf);

    /*
     * Check for errors.
     */

    if (status != REG_OKAY) {
	if (status == REG_NOMATCH) {
	    result = 0;
	    goto done;
	}
	if (interp) {
	    TclRegError(interp, "error while matching regular expression: ",
		    status);
	}
	result = -1;
	goto done;
    }

    /*
     * Compute the offsets in the UTF string based on the corresponding
     * locations in the Unicode string.
     */

    for (i = 0; i <= regexpPtr->re.re_nsub; i++) { 
	if (matches[i].rm_so == -1) { 
	    regexpPtr->matches[i].startp = NULL; 
	    regexpPtr->matches[i].endp = NULL; 
	} else { 
	    regexpPtr->matches[i].startp = Tcl_UtfAtIndex(string,
		    matches[i].rm_so); 
	    regexpPtr->matches[i].endp = Tcl_UtfAtIndex(string,
		    matches[i].rm_eo); 
	} 
    } 
    result = 1;

    done:
    if (matches != matches_small) {
	ckfree((char*)matches);
    }
    return result;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpRange --
 *
 *	Returns pointers describing the range of a regular expression match,
 *	or one of the subranges within the match.
 *
 * Results:
 *	The variables at *startPtr and *endPtr are modified to hold the
 *	addresses of the endpoints of the range given by index.  If the
 *	specified range doesn't exist then NULLs are returned.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
Tcl_RegExpRange(re, index, startPtr, endPtr)
    Tcl_RegExp re;		/* Compiled regular expression that has
				 * been passed to Tcl_RegExpExec. */
    int index;			/* 0 means give the range of the entire
				 * match, > 0 means give the range of
				 * a matching subrange. */
    char **startPtr;		/* Store address of first character in
				 * (sub-) range here. */
    char **endPtr;		/* Store address of character just after last
				 * in (sub-) range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;

    if ((size_t) index > regexpPtr->re.re_nsub) {
	*startPtr = *endPtr = NULL;
    } else {
	*startPtr = (char *) regexpPtr->matches[index].startp;
	*endPtr = (char *) regexpPtr->matches[index].endp;
    }
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpMatch --
 *
 *	See if a string matches a regular expression.
 *
 * Results:
 *	If an error occurs during the matching operation then -1
 *	is returned and the interp's result contains an error message.
 *	Otherwise the return value is 1 if "string" matches "pattern"
 *	and 0 otherwise.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpMatch(interp, string, pattern)
    Tcl_Interp *interp;		/* Used for error reporting. */
    char *string;		/* String. */
    char *pattern;		/* Regular expression to match against
				 * string. */
{
    Tcl_RegExp re;

    re = Tcl_RegExpCompile(interp, pattern);
    if (re == NULL) {
	return -1;
    }
    return Tcl_RegExpExec(interp, re, string, string);
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegExpMatchObj --
 *
 *	See if a string matches a regular expression pattern object.
 *
 * Results:
 *	If an error occurs during the matching operation then -1
 *	is returned and the interp's result contains an error message.
 *	Otherwise the return value is 1 if "string" matches "pattern"
 *	and 0 otherwise.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclRegExpMatchObj(interp, string, patObj)
    Tcl_Interp *interp;		/* Used for error reporting. */
    char *string;		/* String. */
    Tcl_Obj *patObj;		/* Regular expression to match against
				 * string. */
{
    Tcl_RegExp re;

    re = TclRegCompObj(interp, patObj, REG_ADVANCED);
    if (re == NULL) {
	return -1;
    }
    return Tcl_RegExpExec(interp, re, string, string);
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegCompObj --
 *
 *	Compile a regular expression into a form suitable for fast
 *	matching.  This procedure caches the result in a Tcl_Obj.
 *
 * Results:
 *	The return value is a pointer to the compiled form of string,
 *	suitable for passing to Tcl_RegExpExec.  If an error occurred
 *	while compiling the pattern, then NULL is returned and an error
 *	message is left in the interp's result.
 *
 * Side effects:
 *	Updates the native rep of the Tcl_Obj.
 *
 *----------------------------------------------------------------------
 */

Tcl_RegExp
TclRegCompObj(interp, patObj, flags)
    Tcl_Interp *interp;			/* For use in error reporting. */
    Tcl_Obj *patObj;			/* String for which to produce
					 * compiled regular expression. */
    int flags;				/* Regular expression flags. */
{
    int status;
    TclRegexp *regexpRepPtr = (TclRegexp *) patObj->internalRep.otherValuePtr;

    if ((patObj->typePtr != &tclRegexpType) || (regexpRepPtr->flags != flags)) {
	status = SetRegexpFromAnyFlags(interp, patObj, flags);
	if (status != TCL_OK) {
	    return NULL;
	}
	regexpRepPtr = (TclRegexp *) patObj->internalRep.otherValuePtr;
    }
    return (Tcl_RegExp)regexpRepPtr;
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegComp --
 *
 *	Internal interface to compile a regular expression string.
 *
 * Results:
 *	Returns a newly allocated regexp, or NULL on failure, leaving
 *	an error message in the interp, if possible.
 *
 * Side effects:
 *	This leaks the memory for the compiled regular expression -
 *	use only for things that live forever.
 *
 *----------------------------------------------------------------------
 */

TclRegexp *
TclRegComp(interp, string)
    Tcl_Interp *interp;		/* Interp for error reporting. */
    CONST char *string;		/* String to compile. */
{
    Tcl_Obj *stringObj = Tcl_NewStringObj(string, -1);
    return (TclRegexp *)TclRegCompObj(interp, stringObj, 0);
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegError --
 *
 *	Generate an error message based on the regexp status code.
 *
 * Results:
 *	Places an error in the interpreter.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
TclRegError(interp, msg, status)
    Tcl_Interp *interp;		/* Interpreter for error reporting. */
    char *msg;			/* Message to prepend to error. */
    int status;			/* Status code to report. */
{
    char *errMsg;

    switch(status) {
	case REG_BADPAT:
	    errMsg = "invalid regular expression";
	    break;
	case REG_ECOLLATE:
	    errMsg = "invalid collating element";
	    break;
	case REG_ECTYPE:
	    errMsg = "invalid character class";
	    break;
	case REG_EESCAPE:
	    errMsg = "invalid escape sequence";
	    break;
	case REG_ESUBREG:
	    errMsg = "invalid backreference number";
	    break;
	case REG_EBRACK:
	    errMsg = "unmatched []";
	    break;
	case REG_EPAREN:
	    errMsg = "unmatched ()";
	    break;
	case REG_EBRACE:
	    errMsg = "unmatched {}";
	    break;
	case REG_BADBR:
	    errMsg = "invalid repetition count(s)";
	    break;
	case REG_ERANGE:
	    errMsg = "invalid character range";
	    break;
	case REG_ESPACE:
	    errMsg = "out of memory";
	    break;
	case REG_BADRPT:
	    errMsg = "?+* follows nothing";
	    break;
	case REG_ASSERT:
	    errMsg = "\"can't happen\" -- you found a bug";
	    break;
	case REG_INVARG:
	    errMsg = "invalid argument to regex routine";
	    break;
	case REG_MIXED:
	    errMsg = "char RE applied to wchar_t string (etc.)";
	    break;
	case REG_BADOPT:
	    errMsg = "invalid embedded option";
	    break;
	case REG_IMPOSS:
	    errMsg = "can never match";
	    break;
	default:
	    errMsg = "\"can't happen\" -- you found an undefined error code";
	    break;
    }
    Tcl_ResetResult(interp);
    Tcl_AppendResult(interp, msg, errMsg, NULL);
}


/*
 *----------------------------------------------------------------------
 *
 * FreeRegexpInternalRep --
 *
 *	Deallocate the storage associated with a regexp object's internal
 *	representation.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Frees the compiled regular expression.
 *
 *----------------------------------------------------------------------
 */

static void
FreeRegexpInternalRep(listPtr)
    Tcl_Obj *listPtr;		/* List object with internal rep to free. */
{
    register TclRegexp *regexpRepPtr = 
	(TclRegexp *) listPtr->internalRep.otherValuePtr;
    regfree(&regexpRepPtr->re);
    if (regexpRepPtr->matches) {
	ckfree((char *) regexpRepPtr->matches);
    }
    ckfree((char *) regexpRepPtr);
}

/*
 *----------------------------------------------------------------------
 *
 * DupRegexpInternalRep --
 *
 *	It is way to hairy to copy a regular expression, so we punt
 *	and revert the object back to a vanilla string. 
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Changes the type back to string.
 *
 *----------------------------------------------------------------------
 */

static void
DupRegexpInternalRep(srcPtr, copyPtr)
    Tcl_Obj *srcPtr;		/* Object with internal rep to copy. */
    Tcl_Obj *copyPtr;		/* Object with internal rep to set. */
{
    copyPtr->internalRep.longValue = (long)copyPtr->length;
    copyPtr->typePtr = &tclStringType;
}

/*
 *----------------------------------------------------------------------
 *
 * SetRegexpFromAny --
 *
 *	Attempt to generate a compiled regular expression for the Tcl object
 *	"objPtr".
 *
 * Results:
 *	The return value is TCL_OK or TCL_ERROR. If an error occurs during
 *	conversion, an error message is left in the interpreter's result
 *	unless "interp" is NULL.
 *
 * Side effects:
 *	If no error occurs, a regular expression is stored as "objPtr"s internal
 *	representation.
 *
 *----------------------------------------------------------------------
 */

static int
SetRegexpFromAny(interp, objPtr)
    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */
    Tcl_Obj *objPtr;		/* The object to convert. */
{
    return SetRegexpFromAnyFlags(interp, objPtr, 0);
}

/*
 *----------------------------------------------------------------------
 *
 * SetRegexpFromAny --
 *
 *	Attempt to generate a compiled regular expression for the Tcl object
 *	"objPtr".  This can take a REG_ICASE flag to ignore case in string.
 *
 * Results:
 *	The return value is TCL_OK or TCL_ERROR. If an error occurs during
 *	conversion, an error message is left in the interpreter's result
 *	unless "interp" is NULL.
 *
 * Side effects:
 *	If no error occurs, a regular expression is stored as "objPtr"s internal
 *	representation.
 *
 *----------------------------------------------------------------------
 */

static int
SetRegexpFromAnyFlags(interp, objPtr, flags)
    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */
    Tcl_Obj *objPtr;		/* The object to convert. */
    int flags;			/* 0 or REG_ICASE. */
{
    Tcl_ObjType *oldTypePtr = objPtr->typePtr;
    char *string;
    TclRegexp *regexpRepPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
    Tcl_Encoding encoding;
    Tcl_UniChar *uniString;
    int len;
    Tcl_DString stringBuf;
    int status;

    /*
     * Get the up-to-date string representation and map to unicode.
     */

    string = Tcl_GetString(objPtr);
    encoding = Tcl_GetEncoding(NULL, "unicode");
    uniString = (Tcl_UniChar *) Tcl_UtfToExternalDString(encoding, string,
	    -1, &stringBuf);
    len = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);

    /*
     * Compile the string and check for errors.
     */

    regexpRepPtr->flags = flags;
    status = re_ucomp(&regexpRepPtr->re, uniString, (size_t) len,
	    REG_ADVANCED|flags);
    Tcl_DStringFree(&stringBuf);
    if (status != REG_OKAY) {
	/*
	 * Warning, the following is a hack to allow empty regexp.
	 * The goal is to compile a non-empty regexp that will always
	 * find one empty match.  If you use "(?:)" (an empty pair of
	 * non-capturing parentheses) instead, that will avoid both the
	 * overhead and the subexpression report.
	 */
	
	if (status == REG_EMPTY) {
	    uniString =  (Tcl_UniChar *) Tcl_UtfToExternalDString(encoding,
		    "(?:)", -1, &stringBuf);
	    len = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);

	    status = re_ucomp(&regexpRepPtr->re, uniString, (size_t) len,
		    REG_ADVANCED);
	    Tcl_DStringFree(&stringBuf);
	}

	/*
	 * Clean up and report errors in the interpreter, if possible.
	 */

	if (status != REG_OKAY) {
	    Tcl_FreeEncoding(encoding);
	    regfree(&regexpRepPtr->re);
	    ckfree((char *)regexpRepPtr);
	    if (interp) {
		TclRegError(interp,
			"couldn't compile regular expression pattern: ",
			status);
	    }
	    return TCL_ERROR;
	}
    }
    Tcl_FreeEncoding(encoding);

    /*
     * Allocate enough space for all of the subexpressions, plus one
     * extra for the entire pattern.
     */

    regexpRepPtr->matches = (struct matches *) ckalloc(
	sizeof(struct matches) * (regexpRepPtr->re.re_nsub + 1));

    /*
     * Free the old representation and set our type.
     */

    if ((oldTypePtr != NULL) && (oldTypePtr->freeIntRepProc != NULL)) {
	oldTypePtr->freeIntRepProc(objPtr);
    }
    objPtr->internalRep.otherValuePtr = (VOID *) regexpRepPtr;
    objPtr->typePtr = &tclRegexpType;
    return TCL_OK;
}

