#charset "us-ascii"
/* 
 *  Copyright (c) 2005 by Kevin Forchione. All rights reserved.
 *   
 *  This file is part of the TADS 3 rextok library extension.
 *
 *  rextok.t
 *  version 2.0
 *
 *  This file defines the rexTokMatch(), rexTokSearch(),
 *  and rexTokGroup() functions. These functilons do for 
 *  a token list what rexMatch(), rexSearch(), and rexGroup()
 *  do for strings. 
 *
 *  For instance, for a token list consisting of the following:
 *  
 *      'the' 'red' 'ball' '.'
 *
 *  ret = rexTokSearch('(ball).') returns [TokenBoundaryMatch, [3,2,['ball','.']]
 *  
 *  rexTokGroup(1) returns [TokenBoundaryMatch, [3,1,['ball']]
 *
 *--------------------------------------------------------------------
 *
 *  The rexTokXXXX functions come with a few caveats. 
 *
 *  The first is that the string built from the token list is 
 *  constructed from converted tokenizer token values, not 
 *  original values. 
 *
 *  The second cavaet is that the default construction of the 
 *  search string separates each token value by a single white
 *  space, except for tokPunct token values, which are appended
 *  to the previous token value. 
 *
 *  The third is that it is possible to have a partial match on 
 *  a token value at either end of the token list used to build
 *  the search string. For instance, a search string of 'in' 
 *  would only partially match the token value of 'into'. The 
 *  first element of each function's return value indicates 
 *  what kind of match has occurred. 
 */

#include "rextok.h"

/*
 *  Tests the toks list or vector to see if the subset starting at 
 *  index idx matches the given regular expression pattern, which 
 *  can be given as a string using regular expression syntax, or as 
 *  a RexPattern object.  If the leading subset of toks matches the 
 *  regular expression, the function returns a list consisting of the
 *  following:
 *  
 *      [1] indicates the degree of the token match. A token value of 
 *          'token' would have an exact match if the reconstructed token
 *          value would be 'token'. A partial match might occur at either
 *          end of the token list if the pattern matches only part of a
 *          token value.
 *          a)  TokenBoundaryMatch - exact matches for first and last token values
 *              were exactly matched by the pattern.
 *          b)  TokenBoundaryErrorStart - the first token value was only partially
 *              matched by the pattern.
 *          c)  TokenBoundaryErrorEnd - the last token value was only partially
 *              matched by the pattern.
 *          d)  TokenBoundaryErrorBoth - the first and last token values were only
 *              partially matched by the pattern.
 *      [2] the number of tokens of the match
 *
 *  if there is no match, the function returns nil.  
 *
 *  This does not search for a match, but merely determines if toks 
 *  matches the expression in its leading subset.  Note that a regular 
 *  expression can successfully match zero characters, so a return 
 *  value of zero is distinct from a return value of nil: zero 
 *  indicates a match of zero characters, and nil indicates no match.
 *  
 *  If idx is given, it indicates the starting toks index for the 
 *  match; idx 1 indicates the first token in the collection, and 
 *  is the default if idx is omitted.  
 *
 *  This can be used to match a subset of toks to the pattern 
 *  without actually creating a separate subset collection.
 *
 *  The arguments for the function represent a regular expression
 *  search string or rexPattern, a collection of search tokens, and 
 *  a starting collection index for the search.
 *
 *  [The underlying rexSearch search string is built by simply 
 *  separating each token with a single whitespace except for 
 *  tokPunct tokens, which are appended to the preceding token.]
 */
rexTokMatch(pattern, toks, [idx])
{
    local ret, strRet, str, tokVec;

    if (dataType(pattern) != TypeSString
        && !(dataType(pattern) == TypeObject && pattern.ofKind(RexPattern)))
        throw new RexTokError('string value required');

    if (dataType(toks) != TypeList 
        && !(dataType(toks) == TypeObject && toks.ofKind(Vector)))
        throw new RexTokError('list value required');

    if (idx.length())
        idx = idx.car();
    else idx = 1;

    if (dataType(idx) != TypeInt)
        throw new RexTokError('invalid type for rexTok function argument');
    else if (idx < 1)
        idx = 1;

    strRet  = Tokenizer.stringize(toks, idx);
    str     = strRet[1];
    tokVec  = strRet[2];

    ret = rexMatch(pattern, str, 1);

    if (ret)
    {
        local endVec;
        local cnt, endIdx, match;

        endVec = new Vector(tokVec.length());
        for (local i = 1; i <= tokVec.length(); ++i)
            endVec.append(tokVec[i][2] + tokVec[i][3] - 1);
        
        for (local i = tokVec.length(); i > 0; --i)
        {
            if (tokVec[i][2] < ret)
            {
                endIdx = i;
                break;
            }
        }

        if (endVec.indexOf(ret))
            match = TokenBoundaryMatch;
        else match = TokenBoundaryErrorEnd;

        rexTokGlobal.grpTokVec  = new Vector(toks.length());
        cnt                     = endIdx;
        rexTokGlobal.grpTokVec.fillValue([], 1, cnt);
        rexTokGlobal.grpTokVec.copyFrom(toks, idx, 1, cnt);

        ret = [match, endIdx];
    }
    else
        rexTokGlobal.grpTokVec = nil;

    return ret;
}

/*
 *  Searches for the regular expression pat in the collection toks, 
 *  starting at the collection position idx.  The pattern can be 
 *  given as a string using regular expression syntax, or as a 
 *  RexPattern object.  
 *
 *  If idx is given, it gives the starting token position in toks 
 *  for the search.  The first token is at index 1.  If index is 
 *  omitted, the search starts with the first token.  The index 
 *  value can be used to search for repeated instances of the 
 *  pattern, by telling the function to ignore matches before the 
 *  given point in the collection.
 *
 *  If the function finds a match, it returns a list with the 
 *  following elements: 
 *
 *      [1] indicates the degree of the token match. A token value of 
 *          'token' would have an exact match if the reconstructed token
 *          value would be 'token'. A partial match might occur at either
 *          end of the token list if the pattern matches only part of a
 *          token value.
 *          a)  TokenBoundaryMatch - exact matches for first and last token values
 *              were exactly matched by the pattern.
 *          b)  TokenBoundaryErrorStart - the first token value was only partially
 *              matched by the pattern.
 *          c)  TokenBoundaryErrorEnd - the last token value was only partially
 *              matched by the pattern.
 *          d)  TokenBoundaryErrorBoth - the first and last token values were only
 *              partially matched by the pattern.
 *      [2] a sublist conisting of 3 elements:
 *          a)  the index within toks of the first token of the 
 *              matching subset (the first token in toks is at 
 *              index 1); 
 *          b)  the length of the match
 *          c)  a list giving the matching subset
 *      
 *  If there is no match, the function returns nil.
 *  
 *  The arguments for the function represent a regular expression
 *  search string or rexPattern, a collection of search tokens, and 
 *  a starting collection index for the search.
 *
 *  [The underlying rexSearch search string is built by simply 
 *  separating each token with a single whitespace except for 
 *  tokPunct tokens, which are appended to the preceding token.]
 */
rexTokSearch(pattern, toks, [idx])
{
    local ret, strRet, str, tokVec;
    
    if (dataType(pattern) != TypeSString
        && !(dataType(pattern) == TypeObject && pattern.ofKind(RexPattern)))
        throw new RexTokError('string value required');

    if (dataType(toks) != TypeList 
        && !(dataType(toks) == TypeObject && toks.ofKind(Vector)))
        throw new RexTokError('list value required');

    if (idx.length())
        idx = idx.car();
    else idx = 1;
    
    if (dataType(idx) != TypeInt)
        throw new RexTokError('invalid type for rexTok function argument');
    else if (idx < 1)
        idx = 1;

    strRet  = Tokenizer.stringize(toks, 1);
    str     = strRet[1];
    tokVec  = strRet[2];

    ret = rexSearch(pattern, str, 1);

    if (ret)
    {
        local startVec, endVec, matchVec;
        local cnt, startIdx, endIdx, match;

        startVec = new Vector(tokVec.length());
        for (local i = 1; i <= tokVec.length(); ++i)
            startVec.append(tokVec[i][2]);

        endVec = new Vector(tokVec.length());
        for (local i = 1; i <= tokVec.length(); ++i)
            endVec.append(tokVec[i][2] + tokVec[i][3] - 1);

        startIdx = tokVec.length() + 1;
        for (local i = 1; i <= tokVec.length(); ++i)
        {
            if (tokVec[i][2] + tokVec[i][3] - 1 > ret[1])
            {
                startIdx = i;
                break;
            }
        }

        endIdx = tokVec.length();
        for (local i = tokVec.length(); i > 0; --i)
        {
            if (tokVec[i][2] < ret[1] + ret[2] - 1)
            {
                endIdx = i;
                break;
            }
        }

        matchVec    = new Vector(toks.length());
        cnt         = endIdx - startIdx + 1;
        if (cnt > 0)
        {
            matchVec.fillValue([], 1, cnt);
            matchVec.copyFrom(toks, startIdx, 1, cnt);     
        }

        if (startVec.indexOf(ret[1]))
            if (endVec.indexOf(ret[1] + ret[2] - 1))
                match = TokenBoundaryMatch;
            else match = TokenBoundaryErrorEnd;
        else if (endVec.indexOf(ret[1] + ret[2] - 1))
                match = TokenBoundaryErrorStart;
            else match = TokenBoundaryErrorBoth;
            
        rexTokGlobal.grpTokVec  = new Vector(toks.length());
        cnt                     = endIdx - idx + 1;
        rexTokGlobal.grpTokVec.fillValue([], 1, cnt);
        rexTokGlobal.grpTokVec.copyFrom(toks, idx, 1, cnt);     
        
        ret = [match, [startIdx, matchVec.length(), matchVec.toList()]];
    }
    else
        rexTokGlobal.grpTokVec = nil;

    return ret;
}

/*
 *  Returns information on the group match for the last regular 
 *  expression token search or token match.  group_num is the number 
 *  of the parenthesized group for which to retrieve the information; 
 *  the first parenthesized expression in the most recent search 
 *  expression is number 1.  
 *
 *  If a match is found, returns a list consisting of the following
 *  elements:
 *      [1] indicates the degree of the token match. A token value of 
 *          'token' would have an exact match if the reconstructed token
 *          value would be 'token'. A partial match might occur at either
 *          end of the token list if the pattern matches only part of a
 *          token value.
 *          a)  TokenBoundaryMatch - exact matches for first and last token values
 *              were exactly matched by the pattern.
 *          b)  TokenBoundaryErrorStart - the first token value was only partially
 *              matched by the pattern.
 *          c)  TokenBoundaryErrorEnd - the last token value was only partially
 *              matched by the pattern.
 *          d)  TokenBoundaryErrorBoth - the first and last token values were only
 *              partially matched by the pattern.
 *      [2] a sublist conisting of 3 elements:
 *          a)  the index within toks of the first token of the 
 *              matching subset (the first token in toks is at 
 *              index 1); 
 *          b)  the length of the match
 *          c)  a list giving the matching subset
 *      
 *  If there is no match, the function returns nil.
 *
 *  The argument for the function represents the
 *  rexGroup number.
 */
rexTokGroup(num)
{
    local ret;
    
    if (dataType(num) != TypeInt)
        throw new RexTokError('invalid type for rexTok function argument');
    else if (num < 1)
        throw new RexTokError('invalid value for rexTok function argument');
        
    if (rexTokGlobal.grpTokVec)
        ret = rexGroup(num);

    if (ret)
    {
        local toks, strRet, startVec, endVec, tokVec, matchVec;
        local cnt, startIdx, endIdx, match;

        toks    = rexTokGlobal.grpTokVec;

        strRet  = Tokenizer.stringize(toks, 1);
        tokVec  = strRet[2];

        startVec  = new Vector(tokVec.length());
        for (local i = 1; i <= tokVec.length(); ++i)
            startVec.append(tokVec[i][2]);

        endVec  = new Vector(tokVec.length());
        for (local i = 1; i <= tokVec.length(); ++i)
            endVec.append(tokVec[i][2] + tokVec[i][3] - 1);

        startIdx = tokVec.length() + 1;
        for (local i = 1; i <= tokVec.length(); ++i)
        {
            if (tokVec[i][2] + tokVec[i][3] - 1 > ret[1])
            {
                startIdx = i;
                break;
            }
        }

        endIdx = tokVec.length();
        for (local i = tokVec.length(); i > 0; --i)
        {
            if (tokVec[i][2] < ret[1] + ret[2] - 1)
            {
                endIdx = i;
                break;
            }
        }

        matchVec    = new Vector(toks.length());
        cnt         = endIdx - startIdx + 1;
        if (cnt > 0)
        {
            matchVec.fillValue([], 1, cnt);
            matchVec.copyFrom(toks, startIdx, 1, cnt);  
        }

        if (startVec.indexOf(ret[1]))
            if (endVec.indexOf(ret[1] + ret[2] - 1))
                match = TokenBoundaryMatch;
            else match = TokenBoundaryErrorEnd;
        else if (endVec.indexOf(ret[1] + ret[2] - 1))
                match = TokenBoundaryErrorStart;
            else match = TokenBoundaryErrorBoth;

        ret = [match, [startIdx, matchVec.length(), matchVec.toList()]];
    }

    return ret;
}

modify Tokenizer
{
    /*
     *  Converts a token list or vector to a string. The function
     *  returns the string and a vector of the token locations within
     *  the string 
     *  
     *  The default stringize method builds the string by separating 
     *  each token by a single whitepace value, except for tokPunct 
     *  tokens, which are appended to the previous token. 
     */
    stringize(toks, start, [len])
    {
        local tokLocVec, str = '';
        
        if (dataType(toks) != TypeList 
            && !(dataType(toks) == TypeObject && toks.ofKind(Vector)))
            throw new RexTokError('list value required');
        
        if (dataType(start) != TypeInt)
            throw new RexTokError('invalid type for rexTok function argument');
        else if (start < 1)
            throw new RexTokError('invalid value for rexTok function argument');

        len = len.car();
        if (len == nil)
            len     = toks.length();

        if (dataType(len) != TypeInt)
            throw new RexTokError('invalid type for rexTok function argument');
        else if (len < start)
            throw new RexTokError('invalid value for rexTok function argument');

        tokLocVec = new Vector(len);
        for (local i = start; i <= len; ++i)
        {
            local vec = new Vector(3);

            vec.append(toks[i]);
            vec.append(str.length() + 1);
            vec.append(toks[i][1].length());

            str += toks[i][1];

            tokLocVec.append(vec.toList());

            if (i < toks.length() && toks[i+1][2] != tokPunct)
                str += ' ';
        }

        return [str, tokLocVec];
    }
}

rexTokGlobal: object
{
    /*
     *  Stores the vector required by a rexTokGroup() call
     *  built from a previous rexTokMatch() or rexTokSearch().
     */
    grpTokVec = nil
}

/*
 *  Base class for RexTok Errors
 */
class RexTokError: RuntimeError
{
    construct(msg)
    {
        exceptionMessage = msg;
        inherited(0);
    }
}