/*
  tidy.c - HTML parser and pretty printer

  Copyright (c) 1998-2000 World Wide Web Consortium
  (Massachusetts Institute of Technology, Institut National de
  Recherche en Informatique et en Automatique, Keio University).
  All Rights Reserved.

  Contributing Author(s):

     Dave Raggett <dsr@w3.org>

  The contributing author(s) would like to thank all those who
  helped with testing, bug fixes, and patience.  This wouldn't
  have been possible without all of you.

  COPYRIGHT NOTICE:
 
  This software and documentation is provided "as is," and
  the copyright holders and contributing author(s) make no
  representations or warranties, express or implied, including
  but not limited to, warranties of merchantability or fitness
  for any particular purpose or that the use of the software or
  documentation will not infringe any third party patents,
  copyrights, trademarks or other rights. 

  The copyright holders and contributing author(s) will not be
  liable for any direct, indirect, special or consequential damages
  arising out of any use of the software or documentation, even if
  advised of the possibility of such damage.

  Permission is hereby granted to use, copy, modify, and distribute
  this source code, or portions hereof, documentation and executables,
  for any purpose, without fee, subject to the following restrictions:

  1. The origin of this source code must not be misrepresented.
  2. Altered versions must be plainly marked as such and must
     not be misrepresented as being the original source.
  3. This Copyright notice may not be removed or altered from any
     source or altered source distribution.
 
  The copyright holders and contributing author(s) specifically
  permit, without fee, and encourage the use of this source code
  as a component for supporting the Hypertext Markup Language in
  commercial products. If you use this source code in a product,
  acknowledgment is not required but would be appreciated.
*/
/*
  File Altered by Ajuba Solutions. 
  Modifications Copyright (c) 2000 Ajuba Solutions
*/


#include "platform.h"
#include "html.h"

namespace tidy {

void InitTidy(void);
void DeInitTidy(void);


Bool        debug_flag = no;
Node       *debug_element = null;
Lexer      *debug_lexer = null;
uint       totalerrors = 0;
uint       totalwarnings = 0;
uint       optionerrors = 0;
extern char *release_date;

FILE *input;


static void FatalError(char *msg)
{
    fprintf(stderr, "Fatal error: %s\n", msg);
    DeInitTidy();

    if (input && input != stdin)
        fclose(input);

    /* 2 signifies a serious error */
    exit(2);
}

void *MemAlloc(uint size)
{
    void *p;

    p = malloc(size);

    if (!p)
        FatalError("Out of memory!");

    return p;
}

void *MemRealloc(void *mem, uint newsize)
{
    void *p;

    if (mem == (void *)null)
        return MemAlloc(newsize);

    p = realloc(mem, newsize);

    if (!p)
        FatalError("Out of memory!");

    return p;
}

void MemFree(void *mem)
{
    if (mem != (void *)null)
        free(mem);
}

void ClearMemory(void *mem, uint size)
{
    memset(mem, 0, size);
}

static StreamIn *OpenInput(FILE *fp)
{
    StreamIn *in;

    in = (StreamIn *)MemAlloc(sizeof(StreamIn));
    in->clientData = (void *) fp;
    in->eofproc = tidy_eof;
    in->getcproc = tidy_getc;
    in->pushed = no;
    in->c = '\0';
    in->tabs = 0;
    in->curline = 1;
    in->curcol = 1;
    in->encoding = CharEncoding;
    in->state = FSM_ASCII;

    return in;
}


/* like strdup but using MemAlloc */
char *wstrdup(char *str)
{
    char *s, *p;
    int len;

    if (str == null)
        return null;

    for (len = 0; str[len] != '\0'; ++len);

    s = (char *)MemAlloc(sizeof(char)*(1+len));
    for (p = s; (*p++ = *str++););
    return s;
}

/* like strndup but using MemAlloc */
char *wstrndup(char *str, int len)
{
    char *s, *p;

    if (str == null || len < 0)
        return null;

    s = (char *)MemAlloc(sizeof(char)*(1+len));

    p = s;

    while (len-- > 0 && (*p++ = *str++));

    *p = '\0';
    return s;
}

/* exactly same as strncpy */
void wstrncpy(char *s1, char *s2, int size)
{
    if (s1 != null && s2 != null)
    {
        if (size >= 0)
        {
            while (size--)
                *s1++ = *s2++;
        }
        else
            while ((*s1++ = *s2++));
    }
}

void wstrcpy(char *s1, char *s2)
{
    while ((*s1++ = *s2++));
}

void wstrcat(char *s1, char *s2)
{
    while (*s1)
        ++s1;

    while ((*s1++ = *s2++));
}

/* exactly same as strcmp */
int wstrcmp(char *s1, char *s2)    
{
    int c;

    while ((c = *s1) == *s2)
    {
        if (c == '\0')
            return 0;

        ++s1;
        ++s2;
    }

    return (*s1 > *s2 ? 1 : -1);
}

/* returns byte count, not char count */
int wstrlen(char *str)
{
    int len = 0;

    while(*str++)
        ++len;

    return len;
}

/*
 MS C 4.2 doesn't include strcasecmp.
 Note that tolower and toupper won't
 work on chars > 127
*/
int wstrcasecmp(char *s1, char *s2)    
{
    uint c;

    while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2)))
    {
        if (c == '\0')
            return 0;

        ++s1;
        ++s2;
    }

    return (*s1 > *s2 ? 1 : -1);
}

int wstrncmp(char *s1, char *s2, int n)    
{
    int c;

    while ((c = *s1) == *s2)
    {
        if (c == '\0')
            return 0;

        if (n == 0)
            return 0;

        ++s1;
        ++s2;
        --n;
    }

    if (n == 0)
        return 0;

    return (*s1 > *s2 ? 1 : -1);
}

int wstrncasecmp(char *s1, char *s2, int n)    
{
    int c;

    while (c = *s1, tolower(c) == tolower(*s2))
    {
        if (c == '\0')
            return 0;

        if (n == 0)
            return 0;

        ++s1;
        ++s2;
        --n;
    }

    if (n == 0)
        return 0;

    return (*s1 > *s2 ? 1 : -1);
}

Bool wsubstr(char *s1, char *s2)
{
    int i, len1 = wstrlen(s1), len2 = wstrlen(s2);

    for (i = 0; i <= len1 - len2; ++i)
    {
        if (wstrncasecmp(s1+i, s2, len2) == 0)
            return yes;
    }

    return no;
}

/* wrapper for feof */
int tidy_eof(StreamIn *in)
{
    return feof((FILE *)in->clientData);
}

/* wrapper for getc */
uint tidy_getc(StreamIn *in)
{
    return getc((FILE *)in->clientData);
}

/* wrapper for putc */
void tidy_putc(uint c, Out *out)
{    
    FILE *fp = (FILE *) out->clientData;

    putc(c, fp);
}

int tidy_vfprintf(ErrOut *out, const char *msg, va_list arg)
{
    int result;
    
    result = vfprintf((FILE *) out->clientData, msg, arg);

    return result;
}



/* For mac users, should we map Unicode back to MacRoman? */
void outc(uint c, Out *out)
{
    uint ch;
    
    if (out->encoding == UTF8)
    {
        if (c < 128)
            out->putcproc(c, out);
        else if (c <= 0x7FF)
        {
            ch = (0xC0 | (c >> 6)); out->putcproc(ch, out);
            ch = (0x80 | (c & 0x3F)); out->putcproc(ch, out);
        }
        else if (c <= 0xFFFF)
        {
            ch = (0xE0 | (c >> 12)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 6) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | (c & 0x3F)); out->putcproc(ch, out);
        }
        else if (c <= 0x1FFFFF)
        {
            ch = (0xF0 | (c >> 18)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 12) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 6) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | (c & 0x3F)); out->putcproc(ch, out);
        }
        else
        {
            ch = (0xF8 | (c >> 24)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 18) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 12) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | ((c >> 6) & 0x3F)); out->putcproc(ch, out);
            ch = (0x80 | (c & 0x3F)); out->putcproc(ch, out);
        }
    }
    else if (out->encoding == ISO2022)
    {
        if (c == 0x1b)  /* ESC */
            out->state = FSM_ESC;
        else
        {
            switch (out->state)
            {
            case FSM_ESC:
                if (c == '$')
                    out->state = FSM_ESCD;
                else if (c == '(')
                    out->state = FSM_ESCP;
                else
                    out->state = FSM_ASCII;
                break;

            case FSM_ESCD:
                if (c == '(')
                    out->state = FSM_ESCDP;
                else
                    out->state = FSM_NONASCII;
                break;

            case FSM_ESCDP:
                out->state = FSM_NONASCII;
                break;

            case FSM_ESCP:
                out->state = FSM_ASCII;
                break;

            case FSM_NONASCII:
                c &= 0x7F;
                break;
            }
        }

        out->putcproc(c, out);
    }
    else
        out->putcproc(c, out);
}

/*
  first time initialization which should
  precede reading the command line
*/
void InitTidy(void)
{
    
    InitMap();
    InitAttrs();
    InitTags();
    InitEntities();
    InitConfig();

    totalerrors = totalwarnings = 0;
    XmlTags = XmlOut = HideEndTags = UpperCaseTags =
    MakeClean = writeback = OnlyErrors = no;

    input = null;
    errfile = null;
    
#ifdef CONFIG_FILE
    {
	ErrOut errout;
	errout.clientData = stderr;
	errout.vprintfproc = tidy_vfprintf;
	ParseConfigFile(CONFIG_FILE, &errout);
    }
#endif
}

/*
  call this when you have finished with tidy
  to free the hash tables and other resources
*/
void DeInitTidy(void)
{
    FreeTags();
    FreeAttrTable();
    FreeEntities();
    FreeConfig();
    FreePrintBuf();
}

EXTERN int ReadInput (StreamIn *input, Lexer **lexptr, Node **docptr, ErrOut *errout) {

    Lexer *lexer;
    Node *document;

    lexer = NewLexer(input);

    lexer->errout = errout;

    /*
      store pointer to lexer in input stream
      to allow character encoding errors to be
      reported
    */

    lexer->in->lexer = lexer;
    
    /* Tidy doesn't alter the doctype for generic XML docs */
    if (XmlTags)
	document = ParseXMLDocument(lexer);
    else
    {
	lexer->warnings = 0;
	
	document = ParseDocument(lexer);
	
	if (!CheckNodeIntegrity(document))
        {
	    tidy_out(errout, "\nPanic - tree has lost its integrity\n");
	    return 1;
	}
	    
	/* simplifies <b><b> ... </b> ...</b> etc. */
	NestedEmphasis(document);
	
	/* cleans up <dir>indented text</dir> etc. */
	List2BQ(document);
	BQ2Div(document);
	
	/* replaces i by em and b by strong */
	if (LogicalEmphasis)
	    EmFromI(document);
	
	if (Word2000 && IsWord2000(document))
        {
	    /* prune Word2000's <![if ...]> ... <![endif]> */
	    DropSections(lexer, document);
	    
	    /* drop style & class attributes and empty p, span elements */
	    CleanWord2000(lexer, document);
	}
	
	/* replaces presentational markup by style rules */
	if (MakeClean || DropFontTags)
	    CleanTree(lexer, document);
	
	if (!CheckNodeIntegrity(document))
        {
	    tidy_out(errout, "\nPanic - tree has lost its integrity\n");
	    return 1;
	}

	if (document->content)
        {
	    if (xHTML)
		SetXHTMLDocType(lexer, document);
	    else
		FixDocType(lexer, document);
	    
	    if (TidyMark)
		AddGenerator(lexer, document);
	}

	/* ensure presence of initial <?XML version="1.0"?> */
	if (XmlOut && XmlPi)
	    FixXMLPI(lexer, document);
	
    }
    *lexptr = lexer;
    *docptr = document;
    return 0;
}

EXTERN int WriteOutput(Lexer *lexer, Node *document, Out *out)
{
    if (XmlTags)
	PPrintXMLTree(out, null, 0, lexer, document);
    else
	PPrintTree(out, null, 0, lexer, document);
    
    PFlushLine(out, 0);
    return 0;
}


EXTERN int tidy_main(int argc, char **argv)
{
    ErrOut errout, warnout;
    char *file, *prog;
    Node *document, *doctype;
    Lexer *lexer;
    char *s, c, *arg, *current_errorfile = "stderr";
    Out out;   /* normal output stream */

#if PRESERVEFILETIMES
    struct utimbuf filetimes;
    struct stat sbuf;
#endif
    Bool haveFileTimes;

    InitTidy();

    warnout.clientData = (void *)stdout;
    warnout.vprintfproc = tidy_vfprintf;
    errout.clientData = (void *)stderr;
    errout.vprintfproc = tidy_vfprintf;

    /* look for env var "HTML_TIDY" */
    /* then for ~/.tidyrc (on Unix) */

    if ((file = getenv("HTML_TIDY")))
        ParseConfigFile(file, &errout);
#ifdef SUPPORT_GETPWNAM
    else
        ParseConfigFile("~/.tidyrc", &errout);
#endif /* SUPPORT_GETPWNAM */

    /* read command line */

    prog = argv[0];

    while (argc > 0)
    {
        if (argc > 1 && argv[1][0] == '-')
        {
            /* support -foo and --foo */
            arg = argv[1] + 1;
#if 0
            if (arg[0] == '-')
                ++arg;
#endif
            if (strcmp(arg, "indent") == 0)
                IndentContent = yes;
            else if (strcmp(arg, "xml") == 0)
                XmlTags = yes;
            else if (strcmp(arg, "asxml") == 0 || strcmp(arg, "asxhtml") == 0)
                xHTML = yes;
            else if (strcmp(arg, "indent") == 0)
            {
                IndentContent = yes;
                SmartIndent = yes;
            }
            else if (strcmp(arg, "omit") == 0)
                HideEndTags = yes;
            else if (strcmp(arg, "upper") == 0)
                UpperCaseTags = yes;
            else if (strcmp(arg, "clean") == 0)
                MakeClean = yes;
            else if (strcmp(arg, "raw") == 0)
                CharEncoding = RAW;
            else if (strcmp(arg, "ascii") == 0)
                CharEncoding = ASCII;
            else if (strcmp(arg, "latin1") == 0)
                CharEncoding = LATIN1;
            else if (strcmp(arg, "utf8") == 0)
                CharEncoding = UTF8;
            else if (strcmp(arg, "iso2022") == 0)
                CharEncoding = ISO2022;
            else if (strcmp(arg, "mac") == 0)
                CharEncoding = MACROMAN;
            else if (strcmp(arg, "numeric") == 0)
                NumEntities = yes;
            else if (strcmp(arg, "modify") == 0)
                writeback = yes;
            else if (strcmp(arg, "change") == 0)  /* obsolete */
                writeback = yes;
            else if (strcmp(arg, "update") == 0)  /* obsolete */
                writeback = yes;
            else if (strcmp(arg, "errors") == 0)
                OnlyErrors = yes;
            else if (strcmp(arg, "quiet") == 0)
                Quiet = yes;
            else if (strcmp(arg, "slides") == 0)
                BurstSlides = yes;
            else if (strcmp(arg, "help") == 0 ||
                     argv[1][1] == '?'|| argv[1][1] == 'h')
            {
                HelpText(&warnout, prog);
                return 1;
            }
            else if (strcmp(arg, "config") == 0)
            {
                if (argc >= 3)
                {
                    ParseConfigFile(argv[2], &errout);
                    --argc;
                    ++argv;
                }
            }
            else if (strcmp(argv[1], "-file") == 0 ||
                     strcmp(argv[1], "--file") == 0 ||
                        strcmp(argv[1], "-f") == 0)
            {
                if (argc >= 3)
                {
                    /* create copy that can be freed by FreeConfig() */
                    errfile = wstrdup(argv[2]);
                    --argc;
                    ++argv;
                }
            }
            else if (strcmp(argv[1], "-wrap") == 0 ||
                        strcmp(argv[1], "--wrap") == 0 ||
                        strcmp(argv[1], "-w") == 0)
            {
                if (argc >= 3)
                {
                    sscanf(argv[2], "%d", &wraplen);
                    --argc;
                    ++argv;
                }
            }
            else if (strcmp(argv[1], "-version") == 0 ||
                        strcmp(argv[1], "--version") == 0 ||
                        strcmp(argv[1], "-v") == 0)
            {
                ShowVersion(&errout);
                /* called to free hash tables etc. */
                DeInitTidy();
                return 0;

            }
            else if(strncmp(argv[1],"--",2)==0)
            {
                if (ParseConfig(argv[1]+2, argv[2], &errout))
                {
                    ++argv;
                    --argc;
                }
            }
            else
            {
                s = argv[1];

                while ((c = *++s))
                {
                    if (c == 'i')
                    {
                        IndentContent = yes;
                        SmartIndent = yes;
                    }
                    else if (c == 'o')
                        HideEndTags = yes;
                    else if (c == 'u')
                        UpperCaseTags = yes;
                    else if (c == 'c')
                        MakeClean = yes;
                    else if (c == 'n')
                        NumEntities = yes;
                    else if (c == 'm')
                        writeback = yes;
                    else if (c == 'e')
                        OnlyErrors = yes;
                    else if (c == 'q')
                        Quiet = yes;
                    else
                        UnknownOption(&errout, c);
                }
            }

            --argc;
            ++argv;
            continue;
        }

        /* ensure config is self-consistent */
        AdjustConfig();

        /* user specified error file */
        if (errfile)
        {
            FILE *fp;

            /* is it same as the currently opened file? */
            if (wstrcmp(errfile, current_errorfile) != 0)
            {
                /* no so close previous error file */

                if (errout.clientData != (void *) stderr)
                    fclose((FILE *)errout.clientData);

                /* and try to open the new error file */
                fp = fopen(errfile, "w");

                if (fp != null)
                {
                    errout.clientData = (void *)fp;
                    current_errorfile = errfile;
                }
                else /* can't be opened so fall back to stderr */
                {
                    errout.clientData = (void *)stderr;
                    current_errorfile = "stderr";
                }
            }
        }

        haveFileTimes = no;

        if (argc > 1)
        {
            file = argv[1];
            input = fopen(file, "r");

#if PRESERVEFILETIMES
            /* get last modified time */
            if (KeepFileTimes && input && fstat(fileno(input), &sbuf) != -1)
            {
                filetimes.actime = sbuf.st_atime;
                filetimes.modtime = sbuf.st_mtime;
                haveFileTimes = yes;
            }
#endif
        }
        else
        {
            input = stdin;
            file = "stdin";
        }

        if (input != null)
	{
	    if (!Quiet)
		HelloMessage(&errout, release_date, file);
	
            if (ReadInput(OpenInput(input), &lexer, &document, &errout) == 1)
	    {
		exit(1);
	    }

	    doctype = FindDocType(document);
	    
	    totalwarnings += lexer->warnings;
	    totalerrors += lexer->errors;
	
	    if (!Quiet && document->content)
	    {
		ReportVersion(&errout, lexer, file, doctype);
		ReportNumWarnings(&errout, lexer);
	    }
            if (input != stdin)
            {
                fclose(input);
            }

            MemFree(lexer->in);

            if (lexer->errors > 0)
                NeedsAuthorIntervention(&errout);

            out.state = FSM_ASCII;
            out.encoding = CharEncoding;

            if (!OnlyErrors && lexer->errors == 0)
            {
                if (BurstSlides)
                {
                    Node *body, *doctype;

                    /*
                       remove doctype to avoid potential clash with
                       markup introduced when bursting into slides
                    */
                    /* discard the document type */
                    doctype = FindDocType(document);

                    if (doctype)
                        DiscardElement(doctype);

                    /* slides use transitional features */
                    lexer->versions |= VERS_HTML40_LOOSE;

                    /* and patch up doctype to match */
                    if (xHTML)
                        SetXHTMLDocType(lexer, document);
                    else
                        FixDocType(lexer, document);


                    /* find the body element which may be implicit */
                    body = FindBody(document);

                    if (body)
                    {
                        ReportNumberOfSlides(&errout, CountSlides(body));
                        CreateSlides(lexer, document);
                    }
                    else
                        MissingBody(&errout);
                }
                else if (writeback && (input = fopen(file, "w")))
                {
                    out.clientData = (void *) input;
		    out.putcproc = tidy_putc;
		    
		    WriteOutput(lexer, document, &out);
#if PRESERVEFILETIMES
                    /* set file last accessed/modified times to original values */
                    if (haveFileTimes)
                        futime(fileno(input), &filetimes);
#endif
                    fclose(input);
                }
                else
                {
                    out.clientData = (void *) stdout;
		    out.putcproc = tidy_putc;

		    WriteOutput(lexer, document, &out);

                    PFlushLine(&out, 0);
                }

            }

            ErrorSummary(lexer);
            FreeNode(document);
            FreeLexer(lexer);
        }
        else
            UnknownFile(&errout, prog, file);

        --argc;
        ++argv;

        if (argc <= 1)
            break;
    }

    if (totalerrors + totalwarnings > 0)
        GeneralInfo(&errout);

    if (errout.clientData != (void *)stderr)
        fclose((FILE *)errout.clientData);

    /* called to free hash tables etc. */
    DeInitTidy();

    /* return status can be used by scripts */

    if (totalerrors > 0)
        return 2;

    if (totalwarnings > 0)
        return 1;

    /* 0 signifies all is ok */
    return 0;
}

}
