#  This is version 1.1 of Crowds
# 
#  The authors of this software are Mike Reiter and Avi Rubin
#               Copyright (c) 1997 by AT&T.
#  Permission to use, copy, and modify this software without fee is
#  hereby granted, provided that this entire notice is included in all
#  copies of any software which is or includes a copy or modification
#  of this software and in all copies of the supporting documentation
#  for such software.
# 
#  SOME PARTS OF CROWDS MAY BE RESTRICTED UNDER UNITED STATES EXPORT
#  REGULATIONS.
# 
#  THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
#  IMPLIED WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR AT&T MAKE
#  ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
#  MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
#  PURPOSE.

# The Parse package is used to find embedded URLs in lines of HTML
# text. The parser keeps state so that embedded links that span more
# than one line are found. The main routine, find_urls is fed
# state - the current state of the parser (1-3)
# hostname - the name of the server host
# port - a port number for the web server, possibly null
# path - the base path for URLs
# line - an HTML line to parse
# A list of embedded URLs is returned.
#
# The routine good_url_list takes a list of URLs and cleans them up
# by adjusting relative paths and removing dots from paths.

package URLParse;

use strict;

sub new {
    my $state = 1;
    return bless \$state, 'URLParse';
}


sub find_urls {
    my ($state,$hostname,$port,$path,$line) = @_;
    my (@return_list,$good_attributes);

    #String to match when looking for attributes with urls
    $good_attributes = "(?:src|background|dynsrc)";

    @return_list = ();

    while (!($line =~ /^\s*$/)) {
        if ($$state == 1) {

           # if line starts with src or has a whitespace before src, then it
           # is an embedded image. This avoids cgi parameters such as
           # portfolio&.list_portf=1&port_create&.src=quote&.portfolio from
           # finding an image

           if (($line =~ m!^$good_attributes!is) ||
	       ($line =~ m!\s$good_attributes!is)) {
               $line =~ s!.*?$good_attributes!!is;
               $$state = 2;
           } else {
               last;
           }
    
        } elsif ($$state == 2) {

            if ($line =~ s!^\s*=!!is) {
               $$state = 3;
            } elsif ($line =~ m!\S!is) {    #if non-space on line and no = 
                $$state = 1;                 #then go back to state 1
            }
    
        } elsif ($$state == 3) {
           
	    $line =~ s/\\"/\t/g;  #convert \" to tabs (should we do this?)
            $line =~ s/"/\t/g;    #convert quotes to tabs
            if ($line =~ s!^\s*([^>\s]+)!!is) {
                # Do not push malformed URLs such as http:/something
                if (!($1 =~ m!http:/[^/]!)) {
                    push @return_list, $1;
                }
                $$state = 1;
            }

        }
    }

    return good_url_list($hostname,$port,$path,@return_list);

}


# The following takes a list of urls that are found as embedded
# in a page, and returns a cleaned up version of the urls.

sub good_url_list {
    my ($hostname,$port,$path,@url_list) = @_;
    my @done_urls = ();
    my $url;

    #Standardize on the port
    if ($port eq 80) {
        $port = "";
    } elsif ($port) {
        $port = ":" . $port;
    }

    #Isolate the path
    if (defined $path) {
	$path =~ s/(.*\/).*/$1/ ; #remove last path element
	if ($path) {
	    # strip off first and last '/' from path if exists
	    $path =~ s/(\/)?(.*)/$2/;
	    if ($path =~ /.*\/$/) {
		$path =~ s/(.*)\/$/$1/;
	    }
	}
    }

    foreach $url (@url_list) {

	$url =~ s/^\s*"([^"]*)"$/$1/;

	if (!($url =~ m!http://[^/:]+:?\d*/??[^"]*!o)) {
            # relative URL

	    if ($url =~ m!^/!) {
                #url is an absolute path
                $url = "http://" . $hostname . $port . $url;
            } else {
                # url is a relative path		
		if ($path) {
		    $url = "http://".$hostname.$port."/".$path."/".$url;
		} else {
		    $url = "http://".$hostname.$port."/".$url; 
		}
            }
	}

        #Don't want to process null URLs
        if ($url) {
            $url = fix_path($url);
            my $url2find = quotemeta($url);
            if (grep(m|$url2find|, @done_urls) == 0) {
                push(@done_urls, $url);
	    }
        }
    }

    return @done_urls;	
}


#--- removes any "." and ".." from URLs
sub fix_path {
    my ($url) = @_;
    my $path;
    my $host;
    my $port;
    my $left;
    my $right;
    my $new_left;

    #parse the host name and port number, if exists, out of url
    ($host,$port,$path) = ($url =~ m!http://([^/:]+)(?::(\d*))?(/?.*)! );

    if ($port) {
	$port = ":$port";
    } else {
	$port = "";
    }

    #while there is a .. in the path
    while (defined($path) && $path =~ /.*?\/\.\..*?/) {
	($left,$right) = ($path =~ m!(.*?)/\.\.(.*)!);
	($new_left) = ($left =~ m!(.*)/\w*!);
	$path = $new_left.$right;
    }

    #while there is a . in the path
    while (defined($path) && $path =~ /.*?\/\.\/.*?/) {
	($left,$right) = ($path =~ m!(.*?)/\./(.*)!);
	$path = $left."/".$right;
    }

    if (defined($path)) {
	$url = "http://$host$port$path";
    } else {
	$url = "http://$host$port";
    }

    return $url;
}

sub no_dup {
    my (@attrs) = @_;
    my @clean = ();
    my $attr;
    my $last;
   
    $last = '';
    foreach $attr (@attrs) {
        if ($last ne $attr) {
            push (@clean,$attr);
            $last = $attr;
        }
    }

    return @clean;
}

1;
