#!/usr/local/bin/perl

use 5.005;
use strict;
use Getopt::Long;
use LWP::UserAgent;
use HTML::Parser;
use Pod::Usage;
use Term::ReadKey;
use URI;

$main::VERSION = "1.04";


package HTML::Parser::Links;

use base qw(HTML::Parser);

sub new
{
    my($class, $base) = @_;

    my $parser = new HTML::Parser;
    $parser->{base }    = $base;
    $parser->{links}    = [];
    $parser->{fragment} = {};

    bless $parser, $class
}


sub start
{
    my($parser, $tag, $attr, $attrseq, $origtext) = @_;

    $tag eq 'base' and defined $attr->{href} and 
	$parser->{base} = $attr->{href};

    $tag eq 'a' and $attr->{href} and do
    {
	my $base = $parser->{base};
	my $href = $attr->{href};
	my $uri  = new_abs URI $href, $base;
	push @{$parser->{links}}, $uri;
    };

    $tag eq 'a' and $attr->{name} and do
    {
	my $name = $attr->{name};
	$parser->{fragment}{$name} = 1;
    };

    $tag eq 'img' and $attr->{src} and do
    {
	my $base = $parser->{base};
	my $src  = $attr->{src};
	my $uri  = new_abs URI $src, $base;
	push @{$parser->{links}}, $uri;
    };

    $tag eq 'frame' and $attr->{src} and do
    {
	my $base = $parser->{base};
	my $src  = $attr->{src};
	my $uri  = new_abs URI $src, $base;
	push @{$parser->{links}}, $uri;
    };
}


sub links
{
    my $parser = shift;
    $parser->{links}
}


sub check_fragment
{
    my($parser, $fragment) = @_;
    $parser->{fragment}{$fragment}
}


package HTTP::A11N;

# We hoist these into a base class,
# because we need them in both Page and Link

sub get_authorized
{
    my($self, $ua, $request, $response) = @_;

    my $challenge = $response->www_authenticate;
    my($scheme, $realm) = $self->parse_challenge($challenge);
    $scheme eq 'basic' or return $response;

    my $a11n = $self->{a11n};
    my $credentials = $a11n->credentials($request->uri, $realm);
    $credentials or return $response;

    $request->authorization_basic(@$credentials);
    $ua->request($request)
}


sub parse_challenge
{
    my($self, $challenge) = @_;

    my($scheme, $realm) =
	$challenge =~ m[       (\w  +)   # scheme
			\s+
			realm="([^"]+)"  # realm
                       ]ix;

    $scheme = lc $scheme;

    ($scheme, $realm)
}


package Page;

use base qw(HTTP::A11N);

sub new
{
    my($package, $uri, $a11n) = @_;

    $Page::Cache{$uri} and
	return $Page::Cache{$uri};

    my $page = { uri  => $uri,
	         base => $uri,
	         a11n => $a11n};

    bless $page, $package;

    $Page::Cache{$uri} = $page;
    $page
}


sub uri  { shift->{'uri' } }
sub base { shift->{'base'} }


sub get
{
    my $page = shift;

    defined $page->{content} and
	return $page->{content};

    my $uri      = $page->{uri};
    my $ua       = new LWP::UserAgent;
    my $request  = new HTTP::Request GET => $uri;
    my $response = $ua->request($request);

    $response->code == 401 and
	$response = $page->get_authorized($ua, $request, $response);

    $response->is_success or
	return undef;

    $page->{base}         = $response->request->uri;
    $page->{content}      = $response->content;
    $page->{content_type} = $response->content_type;

    $response->content
}


sub parse
{
    my $page = shift;

    $page->{parser} and
	return $page->{parser};

    my $content = $page->get;
    defined $content or 
	return undef;

    my $parser = new HTML::Parser::Links $page->base;
       $parser->parse($content);
       $parser->eof;

    $page->{parser} = $parser;
    $parser
}


sub links
{
    my $page   = shift;
    my $parser = $page->parse;
    defined $parser or 
	return undef;

    $parser->links
}

sub content_type { shift->{content_type} }


package Link;

use base qw(HTTP::A11N);

sub new
{
    my($package, $uri, $a11n, %options) = @_;
    
    $Link::Cache{$uri} and
	return $Link::Cache{$uri};

    my $base     = $uri ->clone;
    my $fragment = $base->fragment(undef);
    
    my $link = { uri      =>  $uri,
		 a11n     =>  $a11n,
		 options  => \%options,
	         base     =>  $base,
	         fragment =>  $fragment };

    bless $link, $package;

    $Link::Cache{$uri} = $link;
    $link
}


sub check
{
    my $link = shift;

    defined $link->{ok} and 
	return $link->{ok};

    my $fragment = $link->{fragment};
    my $no_nulls = not $link->{options}{'null-frags'};
    my $check    = (length  $fragment or 
                    defined $fragment and $no_nulls) ? 'check_fragment' :
		                                       'check_base';

    my $ok = $link->$check();
    $link->{ok} = $ok;

    $ok
}


sub check_fragment
{
    my $link     = shift;
    my $base     = $link->{base};
    my $fragment = $link->{fragment};

    my $page     = new Page $base;
    my $parser   = $page->parse;
    defined $parser or return '';

    $link->{content_type} = $page->content_type;

    $parser->check_fragment($fragment)
}


sub check_base
{
    my $link = shift;
    my $base = $link->{base};

    my $ua       = new LWP::UserAgent;
    my $request  = new HTTP::Request HEAD => $base;
    my $response = $ua->request($request);

    $response->code == 401 and
	$response = $link->get_authorized($ua, $request, $response);

    # Some servers don't like HEAD requests
    $response->is_success or do
    {
	$request  = new HTTP::Request GET => $base;
	$response = $ua->request($request);

	$response->code == 401 and
	    $response = $link->get_authorized($ua, $request, $response);
    };

    $link->{content_type} = $response->content_type;
    $response->is_success;
}

sub content_type { return shift->{content_type} }


sub below_or_equal 
{
    my($link, $page) = @_;
    my $checked = $link->{uri}->path;
    my $orig    = $page->{uri}->path;

    $checked    =~ s|/[^/]*$||;   # remove last component
    $orig       =~ s|/[^/]*$||;

    substr($checked, 0, length $orig) eq $orig
}


package A11N;  # A-uthorizatio-N

sub new
{
    my($package, $spaces) = @_;
    my $a11n = bless { }, $package;
    $a11n->spaces($spaces);
    $a11n
}


sub spaces
{
    my($a11n, $spaces) = @_;

    for my $space (@$spaces)
    {
	$space eq '-' and $a11n->{deferred} = 1            , next;
	$space eq '*' and $a11n->{global  } = $a11n->prompt, next;
                          $a11n->space($space);
    }	
}


sub space
{
    my($a11n, $space) = @_;

    my($scheme, $authority, $realm) = 
	$space =~ m[^
		    (?:  (\w  +):// )?  #scheme
		         ([^:]+)        #authority
		    (?: :(.   *)    )?  #realm
		    $
		    ]x;

    $authority or return;
    $scheme    or $scheme = 'http';

    $a11n->{credentials}{$scheme}{$authority}{$realm} = 
	$a11n->prompt($scheme, $authority, $realm);
}


sub credentials
{
    my($a11n, $url, $realm) = @_;

    my($scheme, $authority) = 
	$url  =~ m[^
		   (\w  +)://  #scheme
		   ([^/]+)     #authority
		   ]x;

    $a11n->{credentials}{$scheme}{$authority}{$realm} ||
    $a11n->{credentials}{$scheme}{$authority}{''    } ||
    $a11n->{global}                                   ||
    $a11n->deferred($scheme, $authority, $realm)
}


sub deferred
{
    my($a11n, $scheme, $authority, $realm) = @_;

    $a11n->{deferred} or return undef;

    my $credentials = $a11n->prompt($scheme, $authority, $realm);
    $a11n->{credentials}{$scheme}{$authority}{$realm} = $credentials;

    $credentials
}


sub prompt
{
    my($a11n, $scheme, $authority, $realm) = @_;

    print "Enter credentials ";
    print "for $scheme://$authority:$realm" if $authority;
    print "\n";
    print "user ID: ";
    my $userID = <STDIN>;
    chomp $userID;

    Term::ReadKey::ReadMode('noecho');
    print "password: ";
    my $password = Term::ReadKey::ReadLine(0);
    print "\n";
    Term::ReadKey::ReadMode('normal');

    [ $userID, $password ]
}


package Spinner;

use vars qw($N @Spin);

@Spin = ('|', '/', '-', '\\');

sub Spin
{
    print STDERR $Spin[$N++], "\r";
    $N==4 and $N=0;
}


package main;

my %Checked;
my($Scheme, $Authority, $Path);
my($Pages, $Links, $Broken) = (0, 0, 0);

my %Options = (parent => 1,
	       scheme => 1);

my $ok = GetOptions(\%Options, qw(Help 
				  Man 
				  authorization=s@
				  null-frags
				  offsite 
                                  parent! 
				  recurse 
				  scheme! 
				  twiddle=i
				  verbosity=i));
Help($ok);
my $A11N = new A11N $Options{authorization};
CheckPages(@ARGV);
Summary();


sub Help
{
    my $ok = shift;
    $ok            or  pod2usage();
    $Options{Help} and pod2usage(VERBOSE=>1);
    $Options{Man}  and pod2usage(VERBOSE=>2);
    @ARGV          or  pod2usage();
}


sub CheckPages
{
    my @pages = @_;
    my @URIs  = map { new URI $_ } @pages;

    for my $uri (@URIs)
    {
	$Scheme    = $uri->scheme;
	$Authority = $uri->authority;
	$Path      = $uri->path;
	$Path      =~ s(\w+\.html$)()i;
	CheckPage($uri);
    }
}


sub CheckPage
{
    my $uri = shift;

    $Checked{$uri} and return;
    $Checked{$uri} = 1;
    $Pages++;
    Twiddle();
    print "PAGE $uri\n" if $Options{verbosity} > 1;

    my $page  = new Page $uri, $A11N;
    my $links = $page->links;
    defined $links or 
	die "Can't get $uri\n";

    CheckLinks($page, $links);
}


sub CheckLinks
{
    my($page, $uris) = @_;
    my @uris;

    for my $uri (@$uris)
    {
	$uri->scheme eq 'http' or next;
	my $on_site = $uri->authority eq $Authority;
	$on_site or $Options{offsite} or next;

	$Links++;
	Twiddle();
	print "LINK $uri\n" if $Options{verbosity} > 2;
	
	my $link = new Link $uri, $A11N, %Options;
	$link->check or do
	{
	    Report($page, $uri);
	    next;
	};

	$on_site or next;
        $Options{parent} or $link->below_or_equal($page) or next;
        
	$link->{content_type} eq 'text/html' or next;
	$uri->fragment(undef);
	push @uris, $uri;
    }

    $Options{recurse} or return;

    for my $uri (@uris)
    {
	CheckPage($uri);
    }
}


sub Report
{
    my($page, $link) = @_;

    my $uri  = $page->uri->as_string;
       $link = $link     ->as_string;

    $Options{scheme} or do
    {
	$uri  =~ s($Scheme://$Authority)();
	$link =~ s($Scheme://$Authority)();
    };

    $Broken++;
    print "BROKEN $uri -> $link\n" if $Options{verbosity} > 0;
}


sub Twiddle
{
    $Options{twiddle}==1 and Spinner::Spin();
    $Options{twiddle}==2 and Progress();
}

sub Progress
{
    print STDERR "$Pages pages, $Links links, $Broken broken\r";
}

sub Summary
{
    print STDERR "Checked $Pages pages, $Links links          \n";
    print STDERR "Found $Broken broken links\n";
}

__END__

=head1 NAME

B<linkcheck> - check the links on an HTML page

=head1 SYNOPSIS

B<linkcheck> 
[B<--Help>]
[B<--Man>]
[B<--authorization> B<-> | B<*> | [I<scheme>]://I<authority>[:I<realm>] ]...
[B<--null-frags>]
[B<--offsite>] 
[B<-->[B<no>]B<parent>]
[B<--recurse>] 
[B<-->[B<no>]B<scheme>] 
[B<--twiddle> I<level>] 
[B<--verbosity> I<level>] 
I<URI> ...

=head1 DESCRIPTION

B<linkcheck> reads the web pages at I<URI> ...,
and checks the existence of any links that it finds there.

=head1 OPTIONS

=over 4

=item B<--Help>

Print command line options and exit.

=item B<--Man>

Print man page and exit.

=item B<--authorization> B<-> | B<*> | [I<scheme>://]I<authority>[:I<realm>]

Prompt for user ID and password.

Without B<--authorization>,
links to pages that require authorization are reported as broken.

If B<--authorization -> is specified,
then B<linkcheck> prompts for user ID and password after receiving
a 401 (Unauthorized) response from a web server.

If B<--authorization> [I<scheme>://]I<authority>[:I<realm>] is specified,
then B<linkcheck> prompts immediately for user ID and password.
If the I<scheme> part is omitted, C<http> is assumed.
If the I<realm> part is omitted,
the user ID and password will be used for all realms on that authority.

If B<--authorization *> is specified,
then B<linkcheck> prompts immediately 
for a single user ID and password that will be
used for all realms on all authorities.

Multiple B<--authorization> options may be specified;
B<linkcheck> prompts for a separate user ID and password for each.

=item B<--null-frags>

Allow empty fragments in URLs, e.g. C<http://foo.com/bar/baz#>

=item B<--offsite>

Check off-site links.

=item B<-->[B<no>]B<parent>

Follow links upward in the directory tree.
Enabled by default.
Without this option, 
recursion is restricted to a directory tree within a web site.

=item B<--recurse>

Recursively check pages that I<URI> links to.
Doesn't recurse to off-site pages.

=item B<-->[B<no>]B<scheme>

Include the scheme://authority part when reporting broken links.
Enabled by default.

=item B<--twiddle> I<level>

Indicate activity with a twiddle

=over 4

=item Z<>0

None (default)

=item Z<>1

Spinner

=item Z<>2

Running count of pages/links checked and broken links found

=back

=item B<--verbosity> I<level>

Verbosity level: 0, 1, 2, 3

=over 4

=item Z<>0

Print final count of pages/links checked and broken links (default)

=item Z<>1

Also list broken links

=item Z<>2

Also list checked pages

=item Z<>3

Also list checked links

=back

=back

=head1 NOTES

=head2 B<--authorization>

Arguments to the B<--authorization> option may need quotes to protect
them from the shell

    --authorization \*
    --authorization 'http://www.mozilla.com:System Administrator'

=head1 CHANGES

=head2 1.04

=over 4

=item *

Added B<--authorization> option

=back

=head2 1.03

=over 4

=item *

Handle BASE elements with no href attribute, e.g. 

    <base target="PerlDoc">

=back

=head2 1.02

=over 4

=item *

Added B<-->[B<no>]B<parent> option

=back

=head2 1.01

=over 4

=item *

Fixed the B<--null-frags> option

=back

=head2 1.00

=over 4

=item *

Changed from C<Getopt::Std> to C<Getopt::Long>

=item *

Added B<--null-frags> option

=item *

Checks embedded images

=item *

Checks frames

=back


=head1 SEE ALSO

Checking your links with C<linkcheck> at
http://world.std.com/~swmcd/steven/perl/pm/lc/linkcheck.html

=head1 ACKNOWLEDGMENTS

=over 4

=item *

Vlado Bahyl, <vlado@uni-c.dk>

=item *

Marcus Freeman, <MarcusF@ActiveState.com>

=item *

Edward J. Huff, <ejhuff@bellatlantic.net>

=item *

Philippe Queinnec, <Philippe.Queinnec@enseeiht.fr>

=item *

Geoffrey Young, <gyoung@laserlink.net>

=back

=head1 AUTHOR

Steven McDougall, <swmcd@world.std.com>

=head1 COPYRIGHT

Copyright 2000 by Steven McDougall. This program is free (libre)
software; you can redistribute it and/or modify it under the same
terms as Perl.

=head1 SCRIPT CATEGORIES

Web

=head1 PREREQUISITES

Getopt::Long
LWP::UserAgent
HTML::Parser
Pod::Usage
URI

=head1 README

Find broken links in a web site.

