#!/home/abhaile/swmcd/perl/bin/perl

use 5.005;
use strict;
use Getopt::Long;
use LWP::UserAgent;
use HTML::Parser;
use Pod::Usage;
use URI;

$main::VERSION = "1.02";


package HTML::Parser::Links;

use base qw(HTML::Parser);

sub new
{
    my($class, $base) = @_;

    my $parser = new HTML::Parser;
    $parser->{base }    = $base;
    $parser->{links}    = [];
    $parser->{fragment} = {};

    bless $parser, $class
}


sub start
{
    my($parser, $tag, $attr, $attrseq, $origtext) = @_;

    $tag eq 'base' and 
	$parser->{base} = $attr->{href};

    $tag eq 'a' and $attr->{href} and do
    {
	my $base = $parser->{base};
	my $href = $attr->{href};
	my $uri  = new_abs URI $href, $base;
	push @{$parser->{links}}, $uri;
    };

    $tag eq 'a' and $attr->{name} and do
    {
	my $name = $attr->{name};
	$parser->{fragment}{$name} = 1;
    };

    $tag eq 'img' and $attr->{src} and do
    {
	my $base = $parser->{base};
	my $src  = $attr->{src};
	my $uri  = new_abs URI $src, $base;
	push @{$parser->{links}}, $uri;
    };

    $tag eq 'frame' and $attr->{src} and do
    {
	my $base = $parser->{base};
	my $src  = $attr->{src};
	my $uri  = new_abs URI $src, $base;
	push @{$parser->{links}}, $uri;
    };
}


sub links
{
    my $parser = shift;
    $parser->{links}
}


sub check_fragment
{
    my($parser, $fragment) = @_;
    $parser->{fragment}{$fragment}
}


package Page;


sub new
{
    my($package, $uri) = @_;

    $Page::Cache{$uri} and
	return $Page::Cache{$uri};

    my $page = { uri  => $uri,
	         base => $uri };

    bless $page, $package;

    $Page::Cache{$uri} = $page;
    $page
}


sub uri  { shift->{'uri' } }
sub base { shift->{'base'} }


sub get
{
    my $page = shift;

    defined $page->{content} and
	return $page->{content};

    my $uri      = $page->{uri};
    my $ua       = new LWP::UserAgent;
    my $request  = new HTTP::Request GET => $uri;
    my $response = $ua->request($request);
       $response->is_success or
	   return undef;

    $page->{base}         = $response->request->uri;
    $page->{content}      = $response->content;
    $page->{content_type} = $response->content_type;

    $response->content
}


sub parse
{
    my $page = shift;

    $page->{parser} and
	return $page->{parser};

    my $content = $page->get;
    defined $content or 
	return undef;

    my $parser = new HTML::Parser::Links $page->base;
       $parser->parse($content);
       $parser->eof;

    $page->{parser} = $parser;
    $parser
}


sub links
{
    my $page   = shift;
    my $parser = $page->parse;
    defined $parser or 
	return undef;

    $parser->links
}

sub content_type { shift->{content_type} }


package Link;

sub new
{
    my($package, $uri, %options) = @_;
    
    $Link::Cache{$uri} and
	return $Link::Cache{$uri};

    my $base     = $uri ->clone;
    my $fragment = $base->fragment(undef);
    
    my $link = { uri      =>  $uri,
		 options  => \%options,
	         base     =>  $base,
	         fragment =>  $fragment };

    bless $link, $package;

    $Link::Cache{$uri} = $link;
    $link
}


sub check
{
    my $link = shift;

    defined $link->{ok} and 
	return $link->{ok};

    my $fragment = $link->{fragment};
    my $no_nulls = not $link->{options}{'null-frags'};
    my $check    = (length  $fragment or 
                    defined $fragment and $no_nulls) ? 'check_fragment' :
		                                       'check_base';

    my $ok = $link->$check();
    $link->{ok} = $ok;

    $ok
}


sub check_fragment
{
    my $link     = shift;
    my $base     = $link->{base};
    my $fragment = $link->{fragment};

    my $page     = new Page $base;
    my $parser   = $page->parse;
    defined $parser or return '';

    $link->{content_type} = $page->content_type;

    $parser->check_fragment($fragment)
}


sub check_base
{
    my $link = shift;
    my $base = $link->{base};

    my $ua       = new LWP::UserAgent;
    my $request  = new HTTP::Request HEAD => $base;
    my $response = $ua->request($request);

    # Some servers don't like HEAD requests
    $response->is_success or do
    {
	$request  = new HTTP::Request GET => $base;
	$response = $ua->request($request);
    };

    $link->{content_type} = $response->content_type;
    $response->is_success;
}

sub content_type { return shift->{content_type} }


sub below_or_equal 
{
    my($link, $page) = @_;
    my $checked = $link->{uri}->path;
    my $orig    = $page->{uri}->path;

    $checked    =~ s|/[^/]*$||;   # remove last component
    $orig       =~ s|/[^/]*$||;

    substr($checked, 0, length $orig) eq $orig
}


package Spinner;

use vars qw($N @Spin);

@Spin = ('|', '/', '-', '\\');

sub Spin
{
    print STDERR $Spin[$N++], "\r";
    $N==4 and $N=0;
}


package main;

my %Options;
my %Checked;
my($Scheme, $Authority, $Path);
my($Pages, $Links, $Broken) = (0, 0, 0);

$Options{parent} = 1;
$Options{scheme} = 1;
my $ok = GetOptions(\%Options, qw(Help 
				  Man 
				  null-frags
				  offsite 
                                  parent! 
				  recurse 
				  scheme! 
				  twiddle=i
				  verbosity=i));
Help($ok);
CheckPages(@ARGV);
Summary();


sub Help
{
    my $ok = shift;
    $ok            or  pod2usage();
    $Options{Help} and pod2usage(VERBOSE=>1);
    $Options{Man}  and pod2usage(VERBOSE=>2);
    @ARGV          or  pod2usage();
}


sub CheckPages
{
    my @pages = @_;
    my @URIs  = map { new URI $_ } @pages;

    for my $uri (@URIs)
    {
	$Scheme    = $uri->scheme;
	$Authority = $uri->authority;
	$Path      = $uri->path;
	$Path      =~ s(\w+\.html$)()i;
	CheckPage($uri);
    }
}


sub CheckPage
{
    my $uri = shift;

    $Checked{$uri} and return;
    $Checked{$uri} = 1;
    $Pages++;
    Twiddle();
    print "PAGE $uri\n" if $Options{verbosity} > 1;

    my $page  = new Page $uri;
    my $links = $page->links;
    defined $links or 
	die "Can't get $uri\n";

    CheckLinks($page, $links);
}


sub CheckLinks
{
    my($page, $uris) = @_;
    my @uris;

    for my $uri (@$uris)
    {
	$uri->scheme eq 'http' or next;
	my $on_site = $uri->authority eq $Authority;
	$on_site or $Options{offsite} or next;

	$Links++;
	Twiddle();
	print "LINK $uri\n" if $Options{verbosity} > 2;
	
	my $link = new Link $uri, %Options;
	$link->check or do
	{
	    Report($page, $uri);
	    next;
	};

	$on_site or next;
        $Options{parent} or $link->below_or_equal($page) or next;
        
	$link->{content_type} eq 'text/html' or next;
	$uri->fragment(undef);
	push @uris, $uri;
    }

    $Options{recurse} or return;

    for my $uri (@uris)
    {
	CheckPage($uri);
    }
}


sub Report
{
    my($page, $link) = @_;

    my $uri  = $page->uri->as_string;
       $link = $link     ->as_string;

    $Options{scheme} or do
    {
	$uri  =~ s($Scheme://$Authority)();
	$link =~ s($Scheme://$Authority)();
    };

    $Broken++;
    print "BROKEN $uri -> $link\n" if $Options{verbosity} > 0;
}


sub Twiddle
{
    $Options{twiddle}==1 and Spinner::Spin();
    $Options{twiddle}==2 and Progress();
}

sub Progress
{
    print STDERR "$Pages pages, $Links links, $Broken broken\r";
}

sub Summary
{
    print STDERR "Checked $Pages pages, $Links links          \n";
    print STDERR "Found $Broken broken links\n";
}

__END__

=head1 NAME

B<linkcheck> - check the links on an HTML page

=head1 SYNOPSIS

B<linkcheck> 
[B<--Help>]
[B<--Man>]
[B<--offsite>] 
[B<--recurse>] 
[B<-->[B<no>]B<parent>]
[B<--null-frags>]
[B<-->[B<no>]B<scheme>] 
[B<--twiddle> I<level>] 
[B<--verbosity> I<level>] 
I<URI> ...

=head1 DESCRIPTION

B<linkcheck> reads the web pages at I<URI> ...,
and checks the existence of any links that it finds there.

=head1 OPTIONS

=over 4

=item B<--Help>

Print command line options and exit.

=item B<--Man>

Print man page and exit.

=item B<--null-frags>

Allow empty fragments in URLs, e.g. C<http://foo.com/bar/baz#>

=item B<--offsite>

Check off-site links.

=item B<--recurse>

Recursively check pages that I<URI> links to.
Doesn't recurse to off-site pages.

=item B<-->[B<no>]B<parent>

Follow links upward in the directory tree.
Without this option, 
recursion is restricted to a directory tree within a web site.

=item B<-->[B<no>]B<scheme>

Omit the scheme://authority part when reporting broken links.

=item B<--twiddle> I<level>

Indicate activity with a twiddle

=over 4

=item Z<>0

None (default)

=item Z<>1

Spinner

=item Z<>2

Running count of pages/links checked and broken links found

=back

=item B<--verbosity> I<level>

Verbosity level: 0, 1, 2, 3

=over 4

=item Z<>0

Print final count of pages/links checked and broken links (default)

=item Z<>1

Also list broken links

=item Z<>2

Also list checked pages

=item Z<>3

Also list checked links

=back

=head1 SEE ALSO

Checking your links with C<linkcheck> at
http://world.std.com/~swmcd/steven/perl/pm/lc/linkcheck.html

=head1 ACKNOWLEDGEMENTS

=over 4

=item *

Geoffrey Young, <gyoung@laserlink.net>

=item *

Philippe Queinnec, <Philippe.Queinnec@enseeiht.fr>

=back

=head1 AUTHOR

Steven McDougall, <swmcd@world.std.com>

=head1 COPYRIGHT

Copyright 2000 by Steven McDougall. This program is free (libre)
software; you can redistribute it and/or modify it under the same
terms as Perl.

=head1 SCRIPT CATEGORIES

Web

=head1 PREREQUISITES

Getopt::Long
LWP::UserAgent
HTML::Parser
Pod::Usage
URI

=head1 README

Find broken links in a web site.

