#!/usr/local/bin/perl

# first arg should be the contents of
# 'http://www.cs.colorado.edu/faculty/faculty.html'
#
# stdin is a list of urls to generate mini-SOIF's for.


$DIR = shift;
open(DIR)	|| die "$DIR: $!\n";

while (<DIR>) {
	s/^\s+//g;	# remove leading whitespace
	s/\s+$//g;	# remove trailing whitespace

	if (/\s*<h2>(.*)<\/h2>/) {
		$supergroup = $1;
		$supergroup =~ s/^\s+//g;	# remove leading whitespace
		$supergroup =~ s/\s+$//g;	# remove trailing whitespace
		$group = '';

		if ($supergroup =~ /Administrative/) {
			$supergroup	= 'Staff';
			$group		= 'Administrative';
		} elsif ($supergroup =~ /Operations/) {
			$supergroup	= 'Staff';
			$group		= 'Operations';
		} elsif ($supergroup =~ /Research/) {
			$supergroup	= 'Staff';
			$group		= 'Research';
		}
	}

	if (/\s*<h3>(.*)<\/h3>/) {
		$group = $1;
		$group =~ s/^\s+//g;	# remove leading whitespace
		$group =~ s/\s+$//g;	# remove trailing whitespace
	}

	if (/^<dt>/o) {
		s/^\s*<dt>\s*//o;	# remove <dt> tag
		%P = ();

# locate the hompage url in the anchor tag

		$url  = $1 if /<a\s*href\s*=\s*\"([^\"]+)\"\s*>/o;
		s/\s*<a[^>]+>\s*//o;	# remove opening anchor tag
		s/\s*<\/a>\s*//o;	# remove closing anchor tag

# now match name, title and login.   Line looks like:
# Duane Wessels, (Research Staff), <i>wessels</I>

		if (/([^,]+), \(([^\)]+)\), <i>([^<]+)<\/i>/o) {
			$login		= $3;
			$Name{$login}	= $1;
			$Title{$login}	= $2;
			$SGroup{$login}	= $supergroup;
			$Group{$login}	= $group;
			$HPURL{$login}	= $url;
		}
	}
}
close DIR;

@Logins = keys %Name;

# read URL's from stdin.

while (<>) {
	foreach $l ( @Logins ) {
		if (/$l\//) {
			print "@FILE { $_";
			&soif_print_av ('human-name', $Name{$l});
			&soif_print_av ('job-title', $Title{$l});
			&soif_print_av ('classification', $SGroup{$l});
			&soif_print_av ('group', $Group{$l});
			&soif_print_av ('home-page-url', $HPURL{$l});
			print "}\n";
		}
	}
}

exit 0;

sub soif_print_av {
        local($k, $v) = @_;
        return if (length($v) < 1);
        print $k;
        print "{", length($v), "}:\t";
        print $v;
        print "\n";
}
