#!/usr/bin/perl

# train.pl -n negfile -p posfile [outfile]
#
# Writes out a weight vector to OUTFILE (or STDOUT) trained on negative
# examples from NEGFILE and positive examples from POSFILE.
# For the program to work, the input list needs to consist of binary 
# feature vectors, such as those that can be built with feature.pl.
#
# This program implements the same algorithm, and should do the same 
# thing, as does the GNU Octave/Matlab program "train.m" that is 
# included with Perform.
#
# Perform (Perceptron Classifier in Inform) v1.0
# Nick Montfort  http://nickm.com  2004-06-23

use strict;
use warnings;
use Getopt::Std;
use vars qw/ $opt_n $opt_p $opt_u /;

getopt("n:p:u");
my @data;
if(!$opt_n or !$opt_p or $opt_u)
  { die ("Usage: $0 -n negfile -p posfile [outfile]\n"); }
if ($ARGV[0])
  { open OUT, ">$ARGV[0]" or die("<!> Can't open $ARGV[0] for writing."); }
open NEG, "<$opt_n" or die("<!> Can't open $opt_n for reading.");
print "Loading negative data from $opt_n ... ";
my @neg = <NEG>;
close NEG;
print "got all ".scalar(@neg)." of them.\n";
open POS, "<$opt_p" or die("<!> Can't open $opt_p for reading.");  
print "Loading positive data from $opt_p ... ";
my @pos = <POS>;
close POS;
print "got all ".scalar(@pos)." of them.\n";
print "Data loaded.\n";

my @nmat; my @pmat;
my $dim = 0;
my $i = 0; my $j = 0; my $k = 0;
my @point; my $sum; my $bit;
for(@neg)
  {
  chomp();
  $j = 0;
  for(split(/ /))
    {
    $nmat[$i][$j] = $_;
    if ($i == 1) { $dim++; }
    $j++;
    }  
  $i++;
  }
$i = 0;
for(@pos)
  {
  chomp();
  $j = 0;
  for(split(/ /))
  {
    $pmat[$i][$j] = $_;
    $j++;
  }
  $i++;
  }

# Set weight vector to all zeros initially.
my @w; my $bias = 0;
for(1..$dim)  { push(@w, 0); }
print "Dimension of data is $dim.\n";
my $sumw = 0;
# Initially, all the negative examples are correctly classified by
# the zero weight vector:
my @tneg;
for(@neg) { push(@tneg, 0); }
my $sumtneg = 0;
# But the zero weight vector misclassifies all the positive examples:
my @tpos;
for(@pos) { push(@tpos, 1); }
my $sumtpos = @tpos;

$i = 1;
while (($sumtneg + $sumtpos) > 0)
  {
  print "Starting iteration $i with ";
  print "$sumtneg (-) and $sumtpos (+) misclassified...\n";
  if ($sumtneg < 11 and $sumtneg > 0) {
    print "(-) points missed: [ ";
    $j = 1;
    for( @tneg ) { if ($_ == 1) { print "$j "; } $j++; }
    print "]\n";
  }
  if ($sumtpos < 11 and $sumtpos > 0) {
    print "(+) points missed: [ ";
    $j = 1;
    for( @tpos ) { if ($_ == 1) { print "$j "; } $j++; }
    print "]\n";
  }
# if ($ARGV[0]) { print OUT "@w "; print "\n"; }
  $sumtneg = 0;
  $sumtpos = 0;
  $j = 0;
  for( @tneg )
    {
    if ($_ == 1)
      {
      for(0..($dim-1)) { $w[$_] -= $nmat[$j][$_]; } 
      $bias--;
      }
    $j++;
    }
  $j = 0;
  for( @tpos )
    {
    if ($_ == 1)
      {
      for(0..($dim-1)) { $w[$_] += $pmat[$j][$_]; }
      $bias++;
      }
    $j++;
    }
  for($j = 0; $j < @neg; $j++) {
    for(0..($dim-1)) { $point[$_] = $nmat[$j][$_]; }; 
    $sum = $bias; 
    for(0..($dim-1)) { $sum += $point[$_] * $w[$_]; }
    $tneg[$j] = ($sum > 0);
    $sumtneg += $tneg[$j];
  }
  for($j = 0; $j < scalar(@pos); $j++) {
    for(0..($dim-1)) { $point[$_] = $pmat[$j][$_]; }; 
    $sum = $bias; 
    for(0..($dim-1)) { $sum += $point[$_] * $w[$_]; }
    $tpos[$j] = ($sum <= 0);
    $sumtpos += $tpos[$j];
  }
  $sumw = 0;
  for(@w) { $sumw += $_; }  
  $i++;
  }

print "Finished, with ". ($sumtneg+$sumtpos) ." misclassified.\n\n";
if ($ARGV[0])
  { print OUT " @w"; print OUT "\n $bias"; print "\n"; close OUT; }
else
  { print "@w "; print "\n $bias"; }
  
__END__
