#!/usr/bin/perl

# feature.pl [infile] -
#
# Prints binary features of words in INFILE (or STDIN).
# For the program to work, the input list should have only a single 
# word on each line, with no whitepsace before or after it.
#
# Normally checks each line in INFILE and prints an error to STDERR 
# if a line is blank or doesn't begin or end with a letter. The -d
# flag disables this check, which is unwise but speeds things up a
# bit.
#
# Perform (Perceptron Classifier in Inform) v1.0
# Nick Montfort  http://nickm.com  2004-06-25

use strict;
use warnings;
use Getopt::Std;
use vars qw/ $opt_d $opt_u /;

getopts("u");
if($opt_u)
  { die ("Usage: $0 [-d] [infile]\n"); }
my @data;
my $err = "\n    Output will be malformed; correct file and run again.\n";
if ($ARGV[0])
  {
  open DATA, "<$ARGV[0]" or die("<!> Can't open $ARGV[0] for reading.");
  @data = <DATA>;
  close DATA;
  }
else
  {
  @data = <STDIN>;
  }
my $word;  # Oh my word
my $i = 1; # Eeeagh my i
for(@data)
{
  tr/A-Z/a-z/;
  $word = $_;
  if(!$opt_d)
    {
    if(/^$/)
      { warn "<?> Line $i is blank.$err"; }
    elsif(!/^[a-z]/)
      { warn "<?> Line $i doesn't begin with a letter.$err"; }
    elsif(!/[a-z]$/)
      { warn "<?> Line $i doesn't end with a letter.$err"; }
    $i++;
    }
  #single letter occuring multiple times
  for('a' .. 'z')
    {
    my $temp = $word;
    $temp =~ s/$_/0/;
    if($temp =~ /$_/) { yea(); }
    else { nay(); }
    }
  #initial bigram
   for('aa' .. 'zz')
     {
     if($word =~ /^$_/) { yea(); }
     else { nay(); }
     }
  #final bigram
   for('aa' .. 'zz')
     {
     if($word =~ /$_$/) { yea(); }
     else { nay(); }
     }
  #bigram occurance anywhere else in line
   for('aa' .. 'zz')
     {
     if($word =~ /[a-z]$_[a-z]/) { yea(); }
     else { nay(); }
     }
  print "\n";
  }

sub yea { print "1 "; }
sub nay { print "0 "; }

__END__
