#!/usr/bin/perl -w

##   dt_example_involving_csv_cleanup.pl

##  This scripts illustrates the constructor options introduced in Version 3.2.4 in
##  November 2015.  These options are:
##
##       csv_cleanup_needed
##  and
##
##       first_field_in_first_record
##
##  The comma separated values in some large econometrics datasets include
##  double-quoted strings with commas.  To deal with such CSV files, Version 3.2.4
##  incorporates a new function named 'cleanup_csv()'.  This function is invoked when
##  you set the construction option 'csv_cleanup_needed' to 1 as shown below.

##  You may also want to note that I have used the constructor option
##
##            number_of_histogram_bins
##
##  In general, the larger the training dataset, the smaller the smallest difference
##  between any two values for a numeric feature in relation to the overall range of
##  values for that feature. In such cases, the module may use too large a number of
##  bins for estimating the probabilities and that could slow down the calculation of
##  the decision tree.  You can get around this difficulty by explicitly giving a
##  value to the 'number_of_histogram_bins' parameter, as shown in the following
##  example.

use strict;
use Algorithm::DecisionTree;

#my $training_datafile = "try10.csv";
#my $training_datafile = "try100.csv";
my $training_datafile = "try20000.csv";

my $dt = Algorithm::DecisionTree->new( 
                              training_datafile => $training_datafile,
                              csv_cleanup_needed => 1,
                              first_field_in_first_record => 'OBJECTID',
                              csv_class_column_index => 14,
                              csv_columns_for_features => [166,167,176,177,178],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
                              number_of_histogram_bins => 100,
         );

$dt->get_training_data();
$dt->calculate_first_order_probabilities();
$dt->calculate_class_priors();

#   UNCOMMENT THE NEXT STATEMENT if you would like to see the
#   training data that was read from the disk file:
#$dt->show_training_data();

my $root_node = $dt->construct_decision_tree_classifier();

#   UNCOMMENT THE NEXT TWO STATEMENTs if you would like to see the
#   decision tree displayed in your terminal window:
print "\n\nThe Decision Tree:\n\n";
$root_node->display_decision_tree("     ");           

my @test_sample  = qw / SALES90=1000000.0
                        SALES91=500000.0
                        SALES00=100000.0
                        SALES01=20000.0
                        SALES02=0.0 /;

#   The classifiy() in the call below returns a reference to a hash
#   whose keys are the class labels and the values the associated 
#   probabilities:

my %classification = %{$dt->classify($root_node, \@test_sample)};

my @solution_path = @{$classification{'solution_path'}};
delete $classification{'solution_path'};
my @which_classes = keys %classification;

@which_classes = sort {$classification{$b} <=> $classification{$a}} 
                                                     @which_classes;
print "\nClassification:\n\n";
print "     class                         probability\n";
print "     ----------                    -----------\n";
foreach my $which_class (@which_classes) {
    my $classstring = sprintf("%-30s", $which_class);
    my $valuestring = sprintf("%-30s", $classification{$which_class});
    print "     $classstring $valuestring\n";
}
print "\nSolution path in the decision tree: @solution_path\n";
print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
