#!perl
use warnings;
use strict;
use Data::Dumper;
# 
# It runs  smotifab pre-requisites
#
#    Pre-requisites
#    1. Get FASTA sequence (from user). 
#    2. Run Psipred (- get .horiz and .ss2 files). 
#    3. Run HHblits  and get hhr file 
#    4. Run psi-blast and delta-blast 
#

#
# perl smotifab_prereq.pl --sequence_file=4wyq.fasta --step=1 --dir=./
#
use lib "/home/cmadrid/test/lib/perl5/site_perl/5.8.8/";
# use lib "/home/brinda/MyLib/share/perl5/";
# use lib "/home/cmadrid/test/share/perl5/";
use SmotifTF;

use SmotifTF::Psipred;
use SmotifTF::HHblits;
use SmotifTF::Psiblast;
 
use constant SMOTIFS_NUMBER_LOWER_LIMIT => 2;
use constant SMOTIFS_NUMBER_UPPER_LIMIT => 14;

use File::Spec::Functions qw(catfile catdir);
use Getopt::Long qw(GetOptions);
use Pod::Usage;
use Config::Simple;
use Carp;


my $config_file = $ENV{'SMOTIFTF_CONFIG_FILE'};
croak "Environmental variable SMOTIFTF_CONFIG_FILE should be set"
    unless $config_file;

my $cfg           = new Config::Simple($config_file);
my $localrun      = $cfg->param( -block => 'localrun' );
my $MAX_PROCESSES = $localrun->{'max_proc'};

my $sequence_file;
#my $pdb;
my $chain;
my $step = 0;
my $dir;
my $verbose;

my $man  = 0;
my $help = 0;

# smotifab_prereq.pl -s 4wyq.fasta -d ./
my $result = GetOptions(
    "sequence_file=s" => \$sequence_file,  #  string
#    "chain=s"         => \$chain,          # string
    "dir=s"           => \$dir,            # string
    "step=s"          => \$step,           # string
    "verbose"         => \$verbose,
    'help|?'          => \$help,
    man               => \$man
);

die "Failed to parse command line options\n" unless $result;

pod2usage( -exitval => 0, -verbose => 2 ) if $man;
pod2usage(1) if $help;
pod2usage(1) unless $sequence_file;

my $dispatch_for = {
    1       => \&run_psipred,
    2       => \&run_hhblits,
    3       => \&run_hhsearch,
    4       => \&run_psiblast,
    5       => \&run_deltablast,
    6       => \&reformat_psiblast_deltablast_combined_output,
    7       => \&run_analyze_psipred,
    8       => \&generate_dynamic_database,
    all     => \&execute_all,
    DEFAULT => sub { print "Unknown step.\n"; }
};

my $func = $dispatch_for->{$step} || $dispatch_for->{DEFAULT};
$func->();

exit;

sub run_psipred {
    
    print "Step 1: running Psipred ...\n\n";
    eval {
        SmotifTF::Psipred::run( sequence => $sequence_file, directory => $dir );
    };
    if ($@) {
        print "Error: at run_psipred $@";
    }
}

sub run_hhblits {
    print "Step 2: Running hhblits ...\n";
    
    # check_step2( $pdb, $smotifs );
    eval {
        my $file_name = $sequence_file;
        $file_name =~ s/\.fasta//; 
        my $oa3m = $file_name.".a3m";  # 4wyq.a3m
        my $ohhm = $file_name.".hhm";  # 4wyq.hhm
       
         SmotifTF::HHblits::run_hhblits(
                sequence_fasta => $sequence_file,
                directory => $dir,
                database  => 'nr20_12Aug11',
                oa3m      => $oa3m,
                ohhm      => $ohhm,
        );
=for
Writing HMM to 4wyq.hhm
Writing A3M alignment to 4wyq.a3m
=cut     
    };
    if ($@) {
        print "Error: at run_hhblits $@";
    }
}

sub run_hhsearch {
    print "Step 3: Running hhsearch ...\n";
    
    # check_step2( $pdb, $smotifs );
    eval {
        my $file_name = $sequence_file;
        $file_name =~ s/\.fasta//; 
        my $hhm = $file_name.".hhm";  # 4wyq.hhm
        my $hhr = $file_name.".hhr";  # 4wyq.hhr
        
        SmotifTF::HHblits::run_search(
                sequence_hhm => $hhm,
                directory => $dir,
                database  => 'pdb70_06Sep14_hhm_db',
                ohhr      => $hhr,
        );  
    };
    if ($@) {
        print "Error: at run_hhsearch $@";
    }
}

sub run_psiblast {
    print "Step 4: Running psiblast ...\n";
    # check_step2( $pdb, $smotifs );

    eval {
        my $psiblast_out = 'outfile.txt';
        SmotifTF::Psiblast::run_psiblast(
            query          => $sequence_file,
            directory      => $dir, 
            database       => 'pdbaa',
            out            => $psiblast_out,
            evalue         => 100,
            num_iterations => 2
        );

    };
    if ($@) {
        print "Error: at run_psiblast $@";
    }
}

sub run_deltablast {
    print "Step 5: Running deltablast ...\n";
    
   # check_step2( $pdb, $smotifs );
    eval {
        my $deltablast_out = 'deltablast_outfile.txt';
        SmotifTF::Psiblast::run_deltablast(
            query          => $sequence_file,
            directory      => $dir, 
            database       => 'pdbaa',
            out            => $deltablast_out,
            evalue         => 100,
        );
    };
    if ($@) {
        print "Error: at run_deltablast $@";
    }
}

#
# hhr and psiblast, delatblast combined (reformatted) output
# columns
# 1 = pdb hit (sequence hit)
# 2 = chain
# 3 = evalue
# 4 = method 
#
# 2grk    A   2.0 DEL
# 2pmz    F   2.2 DEL
# 4ak4    B   4.4 HHS
# 4hqe    A   4.9 DEL
# 3sc0    A   5.9 HHS
# 1cq3    A   6.9 DEL
# 3som    A   8.4 HHS
# 2hwn    E   16  HHS
# 1fyh    A   16  HHS
# 1ytr    A   18  HHS
#
sub reformat_psiblast_deltablast_combined_output {
    
    print "Step 6: Reformat combined output...\n";
     
   
    my $file_name = $sequence_file;
    $file_name =~ s/\.fasta//; 
    
    # check_step2( $pdb, $smotifs );
    my $hhr_out        = $file_name.".hhr";  # 4wyq.hhr
    my $psiblast_out   = 'outfile.txt';
    my $deltablast_out = 'deltablast_outfile.txt';
    eval {
        my $file    = $sequence_file;
        my @seqlist = ();
        my $numhits = 0;
        my $cc1 = "PSI";
        my $cc2 = "DEL";
        my $cc3 = "HHS";
        ($numhits, @seqlist) = SmotifTF::HHblits::format_hhr_file($dir, $hhr_out, $file_name, $numhits, $cc3, @seqlist);
        ($numhits, @seqlist) = SmotifTF::Psiblast::format_blast_file($dir, $psiblast_out, $file_name, $numhits, $cc2, @seqlist);
        ($numhits, @seqlist) = SmotifTF::Psiblast::format_blast_file($dir, $deltablast_out, $file_name, $numhits, $cc2, @seqlist);
        
        @seqlist = sort {$a->[2] <=> $b->[2]} @seqlist;

        chdir $dir;
        # saving the smotif definfition for all sequence hits to *seqhits.evalue file
        my $file3 = "$file_name.seqhits.evalue";
        open(OUTFILE,">$file3");
        for (my $aa=0; $aa<scalar(@seqlist); $aa++) {
            print OUTFILE "$seqlist[$aa][0]\t$seqlist[$aa][1]\t$seqlist[$aa][2]\t$seqlist[$aa][3]\n";
        }
        close (OUTFILE);
    };
    if ($@) {
        print "Error: at run_deltablast $@";
    }
}

sub run_analyze_psipred {
    print "Step 7: Analyze psipred ...\n";
    
    my $file_name = $sequence_file;
    $file_name =~ s/\.fasta//; 
    
    # check_step2( $pdb, $smotifs );
    eval {
        my ($seq, $number_of_motifs) = SmotifTF::Psipred::analyze_psipred (
                pdb       => $file_name,
                directory => $dir,
        );
        # Proteins having more than 14 smotifs can not be processed
        # (time consuming)
        if ($number_of_motifs > SMOTIFS_NUMBER_UPPER_LIMIT) { 
            croak "$sequence_file contains more than the maximum allowed number of Smotifs";
        }
        if ($number_of_motifs < SMOTIFS_NUMBER_LOWER_LIMIT) {
            croak "$sequence_file contains less than the minimum allowed number of Smotifs";
        }
    };
    if ($@) {
        print "Error: at analyze_psipred  $@";
    }
}

# my $outputlogfile = $pdb_code . "_" . $chain . ".extract_loops.log";

# ##Run Joe's script to get Smotif definitions

#=head2  get_smotif_definition
# gettting the smotif definfition for all sequence hits
#  ./4wyq.seqhits.evalue
#   [cmadrid@manaslu test_SmotifTF]$ cat ./4wyq.seqhits.evalue
# 2grk	A	2.0	DEL
# 2pmz	F	2.2	DEL
# 4ak4	B	4.4	HHS
# 4hqe	A	4.9	DEL
# 3sc0	A	5.9	HHS
# 1cq3	A	6.9	DEL
# 3som	A	8.4	HHS
# 2hwn	E	16	HHS
# 1fyh	A	16	HHS
#
#
# $pdb_code,         $chain,            $smotiffields[2],
# 172                 $smotiffields[3],  $smotiffields[13], $smotiffields[11],
# 173                 $smotiffields[12], $smotiffields[7],  $smotiffields[8],
# 174                 $smotiffields[9],  $geoms[0],         $geoms[1],
# 175                 $geoms[2],         $geoms[3]
#
#
#ps is an array of lines. Eahc lines conatins the followinf values.
#2pmz	pdb_code,
#F	chain,
#HH	smotif_type
#16	smotif residue start
#26	loop length
#7	ss1 length
#7	ss2 length
#VAKKLLTDVIRSGGSSNLLQRTYDYLNSVEKCDAESAQKV	seq 
#HHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHH	ss
#aaaaaaaaaaaaexebaaaaaaaaaaaaaxabbaaaaaaa	ramachandran
#13.507326	 
#145.200039	
#160.665006	
#226.951454',
#
#
#
sub generate_dynamic_database {
    print "Step 8: Generate Dynamic databases ...\n";
    
    my $file_name = $sequence_file;
    $file_name =~ s/\.fasta//; 
    
    # check_step2( $pdb, $smotifs );
    eval {
        #
        # get get_smotif_definition for the :
        # hhr,  psiblast, deltablast combined (reformatted) hits
        #
        
        # my @smotif = get_smotif_definition("./4wyq.seqhits.evalue");
        my $dd_file = $file_name.".seqhits.evalue"; 
        my $dd_path = catfile( $dir, $dd_file); 
        print "dd_file = $dd_path\n";
        my @smotif  = get_smotif_definition( $dd_path );
        print "Smotif\n";
        print Dumper( \@smotif );

        # we need pdb chain and evalue for the
        # hhr,  psiblast, deltablast combined (reformatted) hits
        my %newhash;
        # open(INFILE2,"$pdb/$pdb.seqhits.evalue"); # previous step
        # my $file = "./$pdb_code.seqhits.evalue";
        
        my $file = catfile( $dir, "$file_name.seqhits.evalue" );   # inout file
        # my $file = "./$file_name.seqhits.evalue";   # inout file
        open(INFILE2, $file ) or die "Cannot open $file $!"; # previous step
            while (my $line=<INFILE2>) {
            chomp $line;
            my @lin = split( /\s+/, $line);
            my $pdb   = $lin[0];
                my $chain = $lin[1];
                my $evalue= $lin[2];
            if ($pdb ne $file_name) {
                $newhash{$pdb.$chain} = $evalue;
            }
        }

        my $dd = 0;
        # open(OUTFILE5,">./dd_info_evalue.out");
        open(OUTFILE5,">$dir/dd_info_evalue.out");

            # for the hit from hhr,  psiblast, deltablast 
            # addd the evalue to the smotif info
            # getting form vilas's array pdb_id and chain's hit 
            my @smotlist = ();
            foreach my $hash (@smotif) {
                my $pdb_code = $hash->{'pdb_code'}; 
                my $chain    = $hash->{'chain'};
                $hash->{ 'evalue' } = $newhash{$pdb_code.$chain};
                push (@smotlist, $hash );
            }
            # then save it	
            my $num_mots = 0;
            print "Start smotlist";
            print Dumper(\@smotlist);
            print "End smotlist";
            
            foreach my $href (@smotlist) {
                my $motifname = $dd + 400000;  # dd -dynamucic databstabese entry countet # nid
                $num_mots++;
                $dd++;
                
                my $pdb_code             = $href->{'pdb_code'}; 
                my $chain                = $href->{'chain'};     
                my $smotif_type          = $href->{'smotif_type'};  
                my $smotif_residue_start = $href->{'smotif_residue_start'};
                my $loop_length          = $href->{'loop_length'};
                my $ss1_length           = $href->{'ss1_length'};
                my $ss2_length           = $href->{'ss2_length'};
                my $sequence             = $href->{'sequence'}; 
                my $secondary_structure  = $href->{'secondary_structure'};
                my $ramachandran         = $href->{'ramachandran'}; 
                # my $num1  = $href->{'num1'};
                # my $num2  = $href->{'num2'};
                # my $num3  = $href->{'num3'};
                # my $num4  = $href->{'num4'};
                my $evalue= $href->{'evalue'};

                # dynamic database file
                print OUTFILE5 "$motifname\t$pdb_code\t$chain\t$smotif_type\t$smotif_residue_start\t$loop_length\t$ss1_length\t$ss2_length\t$sequence\t$secondary_structure\t$ramachandran\t$evalue\n";
            }
            close OUTFILE5;

    };
    if ($@) {
        print "Error: Generate Dynamic databases  $@";
    }

}

sub execute_all {
    run_psipred();
    run_hhblits();
    run_hhsearch();
    run_psiblast();
    run_deltablast();
    reformat_psiblast_deltablast_combined_output();
    run_analyze_psipred();
    generate_dynamic_database();
}

sub check_step2 {

    use File::Find::Rule;

    my ( $pdb, $smotifs ) = @_;

    for ( my $i = 1 ; $i <= $smotifs ; $i++ ) {
        my $motif_number = $i;
        my $smot         = sprintf( "%02d", $motif_number );    #pad with zeros
        my $rule         = File::Find::Rule->new;
        $rule->file;
        
        my $file = "dd_shiftcands" . $pdb . "_" . $smot;
        $rule->name(qr/$file/);
        my @file_full_path = $rule->in($pdb);

        if ( scalar(@file_full_path) == 0 ) {
            die "Output from step 1 is required for $pdb Smotif $i. 
            Run steps 1 first\n";
        }
    }
}

sub check_step3 {

    use File::Find::Rule;

    my ( $pdb, $smotifs ) = @_;

    for ( my $i = 1 ; $i <= $smotifs ; $i++ ) {
        my $motif_number = $i;
        my $smot         = sprintf( "%02d", $motif_number );    #pad with zeros
        my $rule         = File::Find::Rule->new;
        $rule->file;

        # 1aab_01_motifs_best.csv
        my $file = $pdb . "_" . $smot . "_motifs_best.csv";

        print "Looking for $pdb\/$file\n";
        $rule->name(qr/$file/);
        my @file_full_path = $rule->in($pdb);

        if ( scalar(@file_full_path) == 0 ) {
            die "Output from step 2 is required for $pdb Smotif $i. 
            Run steps 1-2 first\n";
        }
    }
}

sub check_step4 {
    use File::Find::Rule;

    my ($pdb) = @_;

    my $rule = File::Find::Rule->new;
    $rule->file;
    my $file = "-all_enum_$pdb.csv";
    
    $rule->name(qr/$file/);
    my @file_full_path = $rule->in($pdb);
    if ( scalar(@file_full_path) == 0 ) {
        die "Enumeration output from step 3 is required for step 4. 
        Run steps 1-3 first for $pdb\n";
    }
}

sub get_smotif_definition {
    use Try::Tiny;
 
        use SmotifTF::GetSMotifsfromPDB qw(missing_residues extract_loops);

        # Directory that holds the pdb files to be parsed
        # my $PDB_DIR       = "/usr/local/databases/pdb/uncompressed/";

        my $PDB_DIR       = "/usr/local/databases/remodeled_pdb/";
        my $PDB_OBSOLETES = "/usr/local/databases/remodeled_pdb/";
        $SmotifTF::GetSMotifsfromPDB::PDB_DIR       = $PDB_DIR;
        $SmotifTF::GetSMotifsfromPDB::PDB_OBSOLETES = $PDB_OBSOLETES;
        
        $SmotifTF::PDB::PDBfileParser::PDB_DIR       = $PDB_DIR;
        $SmotifTF::PDB::PDBfileParser::PDB_OBSOLETES = $PDB_OBSOLETES;

        use constant DSSP_PATH => "/usr/local/bin";
        use constant DSSP_EXEC => "dsspcmbi";

        my ($full_path_name_file) = @_;

     die "full_path_name_file is required"
      unless $full_path_name_file;

    die "full_path_name_file does not exists"
      unless -e $full_path_name_file;

    my @smotif_definition;

    open my $list, "<", $full_path_name_file or die $!;
    while ( my $structline = <$list> ) {
        try {

            # structure line format
            # 1e9g      B       2.1     HHS
            chomp $structline;
            print "\nstructline = $structline\n";
            my @slin  = split( /\s+/, $structline );
            my $pdb   = $slin[0];
            my $chain = $slin[1];

         # get the smotifs
         # pdb_id, $uploadpdbfull = "pdb".$uploadpdb.$uploadchain.".ent", chain,
         # extracting smotif definition of pdb hits
            my @loops = SmotifTF::GetSMotifsfromPDB::extract_loops( $pdb, $chain );
            # print "THIS IS LOOPS\n";
            # print Dumper( \@loops );

            foreach my $line (@loops) {
                my @tmp = split( /\s+/, $line );
                
                next unless @tmp == 14;
                # print Dumper(\@tmp);
                my %hash = (
                    pdb_code             => $tmp[0],
                    chain                => $tmp[1],
                    smotif_type          => $tmp[2],
                    smotif_residue_start => $tmp[3],
                    loop_length          => $tmp[4],
                    ss1_length           => $tmp[5],
                    ss2_length           => $tmp[6],
                    sequence             => $tmp[7],
                    secondary_structure  => $tmp[8],
                    ramachandran         => $tmp[9],
                    num1                 => $tmp[10],
                    num2                 => $tmp[11],
                    num3                 => $tmp[12],
                    num4                 => $tmp[13],
                );

                # print "esta mielda es hash\n";
                # print Dumper(\%hash);
                push @smotif_definition, \%hash;
            }
        }
        catch {
            print  "Error at get_smotif_definition: $_ processing next file";
            next;
        };
    }    #end STRUCT loop
    close $list;

    return @smotif_definition;
}

=head1 NAME

SmotifTF Template-free Modeling Method - The pre-requisites

=head1 SYNOPSIS

Please read this document completely for running the SmotifTF
software successfully on any local computer. 

Pre-requisites: 

The Smotif-based modeling algorithm requires the query protein sequence as input. 
Additionally, if the structure of the protein is known from any alternate
resource, then a PDB-formatted structure file is required. This pdb-file can be
present in a centralized local directory or a user-designated separate directory. 

Software / data:

1. Psipred (http://bioinf.cs.ucl.ac.uk/psipred/)

2. HHSuite (ftp://toolkit.genzentrum.lmu.de/pub/HH-suite/)

3. Psiblast and Delta-blast (http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)

4. Modeller (version 9.14 https://salilab.org/modeller/)

5. Local PDB directory (central or user-designated) - updated (http://www.rcsb.org). 

Download and install the above mentioned software / data according to their instructions. 

Note: Psipred and Psiblast require legacy blast and Delta-blast is a part of the Blast+ package. 

Databases required: 

1. PDBAA blast database is required (ftp://ftp.ncbi.nlm.nih.gov/blast/db/). 
2. HHsuite databases NR20 and PDB70 are required (ftp://toolkit.genzentrum.lmu.de/pub/HH-suite/databases/hhsuite_dbs/)

SmotifTF Download and Installation: 

Download SmotifTF package from CPAN: 

http://search.cpan.org/dist/SmotifTF/

Installation of the software (also available in the README file):

 tar -zxvf SmotifTF-0.01.tar.gz

 cd SmotifTF-0.01/

 perl Makefile.PL PREFIX=/home/user/SmotifTF-0.01

 make

 make test

 make install

Set up the configuration file:

The configuration file, smotifab_config.ini has all the information
regarding the required library files and other pre-requisite software. 

Set all the paths and executables in this file correctly.

Set environment varible in .bashrc file:

export SMOTIFTF_CONFIG_FILE=/home/user/SmotifTF-0.01/smotiftf_config.ini


 Pre-requisites steps: 

      -------------------------------------------------------
     | Run Pre-requisites:                                   |
     | Psipred, HHblits+HHsearch, Psiblast, Delta-blast      |
     |                                                       |
     | Single-core job                                       |
     | Usage:                                                |
     |   perl smotiftf_prereq.pl --sequence_file=1zzz.fasta  |
     |         --dir=1zzz --step=all                         |
      -------------------------------------------------------


How to run the pre-requisites program:


1. Create a subdirectory with a dummy pdb file name (eg: 1abc or 1zzz). 

2. Put the query fasta file (1zzz.fasta) in this directory.

3. Optional: If structure is known, include a pdb format structure file
   in the same directory. 1abc/pdb1abc.ent or 1zzz/pdb1zzz.ent

4. Run the pre-requisites step first. This runs Psipred, HHblits+HHsearch,
   Psiblast and Delta-blast. It will then generate the dynamic database of
   Smotifs and the list of putative Smotifs in the query protein.
   Usage: perl smotiftf_prereq.pl --sequence_file=1zzz.fasta --dir=1zzz --step=all

5. Next, run smotiftf.pl according to the instructions given there.  

Reference: 

Vallat BK, Fiser A.
Modularity of protein folds as a tool for template-free modeling of sequences
Manuscript under review. 

Authors:

Brinda Vallat, Carlos Madrid and Andras Fiser. 

=head1 OPTIONS

=over 8

=item B<-help>

Print a brief help message and exits.

=item B<-man>

Prints the manual page and exits.

=item B<--step>

1,2,3,4,5,6,7,8 or all

=item B<--sequence_file>
 
Give the name of the fasta file. 

=item B<--dir>

Give 4-letter dummy pdb_code or any other directory where the fasta file is present. 

=back

=head1 DESCRIPTION

B<SmotifTF> will carry out template-free structure prediction of a 
protein from its sequence to model its complete structure using the 
Smotif library.

=cut
