FastML.v3.1/ 0000755 0176723 0002036 00000000000 12272424201 011537 5 ustar haim pupko FastML.v3.1/www/ 0000755 0176723 0002036 00000000000 12272424174 012374 5 ustar haim pupko FastML.v3.1/www/fastml/ 0000755 0176723 0002036 00000000000 12272466165 013670 5 ustar haim pupko FastML.v3.1/www/fastml/FastML_Wrapper.pl 0000644 0176723 0002036 00000253040 12272501150 017037 0 ustar haim pupko use strict;
use Getopt::Long;
use FindBin qw($Bin); # www/FastML_2012/
use lib "$Bin/../bioSequence_scripts_and_constants/";
#use lib "/bioseq/bioSequence_scripts_and_constants";
use GENERAL_CONSTANTS;
use BIOSEQUENCE_FUNCTIONS;
use POSIX;
use FindBin qw($Bin);
use File::Copy;
use File::Basename;
die "USAGE:FastML_Wrapper.pl --MSA_File --seqType --outDir
Optional parameters:
--Tree
--TreeAlg - How to builed tree when tree not provided by user; default=NJ
--SubMatrix amino acid options, the default is JTT.
nucleotide options, the default is JC_Nuc.
codon options, the default is yang.
--OptimizeBL default: yes
--UseGamma default: yes
# --OptAlpha default: no (relevant only when UseGamma==yes)
--Alpha (relevant only when UseGamma==yes)
user alpha parameter of the gamma distribution [if alpha is not given, alpha and branches will be evaluated from the data]
--jointReconstruction default: yes
--indelReconstruction - which method is used for indel reconstruction
--indelCutOff deafult =0.5
" unless (@ARGV >= 1);
my @ARGV_forPrint=@ARGV;
my %VARS=(); # FOR PROGRAM VARS
my %FORM=(); # FOR USER INPUTS
# Assign default
$FORM{MSA_File}="";
$FORM{outDir}="";
$FORM{TreeAlg}="NA";
$FORM{Tree}="NA";
$FORM{OptimizeBL}="YES";
$FORM{UseGamma}="YES";
#$FORM{OptAlpha}="NO";
$FORM{Alpha}="";
$VARS{RunNumber}="NA";
$VARS{isServer}="NO";
$FORM{JointReconstruction}="YES";
$FORM{IndelReconstructionMethod}="BOTH";
$FORM{IndelsCutoff}=0.5;
$FORM{DEBUG}="NO";
my $getoptResult = GetOptions ("MSA_File=s"=>\$FORM{MSA_File}, # = means that this parameter is required, s means string
"outDir=s"=>\$FORM{outDir},
"seqType=s"=>\$FORM{seqType},
"Tree:s"=>\$FORM{Tree},
"TreeAlg:s"=>\$FORM{TreeAlg}, # NJ | RaxML
"SubMatrix:s"=>\$FORM{SubMatrix},
"OptimizeBL:s"=>\$FORM{OptimizeBL},
"UseGamma:s"=>\$FORM{UseGamma},
# "OptAlpha:s"=>\$FORM{OptAlpha},
"Alpha:i"=>\$FORM{Alpha},
"jointReconstruction:s"=>\$FORM{JointReconstruction},
"indelReconstruction:s"=>\$FORM{IndelReconstructionMethod}, #Parsimony|ML
"RunNum:i"=>\$VARS{RunNumber}, # RELEVANT FOR SERVER ONLY
"isServer:s"=>\$VARS{isServer}, # RELEVANT FOR SERVER ONLY
"indelCutOff:f"=>\$FORM{IndelsCutoff},
"DEBUG:s"=>\$FORM{DEBUG} # YES | NO
);
$FORM{JointReconstruction}=uc($FORM{JointReconstruction});
$FORM{UseGamma}=uc($FORM{UseGamma});
$FORM{OptimizeBL}=uc($FORM{OptimizeBL});
$FORM{TreeAlg}=uc($FORM{TreeAlg});
$FORM{DEBUG}=uc($FORM{DEBUG});
$FORM{IndelReconstructionMethod}=uc($FORM{IndelReconstructionMethod});
die "ERROR: No path for output\n" if ($FORM{outDir} eq "");
die "ERROR: MSA_File is requiered\n" if ($FORM{MSA_File} eq "");
$FORM{seqType}=lc ($FORM{seqType});
die "ERROR: seqType must be aa or nuc or codon - NOT $FORM{seqType}\n" if (($FORM{seqType} ne "aa") and ($FORM{seqType} ne "codon") and ($FORM{seqType} ne "nuc"));
unless ($FORM{outDir} =~ m/\/$/) {
$FORM{outDir} .= "/";
}
print "outDir: $FORM{outDir}\n";
unless (-e $FORM{outDir}) {
mkdir ($FORM{outDir});
}
if (!defined $FORM{SubMatrix}) # assign default
{
if ($FORM{seqType} eq "aa") {$FORM{SubMatrix}="JTT"; print "SubMatrix=JTT (default)\n";}
elsif ($FORM{seqType} eq "nuc") {$FORM{SubMatrix}="JC_Nuc"; print "SubMatrix=JC_Nuc (default)\n";}
elsif ($FORM{seqType} eq "codon") {$FORM{SubMatrix}="yang"; print "SubMatrix=yang (default)\n";}
}
if (($FORM{Tree} ne "NA") and ($FORM{TreeAlg} ne "NA"))
{
die "ERROR: Notice, only --Tree or --TreeAlg should be provided, not both...\n";
}
if (($FORM{Tree} ne "NA") and (!-e $FORM{Tree}))
{
die "ERROR: The tree file '$FORM{Tree}' does not exists...\n";
}
if (($FORM{IndelsCutoff}<0) or ($FORM{IndelsCutoff}>1))
{
die "ERROR: The --indelCutOff must be between 0 and 1...\n";
}
if (($FORM{IndelReconstructionMethod} ne "BOTH") and ($FORM{IndelReconstructionMethod} ne "PARSIMONY") and ($FORM{IndelReconstructionMethod} ne "ML"))
{
die "ERROR: The --indelReconstruction must be ML or PARSIMONY or BOTH Only...\n";
}
# Assign other defaults
$VARS{Aln_format}="FASTA";
$FORM{TreeAlg}="NJ" if ($FORM{TreeAlg} eq "NA");
###### here are the name of the result files.
###### tree file output in Newick format:
$VARS{tree_newick} = "tree.newick.txt";
###### ree file output in ANCESTOR format:
$VARS{tree_ancestor} = "tree.ancestor.txt";
###### joint sequences output file:
$VARS{seq_joint} = "seq.joint.txt";
###### marginal sequences output file:
$VARS{seq_marginal} = "seq.marginal.txt";
###### joint probabilities output file:
$VARS{prob_joint} = "prob.joint.txt";
###### marginal probabilities output file:
$VARS{prob_marginal} = "prob.marginal.txt";
$VARS{prob_marginal_csv} = "prob.marginal.csv";
$VARS{log_likelihood_prob_marginal_csv}="LogLikelihood_prob.margianl.csv";
# Indel Reconstructions
# Likelihood
$VARS{marginal_seq_chars_and_indel}="seq.marginal_IndelAndChars.txt";
$VARS{marginal_prob_chars_and_indel}="Ancestral_MaxMarginalProb_Char_Indel.txt";
$VARS{marginal_indel_prob}="IndelsMarginalProb.txt";
# Parsimony
$VARS{marginal_prob_chars_and_parsimony_indels}="Ancestral_MaxProb_Marginal_Char_Parsimony_Indel.txt";
$VARS{marginal_seq_chars_and_parsimony_indels}="seq.marginal_Chars_ParsimonyIndels.txt";
$VARS{parsimony_indels}="Indels.parsimony.txt";
###### JalView Ouputs
$VARS{JalViewMarginalFeaturesFile}="JalView_Features_Marginal_Prob";
$VARS{seq_marginal_JalView}="seq.marginal_NO_IndelReconstruction_JalView.$VARS{Aln_format}".".aln";
$VARS{Tree_JalView}="tree.JalView.newick";
$VARS{JalView_Marginal_Reconstruction}="JalViewMarginal_Seq_Reconstruction_NO_IndelReconstruction.html" if ($VARS{isServer} eq "YES");
$VARS{JalView_Marginal_Reconstruction}="JalViewMarginal_Seq_Reconstruction_NO_IndelReconstruction.jnlp" if ($VARS{isServer} eq "NO");
##Chars and Indels
# ML BASED
$VARS{JalViewMarginal_Chars_and_Indels_FeaturesFile}="JalView_Features_CharsAndIndels_Marginal_Prob";
$VARS{seq_marginal_Chars_and_Indels_JalView}="seq.marginal_CharsAndIndels_JalView.$VARS{Aln_format}".".aln";
if ($VARS{isServer} eq "YES")
{
$VARS{JalView_Marginal_Chars_and_Indel_Reconstruction}="JalViewMarginal_CharsAndIndels_Reconstruction.html";
}
else
{
$VARS{JalView_Marginal_Chars_and_Indel_Reconstruction}="JalViewMarginal_CharsAndIndels_Reconstruction.jnlp";
}
# ML CHARS PARSIMONY INDELS
$VARS{seq_marginal_chars_and_parsimony_indels_JalView}="seq.marginal_Chars_ParsimonyIndels_JalView.$VARS{Aln_format}".".aln";
$VARS{JalViewMarginal_Chars_and_Parsimony_Indels_FeaturesFile}="JalView_Features_Marginal_Prob_Chars_And_Parsimony_Indels";
if ($VARS{isServer} eq "YES")
{
$VARS{JalView_Marginal_Chars_and_Parsimony_Indel_Reconstruction}="JalViewMarginal_Chars_And_Parsimony_Indels_Reconstruction.html";
}
else
{
$VARS{JalView_Marginal_Chars_and_Parsimony_Indel_Reconstruction}="JalViewMarginal_Chars_And_Parsimony_Indels_Reconstruction.jnlp";
}
# Joint reconstruction
$VARS{JalViewJointAnnotationGraphFile}="JalView_Annotation_Graph_Joint_Prob";
$VARS{seq_joint_JalView}="seq.joint_JalView.$VARS{Aln_format}".".aln";
if ($VARS{isServer} eq "YES")
{
$VARS{JalView_Joint_Reconstruction}="JalViewJoint_Reconstruction.html";
}
else
{
$VARS{JalView_Joint_Reconstruction}="JalViewJoint_Reconstruction.jnlp";
}
###### here we set the html output file (where links to all files will be)
if ($VARS{isServer} eq "NO")
{
$VARS{OutHtmlFile} = "output.html";
}
else
{
$VARS{OutHtmlFile} = "output.php";
}
#TO DO
# Convert sequence names to num to avoid problems with RAxML and LIB
if ($VARS{isServer} eq "NO")
# Copy input files to the running dir and work on them from now on
{
copy ($FORM{MSA_File},$FORM{outDir});
my ($MSA_FileName,$MSA_dir)=fileparse($FORM{MSA_File});
$FORM{MSA_File}=$FORM{outDir}.$MSA_FileName;
print "Copy and analyse MSA: $FORM{MSA_File}\n";
if (-e $FORM{Tree})
{
copy ($FORM{Tree},$FORM{outDir});
my ($Tree_FileName,$Tree_dir)=fileparse($FORM{Tree});
$FORM{Tree}=$FORM{outDir}.$Tree_FileName;
print "Copy and analyse tree: $FORM{Tree}\n";
}
}
my %SeqNamesToCode=();
my %CodeToSeqName=();
my ($SeqNamesToCode,$CodeToSeqName)=MSASeqNamesToCode($FORM{MSA_File},$FORM{outDir});
TreeNamesToCodes ($FORM{Tree},$SeqNamesToCode) if (-e $FORM{Tree});
%CodeToSeqName=%$CodeToSeqName;
%SeqNamesToCode=%$SeqNamesToCode;
################
if ($FORM{Tree} ne "NA")
{
$VARS{UserProvideTree}="YES";
}
else
{
$VARS{UserProvideTree}="NO";
if ($FORM{TreeAlg} eq "RAXML")
{
$VARS{RAxML_Tree}="RAxML_tree.newick";
}
}
if ($VARS{isServer} eq "YES")
{
$VARS{All_Outputs_Zip}="FASTML_run_".$VARS{RunNumber}.".zip"; # All Outputs ZIP
$VARS{logs_dir} = GENERAL_CONSTANTS::SERVERS_LOGS_DIR."fastml/" if ($VARS{isServer} eq "YES");
$VARS{OutLogFile} = $VARS{logs_dir}.$VARS{RunNumber}.".log";
###### WWWdir is where the web=page is.
$VARS{WWWdir} = GENERAL_CONSTANTS::FASTML_URL."results/" .$VARS{RunNumber}. "/"; #XMXMXMXMX
$VARS{run_url} = $VARS{WWWdir}.$VARS{OutHtmlFile};
###### here we set the reload interval (in seconds).
$VARS{reload_interval} = 30;
###### here we set the email of the server - for problems...
$VARS{DEVELOPER_MAIL} = GENERAL_CONSTANTS::ADMIN_EMAIL;
$VARS{UserMailFile}=$FORM{outDir}."user_email.txt";
$VARS{DevMail} = "\"mailto:$VARS{DEVELOPER_MAIL}?subject=Fastml%20Run%20No.:%20$VARS{RunNumber}\"";
$VARS{ContactDef} = "\nFor assistance please contact us and mention this number: $VARS{RunNumber}
\n";
###### this are the name of the program.
# $VARS{fastml} = "/bioseq/pupkoSVN/tags/fastml.v2.05/programs/fastml/fastml"; # TO DO
# $VARS{fastml} = "/groups/pupko/haim/pupkoSVN/trunk/programs/fastml/fastml"; # TO DO
$VARS{fastml} = "/bioseq/FastML/fastml";
$VARS{Indel_Reconstruction} = "/bioseq/FastML/IndelReconstruction/IndelReconstruct.pl"; # TO DO
$VARS{RAxML} = "/bioseq/FastML/BuildRaxMLTree.pl"; # TO DO
###### Send mail Global VARS
$VARS{send_email_dir} = GENERAL_CONSTANTS::SEND_EMAIL_DIR_IBIS;
$VARS{smtp_server} = GENERAL_CONSTANTS::SMTP_SERVER;
$VARS{userName} = GENERAL_CONSTANTS::ADMIN_USER_NAME;
$VARS{userPass} = GENERAL_CONSTANTS::ADMIN_PASSWORD;
my $estimated_run_time=estimate_run_time($FORM{MSA_File},$FORM{seqType},$VARS{UserProvideTree},$FORM{UseGamma});
# UPDATE STATE
open OUTPUT, "$FORM{outDir}$VARS{OutHtmlFile}" || exit_on_error("sys_error","Can't open output page: '$FORM{outDir}$VARS{OutHtmlFile}' $!");
my @OUTPUT=
");
$kMostProb_Section=0;
}
else
{
print OUTPUT $line;
}
}
close (OUTPUT);
}
#---------------------------------------------
sub print_message_to_output{
#---------------------------------------------
my $msg = shift;
print OUTPUT "\n\n";
}
FastML.v3.1/www/fastml/IndelReconstruction_Wrapper.pl 0000644 0176723 0002036 00000077207 12272416750 021732 0 ustar haim pupko use strict;
use Getopt::Long;
use FindBin qw($Bin); # www/FastML_2012/
use File::Copy;
die "USAGE: --MSA_File --Tree_File --outDir --seqType
Optional parameters:
--indelCutOff deafult =0.5
--CharsMarginalProb (deafult=prob.marginal.txt) - prob of ancestral sequences - FASTML output
--ML_GapOut # (deafult: IndelsMarginalProb.txt - IndelsMarginalProb (IndelReconstructOutput)
--ML_Ancestral_MSA # (deafult: seq.marginal_IndelAndChars.txt) - output for Chars and Gap Ancestral Reconstruction - MSA;
--ML_Chars_ML_Gap # (deafult: AncestralMaxMarginalProb_Char_Indel.txt) - File with the max prob of each position on each node
--MP_GapOut # (deafult: Indels.parsimony) - Indel Satate for each MSA pos by parsimony
--ML_Char_MP_Gap # (deafult: AncestralMaxProbMarginal_Char_Parsimony_Indel.txt) - File with the max prob char of each position on each node and indel parsimony
--Ancestral_MSA_MP_GAP # (deafult: seq.marginal_Chars_ParsimonyIndels.txt) - MSA Output for Chars and Parsimonuis Gap Ancestral Reconstruction;
--Debug # (deafult: off) printouts debug info
" unless (@ARGV >= 1);
# Assign default
my ($MSA_File,$OutDir,$Tree_File,$IndelsCutoff,$SeqType,$MarginalProb_of_Chars,$GapProb_OutFile,$Ancestral_MSA,$Ancestral_Prob,$GapParsimony_OutFile,$Ancestral_Prob_ParsimonyIndel,$Ancestral_MSA_Parsimony,$DEBUG_F,);
$MSA_File="";
$OutDir="";
$Tree_File="";
$IndelsCutoff=0.5;
my $getoptResult = GetOptions ("MSA_File=s"=>\$MSA_File, # = means that this parameter is required, s means string
"outDir=s"=>\$OutDir,
"Tree_File=s"=>\$Tree_File,
"seqType=s"=>\$SeqType, # aa|nuc|codon
"indelCutOff:f"=>\$IndelsCutoff,
"CharsMarginalProb:s"=>\$MarginalProb_of_Chars, # (prob.marginal.txt) - prob of ancestral sequences - FASTML output
"ML_GapOut:s"=>\$GapProb_OutFile, # (IndelsMarginalProb.txt) - IndelsMarginalProb (IndelReconstructOutput)
"ML_Ancestral_MSA:s"=>\$Ancestral_MSA, # (seq.marginal_IndelAndChars.txt) - output for Chars and Gap Ancestral Reconstruction - MSA;
"ML_Chars_ML_Gap:s"=>\$Ancestral_Prob, # (Ancestral_MaxMarginalProb_Char_Indel.txt) - File with the max prob of each position on each node
"MP_GapOut:s"=>\$GapParsimony_OutFile, # (Indels.parsimony.txt) - Indel Satate for each MSA pos by parsimony
"ML_Char_MP_Gap:s"=>\$Ancestral_Prob_ParsimonyIndel, # (Ancestral_MaxProb_Marginal_Char_Parsimony_Indel.txt) - File with the max prob char of each position on each node and indel parsimony
"Ancestral_MSA_MP_GAP:s"=>\$Ancestral_MSA_Parsimony, # (seq.marginal_Chars_ParsimonyIndels.txt) - MSA Output for Chars and Parsimonuis Gap Ancestral Reconstruction;
"Debug" =>\$DEBUG_F,
);
# default file names
if ($OutDir!~/\/$/) {$OutDir.="/";}
$GapProb_OutFile=$OutDir."IndelsMarginalProb.txt" if ((!defined $GapProb_OutFile) or ($GapProb_OutFile eq ""));
$MarginalProb_of_Chars=$OutDir."prob.marginal.txt" if ((!defined $MarginalProb_of_Chars) or ($MarginalProb_of_Chars eq ""));
$Ancestral_MSA=$OutDir."seq.marginal_IndelAndChars.txt" if ((!defined $Ancestral_MSA) or ($Ancestral_MSA eq ""));
$Ancestral_Prob=$OutDir."Ancestral_MaxMarginalProb_Char_Indel.txt" if ((!defined $Ancestral_Prob) or ($Ancestral_Prob eq ""));
# default file names for PARSIMONY BASED OUTPUT
$GapParsimony_OutFile=$OutDir."Indels.parsimony.txt" if ((!defined $GapParsimony_OutFile) or ($GapParsimony_OutFile eq "")); # Indel Satate for each MSA pos by parsimony
$Ancestral_Prob_ParsimonyIndel=$OutDir."Ancestral_MaxProb_Marginal_Char_Parsimony_Indel.txt" if ((!defined $Ancestral_Prob_ParsimonyIndel) or ($Ancestral_Prob_ParsimonyIndel eq "")); # File with the max prob char of each position on each node and indel parsimony
$Ancestral_MSA_Parsimony=$OutDir."seq.marginal_Chars_ParsimonyIndels.txt" if ((!defined $Ancestral_MSA_Parsimony) or ($Ancestral_MSA_Parsimony eq "")); # Output for parsimony Chars and Gap Ancestral Reconstruction;
my $DEBUG="NO";
$DEBUG="YES" if ($DEBUG_F);
print "
--MSA_File=$MSA_File
--outDir=$OutDir
--Tree_File=$Tree_File
--seqType=$SeqType
--indelCutOff=$IndelsCutoff
--CharsMarginalProb=$MarginalProb_of_Chars
--ML_GapOut=$GapProb_OutFile
--ML_Ancestral_MSA=$Ancestral_MSA
--ML_Chars_ML_Gap=$Ancestral_Prob
--MP_GapOut=$GapParsimony_OutFile
--ML_Char_MP_Gap=$Ancestral_Prob_ParsimonyIndel
--Ancestral_MSA_MP_GAP=$Ancestral_MSA_Parsimony
--Debug=$DEBUG\n";
#print "WAIT...\n";;
# Constants
my $ParsimonyCostMatrix=2;
my $MSA_Prefix_Name="";
if ($MSA_File=~/([^\/]+?)(.aln|.faa|.mfa|.txt)?$/)
{
$MSA_Prefix_Name=$1;
}
else
{
$MSA_Prefix_Name=$MSA_File;
}
$DEBUG=uc($DEBUG);
if (!defined $DEBUG)
{
$DEBUG="NO";
}
# Programs Path
#my $IndelCoder="/bioseq/FastML/IndelReconstruction/indelCoder";
#my $IndelCoder="/bioseq/FastML/IndelReconstruction/indelCoder.V1.6";
#my $IndelCoder="/bioseq/FastML/IndelReconstruction/indelCoder.V1.71";
my $IndelCoder="$Bin/../../programs/indelCoder/indelCoder";
#my $IndelReconstruction="/bioseq/FastML/IndelReconstruction/gainLoss.V9.9822"; # by gainLoss
#my $IndelReconstruction="/bioseq/FastML/IndelReconstruction/gainLoss.V9.9863"; # by gainLoss
my $IndelReconstruction="$Bin/../../programs/gainLoss/gainLoss"; # by gainLoss
# Globals File Names
$OutDir=$OutDir."/" if ($OutDir!~/\/$/);
my $Indels_Reconstruction_results_Dir=$OutDir."IndelsReconstruction/";
# IndelCoder
my $IndelCoderParamFile="IndelCoderParamFile";
my $indelOutputFastaFile="$Indels_Reconstruction_results_Dir/$MSA_Prefix_Name".".indelOutputFastaFile";
my $indelOutputInfoFile="$Indels_Reconstruction_results_Dir/$MSA_Prefix_Name".".indelOutputInfoFile";
my $nexusFileName="$Indels_Reconstruction_results_Dir/$MSA_Prefix_Name".".indel_nexusFile";
my $indelLogFile="$Indels_Reconstruction_results_Dir/$MSA_Prefix_Name"."IndelCoder.log";
# Indel Reconstruction
my $IndelReconstructionParamFile="IndelReconstructionParamFile";
#my $indelOutputFasta_NO_MISSING_DATA_File="$Indels_Reconstruction_results_Dir/$MSA_Prefix_Name"."_MISING_DATA_TO0.indelOutputFastaFile"; # For now gainLoss don't handle missing data so we replace '?' with 0
my $AncestralReconstructIndelPosterior="$Indels_Reconstruction_results_Dir/RESULTS/AncestralReconstructPosterior.txt"; # The file with ancestral prob of indel
my $AncestralReconstructParsimony="$Indels_Reconstruction_results_Dir/RESULTS/gainLossMP.".$ParsimonyCostMatrix.".AncestralReconstructSankoff.txt";
# Joint character based Ancestral MSA with Indel Reconstruction
mkdir ($Indels_Reconstruction_results_Dir);
my %Species_On_MSA=(); # All species in the MSA - MAYBE TO REMOVE
open (MSA,$MSA_File);
while (my $line=)
{
chomp ($line);
if ($line=~/^>(.*)/)
{
$Species_On_MSA{$1}=1;
}
}
# Read MSA to Hash
my $MSA_Hash_ref=readMSA($MSA_File);
my %MSA_Hash=%{$MSA_Hash_ref};
# Prepare indel Coder ParamFile
open (INDEL_CODER_PARAMS,">$Indels_Reconstruction_results_Dir$IndelCoderParamFile") || die "IndelReconstruction_Wrapper: Can't open IndelCoderParamFile '$Indels_Reconstruction_results_Dir$IndelCoderParamFile' $!";
print INDEL_CODER_PARAMS "_seqFile $MSA_File\n";
print INDEL_CODER_PARAMS "_indelOutputInfoFile $indelOutputInfoFile\n";
print INDEL_CODER_PARAMS "_indelOutputFastaFile $indelOutputFastaFile\n";
print INDEL_CODER_PARAMS "_nexusFileName $nexusFileName\n";
print INDEL_CODER_PARAMS "_logFile $indelLogFile\n";
print INDEL_CODER_PARAMS "_logValue 9\n";
print INDEL_CODER_PARAMS "_codingType SIC\n";
print INDEL_CODER_PARAMS "_isOmitLeadingAndEndingGaps 0\n";
close (INDEL_CODER_PARAMS);
system ("cd $Indels_Reconstruction_results_Dir; $IndelCoder $IndelCoderParamFile");
if (!-e $indelOutputFastaFile)
{
die "IndelReconstruction_Wrapper: $indelOutputFastaFile was not created or empty, please have a look on the indel coder log file at: $indelLogFile";
}
# Run indelReconstruction by gainLoss
my $removed_BP_InternalNodeName=remove_InternalNodeName_or_BPvalues($Tree_File,$Tree_File.".Orig");
copy ($Tree_File,"$Tree_File.ForIndelReconstruction");
move ("$Tree_File.Orig",$Tree_File) if (-e "$Tree_File.Orig");
open (INDEL_RECONSTRUCTION_PARAMS,">$Indels_Reconstruction_results_Dir$IndelReconstructionParamFile") || die "Can't open IndelReconstructionParamFile '$Indels_Reconstruction_results_Dir$IndelReconstructionParamFile' $!";
print INDEL_RECONSTRUCTION_PARAMS "_seqFile $indelOutputFastaFile\n";
print INDEL_RECONSTRUCTION_PARAMS "_treeFile $Tree_File.ForIndelReconstruction\n";
print INDEL_RECONSTRUCTION_PARAMS "_isRootFreqEQstationary 1\n";
print INDEL_RECONSTRUCTION_PARAMS "_calculateAncestralReconstruct 1\n";
print INDEL_RECONSTRUCTION_PARAMS "_costMatrixGainLossRatio 2\n";
print INDEL_RECONSTRUCTION_PARAMS "_minNumOfOnes 1\n";
close (INDEL_RECONSTRUCTION_PARAMS);
system ("cd $Indels_Reconstruction_results_Dir; $IndelReconstruction $IndelReconstructionParamFile");
my %MSA_Pos_Species_to_Indel=();
my %MSAtoIndel=();
my ($MSA_Pos_Species_to_Indel,$MSAtoIndel)=Read_MSA_to_Indels_Info($indelOutputInfoFile,\%MSA_Pos_Species_to_Indel,\%MSAtoIndel); # hash1 - key1:MSA_Pos,key2:species; value:IndelMSAPos; hash2 - key: MSA_Pos;value: IndelsMSA_Pos (array)
my %AncestralReconstructIndelPosterior_Hash=();
my $AncestralReconstructIndelPosterior_Reff=Read_Ancestral_Prob_For_Indel($AncestralReconstructIndelPosterior,\%AncestralReconstructIndelPosterior_Hash); # hash = key1:IndelMSA_Pos,key2:species; value Prob for indel
####### HADLE WITH PROB RECONSTRUCTION
%AncestralReconstructIndelPosterior_Hash=%$AncestralReconstructIndelPosterior_Reff;
my %MSA_Pos_Species_AncestorIndelProb=(); # Will hold for each MSA_Pos and Species the vector of IndelPos_ProbOfIndel
print "HADLE WITH PROB RECONSTRUCTION LOOP\n====================================================\n" if ($DEBUG eq "YES");
## MAKE UNIQ
print "+++++++++++++++++DEBUG - PRINT INDEL POS TO INDEL NOT UNIQ ++++++++++++++++++++++\n" if ($DEBUG eq "YES");
foreach my $MSA_Pos (sort {$a<=>$b} keys %$MSAtoIndel)
{
print "MSA:$MSA_Pos\t",join(",",@{$MSAtoIndel->{$MSA_Pos}}),"\n" if ($DEBUG eq "YES");
my $tmp_array=uniq_array($MSAtoIndel->{$MSA_Pos});
$MSAtoIndel->{$MSA_Pos}=[@{$tmp_array}];
}
print "+++++++++++++++++DEBUG - PRINT INDEL POS TO INDEL UNIQ +++++++++++++++++++++++++\n" if ($DEBUG eq "YES");
foreach my $MSA_Pos (sort {$a<=>$b} keys %$MSAtoIndel)
{
print "MSA:$MSA_Pos\t",join(",",@{$MSAtoIndel->{$MSA_Pos}}),"\n" if ($DEBUG eq "YES");
}
print "+++++++++++++++++ END DEBUG ++++++++++++++++++++++++\n" if ($DEBUG eq "YES");
foreach my $MSA_Pos (sort {$a<=>$b} keys %$MSAtoIndel)
{
print "MSA:$MSA_Pos," if ($DEBUG eq "YES"); # DEBUG
foreach my $IndelPos (@{$MSAtoIndel->{$MSA_Pos}})
{
print "Indel:$IndelPos - $AncestralReconstructIndelPosterior_Hash{$IndelPos}" if ($DEBUG eq "YES"); # empty
foreach my $species (keys %{$AncestralReconstructIndelPosterior_Hash{$IndelPos}})
{
if (!exists $Species_On_MSA{$species}) # Ancestral Node # CONSIDER REMOVE
{
my $IndelPos_ProbOfIndel=$IndelPos."_".$AncestralReconstructIndelPosterior_Hash{$IndelPos}{$species};
if (!exists $MSA_Pos_Species_AncestorIndelProb{$MSA_Pos}{$species}){$MSA_Pos_Species_AncestorIndelProb{$MSA_Pos}{$species}=[$IndelPos_ProbOfIndel];}
else {push @{$MSA_Pos_Species_AncestorIndelProb{$MSA_Pos}{$species}},$IndelPos_ProbOfIndel;}
print "$MSA_Pos\t$IndelPos\t$species\t$AncestralReconstructIndelPosterior_Hash{$IndelPos}{$species}\n" if ($DEBUG eq "YES"); # DEBUG
}
}
}
}
open (GAP_PROB,">$GapProb_OutFile") || die "Can't open '$GapProb_OutFile' $!";
print GAP_PROB "Pos\tNode\tProb_Of_Indel\n";
my %MSA_Pos_Node_MaxProbOf_Gap=();
foreach my $MSA_Pos (sort {$a<=>$b} keys %MSA_Pos_Species_AncestorIndelProb)
{
foreach my $species (sort keys %{$MSA_Pos_Species_AncestorIndelProb{$MSA_Pos}})
{
if (!exists $Species_On_MSA{$species}) # Ancestral Node # CONSIDER REMOVE
{
print "$MSA_Pos\t$species" if ($DEBUG eq "YES");
print GAP_PROB "$MSA_Pos\t$species";
my $Uniq_Indels_Reff=uniq_array($MSA_Pos_Species_AncestorIndelProb{$MSA_Pos}{$species});
my @Uniq_Indels=@$Uniq_Indels_Reff;
my $NumOfIndelCoverMSA_Pos=@Uniq_Indels;
my @ProbsOfIndel;
for (my $i=0;$i<$NumOfIndelCoverMSA_Pos;$i++)
{
my $Indel_IndelProb=$Uniq_Indels[$i];
my ($Indel_Pos,$IndelProb)=split("_",$Indel_IndelProb);
print "\t$Indel_Pos:$IndelProb" if ($DEBUG eq "YES");
push (@ProbsOfIndel,$IndelProb);
}
my $maxProbOfIndel = (sort { $b <=> $a } @ProbsOfIndel)[0];
print "\tMAX:$maxProbOfIndel\n" if ($DEBUG eq "YES");
print GAP_PROB "\t$maxProbOfIndel\n";
$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$species}=$maxProbOfIndel;
}
}
}
close (GAP_PROB);
my %MSA_Pos_Node_Char_or_Gap=();
# Read the Chars Marginal Prob
my ($MSA_Pos_Node_Char_Marginal_Prob_Reff,$Nodes_Name_Reff,$MSA_Length)=Read_Char_Marginal_Prob($MarginalProb_of_Chars);
print "MSA_Length:$MSA_Length\n" if ($DEBUG eq "YES");
my @Nodes=@$Nodes_Name_Reff;
open (ANCESTRAL_PROB,">$Ancestral_Prob")|| die "Can't open Ancestral Prob File: '$Ancestral_Prob' $!\n";
print ANCESTRAL_PROB "Pos_on_MSA\tNode\tChar\tCharProb\n";
foreach my $MSA_Pos (sort {$a<=>$b} keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff})
{
print "MSA:$MSA_Pos\n" if ($DEBUG eq "YES");
foreach my $Node (sort keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}})
{
my $maxProbChar="NA";
my $maxProb=0;
my $Num_Of_1=0;
foreach my $Char (sort keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}})
{
if (($MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}>$maxProb)&&(defined $MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}))
{
$maxProbChar=$Char;
$maxProb=$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char};
}
$Num_Of_1++ if ($MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}==1);
}
# Decide what is the most probable char on pos
if ($Num_Of_1>1) # GAP
{
if ($SeqType eq "codon")
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}="---".":1";
}
else
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}="-".":1";
}
$maxProbChar="NA";
$maxProb=0;
}
else
{
if (!exists $MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}){$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}="NA";}
print "NODE:$Node - $maxProbChar:$maxProb ? -:$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}\n" if ($DEBUG eq "YES");#; # DEBUG
if (($SeqType eq "aa") or ($SeqType eq "nuc"))
{
if ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node} eq "NA")
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
# elsif ($maxProb>=$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}) # MOST PROBALBE IS THE CHAR
#elsif ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}<(1-$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node})) # MOST PROBALBE IS THE CHAR
elsif ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}<$IndelsCutoff) # MOST PROBALBE IS THE CHAR
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
else
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}="-".":".$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node};
#$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}="---".":".$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node} if ($SeqType eq "codon");
}
}
elsif ($SeqType eq "codon")
{
# MSA Pos is according to the codon number (i.e ((MSA_Pos-1)/3)+1)
my $MSA_Pos_GAP=(($MSA_Pos-1)*3)+1; # The real char on the MSA
if (!exists $MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node}){$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node}="NA";}
if ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node} eq "NA")
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
# elsif ($maxProb>=$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}) # MOST PROBALBE IS THE CHAR
#elsif ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node}<(1-$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node})) # MOST PROBALBE IS THE CHAR
elsif ($MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node}<$IndelsCutoff) # MOST PROBALBE IS THE CHAR
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
else
{
$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}="---".":".$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos_GAP}{$Node};
}
}
}
my ($CharForPrint,$ProbForPrint)=split(/:/,$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node});
if ($SeqType eq "codon")
{
my $MSA_Pos_GAP=(($MSA_Pos-1)*3)+1; # The real char
print ANCESTRAL_PROB "$MSA_Pos_GAP\t$Node\t$CharForPrint\t$ProbForPrint\n";#$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}\n";
}
else
{
print ANCESTRAL_PROB "$MSA_Pos\t$Node\t$CharForPrint\t$ProbForPrint\n";#$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}\n";
}
}
}
### PRINT THE GAP and CHAR Ancestral MSA
open (MSA_OUT,">$Ancestral_MSA") || die "Can't open Output MSA: '$Ancestral_MSA' $!\n";
foreach my $Node (@Nodes)
{
if (exists $MSA_Hash{$Node}) # Original sequence
{
print MSA_OUT ">$Node\n";
print MSA_OUT "$MSA_Hash{$Node}\n";
}
else # Ancestral seq
{
print MSA_OUT ">$Node\n";
for (my $i=1;$i<=$MSA_Length;$i++)
{
my ($Char,$Prob)=split(":",$MSA_Pos_Node_Char_or_Gap{$i}{$Node});
print MSA_OUT $Char;
}
print MSA_OUT "\n";
}
}
### TO HERE
# For Parsimony (COPY OF THE CODE ABOVE...) TO DO: CHANGE IT SOME DAY...
my %AncestralReconstructIndelParsimony_Hash=();
my $AncestralReconstructIndelParsimony_Reff=Read_Ancestral_Parsimony_State($AncestralReconstructParsimony,\%AncestralReconstructIndelParsimony_Hash); # hash = key1:IndelMSA_Pos,key2:species; value 1 for indel 0 for char
%AncestralReconstructIndelParsimony_Hash=%$AncestralReconstructIndelParsimony_Reff;
my %MSA_Pos_Species_AncestorIndelParsimony=(); # Will hold for each MSA_Pos and Species the vector of IndelPos_ProbOfIndel
foreach my $MSA_Pos (sort {$a<=>$b} keys %$MSAtoIndel)
{
# print "MSA:$MSA_Pos,";
foreach my $IndelPos (@{$MSAtoIndel->{$MSA_Pos}})
{
# print "Indel:$IndelPos - $AncestralReconstructIndelPosterior_Hash{$IndelPos}"; # empty
foreach my $species (keys %{$AncestralReconstructIndelParsimony_Hash{$IndelPos}})
{
my $IndelPos_ProbOfIndel=$IndelPos."_".$AncestralReconstructIndelParsimony_Hash{$IndelPos}{$species};
if (!exists $MSA_Pos_Species_AncestorIndelParsimony{$MSA_Pos}{$species}){$MSA_Pos_Species_AncestorIndelParsimony{$MSA_Pos}{$species}=[$IndelPos_ProbOfIndel];}
else {push @{$MSA_Pos_Species_AncestorIndelParsimony{$MSA_Pos}{$species}},$IndelPos_ProbOfIndel;}
# print "$MSA_Pos\t$IndelPos\t$species\t$AncestralReconstructIndelPosterior_Hash{$IndelPos}{$species}\n";
}
}
}
open (GAP_PARSIMONY,">$GapParsimony_OutFile") || die "Can't open '$GapProb_OutFile' $!";
print GAP_PARSIMONY "Pos\tNode\tGap\n";
my %MSA_Pos_Node_ParsimonyOf_Gap=();
foreach my $MSA_Pos (sort {$a<=>$b} keys %MSA_Pos_Species_AncestorIndelParsimony)
{
foreach my $species (sort keys %{$MSA_Pos_Species_AncestorIndelParsimony{$MSA_Pos}})
{
print "$MSA_Pos\t$species" if ($DEBUG eq "YES");
print GAP_PARSIMONY "$MSA_Pos\t$species" if ($species=~/^N\d+$/); # print only ancestral nodes
my $Uniq_Indels_Reff=uniq_array($MSA_Pos_Species_AncestorIndelParsimony{$MSA_Pos}{$species});
my @Uniq_Indels=@$Uniq_Indels_Reff;
my $NumOfIndelCoverMSA_Pos=@Uniq_Indels;
my @ParsimonyOfIndel;
for (my $i=0;$i<$NumOfIndelCoverMSA_Pos;$i++)
{
my $Indel_IndelParsimony=$Uniq_Indels[$i];
my ($Indel_Pos,$IndelParsimony)=split("_",$Indel_IndelParsimony);
print "\t$Indel_Pos:$IndelParsimony" if ($DEBUG eq "YES");
push (@ParsimonyOfIndel,$IndelParsimony);
}
# my $minProbOfIndel = (sort { $a <=> $b } @ParsimonyOfIndel)[0]; # WE GAVE PRIORITY TO CHAR (used when we had old (<=1.71) indelCoder)
# print "\tMAX:$minProbOfIndel\n" if ($DEBUG eq "YES");
# print GAP_PARSIMONY "\t$minProbOfIndel\n";
# $MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos}{$species}=$minProbOfIndel;
my $maxProbOfIndel = (sort { $b <=> $a } @ParsimonyOfIndel)[0];
print "\tMAX:$maxProbOfIndel\n" if ($DEBUG eq "YES");
print GAP_PARSIMONY "\t$maxProbOfIndel\n" if ($species=~/^N\d+$/); # print only ancestral nodes;
$MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos}{$species}=$maxProbOfIndel;
}
}
close (GAP_PARSIMONY);
my %MSA_Pos_Node_Char_or_Gap_Parsimony=();
open (ANCESTRAL_PROB_PARSIMONY_INDEL,">$Ancestral_Prob_ParsimonyIndel")|| die "IndelReconstruction_Wrapper::Can't open Ancestral Prob Parsimony Indel File: '$Ancestral_Prob_ParsimonyIndel' $!\n";
print ANCESTRAL_PROB_PARSIMONY_INDEL "Pos_on_MSA\tNode\tChar\tCharProb\n";
foreach my $MSA_Pos (sort {$a<=>$b} keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff})
{
print "MSA:$MSA_Pos\n" if ($DEBUG eq "YES");
foreach my $Node (sort keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}})
{
my $maxProbChar="NA";
my $maxProb=0;
my $Num_Of_1=0;
foreach my $Char (sort keys %{$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}})
{
if (($MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}>$maxProb)&&(defined $MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}))
{
$maxProbChar=$Char;
$maxProb=$MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char};
}
$Num_Of_1++ if ($MSA_Pos_Node_Char_Marginal_Prob_Reff->{$MSA_Pos}->{$Node}->{$Char}==1);
}
# Decide what is the most probable char on pos
if ($Num_Of_1>1) # GAP ON ORIGINAL SEQ (NOT ANCESTRAL)
{
if ($SeqType eq "codon")
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}="---".":1";
}
else
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}="-".":1";
}
$maxProbChar="NA";
$maxProb=0;
}
else
{
if (($SeqType eq "aa") or ($SeqType eq "nuc"))
{
# print "NODE:$Node - $maxProbChar:$maxProb ? -:$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}\n";#;
if (!exists $MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos}{$Node})
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
# elsif ($maxProb>=$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}) # MOST PROBALBE IS THE CHAR
elsif ($MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos}{$Node}==0) # NO GAP BY PARSIMONY - MOST PROBALBE IS THE CHAR
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
elsif ($MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos}{$Node}==1)
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}="-".":"."1";
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}="---".":"."1" if ($SeqType eq "codon");
}
}
elsif ($SeqType eq "codon")
{
# MSA Pos is according to the codon number (i.e ((MSA_Pos-1)/3)+1)
my $MSA_Pos_GAP=(($MSA_Pos-1)*3)+1; # The real char on the MSA
if (!exists $MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node}){$MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node}="NA";}
if ($MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node} eq "NA")
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
# elsif ($maxProb>=$MSA_Pos_Node_MaxProbOf_Gap{$MSA_Pos}{$Node}) # MOST PROBALBE IS THE CHAR
#elsif ($MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node}<(1-$MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node})) # MOST PROBALBE IS THE CHAR
elsif ($MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node}<$IndelsCutoff) # MOST PROBALBE IS THE CHAR
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}=$maxProbChar.":".$maxProb;
}
else
{
$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node}="---".":".$MSA_Pos_Node_ParsimonyOf_Gap{$MSA_Pos_GAP}{$Node};
}
}
}
my ($CharForPrint,$ProbForPrint)=split(/:/,$MSA_Pos_Node_Char_or_Gap_Parsimony{$MSA_Pos}{$Node});
if ($SeqType eq "codon")
{
my $MSA_Pos_GAP=(($MSA_Pos-1)*3)+1; # The real char on the MSA
print ANCESTRAL_PROB_PARSIMONY_INDEL "$MSA_Pos_GAP\t$Node\t$CharForPrint\t$ProbForPrint\n";#$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}\n";
}
else
{
print ANCESTRAL_PROB_PARSIMONY_INDEL "$MSA_Pos\t$Node\t$CharForPrint\t$ProbForPrint\n";#$MSA_Pos_Node_Char_or_Gap{$MSA_Pos}{$Node}\n";
}
}
}
### PRINT THE GAP and CHAR Ancestral MSA
open (MSA_OUT_PARSIMONY,">$Ancestral_MSA_Parsimony") || die "Can't open Output MSA PARSIMONY : '$Ancestral_MSA_Parsimony' $!\n";
foreach my $Node (@Nodes)
{
if (exists $MSA_Hash{$Node}) # Original sequence
{
print MSA_OUT_PARSIMONY ">$Node\n";
print MSA_OUT_PARSIMONY "$MSA_Hash{$Node}\n";
}
else
{
print MSA_OUT_PARSIMONY ">$Node\n";
for (my $i=1;$i<=$MSA_Length;$i++)
{
my ($Char,$Prob)=split(":",$MSA_Pos_Node_Char_or_Gap_Parsimony{$i}{$Node});
print MSA_OUT_PARSIMONY $Char;
}
print MSA_OUT_PARSIMONY "\n";
}
}
close (MSA_OUT_PARSIMONY);
sub Read_MSA_to_Indels_Info
# Will create an hash that map each position on the MSA to the translated indel (or indels)
{
#character number: 0
#Start position relative to MSA: 0
#End position relative to MSA: 1
#Length: 1
#Found in species: DQ373066.PTT Start position relative to genome: 0 Length: 1
#ENDCHARACTER
print "MAPPING MSA POS TO INDEL\n==============================================================\n" if ($DEBUG eq "YES");
my $IndelInfo=shift;
my $MSA_Pos_Species_to_Indel_Reff=shift;
my $MSAtoIndel_Reff=shift;
my %MSA_Pos_Species_to_Indel=%$MSA_Pos_Species_to_Indel_Reff;
my %MSAtoIndel=%$MSAtoIndel_Reff;
open (INDELS,$IndelInfo) || die "Can't open IndelInfo File: '$IndelInfo' $!";
my $IndelPos="";
my $MSA_Pos="";
my $Length="";
while (my $line=)
{
chomp ($line);
if ($line=~/character number: ([0-9]+)/)
{
$IndelPos=$1+1; # Indel Pos start from 0
}
elsif ($line =~/Start position relative to MSA: ([0-9]+)/)
{
$MSA_Pos=$1+1; # MSA Pos start from 0
}
elsif ($line=~/Found in species: (.*?) Start position relative to genome: ([0-9]+) Length: ([0-9]+)/)
{
my $Species=$1;
my $length=$3;
for (my $i=0;$i<$length;$i++)
{
my $tmpPosOnMSA=$MSA_Pos+$i;
if (exists $MSA_Pos_Species_to_Indel{$tmpPosOnMSA}{$Species}){push (@{$MSA_Pos_Species_to_Indel{$tmpPosOnMSA}{$Species}},$IndelPos);}
else {$MSA_Pos_Species_to_Indel{$tmpPosOnMSA}{$Species}=[$IndelPos];}
if (exists $MSAtoIndel{$tmpPosOnMSA}){push (@{$MSAtoIndel{$tmpPosOnMSA}},$IndelPos);}
else {$MSAtoIndel{$tmpPosOnMSA}=[$IndelPos];}
print "$tmpPosOnMSA\t",$Species,"\t",join(",",@{$MSAtoIndel{$tmpPosOnMSA}}),"\n" if ($DEBUG eq "YES"); # QA
}
}
print "===========================\n" if ($DEBUG eq "YES");
}
close (INDELS);
return (\%MSA_Pos_Species_to_Indel,\%MSAtoIndel);
}
sub Read_Ancestral_Parsimony_State
{
my $AncestralReconstructParsimony=shift;
my $AncestralReconstructIndelParsimony_Reff=shift;
my %AncestralReconstructIndelState=%$AncestralReconstructIndelParsimony_Reff;
open (ANCESTRAL_INDEL_STATE,$AncestralReconstructParsimony) || die "Can't open AncestralReconstructParsimony: '$AncestralReconstructParsimony' $!";
my $line=;
$line=;
$line=;
$line=;
$line=;
$line=;
# print with MP based on the cost matrix:
# 0->0 =0
# 0->1 =2
# 1->0 =1
# 1->1 =0
#POS Node State
while ($line=)
{
chomp ($line);
my ($POS,$Node,$State)=split(/\t/,$line);
if ($State==0) #Char
{
$AncestralReconstructIndelState{$POS}{$Node}=0;
}
else # Indel
{
$AncestralReconstructIndelState{$POS}{$Node}=1;
}
}
close (ANCESTRAL_INDEL_STATE);
return \%AncestralReconstructIndelState;
}
sub Read_Ancestral_Prob_For_Indel
{
my $AncestralReconstructPosterior=shift;
my $AncestralReconstructIndelPosterior_Reff=shift;
my %AncestralReconstructIndelPosterior=%$AncestralReconstructIndelPosterior_Reff;
print "Read_Ancestral_Prob_For_Indel: $AncestralReconstructPosterior $AncestralReconstructIndelPosterior_Reff\n=========================================================================================\n" if ($DEBUG eq "YES"); # DEBUG
open (ANCESTRAL_INDEL_PROB,$AncestralReconstructPosterior) || die "IndelReconstruction_Wrapper.pl:Can't open AncestralReconstructPosterior: '$AncestralReconstructPosterior' $!";
my $line=;
while ($line=)
{
chomp ($line);
my ($POS,$Node,$State,$Prob)=split(/\t/,$line);
$AncestralReconstructIndelPosterior{$POS}{$Node}=$Prob;
print "AncestralReconstructIndelPosterior{$POS}{$Node}=$Prob\n" if ($DEBUG eq "YES"); # DEBUG
}
close (ANCESTRAL_INDEL_PROB);
return \%AncestralReconstructIndelPosterior;
}
sub remove_InternalNodeName_or_BPvalues {
my $IN_treeFile=shift;
my $OLD_treeFile=shift;
my $treeFileOneLine;
open(TREEFILE,"$IN_treeFile") || die "IndelReconstruction_Wrapper.pl:remove_InternalNodeName_or_BPvalues: Can't open TREEFILE for reading '$IN_treeFile' $!";;
while () {
my $line = $_;
chomp($line);
$treeFileOneLine .= $line;
}
close TREEFILE;
my $changed = "no";
if ($treeFileOneLine =~ m/\)N[0-9]+:/) {
$treeFileOneLine =~ s/\)N[0-9]+:/\):/g; # remove internal nodes names in the BP palce
$changed = "yes";
}
if ($treeFileOneLine =~ m/\)N[0-9];/) {
$treeFileOneLine =~ s/\)N[0-9];/\);/g; # remove last internal node names in the BP palce
$changed = "yes";
}
if ($treeFileOneLine =~ m/\)\d*\.?\d+\:/) {
$treeFileOneLine =~ s/\)\d*\.?\d+\:/\)\:/g; #replace bootstrap values which look like this: ((A:0.02,B:0.03)40:0.3);
$changed = "yes";
}
if ($treeFileOneLine =~ m/\d*\.?\d+\[\d*\.?\d+\]/) {
$treeFileOneLine =~ s/(\d*\.?\d+)\[\d*\.?\d+\]/$1/g;#replace bootstrap values which look like this:(A:0.4,(B:0.1,C:0.1):0.3[40]);
$changed = "yes";
}
if ($changed eq "yes") {
rename $IN_treeFile, $OLD_treeFile;
open (TREE_REMOVED,">$IN_treeFile");
print TREE_REMOVED $treeFileOneLine."\n";
close TREE_REMOVED;
}
return $changed;
}
sub uniq_array
{
my $ReffToArray=shift;
my %hash = ();
foreach my $item (@$ReffToArray) {
$hash{$item} = 1;
}
my @unique = sort keys(%hash);
return \@unique;
}
sub Read_Char_Marginal_Prob
{
my $Chars_MarginalProb_File=shift;
my %Chars_MarginalProb=(); #Key1: MSA_Pos, Key2:Species, Key3:Char, Value:MarginalProb
my @Nodes_Name=();
my $MSA_Length=0;
open (MARGINAL_PROB,$Chars_MarginalProb_File) || return "Could Not Open the MarginalProb_File: '$Chars_MarginalProb_File' $!";
my $MSA_Pos="";
while (my $line=)
{
if ($line=~/marginal probabilities at position: ([0-9]+)/)
{
$MSA_Pos=$1;
$MSA_Length++;
# print "POS:$MSA_Pos\t";
}
elsif ($line=~/of node: (.*?): /)
{
my $node=$1;
push (@Nodes_Name,$node) if ($MSA_Pos==1);
# print "$node\t";
my @Chars_Prob=$line=~/p\([A-Z]+\)=[0-9\.\-]+/g;
foreach my $Char_Prob (@Chars_Prob)
{
if ($Char_Prob=~/p\(([A-Z]+)\)=([0-9\.\-]+)/)
{
my $char=$1;
my $prob=$2;
$Chars_MarginalProb{$MSA_Pos}{$node}{$char}=$prob;
# print "Chars_MarginalProb{$MSA_Pos}{$node}{$char}=$prob\n";
}
}
}
}
close (MARGINAL_PROB);
return (\%Chars_MarginalProb,\@Nodes_Name,$MSA_Length);
}
sub readMSA
{
# read MSA in FASTA format return hash where key is seq name and value is sequence
my $MSA=shift;
my %MSA_Hash=();
open (my $in, "<",$MSA) || die "IndelReconstruction_Wrapper:readMSA: Can't read the MSA '$MSA' $!";
## 1.1. Read FASTA header and save it
my $fastaLine = <$in>;
while (defined $fastaLine) {
chomp $fastaLine;
my $header = substr($fastaLine,1);
## 1.2. Read seq until next header
$fastaLine = <$in>;
my $seq = "";
while ((defined $fastaLine) and
(substr($fastaLine,0,1) ne ">" )) {
chomp $fastaLine;
$seq .= $fastaLine;
$fastaLine = <$in>;
}
$MSA_Hash{$header}=$seq;
}
# close file
close ($in);
return \%MSA_Hash;
}
FastML.v3.1/www/fastml/SampleSeqFromProb.pl 0000644 0176723 0002036 00000007262 12272415322 017562 0 ustar haim pupko use strict;
my $FullProbFile=shift;
my $Node=shift;
my $NumOfSeqToSample=shift;
my $SeqType=shift; # aa | nuc
my $OutFile=shift;
my $isServer=shift;
my @AB=();
my $AB_SIZE;
if ($SeqType eq "nuc")
{
@AB=qw(A C G T);
$AB_SIZE=4;
}
if ($SeqType eq "aa")
{
@AB=qw(A C D E F G H I K L M N P Q R S T V W Y);
$AB_SIZE=20;
}
if ($SeqType eq "codon")
{
@AB=qw(AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC CAG CAT CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT GCA GCC GCG GCT GGA GGC GGG GGT GTA GTC GTG GTT TAC TAT TCA TCC TCG TCT TGC TGG TGT TTA TTC TTG TTT);
$AB_SIZE=61;
}
my %ProbPerSite=(); # hash of array with prob for each pos
open (PROB_FILE,$FullProbFile) || die "Can't open The Full Prob File '$FullProbFile' $!";
my $SeqLength=0;
my $line=; # header
while ($line=)
{
chomp ($line);
my @line=split(",",$line); # NODE,SITE,PROBS BY AB
my $CurrNode=shift(@line);
my $CurrPos=shift(@line);
if ($CurrNode eq $Node)
{
$ProbPerSite{$CurrPos}=[@line];
$SeqLength=$CurrPos if ($CurrPos>$SeqLength);
}
}
close (PROB_FILE);
open (OUT,">$OutFile") || die "Can't open Out: '$OutFile' $!";
for (my $SeqNum=0;$SeqNum<$NumOfSeqToSample;$SeqNum++)
{
my $RandomSeq="";
#if (($SeqType eq "aa") or ($SeqType eq "nuc"))
#{
for (my $pos=1;$pos<=$SeqLength;$pos++)
{
my $Rand=rand();
my $i=0;
my $Size=@{$ProbPerSite{$pos}};
print "SIZE OF PROB VECTOR at POS $pos:$Size\n" if ($Size<$AB_SIZE);
while(($Rand+0.0001 >= $ProbPerSite{$pos}[$i]) and ($i<$AB_SIZE-1))
{
$Rand=$Rand-$ProbPerSite{$pos}[$i];
$i++;
}
print "UNDIFINED:$i for RAND $Rand and vector ",join (",",@{$ProbPerSite{$pos}}) if (!defined $AB[$i]);
$RandomSeq=$RandomSeq.$AB[$i];
}
#}
#elsif ($SeqType eq "codon")
#{
# for (my $pos=1;$pos<=($SeqLength/3);$pos++)
# {
# my $Rand=rand();
# my $i=0;
# my $Size=@{$ProbPerSite{$pos}};
# print "SIZE OF PROB VECTOR at POS $pos:$Size\n" if ($Size<$AB_SIZE);
# while(($Rand+0.0001 >= $ProbPerSite{$pos}[$i]) and ($i<$AB_SIZE-1))
# {
# $Rand=$Rand-$ProbPerSite{$pos}[$i];
# $i++;
# }
# print "UNDIFINED:$i for RAND $Rand and vector ",join (",",@{$ProbPerSite{$pos}}) if (!defined $AB[$i]);
# $RandomSeq=$RandomSeq.$AB[$i];
# }
#}
# print "LENGTH:",length($RandomSeq),"\n";
print OUT ">",$SeqNum+1,"\n$RandomSeq\n";
}
if ($isServer eq "YES")
{
# Update the output page
#######################################
my $OutDir=getDir($OutFile);
my $OutPage=$OutDir."output.html";
if (-e $OutDir."output.php")
{
$OutPage=$OutDir."output.php";
}
open (OUTPUT,"$OutPage") || die "Can't open '$OutPage' $!";
my @out=;
close (OUTPUT);
open (OUTPUT,">$OutPage");
my $SampledSeq_Section=0;
foreach my $line (@out)
{
if ($line=~/sequences from the posterior distribution for ancestral node/)
{
$SampledSeq_Section=1;
print OUTPUT $line;
}
elsif (($line=~/form/) and ($SampledSeq_Section==1))
{
print OUTPUT $line;
my $FileNoPath=getFilename($OutFile);
print_message_to_output("$NumOfSeqToSample sequences sampled from the posterior distribution for ancestral node $Node");
$SampledSeq_Section=0;
}
else
{
print OUTPUT $line;
}
}
close (OUTPUT);
}
#---------------------------------------------
sub print_message_to_output{
#---------------------------------------------
my $msg = shift;
print OUTPUT "\n\n";
}
# Returns the filename without directory
sub getFilename{
my $fullFile = pop @_;
if ($fullFile =~ m/.*[\\\/](.*)$/) {
return $1;
} else {return $fullFile}
}
sub getDir{
my $fullFile = pop @_;
if ($fullFile =~ m/(.*[\\\/]).*$/) {
return $1;
} else {return ''}
}
FastML.v3.1/www/fastml/BuildRaxMLTree.pl 0000644 0176723 0002036 00000013122 12160632357 017001 0 ustar haim pupko use strict;
use FileHandle;
use Bio::SeqIO;
use Bio::AlignIO;
my $MSA=shift;
my $OutTree=shift;
my $WorkingDir=shift;
my $Model=shift; #Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, MTART, MTZOA, PMB, HIVB, HIVW, JTTDCMUT, FLU, GTR
#NUC: GTRCAT
my $MSA_Name=$MSA; # IF WITHOUT PATH
if ($MSA=~/([^\/]+)$/){$MSA_Name=$1;} # NAME WITHOUT PATH
my $OutTree_Suffix=$OutTree; # IF WITHOUT PATH
if ($OutTree=~/([^\/]+)$/){$OutTree_Suffix=$1;} # NAME WITHOUT PATH
$WorkingDir=$WorkingDir."/" if ($WorkingDir!~/\//);
my $Codes2NameIndex=$WorkingDir."$MSA_Name"."Codes2NamesIndex.txt";
my $CodedMSA=$WorkingDir."/$MSA_Name".".coded.aln";
my $CodedMSAPhylip=$WorkingDir."$MSA_Name".".coded.Phylip";
# Convert Names to numbers
my $ans=name2codeFastaFrom1("$MSA",$Codes2NameIndex,$CodedMSA);
#if ($ans ne "ok") {exit_on_error}
# Convert To Phylip
convertMsaFormat($CodedMSA,$CodedMSAPhylip,"fasta","phylip");
#my $convert_cmd="readseq -a -f12 $CodedMSA > $CodedMSAPhylip";
#system ($convert_cmd);
# Run RaxML
$Model="PROTCAT".$Model if ($Model ne "GTRCAT");
my $RaxML_cmd="cd $WorkingDir;raxmlHPC -s $CodedMSAPhylip -n $OutTree_Suffix"." -m $Model";
print "$RaxML_cmd\n";
system ($RaxML_cmd);
# Bring Back names to tree
my $RaxMLTree="RAxML_bestTree.$OutTree_Suffix";
code2nameTree($Codes2NameIndex,$WorkingDir.$RaxMLTree,$WorkingDir."$OutTree_Suffix");
sub name2codeFastaFrom1 {
####################################################################################################################
# Convert the names in a fasta file to numbers, and creates a code file with the names and the codes (running number)
###################################################################################################################
my $in_fileName = shift;
my $code_fileName = shift;
my $out_fileName = shift;
my $counter_offset=shift; # optional
my $in_file = Bio::SeqIO->new(-file => $in_fileName , '-format' => 'Fasta');
my $code_file = new FileHandle(">$code_fileName") or return ("Can't write to $code_fileName $!");
my $out_file = new FileHandle(">$out_fileName") or return ("Can't write to $out_fileName");
$counter_offset=1 if (!defined $counter_offset);
$counter_offset=1 if ($counter_offset==0);
my $counter = $counter_offset;
my $i;
while ( my $seqObj = $in_file->next_seq() ) {
my $name = $seqObj->display_id();
$name.= " ".$seqObj->desc() if ($seqObj->desc());
print $code_file "$name\t$counter\n";
my $seq = $seqObj->seq();
print $out_file ">$counter\n";
for($i=0;$iclose();
$in_file->close();
$code_file->close();
return "ok";
}
sub code2nameTree
{
###############################################################################################################
# Works together (or rather after) the script names2codeFasta.pl. Takes a tree created based on
# a fasta file with codes, and reverts the codes to the names. Required input is a code file which is created by
# names2codeFasta.pl
# ** very useful for working with all phyml and such, since these programs chop the name to 10 chars
###############################################################################################################
# die "Usage: code2name.pl CODE_FILE TREE_FILE NEW_FILE NAME_LENGTH" if (scalar(@ARGV) < 3);
my $nameLength = "NA";
my $code2nameFile = shift;
my $treeFile = shift;
my $newFile = shift;
$nameLength = shift;
if (!defined $nameLength) {
$nameLength = 30;
}
my %names2code;
my @fields;
open FH, "<$code2nameFile";
while (my $line=){
$line =~ /(.+)\t(\d+)/;
my $code = $2;
my $name = $1;
$name =~ s/[\[\]\,\:\;\(\)]/_/g; #remove characters that are newick format associated
if ($name =~ m/(.*\|.{$nameLength})/) {
$name = $1;
}
$names2code{$code}=$name;
print "$code $name\n";
}
close FH;
open TREE, "<$treeFile";
open NEWTREE, ">$newFile";
my $full_tree = "";
my $line2;
while ($line2 = ){ # this assumes there are bootstrap values on the input tree
chomp $line2;
$full_tree.=$line2;
}
@fields = split(/:/, $full_tree);
foreach my $field (@fields) {
if ($field =~ /[\,\(](\d+)$/) { # a leaf comes either after a "(" or a ","
$field =~ s/(\d+)$/$names2code{$1}/;
}
if ($field !~/;$/) {print NEWTREE "$field:";}
else {print NEWTREE "$field";} # Last One
}
print NEWTREE "\n";
}
sub convertMsaFormat
{
my $inFile=shift;
my $outFile=shift;
my $inFormat=shift;
my $outFormat=shift;
#die "usage: convertMsaFormat.pl \n"
print "inFile = '$inFile'\n";
print "outFile = '$outFile'\n";
print "inFormat = '$inFormat'\n";
print "outFormat = '$outFormat'\n";
my $in = Bio::AlignIO->new( '-format' => $inFormat , -file => $inFile);
my $out = Bio::AlignIO->new( '-format' => $outFormat , -file => ">$outFile");
my ($alignObj, $seqStr, $trans);
while ($alignObj = $in->next_aln()) {
$alignObj->verbose(1);
# Otherwise, bioperl adds sequence start/stop values, causing problems
# with clustal/bali_score
$alignObj->set_displayname_flat();
$out->write_aln($alignObj);
}
}
FastML.v3.1/www/bioSequence_scripts_and_constants/ 0000755 0176723 0002036 00000000000 12272452624 021324 5 ustar haim pupko FastML.v3.1/www/bioSequence_scripts_and_constants/GENERAL_CONSTANTS.pm 0000755 0176723 0002036 00000051130 12272424010 024362 0 ustar haim pupko #!/usr/bin/perl
package GENERAL_CONSTANTS; #don't forget: a package must end with a return value (1; in the end)!!!!!
# constants to use when sending e-mails using the server admin's email address.
use constant ADMIN_EMAIL => "TAU BioSequence \";
use constant ADMIN_USER_NAME => "";
use constant ADMIN_PASSWORD => "";
#use constant SMTP_SERVER => "";
use constant SMTP_SERVER => "";
# the name of the list of all running processes
use constant QUEUING_JOBS => "/bioseq/bioSequence_scripts_and_constants/queuing_jobs.list";
use constant RUNNING_JOBS => "/bioseq/bioSequence_scripts_and_constants/running_jobs.list";
use constant SUBMITTED_JOBS => "/bioseq/bioSequence_scripts_and_constants/submitted_jobs.list";
use constant JOBS_ON_BIOSEQ_NODE => "/bioseq/bioSequence_scripts_and_constants/jobs_on_bioc.01_node.list";
use constant JOBS_WAITING_BIOSEQ_NODE => "/bioseq/bioSequence_scripts_and_constants/jobs_waiting_bioc.01_node.list";
use constant CONSURF_RUNNING_JOBS => "/bioseq/bioSequence_scripts_and_constants/consurf_running_jobs.list";
use constant SELECTON_RUNNING_JOBS => "/bioseq/bioSequence_scripts_and_constants/selecton_running_jobs.list";
use constant CONSEQ_RUNNING_JOBS => "/bioseq/bioSequence_scripts_and_constants/conseq_running_jobs.list";
use constant PEPITOPE_RUNNING_JOBS => "/bioseq/bioSequence_scripts_and_constants/pepitope_running_jobs.list";
# Databases urls
use constant PROTEOPEDIA => "http://proteopedia.org/wiki/index.php/";
use constant PDB_DB => "http://www.rcsb.org/pdb/explore/explore.do?structureId=";
use constant RCSB_WGET=> "wget ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/pdb/";
use constant RCSB => "http://www.rcsb.org/";
use constant PISA_WGET => "wget http://www.ebi.ac.uk/msd-srv/pisa/cgi-bin/multimer.pdb?";
# CGIs paths
use constant CONSURF_CGI_DIR => "/var/www/cgi-bin/ConSurf";
#general paths
use constant SERVERS_RESULTS_DIR => "/bioseq/data/results/";
use constant SERVERS_LOGS_DIR => "/bioseq/data/logs/";
#use constant SEND_EMAIL_DIR => "/db1/Local/src/sendEmail"; # path on biocluster
use constant SEND_EMAIL_DIR => "/bioseq/bioSequence_scripts_and_constants/sendEmail";
use constant SEND_EMAIL_DIR_IBIS => "/bioseq/bioSequence_scripts_and_constants/sendEmail"; # path on ibis
use constant DAEMON_LOG_FILE => "/bioseq/bioSequence_scripts_and_constants/daemon.log";
use constant UPDATE_RUN_TIME_LOG_FILE => "/bioseq/bioSequence_scripts_and_constants/update_runTime.log";
use constant CONSURF_CGI => "/var/www/cgi-bin/ConSurf"; #on ibis
use constant BIOSEQ_TEMP => "/bioseq/temp/";
# servers urls:
use constant SELECTON_URL => "http://selecton.tau.ac.il";
use constant CONSEQ_URL => "http://conseq.tau.ac.il/";
use constant CONSURF_URL => "http://consurf.tau.ac.il/";
use constant NEW_CONSURF_URL => "http://consurf.tau.ac.il/"; #"http://consurftest.tau.ac.il/";
use constant EPITOPIA_URL => "http://epitopia.tau.ac.il/";
use constant PEPITOPE_URL => "http://pepitope.tau.ac.il/";
use constant QMF_URL => "http://quasimotifinder.tau.ac.il/";
use constant PATCHFINDER_URL => "http://patchfinder.tau.ac.il/";
#use constant FASTML_URL => "http://ibis.tau.ac.il/fastml/";
use constant FASTML_URL => "http://fastml.tau.ac.il/";
use constant RECONST_URL => "http://fastml.tau.ac.il/reconst/";
use constant GAIN_LOSS_URL => "http://gloome.tau.ac.il/";
use constant CONSURF_DB_URL => "http://consurfdb.tau.ac.il/";
#use constant GILAD_SERVER_URL => "http://consurftest.tau.ac.il/Gilad/";
use constant GILAD_SERVER_URL => "http://mud.tau.ac.il/";
use constant MCPep_URL => "http://bental.tau.ac.il/MCPep/";
use constant GUIDANCE_URL => "http://guidance.tau.ac.il/";
use constant GUIDANCE_INDELS_URL => "http://guidance.tau.ac.il/indels/";
use constant SPECBOOST_URL => "http://bental.tau.ac.il/specBoost/";
use constant PROMAYA_URL => "http://bental.tau.ac.il/ProMaya/";
use constant HOMOLOGY_SEARCH_URL => "http://fastml.tau.ac.il/HomologySearch/";
use constant COPAP_URL => "http://copap.tau.ac.il/";
#servers logs:
use constant CONSURF_LOG => "/bioseq/ConSurf_old/consurf.log";
use constant CONSURF_NEW_LOG => "/bioseq/ConSurf/consurf.log";
use constant SELECTON_LOG => "/bioseq/Selecton/selecton.log";
use constant EPITOPIA_LOG => "/bioseq/epitopia/epitopia.log";
use constant CONSEQ_LOG => "/bioseq/ConSeq/conseq.log";
use constant PEPITOPE_LOG => "/bioseq/pepitope/pepitope.log";
use constant RECONST_LOG => "/bioseq/ReConst_Server/reconst.log";
use constant MCPep_LOG => "/bioseq/MCPep/mcpep.log";
use constant Guidance_LOG => "/bioseq/Guidance/guidance.log";
use constant Guidance_Indels_LOG => "/bioseq/GuidanceIndels/guidance_Indels.log";
use constant MuD_LOG => "/bioseq/Gilad_Server/MuD.log";
use constant FASTML_LOG => "/bioseq/FastML/fastml.log";
use constant SPECBOOST_LOG => "/bioseq/specBoost/specBoost.log";
use constant GAIN_LOSS_LOG => "/bioseq/GainLoss/GainLoss.log";
use constant PROMAYA_LOG => "/bioseq/ProMaya/ProMaya.log";
use constant COPAP_LOG => "/bioseq/CoPAP/CoPAP.log";
#servers results urls:
# servers urls:
use constant SELECTON_RESULTS_URL => SELECTON_URL."/results/";
#external databases
#use constant PQS=> "/bioseq/data/results/PQS/";
use constant PQS=> "/biodb/PQS/";
use constant PDB_DIVIDED => "/biodb/PDB/data/structures/divided/pdb/";
use constant SWISSPROT_DB => "/biodb/BLAST/Proteins/swissprot";
use constant UNIPROT_DB => "/biodb/BLAST/Proteins/uniprot";
use constant CLEAN_UNIPROT_DB => "/biodb/BLAST/Proteins/clean_uniprot";
use constant UNIREF90_DB => "/biodb/BLAST/Proteins/uniref90";#"/groups/bioseq.home/HAIM/UNIREF90/uniref90";
use constant PDBAA_NCBI=> "/biodb/BLAST/Proteins/pdbaa";
use constant CULLED_PDB => "/groups/bioseq.home/HAIM/PDBAA/pdbaaent"; # TO CHANGE TO: /biodb/BLAST/dunbrack.fccc.edu/Guoli/culledpdb/pdbaaent_dun
use constant PDB_DUNBRACK => "/groups/bioseq.home/HAIM/PDBAA/pdbaa"; # TO CHANGE TO: /biodb/BLAST/dunbrack.fccc.edu/Guoli/culledpdb/pdbaa_dun
use constant NR_PROT_DB => "/biodb/BLAST/Proteins/nr";
use constant NR_NUC_DB => "/biodb/BLAST/Nucleotides/nt";
use constant UNIPROT_DAT_INDEX => "/bioseq/data/results/GB_CDS/uniprot.dat.bp_index";
use constant PDB_TO_UNIPROT => "/bioseq/data/results/PDB_to_UNIPROT/idmapping_PDB_UNIPROTKB.dat";#"/biodb/idmapping_PDB_UNIPROTKB.dat";
use constant PDB_TO_UNIPROT_test => "/biodb/idmapping_PDB_UNIPROTKB.dat";
#internal databases
use constant EPITOPIA_DATA => "/bioseq/epitopia/data";
#external programs
use constant BLASTALL => "/opt/bio/ncbi/bin/blastall"; #"/opt/Bio/ncbi/bin/blastall"; # on the lecs
use constant BLASTPGP => "blastpgp"; # "/opt/Bio/ncbi/bin/blastpgp"; # on the lecs
use constant CS_BLAST => "/share/apps/csblast-2.1.0-linux64/csblast_static"; # on the lecs
use constant MUSCLE_LECS => "/share/apps/bin/muscle"; # on the lecs
use constant MUSCLE => "/usr/local/bin/muscle"; # on the biocluster
use constant MUSCLE_3_6 => "/bioseq/Programs/muscle_3.6_from_BIOCLUSTER/muscle3.6/muscle"; # for servers who came from biocluster (Selecton?, old ConSurf, ConSeq)
use constant CLUSTALW_LECS => "/share/apps/bin/clustalw"; # on the lecs
use constant CLUSTALW => "/usr/local/bin/clustalw"; # on the biocluster
use constant CLUSTALW_1_82 => "/bioseq/Programs/ClustalW_1.82/clustalw1.82/clustalw"; # for servers who came from biocluster (Selecton?, old ConSurf, ConSeq)
use constant CLUSTALW_1_81 => "/bioseq/Programs/ClustalW_1.81/clustalw1.81/clustalw"; # for servers who came from biocluster (Selecton?, old ConSurf, ConSeq)
use constant CLUSTALW_2_0_10 => "/bioseq/Programs/ClustalW_2.0.10/clustalw-2.0.10-linux-i386-libcppstatic/clustalw2"; # for servers who came from biocluster (Selecton?, old ConSurf, ConSeq)
use constant MAFFT_LINSI => "/usr/local/bin/mafft-linsi"; # on the biocluster
use constant MAFFT => "/usr/local/bin/mafft"; # on the biocluster
#use constant MAFFT_GUIDANCE => "/groups/pupko/privmane/bin/mafft"; #v6.711b
#use constant MAFFT_LINSI_GUIDANCE => "/groups/pupko/privmane/bin/mafft --localpair --maxiterate 1000"; #v6.711b
#use constant MAFFT_GUIDANCE => "/bioseq/Programs/MAFFT_6.711b/mafft"; #v6.711b
use constant MAFFT_GUIDANCE => "/bioseq/Programs/MAFFT_6.833/bin/mafft"; #v6.833
#use constant MAFFT_GUIDANCE => "/bioseq/Programs/MAFFT_6.857/bin/mafft"; #v6.857 !!! make sure: 'setenv MAFFT_BINARIES /bioseq/Programs/MAFFT_6.857/mafft-6.857-with-extensions/binaries' BEFORE
#use constant MAFFT_LINSI_GUIDANCE => "/bioseq/Programs/MAFFT_6.711b/mafft --localpair --maxiterate 1000"; #v6.711b
use constant MAFFT_LINSI_GUIDANCE => "/bioseq/Programs/MAFFT_6.833/bin/mafft --localpair --maxiterate 1000"; #v6.833
#use constant MAFFT_LINSI_GUIDANCE => "/bioseq/Programs/MAFFT_6.857/bin/mafft --localpair --maxiterate 1000"; #v6.857 !!! make sure: 'setenv MAFFT_BINARIES /bioseq/Programs/MAFFT_6.857/mafft-6.857-with-extensions/binaries' BEFORE
use constant PRANK_LECS => "/share/apps/bin/prank"; # on the lecs
use constant PRANK => "/usr/local/bin/prank"; # on the biocluster
use constant T_COFFEE => "/share/apps/T-COFFEE-8.47/bin/binaries/linux/t_coffee"; # requiers setenv PATH /share/apps/T-COFFEE-8.47/bin/binaries/linux:$PATH
use constant PAGAN_LECS => "/share/apps/pagan-msa/bin/pagan"; # requires: "module load gcc/gcc461" before!!
use constant TREE_VIEWER_DIR => "/bioseq/ConSurf_old/treeViewer/";
use constant PACC_path => "/bioseq/ConSeq/external_scripts/PACC/";
use constant RATE4SITE_BIOC_VER => "/bioseq/rate4site/BioCluster_Nov_06_dev/rate4site.exe";
use constant RATE4SITE_SLOW_BIOC_VER => "/bioseq/rate4site/BioCluster_Nov_06_dev/rate4siteSlow.exe";
use constant RATE4SITE => "/db1/Local/src/Rate4SiteSource/r4s_Nov_06_dev/rate4site.exe";
use constant RATE4SITE_SLOW => "/db1/Local/src/Rate4SiteSource/r4s_Nov_06_dev/rate4siteSlow.exe";
use constant RATE4SITE_SLOW_LECS => "/share/apps/bin/rate4site_slow";
use constant RATE4SITE_LOCAL => "/bioseq/rate4site/rate4site";
use constant RATE4SITE_SLOW_LOCAL =>"/bioseq/rate4site/rate4site.doubleRep";
use constant RATE4SITE_WITH_LG => "/bioseq/rate4site/With_LG/rate4site";
use constant RATE4SITE_WITH_LG_SLOW => "/bioseq/rate4site/With_LG/rate4site.doubleRep";
use constant RUBY => "/share/apps/bin/ruby"; #"/usr/bin/ruby";
#use constant CD_HIT_DIR => "/db1/Local/src/cd-hit_redundency/";
use constant CD_HIT_DIR => "/bioseq/cd_hit/";
use constant PREDICT_PACC => "/bioseq/ConSeq/external_scripts/PACC/run.sh";
use constant MSA_to_HSSP => "/bioseq/ConSeq/external_scripts/PACC/MSA2hssp.pl";
#use constant SEMPHY => "/groups/pupko/privmane/alignment/run/semphy"; #on Biocluster
use constant SEMPHY => "/bioseq/Programs/Semphy/semphy.doubleRep";
#internal programs
use constant EPITOPIA_EXECUTABLES => "/bioseq/epitopia/executables";
# constant values
use constant BLAST_MAX_HOMOLOGUES_TO_DISPLAY => 500;
use constant BLAST_PDB_MAX_HOMOLOGUES_TO_DISPLAY => 25;
use constant CONSURF_PIPE_FORM => "/bioseq/ConSurf_old/consurf_pipe.form";
use constant SELECTON_MAX_NUCLEOTIDE => 15000;
use constant MAX_WALLTIME => "96:00:00";
# Queue Details
use constant BIOSEQ_NODE => "bioc01.tau.ac.il"; #Node on BioCluster dedicated to Bioseq runs (Not part of the queue)
#use constant MAX_QUEUE_RUNS => 60;
use constant MAX_QUEUE_RUNS => 999;
# external links
use constant RCSB_WEB => "http://www.rcsb.org/";
use constant PYMOL_WEB => "http://pymol.sourceforge.net/";
use constant CHIMERA_WEB => 'http://www.rbvi.ucsf.edu/chimera/';
use constant CHIMERA_SAVING_FIGURE => 'http://www.cgl.ucsf.edu/chimera/current/docs/UsersGuide/print.html';
use constant CHIMERA_DOWNLOAD => CHIMERA_WEB."download.html";
use constant MSA_CONVERT => 'http://www.ebi.ac.uk/cgi-bin/readseq.cgi';
use constant MSA_FORMATS => 'http://www.ebi.ac.uk/help/formats.html';
# redirect pages
use constant CONSURF_REDIRECT_PAGE => CONSURF_URL."too_many_runs.html";
use constant SELECTON_REDIRECT_PAGE => SELECTON_URL."/too_many_runs.html";
use constant CONSEQ_REDIRECT_PAGE => CONSEQ_URL."too_many_runs.html";
use constant PEPITOPE_REDIRECT_PAGE => PEPITOPE_URL."too_many_runs.html";
#faq pages
use constant CONSURF_TREE_FAQ => CONSURF_URL.'quick_help.html#note5';
#Files Name Conventions
use constant TEMPLATES_LIST_FILE=>"List_of_Templates";
use constant PISA_ERRORS_FILE=>"PISA_Errors";
#---------------------------------------------
sub print_to_output{
my $OutHtmlFile = shift;
my $server_name = shift;
my $run_name = shift;
my $recipient = shift;
open OUTPUT, ">>$OutHtmlFile";
flock OUTPUT, 2;
print OUTPUT "\nERROR! $server_name session has been terminated: \n
A system error occured during the calculation. Please try to run $server_name again in a few minutes.\n
\n";
print OUTPUT "For assistance please contact us and mention this number: $run_name
\n";
flock OUTPUT, 8;
close OUTPUT;
&send_mail($server_name, $recipient, $run_name, "error","error") if ($recipient ne "NO");
&stop_reload($OutHtmlFile);
}
#---------------------------------------------
# in case the desired mail report on error: the vars $email_subject and $email_message should be 'error'
sub send_mail { # to user
my $server_name = shift;
my $recipient = shift;
my $run_name = shift;
my $email_subject= shift;
my $email_message = shift;
my $email_attach = shift;
my $from_server = "";
$from_server = shift;
my $OutputURL;
my $mail;
if ($server_name eq "Selecton") {$OutputURL = SELECTON_URL."/results/$run_name"."/output.html";}
elsif ($server_name eq "ConSeq") {$OutputURL = CONSEQ_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "Epitopia") {$OutputURL = EPITOPIA_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "pepitope") {$OutputURL = PEPITOPE_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "ConSurf") {$OutputURL = CONSURF_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "QuasiMotiFinder") {$OutputURL = QMF_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "fastml") {$OutputURL = FASTML_URL."results/$run_name"."/output.html";}
$email_subject = "Error in $server_name running" if $email_subject eq "error";
$email_message = "Hello!\n\nUnfortunately there was an error while running the $server_name server.\nPlease click on the following link to see more details\nWe apologize for the inconvenience\n\n$OutputURL\n" if $email_message eq "error";
chdir SEND_EMAIL_DIR;
chdir SEND_EMAIL_DIR_IBIS if ($from_server eq "ibis");
$mail ='perl sendEmail.pl -f \''.ADMIN_EMAIL.'\' -t \''.$recipient.'\' -u \''.$email_subject.'\' -s '.SMTP_SERVER.' -m \''.$email_message."\'";
#$mail ='perl sendEmail.pl -f \''.ADMIN_EMAIL.'\' -t \''.$recipient.'\' -u \''.$email_subject.'\' -xu '.ADMIN_USER_NAME.' -xp '.ADMIN_PASSWORD.' -s '.SMTP_SERVER.' -m \''.$email_message."\'";
if ($email_attach ne '') {$mail.=" -a $email_attach";}
`$mail`;
}
#---------------------------------------------
sub stop_reload{
my $OutHtmlFile = shift;
sleep 10;
open OUTPUT, "<$OutHtmlFile";
my @output = ;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
foreach my $line (@output){ # we remove the refresh lines and the button which codes for Selecton cancelled job
unless ($line =~ /REFRESH/i or $line =~ /NO-CACHE/i or $line =~ /ACTION=\"\/cgi\/kill_process.cgi/ or
$line =~ /VALUE=\"Cancel Selecton Job\"/ or $line =~ /TYPE=hidden NAME=\"pid\"/ or
$line =~ /TYPE=hidden NAME=\"selecton_http\"/ or $line =~ /TYPE=hidden NAME=\"run_no\"/ or
$line =~ /<.+>Your job status is:<\/a> (.+)
/){
if ($_status ne ""){
s/$1/$_status/;
}
}
elsif(/The time that passed since submitting the query is: (.+)
/){
if($_time ne ""){
s/$1/$_time/;
}
}
elsif(/)/ and $_estimated_run_time ne "none"){
$line = $_;
$line1 = $1;
$line2 = $2;
if ($_estimated_run_time =~ m/\d+:\d+:\d+:\d+/) {
$_estimated_run_time .= " days";
}
elsif ($_estimated_run_time =~ m/\d+:\d+:\d+/) {
$_estimated_run_time .= " hours";
}
elsif($_estimated_run_time =~ m/\d+:\d+/){
$_estimated_run_time .= " minutes";
}
$_ = $line; # since we make another RE comparison, the original values of $_ and $1 are changing, therefore we must save them at the beginning and change them back here.
s/$line2/$_estimated_run_time
/; # the reason we first substitue the second part, is that the first part creates an expression --> which might be wrongly replaced with this value
s/$line1/$line1>/;
}
}
print HTML $_ foreach (@html_lines);
flock HTML, 8;
close HTML;
return "OK";
}
}
# in case the desired mail report on error: the vars $email_subject and $email_message should be 'error'
sub send_mail2 { # to user
my $server_name = shift;
my $recipient = shift;
my $run_name = shift;
my $email_subject= shift;
my $email_message = shift;
my $email_attach = shift;
my $from_server = shift;
my $OutputURL;
my $mail;
if ($server_name eq "Selecton") {$OutputURL = SELECTON_URL."/results/$run_name"."/output.html";}
elsif ($server_name eq "ConSeq") {$OutputURL = CONSEQ_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "Epitopia") {$OutputURL = EPITOPIA_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "pepitope") {$OutputURL = PEPITOPE_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "ConSurf") {$OutputURL = CONSURF_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "QuasiMotiFinder") {$OutputURL = QMF_URL."results/$run_name"."/output.html";}
elsif ($server_name eq "fastml") {$OutputURL = FASTML_URL."results/$run_name"."/output.html";}
$email_subject = "Error in $server_name running" if $email_subject eq "error";
$email_message = "Hello!\n\nUnfortunately there was an error while running the $server_name server.\nPlease click on the following link to see more details\nWe apologize for the inconvenience\n\n$OutputURL\n" if $email_message eq "error";
chdir SEND_EMAIL_DIR;
chdir SEND_EMAIL_DIR_IBIS if ($from_server eq "ibis");
$mail ='perl sendEmail.pl -f \''.ADMIN_EMAIL.'\' -t \''.$recipient.'\' -u \''.$email_subject.'\' -s '.SMTP_SERVER.' -m \''.$email_message."\'";
#$mail ='perl sendEmail.pl -f \''.ADMIN_EMAIL.'\' -t \''.$recipient.'\' -u \''.$email_subject.'\' -xu '.ADMIN_USER_NAME.' -xp '.ADMIN_PASSWORD.' -s '.SMTP_SERVER.' -m \''.$email_message."\'";
if ($email_attach ne '') {$mail.=" -a $email_attach";}
$mail = 'sh -c \' $mail 2>/dev/null\'';
`$mail`;
}
1;
FastML.v3.1/www/bioSequence_scripts_and_constants/BIOSEQUENCE_FUNCTIONS.pm 0000755 0176723 0002036 00000061032 11364636070 025061 0 ustar haim pupko #!/usr/bin/perl
package BIOSEQUENCE_FUNCTIONS; #don't forget: a package must end with a return value (1; in the end)!!!!!
use strict;
use GENERAL_CONSTANTS;
#------------------------------------------------------------------------------------
sub subtract_time_from_now{
# receieves the begin time in format of: HH:MN:SS DD-MO-YEAR
# returns the the time (in hours) passed from the time of calculation to the begin time.
# if an error was found during calculation: returns "no"
# error will be found in case the time that passed is more than 1 month different.
my $begin_time = shift;
$begin_time .= " ".shift;
my %date1;
my %date2;
my $date1_ref;
my $date2_ref;
my @time_difference;
my $dir_counter = 0;
$begin_time =~ m/(\d+):(\d+):(\d+) (\d+)-(\d+)-(\d+)/;
%date1 = (Year => $6, Month => $5, Day => $4, Hour => $1, Minute => $2, Second => $3);
%date2 = (Year => "", Month => "", Day => "", Hour => "", Minute => "", Second => "");
&convert_currentTime(\%date2);
@time_difference = &compare_time(\%date1, \%date2);
#if ($time_difference[0] eq "no") {
# return "no";
#}
if ($time_difference[0] =~ m/error/) {
return $time_difference[0];
}
else{
return $time_difference[1];
}
}
#------------------------------------------------------------------------------------
# the routine converts the "Begin/End" time line from Selecton's log files to a numeric string.
# it insertes the new values to the hash' reference .
sub convertTime
{
my $inputTimeString = $_[0];
my $answer = $_[1]; #reference to hash
my %months =
( Jan => "01", Feb => "02", Mar => "03", Apr => "04", May => "05", Jun => "06",
Jul => "07",Aug => "08", Sep => "09", Oct => "10", Nov => "11", Dec => "12");
if ($inputTimeString =~ m/(\d+):(\d+):(\d+),\s+\w+\s(\w+)\s(\d+),\s(\d+)/)
{
my $HH = &convertNum($1);
my $MN = &convertNum($2);
my $SS = &convertNum($3);
my $MM = $months{$4};
my $DD = &convertNum($5);
my $YYYY = $6;
$answer->{Year} = $YYYY;
$answer->{Month} = $MM;
$answer->{Day} = $DD;
$answer->{Hour} = $HH;
$answer->{Minute} = $MN;
$answer->{Second} = $SS;
}
}#convertTime
#__________________________________________________________
# converts a number from one digit to 2 digits
sub convertNum
{
my $input_num = shift;
if ($input_num < 10)
{return "0".$input_num;}
else
{return $input_num;}
}
#__________________________________________________________
# calculates the time differences by comparing seperately months, days, minutes and seconds.
# this functions assumes that the year is the same year.
# input: references to 2 hashs with time's details
# output: string with time difference, messured by hours:minutes:seconds
sub compare_time()
{
my $time1 = $_[0]; #refernce to the time array
my $time2 = $_[1]; #refernce to the time array
my $time_difference;
my $no_of_Days_passed;
my $no_of_hours_passed;
my %days_each_month = ('01' => '31', '02' => '28', '03' => '31', '04' => '30', '05' => '31', '06' => '30',
'07' => '31', '08' => '31', '09' => '30', '10' => '31', '11' => '30', '12' => '31');
if ($time1->{Month} eq $time2->{Month}) {#same month
if ($time1->{Day} eq $time2->{Day}) {#same day
if ($time2->{Hour} >= $time1->{Hour}) {#compare hour: h2>h1
$time_difference = &calculate_time_difference($time1->{Hour}, $time2->{Hour}, $time1->{Minute}, $time2->{Minute}, $time1->{Second}, $time2->{Second}, 0);
}
else{
#return("no");
return("error: H1 is: $time1->{Hour} H2 is: $time2->{Hour} it is the same day, therefor it is impossible that H1>H2. \n");
}
}
else {# different day
if ($time2->{Day} >= $time1->{Day}){
$no_of_Days_passed = ($time2->{Day}-$time1->{Day});
$time_difference = &calculate_time_difference($time1->{Hour}, $time2->{Hour}, $time1->{Minute}, $time2->{Minute}, $time1->{Second}, $time2->{Second}, $no_of_Days_passed);
}
else{
#return("no");
return("error: D1 is: $time1->{Day} D2 is: $time2->{Day}, it is impossible in the same month that D1>D2.\n");
}
}
}
else {#different month
#if ($time2->{Month} >= $time1->{Month}){
if (($time2->{Month} - $time1->{Month})>1 or ($time2->{Month} - $time1->{Month})<0){
#return("no");
return("error: M1 is: $time1->{Month}, M2 is: $time2->{Month}. The program doesn't allow a difference bigger than 1 month.\n");
}
else {# 1 month difference
$no_of_Days_passed = ($time2->{Day} + $days_each_month{$time1->{Month}} - $time1->{Day}); $time_difference = &calculate_time_difference($time1->{Hour}, $time2->{Hour}, $time1->{Minute}, $time2->{Minute}, $time1->{Second}, $time2->{Second}, $no_of_Days_passed);
}
#}
#else{
#return("no");#, "error: M1 is: $time1->{Month}, M2 is: $time2->{Month}. It is impossible for M1 to be bigger within the same year\n");
#}
}
return ("yes", $time_difference);
} # finish: compare_time()
#__________________________________________________________
# does the part of calculating minutes and seconds difference.
# input: hours difference (just for formating the string output) M1, M2, D1, D2
# output: string output, sent to the compare_time() function for display
sub calculate_time_difference()
{
my $hour1 = $_[0];
my $hour2= $_[1];
my $minute1 = $_[2];
my $minute2 = $_[3];
my $second1 = $_[4];
my $second2 = $_[5];
my $days_passed = $_[6];
my $minutes_passed;
my $seconds_passed;
my $hours_passed;
my $reduce_minute = "no";
my $reduce_hour = "no";
my $reduce_day = "no";
# seconds
if ($second2>=$second1)
{$seconds_passed = $second2-$second1;}
else
{$seconds_passed = 60+$second2-$second1;
$reduce_minute = "yes";}
#minutes
if ($minute2>=$minute1)
{$minutes_passed = $minute2-$minute1;}
else
{$minutes_passed = 60+$minute2-$minute1;
$reduce_hour = "yes";}
if ($reduce_minute eq "yes")
{
if ($minutes_passed == 0)
{$minutes_passed = 59;}
else
{$minutes_passed -=1;}
}
#hours
if ($hour2>=$hour1)
{$hours_passed = $hour2-$hour1;}
else
{$hours_passed = 24+$hour2-$hour1;
$reduce_day = "yes";}
if ($reduce_hour eq "yes")
{
if($hours_passed == 0)
{$hours_passed = 23;}
else
{$hours_passed -=1;}
}
#days
if ($days_passed > 0)
{
if($reduce_day eq "yes")
{$days_passed-=1;}
$hours_passed += 24*$days_passed;
}
$hours_passed = &convertNum($hours_passed);
$minutes_passed = &convertNum($minutes_passed);
$seconds_passed = &convertNum($seconds_passed);
return "$hours_passed:$minutes_passed:$seconds_passed";
}
#------------------------------------------------------------------------------------
sub convert_currentTime {
my $answer = shift; #reference to hash
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
$second = &convertNum($second);
$minute = &convertNum($minute);
$hour = &convertNum($hour);
$month = &convertNum($month+1);
$dayOfMonth = &convertNum($dayOfMonth);
$answer->{Year} = $year;
$answer->{Month} = $month;
$answer->{Day} = $dayOfMonth;
$answer->{Hour} = $hour;
$answer->{Minute} = $minute;
$answer->{Second} = $second;
#print "Current time is: ".$answer->{Hour}.":".$answer->{Minute}.":".$answer->{Second}." ".$answer->{Day}."-".$answer->{Month}."-".$answer->{Year}."\n";
}
#---------------------------------------------
sub check_if_user_is_allowed{
my $server_name = shift;
my $user_ip = shift;
my $user_email = shift;
my $file_to_open;
my %ip_total = ();
my ($ip, $_mail, $redirect_html);
if ($server_name eq "consurf"){
$redirect_html = GENERAL_CONSTANTS::CONSURF_REDIRECT_PAGE;
$file_to_open = GENERAL_CONSTANTS::CONSURF_RUNNING_JOBS;
}
elsif ($server_name eq "selecton"){
$redirect_html = GENERAL_CONSTANTS::SELECTON_REDIRECT_PAGE;
$file_to_open = GENERAL_CONSTANTS::SELECTON_RUNNING_JOBS;
}
elsif ($server_name eq "conseq"){
$redirect_html = GENERAL_CONSTANTS::CONSEQ_REDIRECT_PAGE;
$file_to_open = GENERAL_CONSTANTS::CONSEQ_RUNNING_JOBS;
}
elsif ($server_name eq "pepitope"){
$redirect_html = GENERAL_CONSTANTS::PEPITOPE_REDIRECT_PAGE;
$file_to_open = GENERAL_CONSTANTS::PEPITOPE_RUNNING_JOBS;
}
if (-e $file_to_open and !(-z $file_to_open)){
open RUN_LIST, $file_to_open;
flock RUN_LIST, 2;
while (){
chomp;
if(/^(null_)?\d+ (.+) (.+)$/){
$ip = $2;
$_mail = $3;
if (exists $ip_total{$ip}){
$ip_total{$ip}++;}
else{
$ip_total{$ip} = 1;}
if (exists $ip_total{$_mail}){
$ip_total{$_mail}++;}
else{
$ip_total{$_mail} = 1;}
}
#redirects unwanted visitors to the site
if ($ip =~ /66\.232\.100\.62/ or $ip =~ /83\.97\.\177\.107/ or $ip =~ /91\.74\.160\.18/){
#print "Location: http://www.tau.ac.il/lifesci/\n\n";
exit;
}
}
close RUN_LIST;
if ((exists $ip_total{$user_ip} && $ip_total{$user_ip} >=7) or (exists $ip_total{$user_email} && $ip_total{$user_email} >= 7)){
# output a message to the user that he cannot continue the run
print "Location: $redirect_html\n\n";
exit;
}
}
}
#---------------------------------------------
# the values for this statistics were determined in a statistical test we did on November 2007,
# on Selecton seccsful runs for 3 months on the bioinfo machine
#sub selecton_estimated_run_time1{
# my $seq_times_length = shift;
# my $model = shift;
#
# my ($time_in_minutes, $time_in_hours, $time_in_days);
# # set the time according to each model's parameters
# $time_in_minutes = $seq_times_length*0.0251 + 20.345 if ($model eq "M8");
# $time_in_minutes = $seq_times_length*0.0256 + 17.391 if ($model eq "MEC");
# # to be on the safe side - we add 20% for the time
# $time_in_minutes = int($time_in_minutes*1.2);
# # calculate time in DD:HH:MM:SS format
# $time_in_minutes = int($time_in_minutes); # remove numbers after the "."
#
# return(&time_in_days_from_minutes($time_in_minutes));
#}
#---------------------------------------------
# the values for this statistics were determined in a statistical test we did on October 2009, on Selecton seccsful runs for a few month on biocluster.
# the file can be found at: /bioseq/Selecton/total_models_statistics.csv
sub selecton_estimated_run_time{
my $seq_length = shift;
my $num_of_seq = shift;
my $model = shift;
my ($time_in_minutes, $time_in_hours, $time_in_days);
# set the time according to each model's parameters
if ($model eq "MEC"){
$time_in_minutes = $seq_length*$num_of_seq*0.0035 + 12.677 ;
}
elsif ($model eq "M8"){
if($num_of_seq<11){
$time_in_minutes = $seq_length*$num_of_seq*0.022 + 3.5198;
}
elsif($num_of_seq>10 and $num_of_seq<21){
$time_in_minutes = $seq_length*$num_of_seq*0.0025 + 14.82;
}
elsif($num_of_seq>20 and $num_of_seq<31){
$time_in_minutes = $seq_length*$num_of_seq*0.0021 + 35.153;
}
elsif($num_of_seq>30 and $num_of_seq<41){
$time_in_minutes = $seq_length*$num_of_seq*0.0026 + 48.412;
}
elsif($num_of_seq>40 and $num_of_seq<51){
$time_in_minutes = $seq_length*$num_of_seq*0.0024 + 65.947;
}
else{
$time_in_minutes = $seq_length*$num_of_seq*0.003 + 91.341;
}
}
# to be on the safe side - we triple the time
$time_in_minutes = int($time_in_minutes*3);
# calculate time in DD:HH:MM:SS format
$time_in_minutes = int($time_in_minutes); # remove numbers after the "."
return(&time_in_days_from_minutes($time_in_minutes));
}
#---------------------------------------------
# input: int represents sum of minutes
# output: time in format: HH:MM:SS (maybe change in the future to time in format: DD:HH:MM:SS)
sub time_in_days_from_minutes{
my $minutes = shift;
my $hours = 0;
my $days = 0;
my $ret = "";
if($minutes <=59){
$ret = $minutes.":00";
}
elsif ($minutes >59){
$hours = int($minutes/60);
$minutes = $minutes%60;
$minutes = new_num($minutes);
# ---- if the format needed inculdes only hours
$hours = new_num($hours);
$ret = $hours.":".$minutes.":00";
## --- if the format needed inculdes days in seperate
#if($hours <= 23){
# $hours = new_num($hours);
# $ret = $hours.":".$minutes.":00";
#}
#else{
# $days = int($hours/24);
# $hours = $hours%24;
# $hours = new_num($hours);
# $days = new_num($days);
# $ret = $days.":".$hours.":".$minutes.":00";
#}
}
return $ret;
}
#---------------------------------------------
# gives the number in minimum 2 digits
sub new_num{
my $num = shift;
($num < 10) ? return "0".$num : return $num;
}
#---------------------------------------------
# returns the time in format hh:mm:ss dd:mn:yyy
sub printTime {
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
$second = &new_num($second);
$minute = &new_num($minute);
$hour = &new_num($hour);
$month = &new_num($month+1);
$dayOfMonth = &new_num($dayOfMonth);
return "$hour:$minute:$second $dayOfMonth-".$month."-$year";
}
#---------------------------------------------
sub printYear {
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
return $year;
}
#---------------------------------------------
sub printMonth {
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
# localtime returns array. in its 5th cell (4 when coutin from 0) is the number denotin the current month minus 1
# example: in December, $time[4] = 11. So in the above @months array, $months[11] = Dec.
my @time = localtime();
return $months[$time[4]];
}
#---------------------------------------------
# input: the server name and run_name
# the routine will remove this run_name from the list of running jobs
# please note: the var $server should be spelled: "Selecton", "ConSurf"
sub remove_job_from_running_log{
my $server = shift;
my $run_name = shift;
my $log;
if($server eq "Selecton") {
$log = GENERAL_CONSTANTS::SELECTON_RUNNING_JOBS;}
elsif($server eq "ConSurf"){
$log = GENERAL_CONSTANTS::CONSURF_RUNNING_JOBS;}
elsif($server eq "ConSeq"){
$log = GENERAL_CONSTANTS::CONSEQ_RUNNING_JOBS;}
elsif($server eq "pepitope"){
$log = GENERAL_CONSTANTS::PEPITOPE_RUNNING_JOBS;}
# remove the job from the running jobs list
open LIST, "+>>".$log;
flock LIST, 2;
seek LIST, 0, 0; #rewind the pointer to the beginning
my @all_lines_in_list = ; # read the contents into the array
truncate LIST, 0; # remove all the information, The 0 represents the size of the file that we want
foreach (@all_lines_in_list){
chomp;
unless(/$run_name/){
print LIST $_."\n";
}
}
flock LIST, 8;
close LIST;
}
#---------------------------------------------
# prints the job in the queuing jobs list
sub enqueue_job{
my $job_num = shift;
my $server = shift;
my $run_name = shift;
my $ret = "ok";
unless (open LIST, ">>".GENERAL_CONSTANTS::QUEUING_JOBS){
$ret = "Could not open file ".GENERAL_CONSTANTS::QUEUING_JOBS.". Reason: $!\nThe job was not listed in the queuing_jobs list.\n".printTime();
}
else{
flock LIST, 2; # locks the list, so no other process will write to it. On the same time - if the list is currently locked by another process - it waits until the list file is realeased. The "2" and "8" are the operation symbols for "lock" and "unlock".
print LIST "$job_num $server $run_name ".printTime()."\n";
flock LIST, 8;
close LIST;
}
return $ret;
}
#------------------------------------------------------
# prints the job in the bioseq node running jobs list
sub enqueue_job_to_bioseq_node{
my $job_num = shift;
my $server = shift;
my $run_name = shift;
my $ret = "ok";
unless (open LIST, ">>".GENERAL_CONSTANTS::JOBS_ON_BIOSEQ_NODE){
$ret = "Could not open file ".GENERAL_CONSTANTS::JOBS_ON_BIOSEQ_NODE.". Reason: $!\nThe job was not listed in the bioseq node running job list.\n".printTime();
}
else{
flock LIST, 2; # locks the list, so no other process will write to it. On the same time - if the list is currently locked by another process - it waits until the list file is realeased. The "2" and "8" are the operation symbols for "lock" and "unlock".
print LIST "$job_num $server $run_name ".printTime()."\n";
flock LIST, 8;
close LIST;
}
return $ret;
}
#------------------------------------------------------
# prints the job in the bioseq node waiting jobs list
sub waiting_jobs_for_bioseq_node{
my $server = shift;
my $run_name = shift;
my $ret = "ok";
unless (open LIST, ">>".GENERAL_CONSTANTS::JOBS_WAITING_BIOSEQ_NODE){
$ret = "Could not open file ".GENERAL_CONSTANTS::JOBS_WAITING_BIOSEQ_NODE.". Reason: $!\nThe job was not listed in the bioseq node waiting job list.\n".printTime();
}
else{
flock LIST, 2; # locks the list, so no other process will write to it. On the same time - if the list is currently locked by another process - it waits until the list file is realeased. The "2" and "8" are the operation symbols for "lock" and "unlock".
print LIST "$server $run_name ".printTime()."\n";
flock LIST, 8;
close LIST;
}
return $ret;
}
#------------------------------------------------------
# remove the job from the bioseq node waiting jobs list
sub remove_job_from_bioseq_node_waiting_list{
my $server = shift;
my $run_name = shift;
my $ret = "ok";
unless (open LIST, "+>>".GENERAL_CONSTANTS::JOBS_WAITING_BIOSEQ_NODE){
$ret = "Could not open file ".GENERAL_CONSTANTS::JOBS_WAITING_BIOSEQ_NODE.". Reason: $!\nThe job was not listed in the bioseq node waiting job list.\n".printTime();
}
else{
flock LIST, 2;
seek LIST, 0, 0; #rewind the pointer to the beginning
my @all_lines_in_list = ; # read the contents into the array
truncate LIST, 0; # remove all the information, The 0 represents the size of the file that we want
foreach my $line (@all_lines_in_list){
chomp;
if (($line=~/$run_name/) and ($line=~/$server/))
{
$line = ""; # removing this line from the lines array
}
elsif ($line =~/([A-Za-z0-9])+/)
{
print LIST "$line\n";
}
}
flock LIST, 8;
close LIST;
}
return $ret;
}
#---------------------------------------------
# input: path to pdb file
# output: 3 options:
# 1. --PDB_NOT_OPEN if couldn't open the pdb file
# 2. --NO_CHAINS if no chain was founded in column 22
# 3. string with all the chains founded in this pdb.
sub which_chain_in_pdb_and_seqres{
my $input_pdb = shift;
my $chain_founded;
my %all_chains;
my @ret;
my $seqres_found = "--SEQRES_no";
unless (open PDB, $input_pdb){
@ret = ("--PDB_NOT_OPEN $input_pdb $!");
return \@ret;}
while (){
if (/^ATOM/){
$chain_founded = substr $_, 21, 1;
if (!(exists $all_chains{$chain_founded})){
$all_chains{$chain_founded} = 1;
}
}
if ($seqres_found eq "--SEQRES_no" && /^SEQRES/){
$seqres_found = "--SEQRES_yes";
}
}
close PDB;
$chain_founded = "";
foreach my $key (keys %all_chains){
$chain_founded.=$key;
}
if($chain_founded !~ /\S/){
@ret = ("--NO_CHAINS", $seqres_found);}
else{
@ret = ($chain_founded, $seqres_found);}
return \@ret;
}
#---------------------------------------------
# input : 1. path to a pdb file, where there is no chain identifier in the 22 column of ATOM and 12 column of SEQRES
# 2. one letter denotes a chain identifier to add
# output : the same file, in the same path, where the letter given as input is added to the previously empty 22 column.
sub add_chain_to_pdb{
my $input_pdb = shift;
my $chain_id_to_add = shift;
my ($beg_line, $end_line, $line);
open PDB_IN, "+>>".$input_pdb;
seek PDB_IN, 0, 0;
my @all_lines_in_pdb = ;
truncate PDB_IN, 0;
foreach(@all_lines_in_pdb){
if (/^ATOM/){
$line = $_;
$beg_line = substr $line, 0, 21;
$end_line = substr $line, 22, length($line);
$_ = $beg_line.$chain_id_to_add.$end_line;
}
elsif (/^SEQRES/){
$line = $_;
$beg_line = substr $line, 0, 11;
$end_line = substr $line, 12, length($line);
$_ = $beg_line.$chain_id_to_add.$end_line;
}
print PDB_IN $_;
}
close PDB_IN;
}
#---------------------------------------------
sub convertNewline{
# runs dos2unix, the program that converts plain text files in DOS/MAC format to UNIX format.
my $inputFilePath = shift;
my $WorkingDir = shift;
my $dos2unix="cd $WorkingDir;dos2unix -q $inputFilePath";
system "$dos2unix";
# if the input file was in mac format, the simple dos2unix will not work.
# read the file - if it is only one line, it might mean that the new line characters
# are not read well (for example: ^M). Trying to run dos2unix again, saying the format is mac
$WorkingDir.='/' unless $WorkingDir =~ /\/$/;
if (open FILE, $WorkingDir.$inputFilePath){
my $num_of_lines = 0;
while (){
$num_of_lines++;
}
close FILE;
if ($num_of_lines==1){
$dos2unix="cd $WorkingDir;dos2unix -c mac $inputFilePath -q ";
system "$dos2unix";
}
}
}
#---------------------------------------------
sub removeEndLineExtraChars{
# remove extra chars on end of lines (^M,spaces);
my $inputFilePath = shift;
my $WorkingDir = shift;
$WorkingDir.='/' unless $WorkingDir =~ /\/$/;
my @lines;
if (open FILE, $WorkingDir.$inputFilePath){
@lines=;
close (FILE);
}
if (open (NEWFILE,">$WorkingDir$inputFilePath")){
my $line;
foreach $line (@lines){
# $line=~s/(\r)$/\n/;
$line=~s/(\s+)$//;
print NEWFILE "$line\n";
}
close NEWFILE;
}
}
#---------------------------------------------
sub check_file_type{
my $FileName=shift;
my $Type="PLAIN_TEXT";
if (-e "$FileName")
{
#$Type="Executable" if (-x $FileName); #Executable
$Type="Binary" if (-c $FileName); #Contains Special Chars;
$Type="Binary" if (-B $FileName); #Binary
if (-T $FileName and $Type ne "BINARY") # Potentially Text File but maybe not: The first block or so of the file is examined for odd characters such as strange control codes or characters with the high bit set. If too many strange characters (>30%) are found, it's a -B file; otherwise it's a -T file...
{
unless (open FILE,$FileName){
return ("ERR", "check_file_type : cannot open the file $FileName for reading $!");
}
my $line=;
close (FILE);
if ($line=~/%PDF-/){
$Type="PDF";
}
elsif ($line=~/\\rtf/){
$Type="RTF";
}
}
}
else
{
return ("ERR", "check_file_type : the file $FileName was not found");
}
return ("OK", $Type);
}
#---------------------------------------------
1;
FastML.v3.1/libs/ 0000755 0176723 0002036 00000000000 12272424174 012501 5 ustar haim pupko FastML.v3.1/libs/Makefile 0000644 0176723 0002036 00000001165 12272424010 014131 0 ustar haim pupko # $Id: Makefile 942 2006-10-18 12:28:12Z ninio $
# There might be need for a split (as done in programs/Makefile) becouse of a bug in make 3.80.1 - see
# http://www.cygwin.com/ml/cygwin/2004-09/msg01659.html
LIBS= phylogeny
# all has to be the FIRST task!
TASKS= all clean test depend debug All install doubleRep
.PHONY: $(TASKS) $(LIBS)
define TASKS_template
$(1): $$(addsuffix .$(1),$(LIBS))
endef
$(foreach task,$(TASKS),$(eval $(call TASKS_template,$(task))))
define LIB_template
$(1).%:
+cd $(1) && make $$(*)
endef
$(foreach lib,$(LIBS),$(eval $(call LIB_template,$(lib))))
$(LIBS):
+cd $@ && make
FastML.v3.1/libs/phylogeny/ 0000755 0176723 0002036 00000000000 12272424200 014505 5 ustar haim pupko FastML.v3.1/libs/phylogeny/bestHKYparam.h 0000644 0176723 0002036 00000012657 11656124251 017233 0 ustar haim pupko // $Id: bestHKYparam.h 9992 2011-11-08 03:57:29Z rubi $
#ifndef ___BEST_HKY_PARAM
#define ___BEST_HKY_PARAM
#include "definitions.h"
#include "likelihoodComputation.h"
#include "sequenceContainer.h"
#include "stochasticProcess.h"
#include "gammaDistribution.h"
#include "tree.h"
#include "hky.h"
#include "multipleStochasticProcess.h"
class bestHkyParamFixedTree {
public:
explicit bestHkyParamFixedTree(const tree& et,
const sequenceContainer& sc,
stochasticProcess& sp,
const Vdouble * weights=NULL,
const MDOUBLE upperBoundOnHkyParam = 0.5,
const MDOUBLE epsilonHkyParamOptimization = 0.01);
MDOUBLE getBestHkyParam() {return _bestHkyParam;}
MDOUBLE getBestL() {return _bestL;}
private:
MDOUBLE _bestHkyParam;
MDOUBLE _bestL;
};
class bestHkyParamAndBBL {
public:
explicit bestHkyParamAndBBL(tree& et, //find Best HkyParam and best BBL
const sequenceContainer& sc,
stochasticProcess& sp,
const Vdouble * weights=NULL,
const MDOUBLE upperBoundOnHkyParam = 5.0,
const MDOUBLE epsilonHkyParamOptimization= 0.01,
const MDOUBLE epsilonLikelihoodImprovment= 0.05,
const int maxBBLIterations=10,
const int maxTotalIterations=5);
MDOUBLE getBestHkyParam() {return _bestHkyParam;}
MDOUBLE getBestL() {return _bestL;}
private:
MDOUBLE _bestHkyParam;
MDOUBLE _bestL;
};
class C_evalHkyParam{
public:
C_evalHkyParam( const tree& et,
const sequenceContainer& sc,
stochasticProcess& sp,
const Vdouble * weights = NULL)
: _et(et),_sc(sc),_weights(weights),_sp(sp){};
private:
const tree& _et;
const sequenceContainer& _sc;
const Vdouble * _weights;
stochasticProcess& _sp;
public:
MDOUBLE operator() (MDOUBLE HkyParam) {
(static_cast(_sp.getPijAccelerator()->getReplacementModel()))->changeTrTv(HkyParam);
MDOUBLE res = likelihoodComputation::getTreeLikelihoodAllPosAlphTheSame(_et,_sc,_sp,_weights);
//LOG(5,<<" with HkyParam = "<