alien_hunter-1.7/0000755000265600020320000000000011265301730013160 5ustar tilleaadminalien_hunter-1.7/README0000644000265600020320000000461011022573477014053 0ustar tilleaadminSUMMARY alien_hunter is a software for the prediction of Genomic Islands with the implementation of Interpolated Variable Order Motifs (IVOMs) capturing sequence compositional biases in various levels (e.g. G+C, dinucleotide, codon bias). Optionally the predictions can be parsed into a 2 state 2nd order Hidden Markov Model (HMM) to optimize the predicted boundaries using a change-point detection framework. Finally the predictions (embl format) can be automatically loaded into Artemis genome viewer freely available at: http://www.sanger.ac.uk/Software/Artemis/ RELEASE The current release is 1.7 INSTALLATION In order to run alien_hunter, PERL (http://www.perl.com/) and Java Runtime Environment (http://java.sun.com/) must be already installed. The software has been tested on PERL v5.6.1 and JAVA SDK v1.4.2. After downloading alien_hunter.tar.gz change directory to the directory you wish to install alien_hunter in (~/ in this example). Uncompress and untar the alien_hunter.tar.gz file. On UNIX the command is: gzip -d < alien_hunter.tar.gz | tar xf - This will create a directory called ~/alien_hunter which will contain all the files necessary for running alien_hunter. If JAVA and PERL is correctly setup on your system, type: ./alien_hunter on the command line to get the manpage of alien_hunter. To correctly setup your system for the artemis (art) executable visit: http://www.sanger.ac.uk/Software/Artemis/v8/manual/installation.html and follow the instructions. If the setup is not proper the "-a" option of alien_hunter will not work. (If taken from CVS you should compile the two java files first) AUTHOR For questions or comments contact gsv(at)sanger.ac.uk LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. alien_hunter-1.7/alien_hunter0000755000265600020320000000266611022245020015563 0ustar tilleaadmin#!/bin/sh #shell script that sets the CLASSPATH for the biojava classes #and runs the alien_hunter.pl script with the user arguments #author George Vernikos #LICENSE #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #$1 sequence file #$2 output #$3 -a | -c | -i #$4 -a | -c | -i #$5 -a | -c | -i USERPATH=`dirname $0` CLASSPATH=${CLASSPATH}:$USERPATH/biojava/biojava-1.4_new.jar CLASSPATH=${CLASSPATH}:$USERPATH/biojava/commons-cli.jar CLASSPATH=${CLASSPATH}:$USERPATH/biojava/commons-collections-2.1.jar CLASSPATH=${CLASSPATH}:$USERPATH/biojava/commons-dbcp-1.1.jar CLASSPATH=${CLASSPATH}:$USERPATH/biojava/commons-pool-1.1.jar CLASSPATH=${CLASSPATH}:$USERPATH/biojava/bytecode-0.92.jar CLASSPATH=${CLASSPATH}:$USERPATH/ export CLASSPATH perl $USERPATH/alien_hunter.pl -f $1 -o $2 $3 $4 $5 alien_hunter-1.7/alien_hunter.pl0000644000265600020320000001717211022245020016170 0ustar tilleaadmin#!/usr/local/bin/perl =head1 NAME alien_hunter.pl =head1 SYNOPSIS Prediction of Genomic Islands using Interpolated Variable Order Motifs (IVOMs) =head1 AUTHOR George Vernikos =head1 COPYRIGHT =head1 BUGS if you witness any bug please contact the author =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut use FindBin; use lib $FindBin::Bin; use POSIX qw(log10); use Getopt::Long; use PAI_scripts::help; use PAI_scripts::MotifMaker; use PAI_scripts::CutOff; use PAI_scripts::overlap; use PAI_scripts::rrna; use changepointCaller; use Time::Local; use Time::HiRes qw (gettimeofday); $once=0; $WinSize=5000; $Overlap=2500; GetOptions("file_genome=s"=>\$SeqFile,"output=s"=>\$Output,"artemis"=>\$artemis,"help"=>\$help,"changePoint"=>\$changePoint); if($help|!$SeqFile|!$Output){ help(); goto END; } open file1, $SeqFile or die "dead"; print"\n reading sequence...\n"; while(){ chomp($_); if (m#>#){ } else { $GenSeq.=$_; } } close file1; #runs rrna method in rrna.pm to find rrna operons rrna($GenSeq); $GenLen=length($GenSeq); if($GenLen<20000){ print "\n$GenLen bp; too short sequence!\n\n"; goto END; } #builds the motif vectors MMaker(); #scans genome sequence for all kth order motifs print" scanning genome for motifs...\n"; ($amb_base,$count8merswithN)=scan($GenSeq,0); if($amb_base==1){ print "\n\nambiguous bases in the sequence!!!\n\n"; #check percentage of total 8mers in the sequence that contain ambiquous bases (if > 30% stop) $per8merswithN=$count8merswithN/($GenLen-8+1); if($per8merswithN > 0.3){ print "\n\nmore than 30% of 8mers contain ambiguous bases -- cannot continue!!!\n\n"; goto END; } } #builds the IVOM vectors print"\n building genome IVOM vectors...\n"; $GenIVOMref=IvomBuild(); foreach $p ($GenIVOMref){ foreach $key (keys %$p){ $GenIVOM{$key}="$p->{$key}"; } } print"\n sliding window running - calculate relative entropy (KL)...\n"; #how many overlapping windows $HowMany=($GenLen/$Overlap)-1; $start=gettimeofday(); for ($i=0;$i<=$HowMany-1;$i++){ $from=$i*$WinSize-($i*$Overlap); $to=$from+$WinSize; #so as for the last window if != winsize, to take different #start point to the end of the genome, but again with size=winsize if(($HowMany-1)-$i<=1){ $to=$GenLen; $from=$GenLen-$WinSize; } $Query=substr($GenSeq,$from,$to-$from); #scans each sliding window for all kth order motifs ($amb_base,$count8merswithN)=scan($Query,1); #if the current sliding window contains at least 1 ambiguous base skip it and go to the next one if($amb_base==1){ $counter++; goto HERE; } #builds the IVOM vectors for each sliding window $QueryIVOM=IvomBuild(); $counter++; # to convert base 0.. into 1.. if($i==0){ $from=1; } #calculates relative entropy (Kullback-Leibler) foreach $k (keys %GenIVOM){ $w=${$QueryIVOM}{$k}; $G=$GenIVOM{$k}; #print "$k $w $G\n"; $Score+=($w*log10($w/$G)); } $AllScores{"$from..$to"}=$Score; #keeps track of process completed $PercentComplete=($counter/$HowMany)*100; $PercentComplete=sprintf("%.1f",$PercentComplete); if($PreviousValue ne $PercentComplete){ print "completed ... $PercentComplete%\n"; if($PercentComplete>=1){ #calculating finish time if($once==0){ $once=1; $dif=gettimeofday()-$start; $remain=$HowMany/$counter; $dif=$dif*$remain; $estimated=$start+$dif; print "\n\t\testimated finish time:\n\t\t"; print scalar(localtime($estimated)); print "\n\n"; } } } $PreviousValue= $PercentComplete; HERE: $Score=0; }#for how many print"\n determining score threshold...\n"; ($cutoff,$ScaledScores)= Cutoff(\%AllScores); $cutoff=sprintf("%.3f",$cutoff); ########### to print all scores ############## open file4, ">$Output.sco" or die "dead"; foreach $h ($ScaledScores){ foreach $key (keys %$h){ $temp_sc{$key}="$h->{$key}"; } } foreach $k (keys %temp_sc){ $v = $temp_sc{$k}; print file4 "FT misc_feature $k\nFT /score=$v\n"; } close file4; ############################################# $NumKeys= keys %temp_sc; if($NumKeys<2){ goto END; } print"\n merging predictions...\n"; $joinedScoresRef=overlap($cutoff,$ScaledScores); foreach $h ($joinedScoresRef){ foreach $key (keys %$h){ $joinedScores{$key}="$h->{$key}"; } } print"\n writing predictions in embl format...\n"; #find the max joined_window score @keys = sort { $joinedScores{$a} <=> $joinedScores{$b} } keys %joinedScores; $NumKeys = keys %joinedScores; $max=$joinedScores{$keys[$NumKeys-1]}; open file2, ">$Output" or die "dead"; if($cutoff>0){ open file3, ">$Output.plot" or die "dead"; } $st=1; #sorts the scores based on their keys (from..to) in order for the artemis #plot to print the regions in the order they occur in the genome @keys = sort { $a<=>$b }keys %joinedScores; foreach $k (@keys) { $res=0; $v = $joinedScores{$k}; #all scores above the cutoff are coloured scaled: red->white (max->min) if($v>=$cutoff){ $x=($v-$cutoff)/($max-$cutoff); $green=255-($x*255); $blue=255-($x*255); $green=sprintf("%.0f",$green); $blue=sprintf("%.0f",$blue); $red=255; $color="$red $green $blue"; #$label="Alien"; } #calls rrna.pm to annotate regions overlapping rrnas ($rfrom,$rto)=split/\.\./,$k; $res=overlapRNA($rfrom,$rto); if($res==1){ print file2 "FT misc_feature $k FT /colour=$color FT /algorithm=\"alien_hunter\" FT /note=\"threshold: $cutoff; probably region overlapping rRNA operon\" FT /score=$v\n"; } else{ print file2 "FT misc_feature $k FT /colour=$color FT /algorithm=\"alien_hunter\" FT /note=\"threshold: $cutoff\" FT /score=$v\n"; } #writes scores for artemis plot #check if cutoff>0 else it doen's write plot if($cutoff>0){ ($from,$to)=split /\.\./, $k; #begin -> from for($i=$st;$i<$from;$i++){ print file3 "0\n"; } #from -> to for($i=$from;$i<=$to;$i++){ print file3 "$v\n"; } $st=$to+1; } } close file2; #to -> end if($cutoff>0){ for($i=$st;$i<=$GenLen;$i++){ print file3 "0\n"; } close file3; #optimizing the boundaries if($changePoint){ print"\n optimizing predicted boundaries...\n\n"; changepointCaller($SeqFile,$Output); $Output.=".opt"; } } #calculate time elapsed $finish=gettimeofday(); $diff=$finish-$start; $sec=$diff % 60; $diff=($diff-$sec) /60; $min=$diff % 60; $diff=($diff-$min) /60; $hours=$diff % 24; $diff=($diff-$hours) /24; print "\ntime elapsed: $hours:$min:$sec\n"; #it runs artemis; if also -c then it will load the optimized predictions if($artemis){ print"\n loading predictions into artemis ...\n\n\n"; exec "art $SeqFile + $Output"; } #end of code END: alien_hunter-1.7/changepointCaller.pm0000644000265600020320000000512711022245021017134 0ustar tilleaadmin=head1 NAME changepointCaller =head1 SYNOPSIS changepointCaller - it calls the PAI_scripts::changePoint =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut use FindBin; use lib $FindBin::Bin; package changepointCaller; use PAI_scripts::changePoint; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&changepointCaller); sub changepointCaller{ $SeqFile=$_[0]; $tabFile=$_[1]; $Genlen=readGenomeSeq($SeqFile); open file1, $tabFile or die "dead"; open file2, ">$tabFile.opt" or die "dead"; open file3, ">$tabFile.opt.plot" or die "dead"; $count=`grep -c misc_feature $tabFile`; $st=1; $to_prev=0; while(){ chomp($_); if (m#(\d+)\.\.(\d+)#){ $left=$1; $right=$2; $c++; print "\n optimizing prediction $c out of $count\n"; #it calls changepoint ($newLeft,$newRight)=changepoint($left,$right,$tabFile); print " opt boundaries: $newLeft..$newRight\n"; $from=$newLeft; $to=$newRight; #if overlap after the HMM optimization of two consequtive regions #then re-update the plot file not to print extra bases if($to_prev>=$from){ $from=$to_prev+1; $newLeft=$to_prev+1; } $to_prev=$to; } if (m#(colour=\d+ \d+ \d+)#){ $colour=$1; } if (m#(note=.+)#){ $note=$1; } if (m#score=(\d+\.*\d+)#){ $score=$1; print file2 "FT misc_feature $newLeft..$newRight FT /$colour FT /algorithm=\"alien_hunter\" FT /$note FT /score=$score\n"; #begin -> from for($i=$st;$i<$from;$i++){ print file3 "0\n"; } #from -> to for($i=$from;$i<=$to;$i++){ print file3 "$score\n"; } $st=$to+1; } }#while #to -> end for($i=$st;$i<=$Genlen;$i++){ print file3 "0\n"; } close file1; close file2; close file3; return (); } 1; alien_hunter-1.7/demoSeq.dna0000644000265600020320000007465211022245021015246 0ustar tilleaadmin>selected bases gtcagcacggcgcgccggtcattggtttaacctggataatggattcaccgttggtggcgc attgcgactatgtggaaacgtacacgtttggcgacggtaaagatattgccggagagaaaa cgatgaaaggcctgctgagtgcggtcgaactgctccagcagacggaagggtatgcgcact acgacgattttcaggatggcgtcagcaaaatcaaccgtatcgtctggcgtgcttgcgagc aggtagcggagcgtgcgcaggcgttcgcgcaggaatataaagacgataaagtcatttaca ccgtcgccagcggcgcgggctatggcgcagcctacctacagagcatctgtatctttatgg aaatgcaatggatacattccgcctgtattcatagcggtgagtttttccacgggccctttg aaattaccgatgcgaatacgcctttcttcttccagttttccgagggcaatacgcgggcgg tggatgaacgcgcgttaaacttcctgaaaaaatatggccagcgatattatcctgaaactt tcacagaaggtgaatgcgttattggcgcgggatgatgttgacggtgtggtcattactcat ggcactgacacgctcgatgaaaccgcctactttcttaatttgaccgtgaaaagcgacaaa ccggtggtgtttaccgctgcaatgcggcccgcgtcggcaatcagcgccgatggcgcaagg acagccagggctggtgtctgactcgctcaacccggcgaaggcgcgggtgttactgatgat ggcattaactcagacgcgtaatccggaactgatccagagttatttcagtacgtattaatc atgaaggcctgtgcgcccaccctcgaagatgggcgcacaagatattaacgcatggtcaca aactcttccgatgcggtcgggtgaatagcgacggtattatcgaagtctttcttggtagcg cccatcttcagcgccactgcgaagccctgcaacatttcatccatgccgaagccgataccg tggatacccacaattttttcttccggcccgacgcagaccagcttcatacggcacggctgg cggtgcgtcgtcaccgcggtatacatagcggtaaacgatgatttatagactttcacctgc tcgtcgccgtactgctcgcgcgcctgcggctcgcttaacccgacagtgccgataggcgga tggctaaagaccacggtcgggatgttgctgtagtccagatgctcgtccggcttgttgtta aacaggcgctcggagagacggcgaccggcggcgacggcgactggcgttagttctatcgcg ccagtattgtcgccaacggcgtaaatgccttctacgttcgtgttctgccatttatcaacg atgatgtaacctttttcattggttttcacgcccgctgccgccagattaatattgtcggtc gacggttcgcgaccaatggcccagatcaggcaatcaacggtttcactgcggccatcttcc agttccagagtcagactgccgtcggcgtttttcaccaccgctttcggcacggcgtgagta tgcagttgcgggccttcggcgttcatcacctcaaccagcgtttcgctaatcatcgggtca aaactgcgcagcggcgcatgtttacggacaaacagatgcgttttcgcccccagcgcgttg ataacgccagccagttcaacggcgatataacctgcgcctaccaccgcgacgcgctccggc aaggcagatagcgcaaagaaaccgtcggaatcaatgccgtattccacgcctggaatgctc ggatggctcggacggccgccggtagcgatcaaaatatggtcagcggtgatggtttcgccg ttcacttcaatggttttggcatcgacaaagcgggcaaagcctttaatcacatctacgtta tttttgcccagcacgttgtcgtatgaagtatggatgcggtcaatatacgcggtacggctg gcaatcagcttgctccagtcgaactgattaatcgtggtgtcaaaaccgtagtccgggcca tacagatgaatcgcttcacgaatctgcgcggcatgccacatcactttcttcggcacgcag cccacgttaacacaggtgccgcccagctctttggcttcgatgagcgcgcatttctggccg tacatagcggcgcggttaatcgaggcgataccgccgctgccgccgccgatagcgatgtaa tcatagtgtttggtcatgaccgctcctttatgttcgttgtcggacggatagattctgcgc tacccgccaaaatattcgcgattgtatacctgaaggtgaaaggttccaccgatggctgtg attactccggcacgatccagcttaccgaagtgtgtccgtgaccgtttggcgccaacctgc tgtgcagccacggtagcacgttgttcatctgctgctccagtttccacggcgggttaacca cgatcataccggacgccgtcatcccgcgctgatcgctgtccgggcggatcgccagctcaa tttgcaggattttacggatgccggtggcctccagctcatgaatcatgcgcttaatttgct ggcggagtaccaccggataccatagcgcgtatgtaccggtggcgaaacgtttataaccct cgctgatgccgctgactaccgcctggtaatcggttttcatttcataaggcgggtcaatga gaatcagaccgcggcgggaaaccggcggcaatttggctttcagttgctgatagccgtcag cgcgttccacgcgggcgcggttgtctttttgaaactccgcgcgcaacagtgggaagtcgc tgggatgcaattccgtgagttgcagactgtcctgctcacgcagcaactggcgggcgatta acggggagcccggatagtagcgtaactgcccgctgcggttgaaatgttttacgacgctaa tatacggttccagttcggcgggcagatcgtcctgctgccagatacgggcgatgccttcca gatactctccggtacgttcagcatgttcgctgcccaactgataacgccccgcgcccgcgt gcgtgtccagatagagaaacggtttttctttctcttttagcgactcgatgatcaggctct gaacggtatgtttaaggacgtcggcgtggttgccagcgtgaaagctgtgacgataactga gcatggatgcagatattccgggaagtaaacaagttagccgatagtttaccgcagatccgt agagattaccgctcttgcgcttttatcgccgttggccaggaggaaaaacaagtaggctgt ggcgtcagagcgtaaataacgtcaacgaagtgatgcaggcgtctggattggcctgtttca cggtcttaaggaaggtggcaaaatttcgattgaatagggtgtagttatgaaggaagcgcg tctgttttctttgcccttccgctgttatccagtcgactgaccagactgaacgaccactga ctctcgttgttcccggtttcgcctcaataatgctggtgagtggtatcgcggtggtgatac cgtccgcgacctcatagaggtactgcatgtcaaagtagtaatccgcgttatagccgcgag aactgccaaattttcgcagcatcagcgaagaagatgaaattaatttacggccatcttgca gatgcttactccgacggaaaatttttcgcccagggaaaataaagaaaacaataaatatca gtacaataacctgtaataacgtggtcatatacaatccttgtattccagaaacgaaaaagt tgagaatattatacatgggcagtttgcgcaaaaatgacagcgattgatgtcacggccctg ttttcagaaaaacacccatgaaaagcctatctgaatttaacgctaatcatgccgcgacgc gtccaccgacttacccctgtcgggtcttattctgaaatcgtttttattatgatgatagtg ctggttttaccttgtttatttgattaaaacgatagcgctaataatagaaaagaaaggatt cattgatgttccattatggaaaatgtgtcgttcgcacgttgtttttattactggtttgct gttcgctgtcgggctgtctttcatttgggctgatacttgccaccgataaagatggcaatc gtcatcagtctacctggaaatcggataccgttaccgcgctttccttaggtaaagatagca acgggaaaacgggctgggtgtttgttggcgaacattttgactacctgttgacccagggcg gcgataatgtggtggcgctgttaaaagatgcgactattcgccgggataaaatgcgagtaa aggatggtgttaaattcctgattgatacagataagaaggagtttacgggtgaagtcaatg tgacttacgcctgggtggacgagaaagataaactggcggcggtaggctacgggtttattt gtgcagacggcgcggtaaattgcacgctatccgtcatggacctgaaagggactatccatc agaaaaataaagagcagagcgttacgcagcagctatcgttctatcatcctttcaccgttg agttctatcagtatcagcgcagtatgtccggggcaaaatttggcagagtgttgctgccgg taactttggcgctggatattgtcactatgccgctacagctactgatcttcagtcgttaaa attggattcagcagcaatgcctacgttcggatgctggcgttgctgcacgggtgaaatgag cgcccaccatcattgaaattcccccaactatcccccattctagatccatagcgaaagcgc gatagagcgcttcatttcttttttagaccaggactgcgcatatgaccaatccattactaa cgtctttttcactgccgcctttttctgcaattaaaccggagcatgtggtgcctgcggtca ccaaagcgttggccgattgccgggcggcggtagaaggcgttgtggcgcatggcgcgccgt atagctgggaaaacctctgccagccgttggcggaagccgacgatgttctggggcgtattt tctcgccaattagccacttaaactcggtgaaaaatagcccggagctgcgtgaagcctacg aacagacgttaccgctgctgtcggaatacagcacctgggttgggcaacatgaaggactgt acaacgcgtaccgcgacctgcgcgacggcgatcattacgccactctgaataccgcgcaga agaaagcggttgataacgcgctgcgtgattttgaactgtccggcatcggcctgccgaaag agaaacagcagcgttacggtgaaattgccacccgcctgtctgagctgggcaaccagtaca gcaataatgtgctcgatgccaccatgggctggacgaagctcatcaccgacgaagccgagc tggcgggaatgccggaaagcgcgctcgccgccgctaaagccctggcggaagccaaagagc aggaaggttacctgctgactctggatatcccgagctatctgccagtgatgacctactgcg acaaccaggcgttgcgtgaagagatgtatcgcgcctattccactcgtgcctccgatcagg ggccgaacgccggtaagtgggataacagcccggtgatggaagaaattcttgcgctgcgcc atgaactggcgcaactgctgggcttcgaaaattatgcccatgagtcactggccaccaaga tggcggaaaatccgcagcaggtgctcgatttcttaaccgatctggcgaaacgcgcccgtc cgcagggggaaaaagagctggcccagctgcgcgccttcgccaaagccgaatttggcgttg aggaactgcaaccatgggatatcgcgtactacagtgaaaagcaaaaacagcacctgtaca gcatcagcgatgagcagctacgcccgtacttcccggaaaacaaagccgtgaacggcctgt ttgaagtggtgaaacgtatttacggcatcaccgccaaagagcgtactgatgttgatgtct ggcacccggaggtgcgtttcttcgaactgtatgacgaaaataacgagctgcgcggcagct tctaccttgacctgtacgcgcgcgaacacaaacgcggcggggcgtggatggacgactgtg tcggccagatgcgtaaagcggatggcacattgcaaaagccggtcgcttatctgacctgta atttcaaccgtccggtgaacggtaaacccgctctgtttacccatgacgaagtgatcaccc tgttccacgagtttggtcatggcctgcatcatatgctgacccgcattgagaccgccgggg tctccggtatcagcggcgtgccgtgggatgcggtcgaactgccaagtcagtttatggaaa actggtgctgggagccagaagcgctggcgtttatctccggccactatgagaccggcgaac cgctgccgaaggaactgctggataaaatgctggcggcgaaaaactatcaggcggcgctgt ttattctgcgtcagctggagttcggtctgtttgatttccgtctgcatgcggaatttaacc cgcagcaaggagcgaaaattcttgatacgctctttgaaattaaaaaacaggtcgccgtgg tgccgtcaccgacatggggccgtttcccacatgcgttcagccatatctttgctggcggct atgcggcaggctactacagctatctgtgggccgacgtactggcggcggacgcttattctc gctttgaggaggaaggcattttcaaccgtgagaccggtcagtcgttccttgataacatcc tgactcgcggtggttctgaagagccgatggaactctttaaacgcttccgtggccgtgaac cacagctggacgcgatgctggagcattacgggattaaaggctgattttacgtgcaaatct gcttaatggatgaaacgggcgccacagacggcgccttatctgttctggctgcccgctggg gactggagcatgacgaagacaacccgatggcgctggtattgacgccgcaacatctggagc tgcgcaagcgcgacgaaccgaagctcggcggcatttttgtcgattttgtcggcggcgcga tggcgcaccggcgcaagttcggcggcgggcgcggcgaagcggtggcgaaagccgtcggta tcaaaggcgattacctgccggatgtggttgatgccacggccggactggggcgcgatgcgt ttgtgctggcgtccgtcggttgccgcgtgcggatgctggagcgtaatccggtggtcgccg cgttgctcgacgacggcctgacgcgcggctacgccgatgcggacattggcggctggcttc aggagcgtttgcagttgattcatgcttccagcctgacggcgttaaccgatatcacgccgc gcccgcaggtggtctacctcgatccgatgtttcctcaccggcagaaaagcgcgctggtga agaaagagatgcgggtgtttcagtcgctggtggggccggatctcgacgccgacggactgc tggaaccggcgcgtcagttggcgaccaaacgcgtggtggtcaaacgtcccgattatgcgc cgccgctggcggatgtcgcgacgcccaacgccatcgtcaccaaagagcatcggtttgata tttatgccggaacccctctgacggagtaacggattgcccgatggcgctacgcttatcggg cctacgattctcaattatttgtaggccggataaggcgtaagccgcatccggcactgagta ttactgttcggtaccttgcgtattaatcattcggtttaaccacggcaccataatggccat caccacggtgaccgccagcgttaccagaccaattttgctaaagacgccggtataaatggg cagcgtctgtaacggatcggtgatgttttccggtacggcagtgaaggtcgccacataacc gccgagcaggaaggcggcggcctgggtcaggaaccacattcccagaataaagcccatcag atgctgcggcaccagagcggcgaccattgccagtcccagcgcgctaatcagcaactcgcc cagactctggaacagatacactagcacgataaaccacggcgacgtcaatccttgcgcatc ggcaaaccacatcccggcggcggcggtcagaaaacccagcgcgcagaggaacatgccgag cgtaaacttcatcggcatagtcagatctttgcctttgctacccagtcgggtgtaaatcgc tgccagtaccggactggcgacgaccacccagaatgggttcagcgcctgaaaactcaccgg gttaatggcgaatccaagaatttcatgatgcacgttattaatcgcaaagaagttcagtga ggtaggcatctgcgcatacagaatgtaaaacagcacggcttcaatcatcagaataaacgc cacgaacattttattgcggccggttttatccagacgaaacgcttcgcgaaagaagaaaat agtgacgacgatagaaaggacgatgagcaccagattggcaatcttaacgttgtgcatcag ccaggcgcagaggaaaatcatgacgacggtgccgagtagtaccagcagcaaattgcgaaa acgtagcggtttatgatccggttcagaaccgatatttttcaccatgccacggcaggcgaa gtacaccagaagcgcaacaattaaaccagcgccgcacagattataggtcaccgcatagcc aaatttatcggcaatcaccggcgccagcgatagcgataacaaagaaccgatgttaatcga catataaaacagggtgaaagcgccatccagccggggatctttaggctgatagcatttaga gagcaggctggcgggattggctttaaataacccgttgcccacggcaatcgtacccagtgc gataaaaatcagatcgggatttaatagcgacataccagtcataaaatagccaatcgccag cacaatcgcgcccaggaccagggtgcgtttagtccctaacagatggtcgccaacatagcc gccgatggagatcaggccataaaccagcgccgcaaaagcgccaaaagtaataaaggcctg ttcctgagaaaaacccaattgtttaacgaaaaagaccgccaggatgccctggacgccgta atagctaaatcgttcccataattctacaaaaaagatcatgaagaatggacgaggttgctg cagcaagcccgtaggtgcagttgtattcatattccttcgccttttaatgccatcccgaaa cgtatgaacgcgtatatcaaaacgcgccgaaaagtcttaccctgatatagactgcggtta ttagcctgatcaataggttattatacaaatgatacgcaaactaattagtggatcacacta attatgtttggtgtgatggaatgatgtactgagtactgcgaagaaggggggataaacaaa acgcccacataagcgggcgttgttgttgatagccggagtttgcctgatggttcaggccaa cctgattgactatagaccagatgcggtcttagccgccagccggcacggtaagtattactc ttcttcgtcgcgcagcggaacaatcagcatgtcaacgtgaacggtgttgatcagctggcg cgcagaagacatcagtttgctccagaagtcctggtgatgaccgcaaacgaccagatccat atcgtatttcttgatggcgtcaaccagcacctggcccaaatcgccgctaccgctcagggt ttcagtgatagggtagccagcgttggtagacagctcggtcagcgcgtggtgggtttcttt ggagatacgtttctgcatatcgcccagattgacgtcaatcagaccggtgtacaggtctga atagttcacatcaacgtggatgagggagattttcgcgttgtaggggcgcgccatagagac cgctttttcaaccagaactttactttccggggagagatcaactgcgataagaatgtgttt ataagccatagtgttactccttccataagttgtcgatgaccattgagctactggcgtttc ttatgccgccgcgtcacccgcgtcctgcgaactctatgcgcggggctagctcagccgttt accataactatccttacattatagcgaccaggataatccgtcaatctgccttgcttgcca gttagttaaacaaaattttcagggaattgcattgatagtgattaaccttctggtaaaaaa attaactgatctcctacaatgtgtataagagccgttcgaatgcggagcgctaatatggac ggctttcgtagtactctttcactgattgtctggcagggtatcggggagcggtagtcccgg agaggaactccgtgggcgggtcgccggggaggagatatgataagcaccgtctcactattc tgggctttatgtgtcgtttgcattgttaatatggcgcgctatttctcatcgctacgcgca ctgttagtggtacttcgtggttgcgatcctttgctctatcaatatgtcgatggcggcggc ttctttaccacgcatggtcagcccaacaaacaggtgcgcctggtatggtatatctacgct cagcgctatcgcgatcatcatgatgaagaatttattcgccgttgtgaacgcgtgcgccgc cagtttctgctgaccagcgcgttatgcggtttggtggtggtgagcttgattgcgctcatg atttggcactgaataaaaaaagcgggccagtttcctgacccgctcgtattagataagcct ccggatgacgccgcttgtagcggcctgtcaggcctgtgtcgtttcatcctgacaggccgg attagcgacagcgctatccggcccatttggcttaaataatcttcagggaaagccagtaca gcacgccggaaagaataatcgccgccggtaacgtaaatacccaggccatcagaatgctgg tcacggttttacgctgtaacccgccgccatcaacgaccatggttcccgccactgaagagg agagtacgtgagtggtagaaactggcataccggtgtagctggcaagaccgatagagactg ccgccgtcatctgcgccgacattccctgcgcatacgtcatgccttttttaccaattttct cgccgatagtggtcgccacgcgacgccagccgatcatcgtaccaatacccagcgccaacg ccactgccataatgatccacaccggcgcatactcaatggtgcttaacatatcggtcttca gctttttcagcaggcgctggtcatcgctgcttacgccaggcaatttcacgactttgtcgg tcgtgtcggaaatgcacagcataatgcgacgcaactgactacgctgttcaacgcttaact tgtcgtagctttccacgtttgccagcatccctttcgcacggttaagcgcgttaatggtgt tagccggatggcagtggaattcagtcggttcagtcgcgcccggttccggagagggaatca gttggtcaacgccggtgacggctttcagcaggtcaggacgctgctcaaagtacgtttcga cattgttgatagcatcgcgcgtacgggtgatttcgtagctggaggcattcatattgacca cgaaacccgcaggcgcgacgccaatcagcaccagcatgaccaggccaatgcctttctgac catcgttcgcgccatgcgaaaacgccacgccgatagctgagaggatcagcgcgatacgcg tccagaatggcggctttttcttgccgtctttcttttcacgttcggctggcgtcaggtgga tacgggcgcgtttcttagtaccgctccagtagcggcgcagcaagaaaatcagaccgccgg cgaataccagaccgacaataggggagataatgagtgaaccgaaaatattgataactttcg ggatgttaagcgcatccaccactgacgtgccggtcatcatcgcattggttaaaccgatgc cgataatggcgccaatcagcgtgtgggaactggacgccggcaacccgaaataccaggtac cgaggttccagataatcgccgccagcaacatggaaaagaccatagcgaggccgtgcgcgg aacccatgttgagcaggagatccgtcggtagcatatgcacaatggcatacgccacgctga gaccccccagcaaaacgccaaaaaagttaaacaccgccgccatgacaaccgccagctgcg aacgcatcgcacgggtgtaaatcacggttgcgactgcattggctgtatcatggaaaccgt taatcgcttcgtagaacagcacaaaagccagagcaagcaataataaaagcccggtatgta aatccaggccagcaaacaaatgtagcataggacgttacgccattttgaggacatgaacgc ggcgcattatcagtgactttggcggcgcgggcaaagtgaaatatagactttttttgatgt acttcgtgctttgtttgatatcactaaagaattatcttatatatttcagataaataccat tttcaggcgcttttcgcgctggtgcgaccactgaggaaactttacaattctcgcccgaat ttgagagaggaaggcgacgtggaaaggtttgatgctgttattataggcgctggcgcagcg ggcatgttttgcgccgcacaggcaggacaagcgggtagccgcgtgctgctcatcgataat ggcaagaagccaggacgtaaaatcctcatgtccggcggtgggcgctgcaactttactaat ctttatgttgagcctgctgcgtatttgagccagaacccccatttttgcaaatcagcatta gcccgctatacccagtgggactttatcgatctggtcggcaggtatgggatagcctggcat gagaaaacgctgggacagcttttttgcgatgattccgcccaacgcattgtcgatatgctg gttgccgagtgcgacaaaggtggcgtaacgatgcgcctgcgtagcgaggtattgagcgtc gagcgtgatgagtcgggtttcgtactggcgttgaacggcgagacggtgactacgcaaaag ctggtgattgccagcggcggcctgtcgatgccggggcttggcgcatcgccgtttggctat aaaatcgccgaacagtttggtctcaaggtgttgccgacgcgcgccgggctggtgcccttt acgctacataagccgctgttagaacagctccagacgctgtctggcgtctctgtgccctgc gtgattaccgcccgcaatggcacggtatttcgggaaaacctactttttacccatcgtggg ctgtccggccccgccgttttacagatttccagctactggcaaccgggcgagttagtgagc attaacttattgccggatctctcgctggaagacgttctcaatgaacagcgtaacgcgcac ccgaaccagagtctgaagaacacgctggcgatgcatctgccgaaacggctggtggagtgt ttacaacagttggggctcatcccggatgtatcgctcaggcagttgaacgttcgtgaccag cagacgttggttgacacgctgacggcctggcaagtacagcctaacggtaccgaaggctat cggacagcggaagtgacgctgggcggcgtggatacaaacgaactatcatcgcggactatg gaggcgcgccgcgtgccgggtctctattttatcggcgaagtgatggacgtcaccggctgg ttgggcggctataacttccagtgggcctggtcgagcgcctgggcctgcgcgcaggatttg attgcgttaaaaattaactaattgatattatgtgtttttcggtctcatttgactgatcgc cgtatatcttggcctatagcatcttttagtcagaagtattctggataaggggcactcatt ttgcatggcatgaccaccgatttcgttttaacaatggggaatgcggacggtagttttcgc tataaggtcgtggcagtaaaaccagataaattggtttcagctcgggagcctgaataactt tagattgaacatctgttctggatgttactttgggttgatttcccgatttgcgttgactca aaacagaatcaagtttagagataaaaatgtgttagacgatatctataatcagaatgaaac atgtttaaccattttcattatgaatgcattaatctcagctgggttttgaagcagaccgca ttcaggtcggatactggccaacaacaccagatacagaataaggcccctgacagggccttc tggtagtatgtagtcgatatttacactgtttacaacggggagtcatactttcttattaat cctgtaaagcaaaccactgcacagtgcgccagcaaaaagaacaactgcaatagtggcaaa catcagcgtatatgcaaattctttatacgtatcaagaaggtatccgaagataagatggac ataagcttccgggctgaatccgatgcagattaatatccccattgcggtcccggtaaacac tttcggtatgcgaatttcatttacctgcgcatacattataccgcgcatgccaaacgtaag cgctcccagtaataaaaccagagtaataagcaggatcagaaatcgtgagtcatgaggaat aaatataaaagatgcagttatcaatgcaccgagaataaagaagctggccataactttcag ggaagaaccaatcttatctgcaagtaggccagcaaaaggcgaaataataaaagcgagaaa atatgccctaatcatcccgataacggcagtttggtcgggagtcattccataggcatcgga gaggtatggtaccagataagctccgccctggtaaacaaaaataacacagctgataatgaa tgctgccagccaaagttttacacttttgagtagtgaaataagatccgacatttttttagg tgccgcattttcaatagcactaacacctttcggaaataaataataagaggtgaatcccag aataatataaatggctgcgtcaaatcgtaaaataaaaccaatgctgacagcactggagcc atgcagaccaacaatagccacggcacataacgtcagaacagtgccggcaattccacgacc ggactccagaaatccaaaataccgcccctgctcattgtctgatgcaaggagtctggcaat ttttgtcatggaaaccaggaacagtccattaccgactaaggacataaatccgacgataat aaacatggcctgataggaaggccggaaggactgccagatggcaagggccgccattgtgag ataggcaatggtaagaatacttttaggatttttgaaacggtcaacgatgatgcctgaaac agcacctgatataatgcctacaaacccgagccagctcattaacatgcccgattgtgtatt tgtcagtgaaaagtattcattccacggatagtagaatacttcacgcaaatacatagcctt gaatattgttcccccaccgactgccagaagaataaaaaaatcaggttatgccttttaatc gatttgtcttctgcatgttccatatgtccgcctgtataactagacgttattgtagattga tccgttagtgtcttgtgttattttaatttttgtatgtatttataaacagtgggttcagat atgttaagttttttggctactttcgttactgcccctttaagcagaaatacgccttcatca tttagtaatgaaataatatggcttttattttcagcattcagtgactctggattaatattt tcaatattgattacattattaattttgagatcaaccatttcaggaatggtcagtccaata tgttcatcaagctgagtgccgttattattgtcagtctgcccgacaaaaatgcttaaaaaa tgactaagtttatcattcagggcaattaaatcttcgacgacggtattcagacatagtgca ccaacaatttctccctgcttattccggataaaaaaagtggcacacttaagccgtctgcca tcgcgtgtcttgctgagataattacactggtaatgtacctcactgtcattgttcttcatg gcacggactaccagatcggtgatgggatctccgacagagcgaccgctgatatgtccattt ctgattttatgtatagaagactcaagatcagaaagattgtgcagaacaacttccgtgttt tcgccaaaataatcggccaggaaatcgatcaaaaattcatagggttcaagacaagtgttt ttcatggttaatctggttgccacctcagtggcaaccacctttgttattgacgaaatgcca cagcttcaatctcaactctggcacccataggaagtgtttttacagcaaaacagctgcgtg ccggcggatttatcttaaaaaagctggcatagacatcgttgaacgcagcaaaatcagcca tatcactcaagaaacaggtcgttttcagtaccgtatcagtatcaccgcctgccatcttaa ctacagccagaagattaagcagtgactgacgggtctgttcctgaatatcatcagtttcaa ttttgccactgtcagcgttaatgggcaattgtccggaggtaaaaattaaattacccatcg cattgccctgactgtatgagccaattgctgccggtgctgtttctgtaaaaataatctctt tcacttctgttccttaaaaatacttactgaatgccttttccagacgactcagtgcttcat caatacgtgactccggaacaccacagttaattcgggcaaatccggtaccggcaacaccaa actgatctccccggttaaatccgagtcccgcatcctgcactaaaaattcctgaatgcgat cacctgaccagcccgtatcattaaagtcggcccatacaagataagtggcttccggcagat tggcttttaccggatatggcatgttattgagcccgttaaccagtttaatctgcattttac ggatatgattcaccacctcttcacgatagtcatggtaatatttatatccggcataggcgg cttcatgcgcgaacgcagtgggctgaaaatgtaattctttcaggacaactgccatctgtt ttttaaatacaggatcggccgtgaagacataggaactaaaaataccgccaatgttaaaca gcttccctccggataaaaaaacgataatgccatttaaattttccgcagctttcatgaccg gcgtatagcatttttcgtcatagacaaaatctgcatgaatttcgtcagaaatcaggattg ttcctgttttctgggcaattgcggcgactttcctgacttcctcttcagtccagcaacgac ctaccggattatgagggttacagaaaatcagaatcttattctccgggcgtgaacaggtat cttccagcttttcaaaatcaatagtgtaatgattgttttcatcacgcagcagatcacact cagttacagcacggtggttttcacggatggttctggcaaaagaatgatagaccggtgtca taaccacaacggattcattacactgtgtcattcccctgataaaagcgcccatcgtcatca gtaaaggcggtgtttcaataatatcagccgtatccggccgccagtcataatatcgatcat accaggatgccagcacatccttaaaatcatcatgataggccggatatccgaattctctgc ggttacagacttcaatcagtgcctgacgaacctccacgggagaaactaaatcggtatctg caacagacataggtattactggaaatccattaatgacactgtcatattcccatttttctg atgctattctctgatagatttcattaaaatcgtatttcattttattttccttaatttatt cagccgcagtccggaaagtatctgccggtttaccatccattttcttaaatgccgttttgg cattgcggagaagtaaaagtgaagtgaatacagcgagagcaaacatggcacccatgaaga taaacatgtaggtgtaaccctgtactttatatgcatcaagccagtacccgaataacgcat ggataaacatttccggagagaatcccacagtggccatgatccccataacggtaccggtca gacgaacaggaaccttcatttcctcaatttgcgcatacatgatcccgcgcagagcaaagt taatggagcctacaatcatcagggtgccgatcaggacccataccatagccggatcctgtg ggatcaccaggaacatggcgacaccaatcaccccggcaataaacagccaattcatcacct taatcgctgagccgactttgtctgcgagcagacccacaaccggtgcaatcaggatagcga ggacataagcacggatcataccgataattccggcatgttcggcggtcattccgtagacat cagacagataaggaaccagatatgaggacccctgatagaaggagatggttgcaaagatgg agaaagctgcaaaccagacttccttaaccttcatgacagccagaatatcgtgtacagata ttttttccttggccttttcttctccggaagcagaattatggttatcacggggcatggcga accacatcagaacacccagcgctaggtagagaaagccgtatgcatgtagcacactgccaa tcccttccacttcggttgcagacatggcgaaccatgctacggcaacggcagagatggctg acccagcgataccacgtccggactccaggaaaccaaacagcttcccctggtcactggcat taccaatcaggcgggttgcgcgcaccatcgatactaggaaaagaccatttgcgaggaaag atagtatgccgatgatgacatactgcgtagccagacccggtgctgttgactgccataatg tggttacgcccacagccagaaaggtaaaagaaataatcagtcgggtattgttcaccttat cgataatcaccccagcaattgaagatgaaacaatccccacaaaccccagccaactcatca gcaggcctgaatccgtgttattcactcccatgaattcattccaggggtaataaaatactt ccctgaggtacatcgatttaaaaatggtccctcctccgattgccagaagaatgaccatca accatttttgaaaattggatgttttcataaaccttcgcctgtccataaataagaaacaag tgttcatgtgagaatataattttttatacacatgaatattatgaagcacttcacagagtg ggcaaaaaagataaaaaattatctcttctaatttaatatcattaaaatcatgtagttata tgtataataaattatacatatgtgtggggcattaattatttacggttgtaattttataaa aattataaatttttatatttcaggacgtaactgtctgacaacatatatgctgaatagggc ggagaataggggcaggaaagtggataaaatattatcgtgttttacctccggcaggagcgt ttccttattagaagtacgaatatttgatgccgtgggctgtatccatgttttttagccggg cccggcgacttgagcaggggtaatcacgcatgctttcgttaccagccccttcggtaaata gattgacgggcacgattttttgtctgcaacatctgtgctacaaccgaatagacatattga atccaggtgttagtattcaatagacagcagttatgagtatgcgtgtttcatttccggcca ggaacgctgatgatgacaacattactacccggtttcacagcattttcgtcattcacttcc ttcgcggcgcgcgttgccttaacgctcgctcaatggccattgtttttcattccttcttat cccggatggaaacccccgctgagcgcacgcccggcggactttgtgacacgcatggttagg caacagggattacggcgggtcgattatggacaaaatgaagcgtcatctggtgtggtgggg agcggggattcttgtggcggtggccgcgatcgcctggtggatgctgcgccctgcgggaat accggaggggtttgccgccagcaatggcagaattgaagccaccgaagtggatattgccac taagattgccgggcgtattgataccatcctcgtttcggaagggcagttcgttcgtcaggg cgaggtgctggcgaaaatggatacccgcgtattgcaggagcagcggctggaggcgattgc gcagattaaagaggccgagagcgcagttgccgccgcccgtgccttgctggagcaacgcca gagtgaaatgcgcgccgcgcagtcggtcgtcaaacagcgggaagctgagctggactccgt ctctaaacgtcacgtgcgttctcgttcgctgtcgcagcgtggcgcggtatctgtacaaca gttagatgacgatcgcgcggcggcggaaagcgcgcgcgccgccctggagaccgccaaagc gcaggtatctgcggcaaaagcggcgattgaggcggcgcgtaccagtattatccaggcgca aacgcgcgtggaagcggcgcaggcgaccgagcggcgtattgtcgccgacatcgacgacag cgaattaaaagcgccgcgcgatggacgtgtgcagtaccgcgttgcggagccgggcgaagt gttatcggctggcggccgggtgttgaacatggtcgatctcagcgatgtttatatgacatt tttcctgccgaccgaacaggcggggttgttgaagatcggcggcgacgcgcgactggtgct ggacgccgcgccggatctgcgtattccggcgaccatcagttttgtcgccagcgtggcgca gttcacacccaaaaccgtggaaacccacgatgaacggctgaagctgatgttccgcgtcaa agcgcgtattccgccagagctgctgcgacagcatctggaatatgtcaaaaccggtttgcc gggaatggcctgggttcgtctggatgagcgcgtgccctggcctgacgatctgaacgtgag gttgtcgcaatgacgtcgctgacgctggtgcctgttcctcccgtggcgcagcttgagggg gtgagccagcattacggaaaaacggtcgcgctgaacaatatcacgctggacattcccgcc cgtagtatggtcgggctgattggtccggacggggtgggtaagtcgagcctgctgtcgctg atttctggcgcgcgggtgattgagcaagggaacgttatcgtcctgggcggcgatatgcgc gatgcgaaacatcgccgcgacgtctgtccgcgcatcgcctggatgccgcaggggctgggg aaaaacctttatcacacgctatcggtttatgagaacgtcgattttttcgctcgtctcttt ggtcatgacaaagctgaacgcgaagcgcgtattaccgagctgctgaacagcaccggcctg gcgccgtttcgtgatcgtcctgccgggaagctgtccggtgggatgaaacaaaagctgggg ctgtgctgtgcgctgattcatgacccggaattactgattcttgatgagcctaccaccggc gtggactcgctctcccgcgctcagttctgggacttaatcgacagtatccgccaacggcag accaatatgagcgtgttggtcgcaacggcctatatggaagaggcggagcgttttgactgg ctggtggcgatgaatgcgggcgagatactggcaaccggcagcgcgcagcaactgcgggca aaaacccatagcgcgacgctggagcaggcgtttatcgccctgttgccagaagcgcagcgc caggcgcataagccagtggtgatcccgccgtatcacgctgagcaggaagagattgccatt gaggcgaaagatctgaccatgcgcttcggtaagttcgtcgcggttgaccatgttaatttc cgcattccgcgcggcgagatttttggcttcctcggttcaaacggctgcggcaaatcgacc accatgaaaatgctgaccggtctgcttcccgccagcgaaggccaagcctggctatttggc cagccggtagacccgaacgacatcgatacccgtcgccgggtcgggtatatgtcgcaggct ttttcgctctataacgaactcaccgtgcggcagaatctggaacttcatgcccgcctgttt catattccgccagcggagatcccggcgcgcgttgcgcagatgatcgaacgctttatgcta acggaggtggaagatacgctccccgcttcgttgccgctcggtatccgccagcgtttatcg ttggcggtagcggtgatccatcgcccggaaatgctgattcttgatgagccaacgtccggc gttgatccggtcgccagggatatgttctggcagcttatggtcgatctttcgcgtcaggat aaagtgacgatttttatctcgacccattttatgaacgaagcggaacgttgcgaccgaatg tcattgatgcacgccggtaaagtgctcgccagcggtacgccgcaggaattggtgcaacag cggggcgcggccaatctggaagcggcgtttatttcctggctacaggaagctgcgggagcg gcgcctgaaaccccaataccgccatcccagacgcccgccgcgtcagataaaccgtcgcgg cagggattgagcttccggcgtttgttcagctacagtcgccgcgaagcgctggagctacgc cgcgatccggtacgctcgacgctggcgctgctgggaacggtaattctgatgctgattatg ggctatggcatcagtatggacgtggagaacctgcgttttgccgtactcgaccgcgatcaa accgtcagcagccaggcatggtcgctcaatctggcgggatcgcgctattttatcgaacag ccgccgctcgccagctatgacgagcttgaccggcggatgcgttcgggagagttggcggtt gccattgagatcccgcctaattttggccgcgatattgctcgcggtacgccagcgcagatt ggcgtctgggtggatggcgccatgcccagccgcgccgagacggtgaaggggtacgttcag gcgatgcaccaaagctggctgcaggaggcggcaagtcgtcagccgaacccggttaaacaa accgggctgctcaacattgaaacgcgctatcgctataacccggatgtgaaaagtctgccc gctatcgttccggcggtcattccgctgctgctcatgatgatcccgtcaatgcttagcgcc ctgagcgtggtgcgggagaaagagctggggccgatgattaatctgtacgtaacgccgacg acgcgcagtgaatttttgctgggtaaacaactgccgtatatcgcgctgggtatgctgaac ttcttgctactgtgcgcgctgtcggtctttgtgttcggcgtgccgctgaaaggcagcttt ctcacgcttaccctcgcggcgctgctttacgtcatcatcgccaccggtctggggctgctg atctccacgtttatgaagagccaaatcgccgctatttttgggacgtcgattattaccctc attccggcaacacaattttccggcatgatcgatccggtggcttcgctggaaggaccggga agatggatcggcgagatctacccgaccagccattttctgacgatcgcgcgcgggacgttt tccaaagcgttggatctgtccgatctctggtcgttatttatgccgttactgattgccgtc ccggtggtgatgggtctgagcattctgctgctgaaaaaacaggaggggtaatgcgcggat tacgtaatatttataacctcggcgtcaaagaactgcgcagtctgctgggcgacaaagcga tgctggcgctgatcgtcttcgcctttacgatttcggtttactcctccgcgaccgtcctgc ccggttcgttgcatcttgcccctatcgccatcgccgatatggatcagtcacagttgtcga accgtattgtgaatagtttttaccgcccgtggtttctgccgccggagatgattacggcaa cggaaatggatgccggactggatgccggacgctatacctttgcggtcaatattccgccga atttccagcgtgatgtgctggccggacgtcagcctgatatccaggttaacgtcgatgcga cgcgcatgagccaggcgtttaccggcaacagttatatccagaatattattagcggcgagg tcaacagctttgtggcgcgcgcggggagatagcgttcagccggtgtcgctggaaattcgg atgcgctttaacccgaatctggacccggcgtggtttggcggcgtgatggcgattatcaac aacattaccatgctggcgatagtgctgaccggatcggcgctgatccgcgagcgagaacat ggcacggtagagcatctgttagtgatgccggtcacgccctttgagatcatgatggcaaaa gtgtggtcgatggggctggtcgtgctggtggtttccggtttgtctttgatgctgatggtg aaaggcgtacttggcgtaccgattgaaggatcgatcccattgtttatgttgggcgtggcg ctgagcctgttcgccaccacttcaattgggatttttatgggcaccatcgcccgttcgatg ccgcagctggggttattaatgattctggtgctgttgccgctacaaatgctctccggcggt tccacgccgcgcgaaagtatgccgcaggcggtgcaggatattatgctgaccatgccaacg acgcattttgtcagccttgcccaggctattctttatcgcggcgcaggtttaagtattgtc tggccgcagttcctgaccctgctggcgataggcggcgtgttcttcctgatagcgctgtta cgcttcagaaaaacgattggcacgatggcgcaaacattattgccggatggcggctgcgcc tcatccggcctacaaacgacgacgtgcgaacgtaggccgggcaagcgaagcgcccccggc aaggcgctggggcgttagtcttctttcggcaaacattgcaagtggccgtggcgcacgccg cgctgggcgataacatcatcggcaaaatgctgtacgtcgcccatatcgcccttcagaacg gcgatctccagacagtcatcgtggttgatatggacgtgcagcgtggcgacggataaatca tgatgatgatgctgggtggagacgatgcggctggctaaatcccgtttctcatgttcataa acatatgaaagtaccgcgaaaccctgtgtaccgtgttcctgggtggcctcctgcgccaga gcgccgcgcagaatatcacggatcgcttcggagcgattgttatagccgcgtcgctggctc aggctgtccagcgtttccagtaaatcgtcatcgagggtgatggtgacacgttgcatctga cttaaaccttttggtatggcgcgtcggcgcatgggaagctccgcattttgtaacacacgt ccgccgccagagggaaaattaatgtgtcataacgcttttgttatcgtctgcggcgtcagc gtaaacggtgtgggggtacagactgccagactgagcgtatcgagctgacactggctgacc gacaaggcagagggcaacgtgctgtccacgctgacgatttgccaggcgctgccgccgcgt tgttttacgatggcttctttacgcgtccagatacgccagaaggccgccagttgtcgctcc ggacgttccgcttccatttcagcatgttcgccgaggctgaataccgcatttgccagcgaa cgccagttatcgcgcgggcggatcacttcaatatcgcagccgacttcaccttcgtcgcta agcaacagagcaatagtgtcgccgctatggctgaggttaaaccacagcggcgttccggcg gaaaaagccggtttaccctgttcgccatacaccatctccggtagcggcgagagcgcacgg gaaagcagcacgcgcccggccagccagcttgcccggcgaacgccctgcggcgcctgagcg attaacgcgtcgggcaattgacctgcgcttaatgttgacacttttcccagaacgatctgg tacatatcaggaccaacgttgcctatgaagaagaggagattgtagcatctgtcgcctcat cccggtaaacggcttttcggtcggcctggccccagtaagatcgccagtttgctgccgcct ttagtggtttccatccagattttgcagacgctggtcaaaggaacggacaagagcatcccg accgggcctaatagccatccccagaccagtaacgaaagaaataccaccagcgtggacatc cccagacgatggcccatcatgcgtggctccataatgttgccaatcaccatatgaaccacc aaaaagagcgcgccgaccagtacgcattcgtaaaagccgttaaataacaacgcctgaatc atggggggaacggcggaaatcaccgagccgatattaggaacgtaattgagcaaaaaggcc agtacgccccacattagcgcaaactgaatatccattaatgccagccccagccagataatc gcgcccgtccatagactgagcagcgttttcagcgccagataatgcgagacgcctttcagg gcgcggtgcaggccggcaatatggatttgtggattgttaagcgcgaaacgtaatttataa ggcacgtgacgcacctcaaacagcataaaaaccacggtcatcaccagcagcacaatgctt gccatcgcgcccgatacgccggtcattaatgttgtggtgaagagcatgattttatcggaa tccatcccacggagcatacgttccggcgacatatgtaaattcaggaagggcattaactcc tgaagatgtaaaaccttacgcgtcagctccttgctgtatttaggcagcatagcgataaat tcattaagcgatgcggcaagcacacccaccagcgcggtaagcacgatcagcatgacgacc accacgatagtaatcgccagcggtcgcttcacgccccggcgaataaaccaggtcactaat ggattaagaacgatggcgaaaaagagcgccagtaagagttgtacaataatatccgccgcg gcatgaatcccggcgagaataataaccagggaggccagtttcagcagaatatgcataccc gttttgtcgggttgagcggttgccatgaagttttccttgttgtgtttttttctaagtgta gcggctgccctgataccgataatctgaatctcactactgattttcaccgcgcgccatgat agaaaagaaaacatgttgtaaaaatgcggtgatactcatgcccgaacccgtcgctgaacc ggcgctaaacggattgcgcctcaatctgcgtattgtctccatcgtgatgtttaactttgc cagctacctgactatcggcctgccgctcgccgtcttgcccggctatgtgcatgatgcgat gggattcagcgcgttctgggcggggcttattatcagcctgcaatacttcgccactctgtt aagccgtccccatgccgggcggtatgcggatgtattagggccgaaaaaaatcgttgtctt tggcttatgcggctgttttttaagcggactcggctacctgctggcggatatcgccagcgc ctggccgatgatcaatttgttgctactggggctgggtcgcgtgattttggggattgggca aagttttgccggcaccggttcgacactgtggggcgtcggcgtcgtcgggtcgttgcatat tggtcgggttatctcctggaacggtatcgtcacctacggcgcaatggcgatgggcgcgcc gctgggcgtgctgtgttatgcctggggcgggttacagggactggcgctaacggtgatggg cgtggcgctgttggcggtactgttagcccttccacgtccgtcggtgaaggcgaacaaagg caagccgctgccgtttcgcgcggtgctggggcgtgtctggctgtatggtatggcgttggc gctggcctcggcagggtttggcgtcatcgcgacgtttattaccttattttatgatgctaa aggttgggatggcgccgcctttgcgctcacgttatttagcgtcgcgtttgtcggcacgcg tttgctgttccctaacggtatcaatcgcttaggcgggttgaatgtcgccatgatctgctt tggcgtggagattattggtctgttactggtggggacggcagccatgccgtggatggcaaa aatcggcgttttactcacggggatggggttttcgctggtctttccggcgctgggtgtggt ggccgtcaaagccgtgccgccgcagaaccagggcgcggcgctggcgacctataccgtctt tatggatatgtctttgggggtgaccgggccgctggcggggctggtgatgacctgggcggg cgtgccggtgatttatctggcggcggccgggctggtagcgatggcgctattactgacctg gcgcttaaaaaaacggcctccgtctgcactgccggaggccgcatcatcgtcgtgaaggtt acttaataaccaacgtattgatgatgttttcagctgtggtttgcgctttttgctgattgt ccgcaggcagcgtgacctgaatcgtcagcaattggttatccactttgcccagcacgatgg aagagtacgccgtctggcctttcgcggagataatgctgtccagctgttgcaacgtatggc ctttgagctcaatagacttattggtgacgacctgaagctgcgggtcgcggctacgctgtt gctcaagcaggcggttcgccagcaccggcaacgcttcatcggtattatcgccgacgataa caatcaccgctttctggcccgtcgggtcagaataaacgtgcatattattcgcctgggtgc ccagcttaccgctttgatcggtcatatccgccggcagtgagaaactgagtttgccatcca tcaggctgatgggctgaccggaggcctggctttctgcggcggcgctttctgccgacgctt ttgtatcgttgttatcacaggccgcaagccccataaccagcaggccaataccggcatatt taaccaaattgcgcattgacttcttcctttcgataaacggccataacggctcattcgctc atcttatcacaactcggaaaacgaacctttaacttgccggtaatggggaaatttcagatt tatctgccagctttttcagcaacatattcagtagaacgccgtacatcggcaggaagaaaa taatgctgatgagcaccttaaagcaataatccaccagcgcaatttccatccagtgcccgg ccataaaggcatcggggcttcgccagaacgcaatgaagaaaaaggcaagggtatcgctga tattgccaaacagcgttgaagcggtcggcgccagccaccagcgacgattttggcgcaggc ggttaaacacatggacatcaaggatctgtcccagcgcgtaggccataaagctggcggcgg cgatacgggcgacgaacagattaaagttcgccaacgcggcgaaaccttgccaggcaccca tatagaacagcgatgacaccacatatgacaccagcagcgcagggatcatgacggcgaaga taatgcggcgcgccagcggtgcgccaaaaatacgcacggtaagatcggtggcgagaaaaa taaaaggaaaactaaacgcgccccaggtcgtgtggaaaccaaaaatagtgatggggagct gcaccagatagttactggaaatgatcaccagtaaatggaatagcgatagccagaacaacg cttttatgcgctgtgattgtgtaaacggagtcatattgtgacctttttgttggatggggt gagggaacccaataaagaaccgccgcataatactgttttgcgctggcattgcaatggcta attttcacgcaatcgttaacctgcttggcttactgacgaagatgggcgtaaaatatcgcc gtttttctaccattgagacgaagatgagcgatctgttttcctcccctgaccatacgcttg acgccctggggctgcgttgccctgaaccggtaatgatggtgcgcaaaacggtgcgcaaca tgcaaactggcgaaacattgctgattatcgccgacgatccggcgacgacgcgtgatattc cggggttctgtacctttatggaacacgatttgctggcgcaagagacggagggtctgccgt accgctatttattgcgtaaagcgcattaaccgcacggcagatagcattctgattagaagc tttcccagttcgcctgttctgcggcgggctgtggcgttaagcctttgcttaacggcgtcg gcgacgcctcccgacgaggttgtttctggatgcggaacgcggcgaccgcctgacgaagct cgttagcctgatcttccagcgccgccgctgctgctgcggattcctctaccagcgaggcgt tttgctgcgtcacgctgtccatttccgataccgcctgcgccacctgctcaatgcctttgc tttgctcatcggaggcagaggcgatttcgcccataatatcggtcacgcgggtcacggcgt taacgatctctttcatggtttctcccgcttcgcgtacctgcgtagagccggtatcaatgc gcgacacggagttttcaatcagcactttgatctctttggcggcctgcgcgctacggctgg ccaacgtgcgtacctctcc alien_hunter-1.7/ChangepointLeft.java0000644000265600020320000002271010437776153017116 0ustar tilleaadmin/** * run a 2 state 2nd order HMM in a change-point detection framework * to optimize the predicted boundaries using BioJava libraries. * * @author George Vernikos * * For more information on the BioJava project visit: http://www.biojava.org/ */ /* LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ import java.io.*; import org.biojava.bio.symbol.*; import org.biojava.bio.seq.*; import org.biojava.bio.seq.io.*; import org.biojava.bio.dp.*; import org.biojava.bio.*; import org.biojava.bio.seq.db.*; import org.biojava.bio.seq.impl.*; import org.biojava.bio.dist.*; import org.biojava.utils.*; import java.util.*; class ChangepointLeft{ public static SymbolList seqL; public static int order; public static int flatOrRandom; public static int trainOrUntrain; public static Distribution dist; public static int duration; public static ModelTrainer mt; public static int transition_point=0; public static int count=0; //make alphabets static FiniteAlphabet DnaAlphabet = DNATools.getDNA(); public static void main (String args[]) throws Exception{ if(args.length != 5) { throw new Exception("Use: sequence.fa order.int flatD.bin trainableTrans.bin duration.int"); } try{ File seqFile = new File(args[0]); order = Integer.parseInt(args[1]); flatOrRandom = Integer.parseInt(args[2]); trainOrUntrain = Integer.parseInt(args[3]); duration = Integer.parseInt(args[4]); if((flatOrRandom != 0) & (flatOrRandom != 1)) { throw new Exception("Use flatD.bin: only binary i.e. 0 or 1: . . 1/0 . ."); } if((trainOrUntrain != 0) & (trainOrUntrain != 1)) { throw new Exception("Use trainableTrans.bin: only binary i.e. 0 or 1: . . . 1/0 ."); } SymbolTokenization rParser = DnaAlphabet.getTokenization("token"); SequenceBuilderFactory sbFact = new FastaDescriptionLineParser.Factory(SimpleSequenceBuilder.FACTORY); FastaFormat fFormat = new FastaFormat(); SequenceIterator seqI = new StreamReader(new FileInputStream(seqFile), fFormat, rParser, sbFact); seqI.hasNext(); Sequence seq2 = seqI.nextSequence(); SequenceDB seqs = new HashSequenceDB(); seqL = seq2; MarkovModel island = createModel(); DP dp=DPFactory.DEFAULT.createDP(island); Sequence seq = new SimpleSequence( SymbolListViews.orderNSymbolList(seq2, order), null, seq2.getName() + "-o" + order, Annotation.EMPTY_ANNOTATION ); seqs.addSequence(seq); TrainingAlgorithm ta = new BaumWelchTrainer(dp); ta.train( seqs, 0.01, new StoppingCriteria() { public boolean isTrainingComplete(TrainingAlgorithm ta) { try { // XmlMarkovModel.writeModel(ta.getDP().getModel(), System.out); //out2.write(ta.getCycle() + "\t" + ta.getCurrentScore() + "\n"); }catch (Exception ex) {ex.printStackTrace();} //System.out.println(ta.getCycle() + "\t" + ta.getCurrentScore()); //return (ta.getCycle() >=2); return Math.abs(ta.getLastScore() - ta.getCurrentScore()) < 0.001; } } ); //Viterbi SymbolList [] rl = {SymbolListViews.orderNSymbolList(seq2, order)}; StatePath statePath = dp.viterbi(rl, ScoreType.PROBABILITY); for(int i = 0; i <= statePath.length() / 60; i++) { for(int j = i*60; j < Math.min((i+1)*60, statePath.length()); j++) { //System.out.print(statePath.symbolAt(StatePath.STATES, j+1).getName().charAt(0)); char state=statePath.symbolAt(StatePath.STATES, j+1).getName().charAt(0); count++; //it prints the states in binary mode for art user_graph if(state == 'a'){ //out.write("0 1"); } else{ transition_point=count; //out.write("1 0"); } } } System.out.print(transition_point + " " + statePath.getScore()); }catch (Exception e) { e.printStackTrace(); } } //creates the model public static MarkovModel createModel() { List l = Collections.nCopies(order, DNATools.getDNA()); Alphabet alpha = AlphabetManager.getCrossProductAlphabet(l); int [] advance = { 1 }; Distribution typicalD; Distribution atypicalD; try{ //check if higher order; else normal dist if(order >1){ typicalD = OrderNDistributionFactory.DEFAULT.createDistribution(alpha); atypicalD = OrderNDistributionFactory.DEFAULT.createDistribution(alpha); } else{ typicalD = DistributionFactory.DEFAULT.createDistribution(alpha); atypicalD = DistributionFactory.DEFAULT.createDistribution(alpha); } }catch (Exception e){ throw new AssertionFailure("Can't create distributions", e); } EmissionState typicalS = new SimpleEmissionState("typical", Annotation.EMPTY_ANNOTATION, advance, typicalD); EmissionState atypicalS = new SimpleEmissionState("atypical", Annotation.EMPTY_ANNOTATION, advance, atypicalD); SimpleMarkovModel island = new SimpleMarkovModel(1, alpha, "Island"); try{ island.addState(typicalS); island.addState(atypicalS); }catch (Exception e){ throw new AssertionFailure("Can't add states to model", e); } //set up transitions between states try { island.createTransition(island.magicalState(),typicalS); island.createTransition(island.magicalState(),atypicalS); island.createTransition(typicalS,island.magicalState()); island.createTransition(atypicalS,island.magicalState()); island.createTransition(typicalS,atypicalS); island.createTransition(atypicalS,typicalS); island.createTransition(typicalS,typicalS); island.createTransition(atypicalS,atypicalS); }catch (Exception e){ throw new AssertionFailure("Can't create transitions", e); } //set up emission probabilities try { SymbolList highOrderSeq = SymbolListViews.orderNSymbolList (seqL, order); Hashtable symbol= new Hashtable(); for (Iterator i = highOrderSeq.iterator(); i.hasNext(); ) { Symbol sym = (Symbol) i.next(); if(!symbol.containsKey(sym)){ //uniform weights for atypical emmision probs atypicalD.setWeight(sym,0.25); typicalD.setWeight(sym, 0.25); symbol.put(sym, new Integer(1)); } } if(flatOrRandom == 0){ //it randomizes the atypical emission probs DistributionTools.randomizeDistribution(atypicalD); DistributionTools.randomizeDistribution(typicalD); } }catch (Exception e) { throw new AssertionFailure("Can't set emission probabilities", e); } //set up transition scores. try { { //if user option =1 then it trains ; if 0 then untrained if(trainOrUntrain ==0){ //it keeps the transition probs untrainable dist = new UntrainableDistribution (island.transitionsFrom(island.magicalState())); } else{ dist = island.getWeights(island.magicalState()); } dist.setWeight(typicalS, 1.0); //since it will always start at start at state typicalS dist.setWeight(atypicalS, 0.0); island.setWeights(island.magicalState(), dist); } { // always trainable dist = island.getWeights(typicalS); float T_A = (float)1/duration; float T_T = (float)1-T_A; //1/region = 1/7500 dist.setWeight(atypicalS, T_A); //1-1/7500 dist.setWeight(typicalS, T_T); //zero since it will always end at atypical dist.setWeight(island.magicalState(), 0.0); island.setWeights(typicalS, dist); } { //always untrainable dist = new UntrainableDistribution (island.transitionsFrom(atypicalS)); //when it changes it persists for ever. dist.setWeight(typicalS, 0.0000000000000000000000000000001); dist.setWeight(atypicalS, 0.9999); //it was 0.0001 but it throwed NaNs dist.setWeight(island.magicalState(), 0.0000999999999999999999999999999); island.setWeights(atypicalS, dist); } }catch (Exception e) { throw new AssertionFailure("Can't set transition probabilities", e); } return island; } } alien_hunter-1.7/ChangepointRight.java0000644000265600020320000002271210437775673017311 0ustar tilleaadmin/** * run a 2 state 2nd order HMM in a change-point detection framework * to optimize the predicted boundaries using BioJava libraries * * @author George Vernikos * * For more information on the BioJava project visit: http://www.biojava.org/ */ /* LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ import java.io.*; import org.biojava.bio.symbol.*; import org.biojava.bio.seq.*; import org.biojava.bio.seq.io.*; import org.biojava.bio.dp.*; import org.biojava.bio.*; import org.biojava.bio.seq.db.*; import org.biojava.bio.seq.impl.*; import org.biojava.bio.dist.*; import org.biojava.utils.*; import java.util.*; class ChangepointRight{ public static SymbolList seqL; public static int order; public static int flatOrRandom; public static int trainOrUntrain; public static Distribution dist; public static int duration; public static ModelTrainer mt; public static int transition_point=0; public static int count=0; //make alphabets static FiniteAlphabet DnaAlphabet = DNATools.getDNA(); public static void main (String args[]) throws Exception{ if(args.length != 5) { throw new Exception("Use: sequence.fa order.int flatD.bin trainableTrans.bin duration.int"); } try{ File seqFile = new File(args[0]); order = Integer.parseInt(args[1]); flatOrRandom = Integer.parseInt(args[2]); trainOrUntrain = Integer.parseInt(args[3]); duration = Integer.parseInt(args[4]); if((flatOrRandom != 0) & (flatOrRandom != 1)) { throw new Exception("Use flatD.bin: only binary i.e. 0 or 1: . . 1/0 . ."); } if((trainOrUntrain != 0) & (trainOrUntrain != 1)) { throw new Exception("Use trainableTrans.bin: only binary i.e. 0 or 1: . . . 1/0 ."); } SymbolTokenization rParser = DnaAlphabet.getTokenization("token"); SequenceBuilderFactory sbFact = new FastaDescriptionLineParser.Factory(SimpleSequenceBuilder.FACTORY); FastaFormat fFormat = new FastaFormat(); SequenceIterator seqI = new StreamReader(new FileInputStream(seqFile), fFormat, rParser, sbFact); seqI.hasNext(); Sequence seq2 = seqI.nextSequence(); SequenceDB seqs = new HashSequenceDB(); seqL = seq2; MarkovModel island = createModel(); DP dp=DPFactory.DEFAULT.createDP(island); Sequence seq = new SimpleSequence( SymbolListViews.orderNSymbolList(seq2, order), null, seq2.getName() + "-o" + order, Annotation.EMPTY_ANNOTATION ); seqs.addSequence(seq); TrainingAlgorithm ta = new BaumWelchTrainer(dp); ta.train( seqs, 0.01, new StoppingCriteria() { public boolean isTrainingComplete(TrainingAlgorithm ta) { try { // XmlMarkovModel.writeModel(ta.getDP().getModel(), System.out); //out2.write(ta.getCycle() + "\t" + ta.getCurrentScore() + "\n"); }catch (Exception ex) {ex.printStackTrace();} //System.out.println(ta.getCycle() + "\t" + ta.getCurrentScore()); //return (ta.getCycle() >=2); return Math.abs(ta.getLastScore() - ta.getCurrentScore()) < 0.001; } } ); //Viterbi SymbolList [] rl = {SymbolListViews.orderNSymbolList(seq2, order)}; StatePath statePath = dp.viterbi(rl, ScoreType.PROBABILITY); for(int i = 0; i <= statePath.length() / 60; i++) { for(int j = i*60; j < Math.min((i+1)*60, statePath.length()); j++) { //System.out.print(statePath.symbolAt(StatePath.STATES, j+1).getName().charAt(0)); char state=statePath.symbolAt(StatePath.STATES, j+1).getName().charAt(0); count++; //it prints the states in binary mode for art user_graph if(state == 'a'){ //out.write("0 1"); } else{ transition_point=count; //out.write("1 0"); } } } System.out.print(transition_point + " " + statePath.getScore()); }catch (Exception e) { e.printStackTrace(); } } //creates the model public static MarkovModel createModel() { List l = Collections.nCopies(order, DNATools.getDNA()); Alphabet alpha = AlphabetManager.getCrossProductAlphabet(l); int [] advance = { 1 }; Distribution typicalD; Distribution atypicalD; try{ //check if higher order; else normal dist if(order >1){ typicalD = OrderNDistributionFactory.DEFAULT.createDistribution(alpha); atypicalD = OrderNDistributionFactory.DEFAULT.createDistribution(alpha); } else{ typicalD = DistributionFactory.DEFAULT.createDistribution(alpha); atypicalD = DistributionFactory.DEFAULT.createDistribution(alpha); } }catch (Exception e){ throw new AssertionFailure("Can't create distributions", e); } EmissionState typicalS = new SimpleEmissionState("typical", Annotation.EMPTY_ANNOTATION, advance, typicalD); EmissionState atypicalS = new SimpleEmissionState("atypical", Annotation.EMPTY_ANNOTATION, advance, atypicalD); SimpleMarkovModel island = new SimpleMarkovModel(1, alpha, "Island"); try{ island.addState(typicalS); island.addState(atypicalS); }catch (Exception e){ throw new AssertionFailure("Can't add states to model", e); } //set up transitions between states try { island.createTransition(island.magicalState(),typicalS); island.createTransition(island.magicalState(),atypicalS); island.createTransition(typicalS,island.magicalState()); island.createTransition(atypicalS,island.magicalState()); island.createTransition(typicalS,atypicalS); island.createTransition(atypicalS,typicalS); island.createTransition(typicalS,typicalS); island.createTransition(atypicalS,atypicalS); }catch (Exception e){ throw new AssertionFailure("Can't create transitions", e); } //set up emission probabilities try { SymbolList highOrderSeq = SymbolListViews.orderNSymbolList (seqL, order); Hashtable symbol= new Hashtable(); for (Iterator i = highOrderSeq.iterator(); i.hasNext(); ) { Symbol sym = (Symbol) i.next(); if(!symbol.containsKey(sym)){ //uniform weights for atypical emmision probs atypicalD.setWeight(sym,0.25); typicalD.setWeight(sym, 0.25); symbol.put(sym, new Integer(1)); } } if(flatOrRandom == 0){ //it randomizes the atypical emission probs DistributionTools.randomizeDistribution(atypicalD); DistributionTools.randomizeDistribution(typicalD); } }catch (Exception e) { throw new AssertionFailure("Can't set emission probabilities", e); } //set up transition scores. try { { //if user option =1 then it trains ; if 0 then untrained if(trainOrUntrain ==0){ //it keeps the transition probs untrainable dist = new UntrainableDistribution (island.transitionsFrom(island.magicalState())); } else{ dist = island.getWeights(island.magicalState()); } dist.setWeight(typicalS, 1.0); //since it will always start at start at state typicalS dist.setWeight(atypicalS, 0.0); island.setWeights(island.magicalState(), dist); } { // always trainable dist = island.getWeights(typicalS); float T_A = (float)1/duration; float T_T = (float)1-T_A; //1/region = 1/7500 dist.setWeight(atypicalS, T_A); //1-1/7500 dist.setWeight(typicalS, T_T); //zero since it will always end at atypical dist.setWeight(island.magicalState(), 0.0); island.setWeights(typicalS, dist); } { //always untrainable dist = new UntrainableDistribution (island.transitionsFrom(atypicalS)); //when it changes it persists for ever. dist.setWeight(typicalS, 0.0000000000000000000000000000001); dist.setWeight(atypicalS, 0.9999); //it was 0.0001 but it throwed NaNs dist.setWeight(island.magicalState(), 0.0000999999999999999999999999999); island.setWeights(atypicalS, dist); } }catch (Exception e) { throw new AssertionFailure("Can't set transition probabilities", e); } return island; } } alien_hunter-1.7/PAI_scripts/0000755000265600020320000000000011265301730015340 5ustar tilleaadminalien_hunter-1.7/PAI_scripts/CutOff.pm0000644000265600020320000001301311022604371017060 0ustar tilleaadmin=head1 NAME PAI_scripts::CutOff =head1 SYNOPSIS determines dynamically a genome-specific score threshold using k-means clustering (k=3) =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::CutOff; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&Cutoff); sub Cutoff{ $ScoresRef=$_[0]; $min=0; %ALLscores; %scores; $cutoff=0; $Func_prev=0; $Func_max=0; foreach $z ($ScoresRef){ foreach $key (keys %$z){ $ALLscores{$key}="$z->{$key}"; } } #@keys contains the keys sorted by their value (min->max) @keys = sort { $ALLscores{$a} <=> $ALLscores{$b} } keys %ALLscores; $NumKeys = keys %ALLscores; if($NumKeys<2){ print "\n not enough data ($NumKeys) to determine threshold; T=0\n"; goto end; } #minimum value $min=$ALLscores{$keys[0]}; print "\n scaling 0-100\n"; #it scales to zero foreach $item (@keys){ $ALLscores{$item}=$ALLscores{$item}-$min; } #maximum value $max=$ALLscores{$keys[$NumKeys-1]}; #it scales to maximum: Sx'=(Sx*100)/Smax foreach $item (@keys){ $scores{$item}=sprintf("%.3f",($ALLscores{$item}*100)/$max); $ALLscores{$item}=sprintf("%.3f",($ALLscores{$item}*100)/$max); } #@keys contains the keys sorted by their value (max->min) @keys = sort { $scores{$b} <=> $scores{$a} } keys %scores; #Exponential Smoothing (Damping factor = 0.5) print "\n Exponential Smoothing (Damping factor = 0.5)\n\n"; for($i=1;$i<=$NumKeys-1;$i++){ $scores{$keys[$i]}=0.5*$scores{$keys[$i]}+0.5*$scores{$keys[$i-1]}; #print "$scores{$keys[$i]}\n"; } #@keys contains the keys sorted by their value (min->max) @keys = sort { $scores{$a} <=> $scores{$b} } keys %scores; #for($i=0;$i<=$NumKeys-1;$i++){ #print "$scores{$keys[$i]}\n"; #} ############################################################ #check if not enough data for k-means if($NumKeys>=300){ print " K-means Clustering:\n\nFunc_max\tCutoff\tCntrA\t\tCntrB\t\tCntrC\n"; #initialize the 3 centroids and redo - keeping the iteration with the maximum obj function, i.e. that seperates the 3 clusters the most for($j=10;$j<=40;$j+=10){ for($k=0;$k<=(100-$j*2);$k+=10){ $a=$k; $b=$k+$j; $c=$k+($j*2); #calculate distances of each Xi to each of the 3 centroids |Xi-Cj|^2 REDO: for($i=0;$i<$NumKeys;$i++){ $dist_a{$i}=($scores{$keys[$i]}-$a)*($scores{$keys[$i]}-$a); $dist_b{$i}=($scores{$keys[$i]}-$b)*($scores{$keys[$i]}-$b); $dist_c{$i}=($scores{$keys[$i]}-$c)*($scores{$keys[$i]}-$c); #calculates the objective function sum_j(sum_i(|Xi-Cj|^2)) $f+=$dist_a{$i}+$dist_b{$i}+$dist_c{$i}; } $Func=$f; $f=0; #scan through each hash to find where the transition to the other cluster occurs for($i=0;$i<$NumKeys;$i++){ if($dist_a{$i}<=$dist_b{$i}){ $trans_a=$i; } if($dist_b{$i}<=$dist_c{$i}){ $trans_b=$i; } } #sets cutoff to the score value where the transition from cluster 1 -> 2 occurs $cutoff=$scores{$keys[$trans_a+1]}; #recalculates mean for each cluster #cluster a $count=0; $sum=0; for($i=0;$i<=$trans_a;$i++){ $count++; $sum+=$scores{$keys[$i]}; } if($count!=0){ $mean_a=$sum/$count; } else{ $mean_a=0; } #cluster b $count=0; $sum=0; for($i=$trans_a+1;$i<=$trans_b;$i++){ $count++; $sum+=$scores{$keys[$i]}; } if($count!=0){ $mean_b=$sum/$count; } else{ $mean_b=0; } #cluster c $count=0; $sum=0; for($i=$trans_b+1;$i<$NumKeys;$i++){ $count++; $sum+=$scores{$keys[$i]}; } if($count!=0){ $mean_c=$sum/$count; } else{ $mean_c=0; } #convergence criteria $dif=abs($Func-$Func_prev); if($dif>0.1){ $Func_prev=$Func; #re-initialize the centroids $a=$mean_a; $b=$mean_b; $c=$mean_c; #print "$Func\t$cutoff\t$a\t$b\t$c\n"; #re-iterate with the new centroids goto REDO; } #keep the iteration with the highest objective function if($Func>$Func_max){ $Func_max=$Func; $cutoff_best=$cutoff; $Fmax=sprintf("%.3f",$Func_max); $mA=sprintf("%.3f",$mean_a); $mB=sprintf("%.3f",$mean_b); $mC=sprintf("%.3f",$mean_c); $cutbest=sprintf("%.3f",$cutoff_best); print "$Fmax\t$cutbest\t$mA\t\t$mB\t\t$mC\n"; } } } $cutoff_best=sprintf("%.3f",$cutoff_best); } #if not enough data - simple statistics else{ $count=0; $average=0; $sum=0; foreach $k (keys %ALLscores){ $sum+=$ALLscores{$k}; $count++; } $average=$sum/$count; foreach $k (keys %ALLscores){ $sco=$ALLscores{$k}-$average; $scoSqr=$sco**2; $sumSqr+=$scoSqr; } $STANDEV=sqrt($sumSqr/($count-1)); $STANDEV*=0.5; $cutoff_best=sprintf("%.3f",$average+$STANDEV); print "\n too little data to determine dynamically T;\n\n T=$cutoff_best(=average+0.5SD)\n"; goto end; } ############################################################### end: return ($cutoff_best,\%ALLscores); } 1; alien_hunter-1.7/PAI_scripts/MotifMaker.pm0000644000265600020320000002043511022245022017730 0ustar tilleaadmin=head1 NAME PAI_scripts::MotifMaker =head1 SYNOPSIS scans for motifs and builds the IVOM vectors =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::MotifMaker; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&MMaker &scan &IvomBuild); $pos_base{"1"}="A"; $pos_base{"2"}="T"; $pos_base{"3"}="C"; $pos_base{"4"}="G"; $compl{"A"}="T"; $compl{"T"}="A"; $compl{"C"}="G"; $compl{"G"}="C"; $compl{"a"}="t"; $compl{"t"}="a"; $compl{"c"}="g"; $compl{"g"}="c"; $amb=0; #creates all k (<=8) order motifs sub MMaker{ print"\n building motif vectors (order k <=7) ...\n\n"; for ($a=1;$a<=4;$a++){#1 $m1=$pos_base{$a}; $n1mer="$m1"; $n1mers{$n1mer}=0; #-------------------------------- for ($b=1;$b<=4;$b++){#2 $m2=$pos_base{$b}; $n2mer="$m1$m2"; $n2mers{$n2mer}=0; $motifRev=""; for($x=0;$x<=2-1;$x++){ $base=substr($n2mer,$x,1); $motifRev.=$compl{$base}; } $Con2{$n2mer}=reverse($motifRev); #-------------------------------- for ($c=1;$c<=4;$c++){#3 $m3=$pos_base{$c}; $n3mer="$m1$m2$m3"; $n3mers{$n3mer}=0; $motifRev=""; for($x=0;$x<=3-1;$x++){ $base=substr($n3mer,$x,1); $motifRev.=$compl{$base}; } $Con3{$n3mer}=reverse($motifRev); #-------------------------------- for ($d=1;$d<=4;$d++){#4 $m4=$pos_base{$d}; $n4mer="$m1$m2$m3$m4"; $n4mers{$n4mer}=0; $motifRev=""; for($x=0;$x<=4-1;$x++){ $base=substr($n4mer,$x,1); $motifRev.=$compl{$base}; } $Con4{$n4mer}=reverse($motifRev); #-------------------------------- for ($e=1;$e<=4;$e++){#5 $m5=$pos_base{$e}; $n5mer="$m1$m2$m3$m4$m5"; $n5mers{$n5mer}=0; $motifRev=""; for($x=0;$x<=5-1;$x++){ $base=substr($n5mer,$x,1); $motifRev.=$compl{$base}; } $Con5{$n5mer}=reverse($motifRev); #-------------------------------- for ($f=1;$f<=4;$f++){#6 $m6=$pos_base{$f}; $n6mer="$m1$m2$m3$m4$m5$m6"; $n6mers{$n6mer}=0; $motifRev=""; for($x=0;$x<=6-1;$x++){ $base=substr($n6mer,$x,1); $motifRev.=$compl{$base}; } $Con6{$n6mer}=reverse($motifRev); #-------------------------------- for ($g=1;$g<=4;$g++){#7 $m7=$pos_base{$g}; $n7mer="$m1$m2$m3$m4$m5$m6$m7"; $n7mers{$n7mer}=0; $motifRev=""; for($x=0;$x<=7-1;$x++){ $base=substr($n7mer,$x,1); $motifRev.=$compl{$base}; } $Con7{$n7mer}=reverse($motifRev); #-------------------------------- for ($h=1;$h<=4;$h++){#8 $m8=$pos_base{$h}; $n8mer="$m1$m2$m3$m4$m5$m6$m7$m8"; $n8mers{$n8mer}=0; $motifRev=""; for($x=0;$x<=8-1;$x++){ $base=substr($n8mer,$x,1); $motifRev.=$compl{$base}; } $Con8{$n8mer}=reverse($motifRev); } } } } } } } } return(); }#sub MMaker 1; sub scan{ $seq=$_[0]; $ch=$_[1]; $len=length($seq); $amb=0; $count8merswithN=0; #empty the hashes - ready for the next 5kb window flushHash(); for ($i=0;$i<=$len-8;$i++){ #8mers $motif=substr($seq,$i,8); $motif=uc($motif);#converts to upper case always #ambiguous bases check if(!exists($n8mers{$motif})){ $amb=1; #in the case of the Genome scan simply ignore the current 8mer and continue with the next one if($ch==0){ $count8merswithN++; goto HERE; } #in the case of the slid. window, skip the scan process for the whole window and continue with the next one else{ goto HERE2; } } $n8mers{$motif}++; $n8mers{$Con8{$motif}}++; #1mers $n=substr($motif,7,1); $n1mers{$n}++; $n1mers{$compl{$n}}++; #2mers $n=substr($motif,6,2); $n2mers{$n}++; $n2mers{$Con2{$n}}++; #3mers #$n=unpack("x5 A3", $motif); $n=substr($motif,5,3); $n3mers{$n}++; $n3mers{$Con3{$n}}++; #4mers $n=substr($motif,4,4); $n4mers{$n}++; $n4mers{$Con4{$n}}++; #5mers $n=substr($motif,3,5); $n5mers{$n}++; $n5mers{$Con5{$n}}++; #6mers $n=substr($motif,2,6); $n6mers{$n}++; $n6mers{$Con6{$n}}++; #7mers $n=substr($motif,1,7); $n7mers{$n}++; $n7mers{$Con7{$n}}++; HERE: } #motifs in the first 8mer $motif=substr($seq,0,8); $motif=uc($motif); #7mers for($a=0;$a<=0;$a++){ $n=substr($motif,$a,7); if(!exists($n7mers{$n})){ goto HERE2; } $n7mers{$n}++; $n7mers{$Con7{$n}}++; } #1mer for($a=0;$a<=6;$a++){ $n=substr($motif,$a,1); $n1mers{$n}++; $n1mers{$compl{$n}}++; } #2mers for($a=0;$a<=5;$a++){ $n=substr($motif,$a,2); $n2mers{$n}++; $n2mers{$Con2{$n}}++; } #3mers for($a=0;$a<=4;$a++){ $n=substr($motif,$a,3); $n3mers{$n}++; $n3mers{$Con3{$n}}++; } #4mers for($a=0;$a<=3;$a++){ $n=substr($motif,$a,4); $n4mers{$n}++; $n4mers{$Con4{$n}}++; } #5mers for($a=0;$a<=2;$a++){ $n=substr($motif,$a,5); $n5mers{$n}++; $n5mers{$Con5{$n}}++; } #6mers for($a=0;$a<=1;$a++){ $n=substr($motif,$a,6); $n6mers{$n}++; $n6mers{$Con6{$n}}++; } HERE2: return($amb,$count8merswithN); } 1; sub flushHash{ foreach $k (keys %n8mers){ $n8mers{$k}=0; } foreach $k (keys %n7mers){ $n7mers{$k}=0; } foreach $k (keys %n6mers){ $n6mers{$k}=0; } foreach $k (keys %n5mers){ $n5mers{$k}=0; } foreach $k (keys %n4mers){ $n4mers{$k}=0; } foreach $k (keys %n3mers){ $n3mers{$k}=0; } foreach $k (keys %n2mers){ $n2mers{$k}=0; } foreach $k (keys %n1mers){ $n1mers{$k}=0; } return(); } 1; sub IvomBuild{ #calculates weights Wi(=Counts*deg_freedom) and obs_freqs (Pi) foreach $k (keys %n8mers){ #w8 $w8=$n8mers{$k}*65536; $p8=$n8mers{$k}/(($len-7-$count8merswithN)*2); #deletes the values; #$n8mers{$k}=0; #print "$k $p8\n"; #w7 $n=substr($k,1,7); $w7=$n7mers{$n}*16384; $p7=$n7mers{$n}/(($len-6-$count8merswithN)*2); #print "$n $p7\n"; #w6 $n=substr($k,2,6); $w6=$n6mers{$n}*4096; $p6=$n6mers{$n}/(($len-5-$count8merswithN)*2); #print "$n $p6\n"; #w5 $n=substr($k,3,5); $w5=$n5mers{$n}*1024; $p5=$n5mers{$n}/(($len-4-$count8merswithN)*2); #$n5mers{$n}=0; #print "$n $p5\n"; #w4 $n=substr($k,4,4); $w4=$n4mers{$n}*256; $p4=$n4mers{$n}/(($len-3-$count8merswithN)*2); #print "$n $p4\n"; #w3 $n=substr($k,5,3); $w3=$n3mers{$n}*64; $p3=$n3mers{$n}/(($len-2-$count8merswithN)*2); #print "$n $p3\n"; #w2 $n=substr($k,6,2); $w2=$n2mers{$n}*16; $p2=$n2mers{$n}/(($len-1-$count8merswithN)*2); #print "$n $p2\n"; #w1 $n=substr($k,7,1); $w1=$n1mers{$n}*4; $p1=$n1mers{$n}/(($len-$count8merswithN)*2); #print "$n $p1\n"; $w_tot=$w8+$w7+$w6+$w5+$w4+$w3+$w2+$w1; #calculates w% $w8=$w8/$w_tot; $w7=$w7/$w_tot; $w6=$w6/$w_tot; $w5=$w5/$w_tot; $w4=$w4/$w_tot; $w3=$w3/$w_tot; $w2=$w2/$w_tot; $w1=$w1/$w_tot; #calculates IVOMk= Wk*Pk + [1-Wk]IVOMk-1 $IVOM1=$w1*$p1; $IVOM2=$w2*$p2+((1-$w2)*$IVOM1); $IVOM3=$w3*$p3+((1-$w3)*$IVOM2); $IVOM4=$w4*$p4+((1-$w4)*$IVOM3); $IVOM5=$w5*$p5+((1-$w5)*$IVOM4); $IVOM6=$w6*$p6+((1-$w6)*$IVOM5); $IVOM7=$w7*$p7+((1-$w7)*$IVOM6); $IVOM8{$k}=$w8*$p8+((1-$w8)*$IVOM7); #print"$k $IVOM8 $w8 $p8 $w7 $p7 $w6 $p6 $w5 $p5 $w4 $p4 $w3 $p3 $w2 $p2 $w1 $p1\n"; $sum+=$IVOM8{$k}; } #scale from 0 to 1 (cause relative entropy) foreach $k (keys %IVOM8){ $IVOM8{$k}=$IVOM8{$k}/$sum; } $sum=0; return(\%IVOM8); } 1; alien_hunter-1.7/PAI_scripts/changePoint.pm0000644000265600020320000001021311022245022020122 0ustar tilleaadmin=head1 NAME PAI_scripts::changePoint =head1 SYNOPSIS runs a 2 state 2nd order HMM on the boundaries of each prediction =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::changePoint; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&changepoint &readGenomeSeq); sub readGenomeSeq{ $SeqFile=$_[0]; open file1, $SeqFile or die "dead"; while(){ chomp($_); if (m#>#){ } else { $GenSeq.=$_; } } $Genlen=length($GenSeq); close file1; return ($Genlen); } 1; sub changepoint{ $left=$_[0]; $right=$_[1]; $tabfile=$_[2]; $len=$right-$left; #size check: if 5 or 7.5 or 10 or 12.5 kb use different size for the hybrid if($len==5000){ $from=2000; $to=6000; $step=2000; $h_s=5000; $h_e=2000; } elsif($len==7500){ $from=2000; $to=6000; $step=2000; $h_s=5000; $h_e=3000; } elsif($len==10000){ $from=2000; $to=6000; $step=2000; $h_s=5000; $h_e=4000; } elsif($len==12500){ $from=2000; $to=8000; $step=2000; $h_s=5000; $h_e=5000; } elsif($len>12500){ $from=2000; $to=8000; $step=2000; $h_s=5000; $h_e=6000; } $size=$h_s+$h_e; #check if not too close to the boundaries if(($left>=2500) & ($right<=$Genlen-5000)){ #-2500 $Leftchunk=substr($GenSeq,$left-$h_s,$size); #-5000 $Rightchunk=substr($GenSeq,$right-$h_e,$size); #RightChunk open file3, ">$tabfile.hmmR" or die "dead"; print file3 ">seq\n$Rightchunk"; close file3; #LeftChunk open file4, ">$tabfile.hmmL" or die "dead"; print file4 ">seq\n$Leftchunk"; close file4; $max=0; for($i=$from;$i<=$to;$i+=$step){ print $i."bp\n"; ############################################### $pid1 = open(LEFT, "java ChangepointLeft $tabfile.hmmL 3 1 0 $i |") or die "Couldn't fork: $!\n"; print "running HMM on left boundary..\n"; $pid2 = open(RIGHT, "java ChangepointRight $tabfile.hmmR 3 1 0 $i |") or die "Couldn't fork: $!\n"; print "running HMM on right boundary..\n"; while () { $outLeft{$i}= $_; } close(LEFT); while () { $outRight{$i}= $_; } close(RIGHT); print "$out1\n$out2"; ############################################## #parses transition point and path score for each iteration ($location_L, $score_L)=split / /,$outLeft{$i}; $score_L=abs($score_L); ($location_R, $score_R)=split / /,$outRight{$i}; $score_R=abs($score_R); #only once if($i==$from){ $max_R=$score_R; $max_L=$score_L; $transPos_R=$location_R; $transPos_L=$location_L; $duration=$i; } #keeps the highest scoring path if($score_L lt $max_L){ $max_L=$score_L; $transPos_L=$location_L; $duration=$i; } if($score_R lt $max_R){ $max_R=$score_R; $transPos_R=$location_R; $duration=$i; } } $new_right=$right-$h_e+$transPos_R; $new_left=$left-$h_s+$transPos_L; $a=$left-2500; $b=$left+5000; }#if check #else if prediction is very close to the genome ends then don't optimize, just return it as it is else{ $new_left=$left; $new_right=$right; } #deletes the temp files unlink "$tabfile.hmmL", "$tabfile.hmmR"; return ($new_left,$new_right); } 1; alien_hunter-1.7/PAI_scripts/help.pm0000644000265600020320000000367011022573375016643 0ustar tilleaadmin=head1 NAME PAI_scripts::help =head1 SYNOPSIS provides help info to the user =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::help; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&help); sub help{ print "\n----------------------------\n\talien_hunter [Release 1.7] \n\n\tINPUT: raw genomic sequence PREDICTION: HGT regions based on Interpolated Variable Order Motifs (IVOMs) arguments: [] [] (optional): [-a] to load the prediction file into Artemis [-c] optimize predicted boundaries with a change-point detection 2 state 2nd order HMM output: predictions (tab file) in embl format optimized (HMM) predictions (tab file) in embl format predictions in Artemis User Plot format to be loaded manually using Graph -> Add User Plot... optimized (HMM) predictions in Artemis User Plot format to be loaded manually using Graph -> Add User Plot... the scores over all the sliding windows - for score distribution check Note: Predictions that overlap with rRNA operon are mentioned in the note qualifier ------------------------\n"; } alien_hunter-1.7/PAI_scripts/overlap.pm0000644000265600020320000000561411022310363017345 0ustar tilleaadmin=head1 NAME PAI_scripts::overlap =head1 SYNOPSIS merges predictions =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::overlap; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&overlap); sub overlap{ $cutoff=$_[0]; $scoresRef=$_[1]; $prevFrom=0; $prevTo=0; $check=0; %joinedScores; $b=0; $bb=0; $count=0; #if cutoff = 0 don't join at all - return them as they are if($cutoff==0){ foreach $z ($scoresRef){ foreach $key (keys %$z){ $joinedScores{$key}="$z->{$key}"; } } } else{ foreach $z ($scoresRef){ foreach $key (keys %$z){ $scores{$key}="$z->{$key}"; } } @keys2 = sort {$a <=> $b} keys %scores; $numkeys= keys %scores; for($i=0;$i<=$numkeys-1;$i++){ if($scores{$keys2[$i]}>=$cutoff){ $aboveCut[$b]=$keys2[$i]; $b++ } } #if there is only one window with score > T if($b==1){ $joinedScores{$aboveCut[$b-1]}=$scores{$aboveCut[$b-1]}; } for($c=0;$c<=$b;$c++){ ($from,$to)=split /\.\./, $aboveCut[$c]; if(($prevTo>=$from) | ($prevTo==0)){ if($c>0){ $count++; $joinedScore+=$scores{$aboveCut[$c]}; } } else{ $check=1; } #that's for the last windows where there is no next window to make check=1 if($c==$b){ #because it's doing once more the loop $count--; $check=1; } if ($check==1){ ($fromF,$toF)=split /\.\./, $aboveCut[$c-1-$count]; ($fromL,$toL)=split /\.\./, $aboveCut[$c-1]; #to add also the 1st win score $joinedScore+=$scores{$aboveCut[$c-1-$count]}; $average=$joinedScore/($count+1); $average=sprintf("%.3f",$average); #for the first window; there is no previous window if($c>0){ $joinedScores{"$fromF..$toL"}=$average; } $count=0; $check=0; $joinedScore=0; } $prevFrom=$from; $prevTo=$to; } }#else return (\%joinedScores); } 1; alien_hunter-1.7/PAI_scripts/rrna.pm0000644000265600020320000000465711022245023016645 0ustar tilleaadmin=head1 NAME PAI_scripts::rrna =head1 SYNOPSIS Annotates alien_hunter predictions that overlap with a 16s 102bp rRNA motif; This motif comes from a consensus of 16s rRNA alignment from: Ecoli 223771:225312 forward Mycobacterium 1341144:1342692 forward Neisseria 198339:199883 reverse Pseudomonas 722096:723631 forward Rhizobium 2750005:2751465 reverse Ricketsia 772263:773769 reverse Salmonella 287479:289020 forward Staph 514251:515805 forward Streptococcus 17043:18549 forward Thermotoga 188968:190526 forward Vibrio 53823:55357 forward Bacillus 9809:11361 forward Chlamydia 856874:858423 forward Clostridium 8715:10223 reverse Corynebacterium 76643:78166 forward Campylobacter 39249:40761 forward Mycoplasma 170007:171525 forward =head1 AUTHOR George Vernikos =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut package PAI_scripts::rrna; use Exporter; @ISA = ("Exporter"); @EXPORT = qw (&rrna &overlapRNA); sub rrna{ $GenSeq=$_[0]; $GenSeq=uc($GenSeq); #forward strand: $_=$GenSeq; while(/(ACA..GG.ACTGAGA.AC.G.CC..ACTC.{0,1}TACGGGAGGC.GCAGT.G.GAAT.TT...CAATG...G.AA...TGA...AGC.A..CCG.GTG...GA.GA)/g){ $to=pos($_); $len=length($1); $from=$to-$len+1; $loc{$from}=$to; print " possible rRNA signature: $from..$to\n"; } #reverse strand: $_=$GenSeq; while(/(TC.TC...CAC.CGG..T.GCT...TCA...TT.C...CATTG...AA.ATTC.C.ACTGC.GCCTCCCGTA.{0,1}GAGT..GG.C.GT.TCTCAGT.CC..TGT)/g){ $to=pos($_); $len=length($1); $from=$to-$len+1; $loc{$from}=$to; print " possible rRNA signature: $from..$to\n"; } return(); } 1; sub overlapRNA{ $from=$_[0]; $to=$_[1]; $check=0; foreach $k (keys %loc){ if(($k>=$from) & ($k<$to)){ $check=1; } elsif(($loc{k}>$from) & ($loc{k}<=$to)){ $check=1; } } return($check); } 1;