pax_global_header00006660000000000000000000000064134273413300014512gustar00rootroot0000000000000052 comment=3d714fd8cb7e1a1295b4fc05475304a7240dbb40 bioperl-run-release-1-7-3/000077500000000000000000000000001342734133000153745ustar00rootroot00000000000000bioperl-run-release-1-7-3/.gitignore000066400000000000000000000002431342734133000173630ustar00rootroot00000000000000*~ .tmp *# .#* .*.swp *(Autosaved)blib* Build Build.bat _build* pm_to_blib* *.tar.gz .lwpcookies cover_db pod2htm*.tmp .emacs.* blib* *.bak MYMETA.yml MYMETA.json bioperl-run-release-1-7-3/.travis.yml000066400000000000000000000023511342734133000175060ustar00rootroot00000000000000language: perl perl: - "5.24" #- "5.20" - "5.18" #- "5.16" #- "5.14" sudo: false env: PERL_CPANM_OPT="--notest --force --skip-satisfied" addons: apt: packages: - ncbi-blast+ - ncbi-blast+-legacy - clustalw - bwa - hmmer - probcons #- emboss #- wise #- samtools #- muscle #- bedtools #- mafft install: #These are recommended or required Perl libraries - "cpanm Bio::Perl 2>&1 | tail -n 1" - "cpanm Bio::FeatureIO 2>&1 | tail -n 1" - "cpanm IPC::Run XML::Twig 2>&1 | tail -n 1" - "cpanm Config::Any 2>&1 | tail -n 1" - "cpanm File::Sort 2>&1 | tail -n 1" #- "SAMTOOLS=( dirname `which samtools` ) cpanm Bio::DB::Sam 2>&1 | tail -n 1" # Install non-apt dependencies before_install: - bash ./packages/install-samtools.sh - export PATH=$PATH:$HOME/local/bin/ script: - "perl ./Build.PL --accept" - "./Build test" #after_success: # - ./travis_scripts/trigger-dockerhub.sh #TODO - send emails to bioperl-guts-l notifications: email: recipients: #- bioperl-guts-l@lists.open-bio.org - cjfields1@gmail.com on_success: change on_failure: change # whitelist branches branches: only: - master bioperl-run-release-1-7-3/AUTHORS000066400000000000000000000022221342734133000164420ustar00rootroot00000000000000=head1 CONTRIBUTORS TO BIOPERL-RUN =over =item * Sendu Bala =item * Jer-Ming Chia =item * Rob Edwards =item * Mauricio Herrera Cuadra =item * Shawn Hoon =item * Donald Jackson =item * Keith James =item * Ratnapu Kiran Kumar =item * Balamurugan Kumarasamy =item * Catherine Letondal =item * Heikki Lehvaslaiho =item * Stephen Montgomery =item * Brian Osborne =item * Tania Oh =item * Peter Schattner =item * Martin Senger =item * Marc Sohrmann =item * Jason Stajich =item * Elia Stupka =item * David Vilanova =item * Albert Vilella =item * Tiequan Zhang =item * Juguang Xiao =back bioperl-run-release-1-7-3/Build.PL000077500000000000000000000116051342734133000166760ustar00rootroot00000000000000#!/usr/bin/perl # This is a Module::Build script for BioPerl-Run installation. # See http://search.cpan.org/~kwilliams/Module-Build/lib/Module/Build.pm use strict; use warnings; use Module::Build; my $build = Module::Build->subclass( code => q( # add dist version to META files sub get_metadata { my ($self, %args) = @_; my $metadata = $self->SUPER::get_metadata(%args); if (exists $metadata->{provides}) { my $ver = $self->dist_version; my $pkgs = $metadata->{provides}; for my $p (keys %{$pkgs}) { if (!exists($pkgs->{$p}->{'version'})) { $pkgs->{$p}->{'version'} = $ver; } else { $self->log_warn("Note: Module $p has a set version: ".$pkgs->{$p}->{'version'}."\n") if $pkgs->{$p}->{'version'} ne $ver; } } } return $metadata; } ) )->new( dist_name => 'BioPerl-Run', dist_version => '1.007003', module_name => 'Bio::Run', dist_author => 'BioPerl Team ', dist_abstract => 'BioPerl-Run - wrapper toolkit', license => 'perl', config_requires => { 'Module::Build' => 0, }, build_requires => { 'Bio::Root::Version' => '1.007000', 'Bio::Root::Test' => 0, }, requires => { 'perl' => '5.6.1', 'Bio::Root::Version' => '1.007000', 'Bio::Root::Root' => 0, }, recommends => { 'Bio::Cluster::SequenceFamily' => 0, # Bio::Tools::Run::TribeMCL 'Algorithm::Diff' => 0, # generating consensus protein family descriptions: Bio::Tools::Run::TribeMCL 'IPC::Run' => 0, # Glimmer and Genemark application wrappers: Bio::Tools::Run::Glimmer Bio::Tools::Run::Genemark 'IO::String' => 0, # generating Bio::Tree::Tree from strings: Bio::Tools::Run::Phylo::Phylip::Consense 'XML::Twig' => 0, # processing XML data: Bio::Tools::Run::EMBOSSacd 'File::Sort' => 0, # BEDTools 'Config::Any' => 0, # MCS, Match 'Bio::FeatureIO' => 0, # MCS, Match, Phastcons #'SOAP::Lite' => 0.716, # A bug that affects SoapEU-unit.t tests was fixed in this version (many levels deep object throws error) }, get_options => { accept => { }, network => { }, install_scripts => { } }, auto_features => { Network => { description => "Enable tests that need an internet connection", requires => { 'LWP::UserAgent' => 0 } } }, dynamic_config => 1, #create_makefile_pl => 'passthrough' ); my $accept = $build->args->{accept}; # Optionally have script files installed. if ($build->args('install_scripts') or $accept ? 0 : $build->y_n("Install scripts? y/n", 'n')) { my $files = $build->_find_file_by_type('PLS', 'scripts'); my $script_build = File::Spec->catdir($build->blib, 'script'); my @tobp; while (my ($file, $dest) = each %$files) { $dest = 'bp_'.File::Basename::basename($dest); $dest =~ s/PLS$/pl/; $dest = File::Spec->catfile($script_build, $dest); $build->copy_if_modified(from => $file, to => $dest); push @tobp, $dest; } $build->script_files(\@tobp); } # Do network tests? my $do_network_tests = 0; if ($build->args('network')) { $do_network_tests = $build->feature('Network'); } elsif ($build->feature('Network')) { $do_network_tests = $accept ? 0 : $build->y_n("Do you want to run tests that require connection to servers across the internet\n(likely to cause some failures)? y/n", 'n'); } if ($do_network_tests) { $build->notes(network => 1); $build->log_info(" - will run internet-requiring tests\n"); my $use_email = $build->y_n("Do you want to run tests requiring a valid email address? y/n",'n'); if ($use_email) { my $address = $build->prompt("Enter email address:"); $build->notes(email => $address); } } else { $build->notes(network => 0); $build->log_info(" - will not run internet-requiring tests\n"); } # Create the build script and exit $build->create_build_script; bioperl-run-release-1-7-3/Changes000066400000000000000000000174031342734133000166740ustar00rootroot00000000000000Summary of important user-visible changes for BioPerl-Run --------------------------------------------------------- 1.7.3 * The following modules have been moved to the BioPerl distribution so that new BioPerl Run tools can be developed without being dependent on the whole BioPerl-Run distribution: Bio::Tools::Run::Analysis Bio::Tools::Run::AnalysisFactory Bio::Tools::Run::Phylo::PhyloBase Bio::Tools::Run::WrapperBase Bio::Tools::Run::WrapperBase::CommandExts * The following modules have been removed from BioPerl-Run to be part of separate distributions and have independent development: Bio::Tools::Phylo::Gumby Bio::Tools::Run::AssemblerBase Bio::Tools::Run::BWA Bio::Tools::Run::BWA::Config Bio::Tools::Run::Bowtie Bio::Tools::Run::Bowtie::Config Bio::Tools::Run::Cap3 Bio::Tools::Run::Maq Bio::Tools::Run::Maq::Config Bio::Tools::Run::Meme Bio::Tools::Run::Minimo Bio::Tools::Run::Newbler Bio::Tools::Run::Phrap Bio::Tools::Run::Phylo::Gumby Bio::Tools::Run::TigrAssembler * New program previously part of the BioPerl distribution: bp_blast2tree * All Bio::Installer modules and the bioperl_application_installer script have been removed. These were unsafe and out of date. Use a package manager of your choice to install external programs. 1.7.2 * Minor release - PAML, Clustalw, and TCoffee related modules are all separate distributions, one for each set. These are Bio-Tools-Phylo-PAML, Bio-Tools-Run-Alignment-TCoffee and Bio-Tools-Run-Alignment-Clustalw [carandraug] * Fix EMBOSS and SABlastPlus test count, which was failing when EMBOSS wasn't installed [cjfields] * Fix various tests to skip if Bio::FeatureIO isn't installed [cjfields] * Various documentation fixes, including INSTALL updates [bosborne] * Add Bio::FeatureIO as a 'recommends' (needed for Gumby, MCS, Match, Phastcons) 1.7.001 * Minor release to deal with version indexing 1.7.000 * Bio::Tools::Run::WrapperBase moved from bioperl core to bioperl-run * Updaed Samtools wrapper, minimal support for samtools > v.1 added [cjfields] * Minor updates to sync with BioPerl v. 1.7.x release series 1.6.901 * added run support for MSAProbs [Jessen Bredeson] 1.6.900 * Bowtie and BWA wrappers for NGS [maj, Ben Bimber, Dan Kortschak] * ClustalW v2 support [cjfields] * tRNAscanSE support [Mark Johnson, cjfields] * Glimmer v2 updates [Mark Johnson, cjfields] * PAML codeml wrapper updated to work with PAML 4.4d [DaveMessina] * Phyml updates [hyphaltip] * Repeatmasker updates [cjfields] * Initial BLAST+ modules (Bio::Tools::Run::BlastPlus/StandAloneBlastPlus) [maj] * Improved Bio::Tools::Run::AssemblerBase module and update of the wrappers that use it [fangly, maj] * Support for running new de novo and comparative assemblers: 454 Newbler [fangly], Minimo [fangly], Maq [maj], Samtools [maj], Bowtie [maj] * [bug 2728] add support to Bio::Tools::Run::Alignment::ClustalW for ClustalW2 [cjfields] * [RT 50363] make a bit more Windows friendly with file paths * [bug 2713] - Bio::Tools::Run::Infernal now works with Infernal 1.0 (older versions deprecated) [cjfields] * Bio::Tools::Run::Alignment::Gmap added [hartzell] * [bug 2798] - patch to fix clustalw premature file unlinking error [Wei Zhou] 1.6.0 Release * All Pise and Pise-related modules and scripts have been moved to the new bioperl-pise repository. The Pise service is no longer available and has been replaced by Mobyle. They have been retained as one can still install a Pise server, and as these modules can possibly be used to create a new BioPerl API for Mobyle. 1.5.2 Release in sync with bioperl core * Several wrappers updated for newer versions of the programs. 1.5.1 Release in sync with bioperl core o First major release in a while, so lots of things in this release o PHYLIP wrappers are updated for PHYLIP 3.6, some programs will no longer work (DrawTree and DrawGram specifically) for 3.5 at ths point. It will depend on whether or not anyone really wants this if we'll add in the necessary stuf to support 3.5. It isn't hard, just requires some stuff in th PhylipConf.pm modules. o Bio::Tools::Run::Alignment::Muscle added o PAML wrapper for Yn00 and Codeml are more forgiving about the argument validation. o Several wrappers updated for newer versions of the programs. TribeMCL, Genewise, RepeatMasker 1.2.2 Release update in sync with bioperl core o Soaplab - API changes - binary input added o Pise - changes affecting most Bio::Tools::Run:PiseApplication modules - Numerous documentation fixes in almost all modules - Added code in the SYNOPSIS, as well as the FEEDBACK, COPYRIGHT and SEE ALSO parts. - the DESCRIPTION section now contains *only* the parameters that can be set by the client. - remote parameter to -location to conform to Bio::Tools::Run::AnalysisFactory interface - new programs sirna, tranalign, twofeat (from EMBOSS 2.6). o Bio::Tools::Run::Eponine - More standardized way of running o Bio::Tools::Run::FootPrinter - Write the files properly - Mark Wagner's enhancements bug #1399 o Bio::Tools::Run::Genewise - more options o Bio::Tools::Run::Genscan - doc fix o Bio::Tools::Run::Hmmpfam - Updated to set params properly and return a SearchIO object o Bio::Tools::Run::Mdust - new location - Modified to inherit Bio::Tools::Run::WrapperBase - use Bio::Root::IO to build up paths - Modified documentation to conform to bioperl format o Bio::Tools::Run::Signalp - uniform sequence truncation lenght o Bio::Tools::Run::Vista - new module - Support more options - More documentation - fix reverse sequence bug o Bio::Tools::Run::Phylo::Phylip::SeqBoot - Allow more than one alignment o Bio::Tools::Run::Phylo::Phylip::Neighbor - Check for multiple data sets and set parameter accordingly o Bio::Tools::Run::Alignment::Blat - moved from Bio::Tools::Run name space - some code cleanup to avoid warnings and insure filehandles are properly closed, etc o Bio::Tools::Run::Alignment::Lagan - program name included - small fixes and addition of options - added the right credits. - Bio::Tools::Run::Alignment::DBA and Bio::Tools::Run::Alignment::Sim4 - Quiet declaration warnings 1.2 Developer release o Analysis Factory framework- currently providing SOAP access to EMBOSS applications o Support for FootPrinter, Genewise, Hmmpfam, Primate, Prints, Profile, Promoterwise, Pseudowise, Seg, Signalp, Tmhmm,TribeMCL, Blat,DBA,Lagan,Sim4,Fasta,ProtML,Vista o Added support for PHYLIP apps: Consense, DrawGram, DrawTree, SeqBoot o Added INSTALL.PROGRAMS providing references to download the program binaries. o Bug Fixes that hopefully solves the 'too many open files' problem 0.01 Initial release o Package is broken off from bioperl-live to support just runnable wrapper modules. o Support for PAML codeml tested, aaml still waiting o Support for Molphy protml, nucml to come o Support for EMBOSS pkg - still need to move component from bioperl-live Bio::Factory::EMBOSS to this package and rename it Bio::Tools::Run::EMBOSSFactory or something equivalent. o Support for Clustalw, TCoffee, Local NCBI BLAST. o RepeatMasker, Genscan, Pseudowise, TribeMCL, Primate, Eponine. o Support for remote analysis through Pise and NCBI Web Blast queue. o Select PHYLIP apps: Neighbor, ProtDist, and ProtPars. bioperl-run-release-1-7-3/DEPENDENCIES000066400000000000000000000134651342734133000171560ustar00rootroot00000000000000BioPerl-run Dependencies NOTE : This file was auto-generated by the core helper script maintenance/dependencies.pl. Do not edit directly! The following packages are used by BioPerl. While not all are required for BioPerl to operate properly, some functionality will be missing without them. You can easily choose to install all of these during the normal installation process. Note that the PPM version of the BioPerl packages always tries to install all dependencies. NB: This list of packages is not authoritative. See the 'requires', 'build_requires' and 'recommends' sections of Build.PL instead. ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | IO-String | * IO::String - IO::File interface | None | | | for in-core strings | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Phylo::Phylip::Consense - IO::String | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | IPC-Run | * IPC::Run - Child procs w/ piping, | None | | | redir and psuedo-ttys | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Genemark - IPC::Run | | * Bio::Tools::Run::Glimmer - IPC::Run | | * Bio::Tools::Run::TigrAssembler - IPC::Run | | * Bio::Tools::Run::tRNAscanSE - IPC::Run | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | XML-Twig | * XML::Twig - A module for easy | None | | | processing of XML | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::EMBOSSacd - XML::Twig | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | bioperl | * Bio::Seq - NA | None | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Cap3 - Bio::Seq | | * Bio::Tools::Run::Genscan - Bio::Seq | | * Bio::Tools::Run::TribeMCL - Bio::Seq | | * Bio::Tools::Run::Vista - Bio::Seq | | * Bio::Tools::Run::Alignment::Amap - Bio::Seq | | * Bio::Tools::Run::Alignment::Kalign - Bio::Seq | | * Bio::Tools::Run::Alignment::Lagan - Bio::Seq | | * Bio::Tools::Run::Alignment::MAFFT - Bio::Seq | | * Bio::Tools::Run::Alignment::Muscle - Bio::Seq | | * Bio::Tools::Run::Alignment::Probalign - Bio::Seq | | * Bio::Tools::Run::Alignment::Probcons - Bio::Seq | | * Bio::Tools::Run::Alignment::Proda - Bio::Seq | | * Bio::Tools::Run::Alignment::StandAloneFasta - Bio::Seq | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | libwww-perl | * HTTP::Request::Common - Functions | None | | | that generate HTTP::Requests | | | | * LWP - Libwww-perl | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Installer::Generic - HTTP::Request::Common | | * Bio::Installer::Generic - LWP | ============================================================================== bioperl-run-release-1-7-3/INSTALL000077500000000000000000000045401342734133000164330ustar00rootroot00000000000000bioperl-run INSTALLATION INSTALL THE RIGHT BIOPERL You need at least the corresponding version of Bioperl. Since this is BioPerl-run 1.007001, you should use BioPerl 1.007001. INSTALLATION Installation instructions at the following address apply here: http://bioperl.org/INSTALL.html The next 2 sections summarize the essential points from there. CPAN INSTALLATION To install using CPAN you will need a recent version (v1.8802 has been tested) of it and your prefer_installer conf set to 'MB': >cpan cpan>o conf prefer_installer MB cpan>o conf commit cpan>q Find the name of the bioperl-run version you want: >cpan cpan>d /bioperl-run/ Database was generated on Mon, 20 Nov 2006 05:24:36 GMT Distribution CJFIELDS/BioPerl-Run-1.007001.tar.gz Now install: cpan>install CJFIELDS/BioPerl-Run-1.007001.tar.gz If you've installed everything perfectly then you may pass all the tests run in the './Build test' phase. It's also possible that you may fail some tests. Possible explanations: problems with local Perl installation, previously undetected bug in Bioperl, flawed test script and so on. A few failed tests may not affect your usage of bioperl-run. If you decide that the failed tests will not affect how you intend to use bioperl-run and you'd like to install anyway do: cpan>force install CJFIELDS/BioPerl-Run-1.007001.tar.gz This is what most experienced Bioperl users would do. However, if you're concerned about a failed test and need assistance or advice then contact bioperl-l@bioperl.org. MANUAL INSTALLATION Download the bioperl-run archive, then extract its contents. Example: >gunzip bioperl-run-.tar.gz >tar xvf bioperl-run-.tar >cd bioperl-run where is the current release. Issue the following command from within bioperl-run/: >perl Build.PL You can run regression tests and install bioperl-run using the following commands: >./Build test >./Build install NOTE: many tests will be skipped without the necessary environment variables set to tell Bioperl where your programs are installed. INSTALLING bioperl-run ON WINDOWS The following page on the BioPerl website has up-to-date instructions on how to install bioperl-run on Windows: http://www.bioperl.org/wiki/Installing_Bioperl_on_Windows (the instructions are aimed at bioperl-core, but apply equally to bioperl-run) bioperl-run-release-1-7-3/INSTALL.PROGRAMS000066400000000000000000000077361342734133000176730ustar00rootroot00000000000000INSTALL.PROGRAMS: The Bioperl-run package has (Bio)perl wrappers written for the following applications: 1. Coils - Prediction of Coiled Coil Regions in Proteins Bio::Tools::Run::Coil http://www.ch.embnet.org/software/COILS_form.html 2. EMBOSS Applications - European Molecular Biology Open Software Suite Bio::Tools::Run::EMBOSSApplication http://www.hgmp.mrc.ac.uk/Software/EMBOSS/ 3. Eponine - Transcription Start Site finder Bio::Tools::Run::Eponine http://www.sanger.ac.uk/Software/analysis/eponine/ 4. FootPrinter - Program that performs phylogenetic footprinting. Bio::Tools::Run::FootPrinter http://abstract.cs.washington.edu/~blanchem/FootPrinterWeb/FootPrinterInput.pl 5. Genewise - Gene prediction program Bio::Tools::Run::Genewise http://www.sanger.ac.uk/software/wise2 6. Genscan - Identification of complete gene structures in genomic DNA Bio::Tools::Run::Genscan http://genes.mit.edu/GENSCAN.html 7. Hmmpfam - search a single sequence against an HMM database Bio::Tools::Run::Hmmpfam http://hmmer.wustl.edu/ 8. PISE - Web interfaces for Biological Programs Bio::Tools::Run::PiseApplication http://www-alt.pasteur.fr/~letondal/Pise/ 9. Primate - Near exact match finder for short sequence tags. Bio::Tools::Run:::Primate http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/ensembl-nci/?cvsroot=Ensembl 10. FingerPRINTScan - identify the closest matching PRINTS sequence motif fingerprints in a protein sequence Bio::Tools::Run::Prints http://www.bioinf.man.ac.uk/fingerPRINTScan/ 11. pfscan - scan a protein or DNA sequence with a profile library Bio::Tools::Run::Profile http://www.isrec.isb-sib.ch/software/software.html 12. Pseudowise - a pseudogene precdiction program, part of the wise2 package Bio::Tools::Run::Pseudowise http://www.sanger.ac.uk/software/wise2 13. RepeatMasker - screens DNA sequences in fasta format against a library of repetitive elements Bio::Tools::Run::RepeatMasker http://repeatmasker.genome.washington.edu 14. Seg - Identify low-complexity regions in protein sequences Bio::Tools::Run::Seg ftp://ftp.ncbi.nih.gov/pub/seg/ 15. Signalp - predicts the presence and location of signal peptide cleavage sites in amino acid sequences Bio::Tools::Run::Signalp http://www.cbs.dtu.dk/services/SignalP/ 16. Tmhmm - Prediction of transmembrane helices in proteins Bio::Tools::Run::Tmhmm http://www.cbs.dtu.dk/services/TMHMM/ 17. TribeMCL - Method for clustering proteins into related groups. Bio::Tools::Run::TribeMCL http://www.ebi.ac.uk/research/cgg/tribe/ 19. Molphy - MOLecular PHYlogenetics Package Bio::Tools::Run::Molphy http://www.ism.ac.jp/software/ismlib/softother.e.html 20. Phylip - Suite of Phylogenetics programs (Version 3.6) Bio::Tools::Run::Phylip http://evolution.genetics.washington.edu/phylip.html 22. DBA - DNA Block Aligner Bio::Tools::Run::Alignment::DBA http://www.sanger.ac.uk/software/wise2 23. Sim4 - Align CDNA to genomic sequences Bio::Tools::Run::Alignment::Sim4 http://globin.cse.psu.edu/ 25. BLAST - Basic Local Alignment Search Tool Bio::Tools::Run::StandAloneBlast (in bioperl-live CVS repository) ftp://ftp.ncbi.nih.gov/blast/executables 26. FASTA,SSEARCH - Pairwise sequence alignment Bio::Tools::Run::StandAloneFasta ftp://ftp.virginia.edu/pub/fasta 27. Promoterwise - Sequence alignment designed for promoter sequences http://www.sanger.ac.uk/software/wise2 28. Lagan - Lagan suite of tools including MLAGAN http://lagan.stanford.edu/ 29. Vista - Visualizing global DNA sequence alignments of arbitrary length http://www-gsd.lbl.gov/vista/VISTAdownload2.html 30. Exonerate - A package of alignment tools for protein and EST to genome/DNA alignments http://www.ebi.ac.uk/~guy/exonerate/ 31. AMAP- Protein multiple alignment based sequence annealing http://bio.math.berkeley.edu/amap/ 32. Blat - An alignment tool like BLAST, but structured differently http://genome.ucsc.edu/cgi-bin/hgBlat bioperl-run-release-1-7-3/INSTALL.SKIP000066400000000000000000000000211342734133000171630ustar00rootroot00000000000000ConfigData\.\S+$ bioperl-run-release-1-7-3/LICENSE000066400000000000000000001212701342734133000164040ustar00rootroot00000000000000BioPerl is licensed under the same terms as Perl itself, which means it is dually-licensed under either the Artistic or GPL licenses. Below are details of the Artistic License and, following it, the GPL. The "Artistic License" Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder as specified below. "Copyright Holder" is whoever is named in the copyright or copyrights for the package. "You" is you, if you're thinking about copying or distributing this Package. "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as uunet.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) give non-standard executables non-standard names, and clearly document the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. You may embed this Package's interpreter within an executable of yours (by linking); this shall be construed as a mere form of aggregation, provided that the complete Standard Version of the interpreter is so embedded. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whoever generated them, and may be sold commercially, and may be aggregated with this Package. If such scripts or library files are aggregated with this Package via the so-called "undump" or "unexec" methods of producing a binary executable image, then distribution of such an image shall neither be construed as a distribution of this Package nor shall it fall under the restrictions of Paragraphs 3 and 4, provided that you do not represent such an executable image as a Standard Version of this Package. 7. C subroutines (or comparably compiled subroutines in other languages) supplied by you and linked into this Package in order to emulate subroutines and variables of the language defined by this Package shall not be considered part of this Package, but are the equivalent of input as in Paragraph 6, provided these subroutines do not change the language in any way that would cause it to fail the regression tests for the language. 8. Aggregation of this Package with a commercial distribution is always permitted provided that the use of this Package is embedded; that is, when no overt attempt is made to make this Package's interfaces visible to the end user of the commercial distribution. Such use shall not be construed as a distribution of this Package. 9. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . bioperl-run-release-1-7-3/README.md000066400000000000000000000010621342734133000166520ustar00rootroot00000000000000# Description This is the home for *bioperl-run*, which contain modules that provides a Perl interface to various bioinformatics applications. This allows various applications to be used with common BioPerl objects. See the *Changes* file for more information about what is contained in here. # Installation See the accompanying *INSTALL* file for details on installing bioperl-run. # Feedback Write down any problems or praise and send them to bioperl-l@bioperl.org. # Bugs Bug reports can be made using the GitHub Issues tracker for this distribution. bioperl-run-release-1-7-3/lib/000077500000000000000000000000001342734133000161425ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/000077500000000000000000000000001342734133000166535ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/DB/000077500000000000000000000000001342734133000171405ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/DB/ESoap.pm000077500000000000000000000244111342734133000205120ustar00rootroot00000000000000# # BioPerl module for Bio::DB::ESoap # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::ESoap - Client for the NCBI Entrez EUtilities SOAP server =head1 SYNOPSIS $fac = Bio::DB::ESoap->new( -util => 'esearch' ); $som = $fac->run( -db => 'prot', -term => 'HIV and gp120' ); $fac->set_parameters( -term => 'HIV2 and gp160' ); # accessors corresponding to valid parameters are also created: $fac->db('nuccore'); $som = $fac->run; # more later. =head1 DESCRIPTION C provides a basic SOAP interface to the NCBI Entrez Utilities Web Service (L). L handles the SOAP calls. Higher level access, pipelines, BioPerl object I/O and such are provided by L. C complies with L. It depends explicitly on NCBI web service description language files to inform the C method. WSDLs are parsed by a relative lightweight, Entrez-specific module L. The C method returns L (SOAP Message) objects. No fault checking or other parsing is performed in this module. =head1 SEE ALSO L, L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::ESoap; use strict; use warnings; use Bio::Root::Root; use Bio::DB::ESoap::WSDL; use SOAP::Lite; use base qw(Bio::Root::Root Bio::ParameterBaseI); =head2 new Title : new Usage : my $obj = new Bio::DB::ESoap(); Function: Builds a new Bio::DB::ESoap factory Returns : an instance of Bio::DB::ESoap Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($util, $fetch_db, $wsdl) = $self->_rearrange( [qw( UTIL FETCH_DB WSDL_FILE )], @args ); $self->throw("Argument -util must be specified") unless $util; my @wsdl_pms; if ($wsdl) { @wsdl_pms = ( '-wsdl' => $wsdl ); } else { $fetch_db ||= 'seq'; my $url = ($util =~ /fetch/ ? 'f_'.$fetch_db : 'eutils'); $url = $NCBI_BASEURL.$WSDL{$url}; @wsdl_pms = ( '-url' => $url ); } $self->_wsdl(Bio::DB::ESoap::WSDL->new(@wsdl_pms)); $self->_operation($util); $self->_init_parameters; $self->_client( SOAP::Lite->new( proxy => $self->_wsdl->service ) ); return $self; } =head2 _wsdl() Title : _wsdl Usage : $obj->_wsdl($newval) Function: Bio::DB::ESoap::WSDL object associated with this factory Example : Returns : value of _wsdl (object) Args : on set, new value (object or undef, optional) =cut sub _wsdl { my $self = shift; return $self->{'_wsdl'} = shift if @_; return $self->{'_wsdl'}; } =head2 _client() Title : _client Usage : $obj->_client($newval) Function: holds a SOAP::Lite object Example : Returns : value of _client (a SOAP::Lite object) Args : on set, new value (a SOAP::Lite object or undef, optional) =cut sub _client { my $self = shift; return $self->{'_client'} = shift if @_; return $self->{'_client'}; } =head2 _operation() Title : _operation Alias : util Usage : Function: check and convert the requested operation based on the wsdl Returns : Args : operation (scalar string) =cut sub _operation { my $self = shift; my $util = shift; return $self->{'_operation'} unless $util; $self->throw("WSDL not yet initialized") unless $self->_wsdl; my $opn = $self->_wsdl->operations; if ( grep /^$util$/, keys %$opn ) { return $self->{'_operation'} = $util; } elsif ( grep /^$util$/, values %$opn ) { my @a = grep { $$opn{$_} eq $util } keys %$opn; return $self->{'_operation'} = $a[0]; } else { $self->throw("Utility '$util' is not recognized"); } } sub util { shift->_operation(@_) } =head2 action() Title : action Usage : Function: return the soapAction associated with the factory's utility Returns : scalar string Args : none =cut sub action { my $self = shift; return $self->{_action} if $self->{_action}; return $self->{_action} = ${$self->_wsdl->operations}{$self->util}; } =head2 wsdl_file() Title : wsdl_file Usage : Function: get filename of the local WSDL XML copy Returns : filename (scalar string) Args : none =cut sub wsdl_file { my $self = shift; if (ref ($self->_wsdl->wsdl) eq 'File::Temp') { return $self->_wsdl->wsdl->filename; } return $self->_wsdl->wsdl; } =head2 run() Title : _run Usage : $som = $self->_run(@optional_setting_args) Function: Call the SOAP service with the factory-associated utility and parameters Returns : SOAP::SOM (SOAP Message) object Args : named parameters appropriate for the utility Note : no fault checking here =cut sub run { my $self = shift; my @args = @_; $self->throw("SOAP::Lite client not initialized") unless $self->_client; $self->throw("run requires named args") if @args % 2; $self->set_parameters(@args) if scalar @args; my %args = $self->get_parameters; my @soap_data; for my $k (keys %args) { ## kludges for NCBI inconsistencies: my $k_ncbi; for ($k) { /QueryKey/ && do { $k_ncbi = 'query_key'; last; }; /RetMax/ && do { $k_ncbi = 'retmax'; last; }; $k_ncbi = $k; } my $data = $args{$k}; next unless defined $data; for (ref $data) { /^$/ && do { push @soap_data, SOAP::Data->name($k_ncbi)->value($data); last; }; /ARRAY/ && do { push @soap_data, SOAP::Data->name($k_ncbi)->value(join(',',@$data)); last; }; /HASH/ && do { # for adding multiple data items with the same message # key (id lists for elink, e.g.) # see ...::SoapEUtilities, c. line 151 push @soap_data, map { SOAP::Data->name($k_ncbi)->value($_) } keys %$data; }; } } $self->_client->on_action( sub { $self->action } ); my $som = $self->_client->call( $self->util, @soap_data ); return $som; } sub _result_elt_name { my $s=shift; (keys %{$s->_wsdl->response_parameters($s->util)})[0] }; sub _response_elt_name { shift->_result_elt_name } sub _request_elt_name { my $s=shift; (keys %{$s->_wsdl->request_parameters($s->util)})[0] }; =head2 Bio::ParameterBaseI compliance =cut sub available_parameters { my $self = shift; my @args = @_; return @{$self->_init_parameters}; } sub set_parameters { my $self = shift; my @args = @_; $self->throw("set_parameters requires named args") if @args % 2; ($_%2 ? 1 : $args[$_] =~ s/^-//) for (0..$#args); my %args = @args; # special translations : if ( defined $args{'usehistory'} ) { $args{'usehistory'} = ($args{'usehistory'} ? 'y' : undef); } $self->_set_from_args(\%args, -methods=>$self->_init_parameters); return $self->parameters_changed(1); } sub get_parameters { my $self = shift; my @ret; foreach (@{$self->_init_parameters}) { next unless defined $self->$_(); push @ret, ($_, $self->$_()); } return @ret; } sub reset_parameters { my $self = shift; my @args = @_; $self->throw("reset_parameters requires named args") if @args % 2; ($_%2 ? 1 : $args[$_] =~ s/^-//) for (0..$#args); my %args = @args; my %reset; @reset{@{$self->_init_parameters}} = (undef) x @{$self->_init_parameters}; $reset{$_} = $args{$_} for keys %args; $self->_set_from_args( \%reset, -methods => $self->_init_parameters ); $self->parameters_changed(1); return 1; } =head2 parameters_changed() Title : parameters_changed Usage : $obj->parameters_changed($newval) Function: flag to indicate, well, you know Example : Returns : value of parameters_changed (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub parameters_changed { my $self = shift; return $self->{'parameters_changed'} = shift if @_; return $self->{'parameters_changed'}; } =head2 _init_parameters() Title : _init_parameters Usage : $fac->_init_parameters Function: identify the available input parameters using the wsdl object Returns : arrayref of parameter names (scalar strings) Args : none =cut sub _init_parameters { my $self = shift; return $self->{_params} if $self->{_params}; $self->throw("WSDL not yet initialized") unless $self->_wsdl; my $phash = {}; my $val = (values %{$self->_wsdl->request_parameters($self->util)})[0]; $$phash{$_} = undef for map { keys %$_ } @{$val}; my $params =$self->{_params} = [sort keys %$phash]; # create parm accessors $self->_set_from_args( $phash, -methods => $params, -create => 1, -code => 'my $self = shift; if (@_) { $self->parameters_changed(1); return $self->{\'_\'.$method} = shift; } $self->parameters_changed(0); return $self->{\'_\'.$method};' ); $self->parameters_changed(1); return $self->{_params}; } 1; bioperl-run-release-1-7-3/lib/Bio/DB/ESoap/000077500000000000000000000000001342734133000201475ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/DB/ESoap/WSDL.pm000077500000000000000000000471361342734133000212740ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::ESoap::WSDL # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::ESoap::WSDL - WSDL parsing for Entrez SOAP EUtilities =head1 SYNOPSIS Used by L # url $wsdl = Bio::DB::ESoap::WSDL->new( -url => "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl" ); # local copy $wsdl = Bio::DB::ESoap::WSDL->new( -wsdl => "local/eutils.wsdl" ); %opns = %{ $wsdl->operations }; =head1 DESCRIPTION This module is a lightweight parser and container for WSDL XML files associated with the NCBI EUtilities SOAP server. XML facilities are provided by L. The following accessors provide names and structures useful for creating SOAP messages using L (e.g.): service() : the URL of the SOAP service operations() : hashref of the form {.., $operation_name => $soapAction, ...} request_parameters($operation) : request field names and namelists as an array of hashes result_parameters($operation) : result field names and namelists as an array of hashes The following accessors provide L objects pointing at key locations in the WSDL: root : the root of the WSDL docment _types_elt : the element _portType_elt : the element _binding_elt : the element _service_elt : the element _message_elts : an array of all top-level elements _operation_elts : an array of all elements contained in Parsing occurs lazily (on first read, not on construction); all information is cached. To clear the cache and force re-parsing, run $wsdl->clear_cache; The globals C<$NCBI_BASEURL>, C<$NCBI_ADAPTOR>, and C<%WSDL> are exported. $NCBI_ADAPTOR : the soap service cgi To construct a URL for a WSDL: $wsdl_eutils = $NCBI_BASEURL.$WSDL{'eutils'} $wsdl_efetch_omim = $NCBI_BASEURL.$WSDL{'f_omim'} # etc. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::ESoap::WSDL; use strict; use Bio::Root::Root; use XML::Twig; use Bio::WebAgent; use File::Temp; use base qw(Bio::Root::Root Exporter); our @EXPORT = qw( $NCBI_BASEURL $NCBI_ADAPTOR %WSDL ); our $NCBI_BASEURL = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/"; our $NCBI_ADAPTOR = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/soap_adapter_2_0.cgi"; our %WSDL = ( 'eutils' => 'eutils.wsdl', 'f_pubmed' => 'efetch_pubmed.wsdl', 'f_pmc' => 'efetch_pmc.wsdl', 'f_nlmc' => 'efetch_nlmc.wsdl', 'f_journals' => 'efetch_journals.wsdl', 'f_omim' => 'efetch_omim.wsdl', 'f_taxon' => 'efetch_taxon.wsdl', 'f_snp' => 'efetch_snp.wsdl', 'f_gene' => 'efetch_gene.wsdl', 'f_seq' => 'efetch_seq.wsdl' ); =head2 new Title : new Usage : my $obj = new Bio::DB::ESoap::WSDL(); Function: Builds a new Bio::DB::ESoap::WSDL object Returns : an instance of Bio::DB::ESoap::WSDL Args : named args: -URL => $url_of_desired_wsdl -OR- -WSDL => $filename_of_local_wsdl_copy ( -WSDL will take precedence if both specified ) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($url, $wsdl) = $self->_rearrange( [qw( URL WSDL )], @args ); my (%sections, %cache); my $doc = 'wsdl:definitions'; $sections{'_message_elts'} = []; $sections{'_operation_elts'} = []; $self->_sections(\%sections); $self->_cache(\%cache); $self->_twig( XML::Twig->new( twig_handlers => { $doc => sub { $self->root($_) }, "$doc/binding" => sub { $self->_sections->{'_binding_elt'} = $_ }, "$doc/binding/operation" => sub { push @{$self->_sections->{'_operation_elts'}},$_ }, "$doc/message" => sub { push @{$self->_sections->{'_message_elts'}}, $_ }, "$doc/portType" => sub { $self->_sections->{'_portType_elt'} = $_ }, "$doc/service" => sub { $self->_sections->{'_service_elt'} = $_ }, "$doc/types" => sub { $self->_sections->{'_types_elt'} = $_ }, } ) ); if ($url || $wsdl ) { $self->url($url); $self->wsdl($wsdl); $self->_parse; } return $self; } =head1 Getters =head2 request_parameters() Title : request_parameters Usage : @params = $wsdl->request_parameters($operation_name) Function: get array of request (input) fields required by specified operation, according to the WSDL Returns : hash of arrays of hashes... Args : scalar string (operation or action name) =cut sub request_parameters { my $self = shift; my ($operation) = @_; my $is_action; $self->throw("Operation name must be specified") unless defined $operation; my $opn_hash = $self->operations; unless ( grep /^$operation$/, keys %$opn_hash ) { $is_action = grep /^$operation$/, values %$opn_hash; $self->throw("Operation name '$operation' is not recognized") unless ($is_action); } #check the cache here.... return $self->_cache("request_params_$operation") if $self->_cache("request_params_$operation"); # find the input message type in the portType elt if ($is_action) { my @a = grep {$$opn_hash{$_} eq $operation} keys %$opn_hash; # note this takes the first match $operation = $a[0]; $self->throw("Whaaa??") unless defined $operation; } #check the cache once more after translation.... return $self->_cache("request_params_$operation") if $self->_cache("request_params_$operation"); my $bookmarks = $self->_operation_bookmarks($operation); my $imsg_elt = $bookmarks->{'i_msg_elt'}; my $opn_schema = $bookmarks->{'schema'}; my $ret = { $imsg_elt->att('name') => [] }; # do a quick recursion: _get_types((values %$ret)[0], $imsg_elt, $opn_schema); return $self->_cache("request_params_$operation", $ret); 1; } =head2 result_parameters() Title : result_parameters Usage : $result_hash = $wsdl->result_parameters Function: retrieve a hash structure describing the result of running the specified operation according to the WSDL Returns : hash of arrays of hashes... Args : operation (scalar string) =cut sub result_parameters { my $self = shift; my ($operation) = @_; my $is_action; $self->throw("Operation name must be specified") unless defined $operation; my $opn_hash = $self->operations; unless ( grep /^$operation$/, keys %$opn_hash ) { $is_action = grep /^$operation$/, values %$opn_hash; $self->throw("Operation name '$operation' is not recognized") unless ($is_action); } #check the cache here.... return $self->_cache("result_params_$operation") if $self->_cache("result_params_$operation"); # find the input message type in the portType elt if ($is_action) { my @a = grep {$$opn_hash{$_} eq $operation} keys %$opn_hash; # note this takes the first match $operation = $a[0]; $self->throw("Whaaa??") unless defined $operation; } #check the cache once more after translation.... return $self->_cache("result_params_$operation") if $self->_cache("result_params_$operation"); # do work my $bookmarks = $self->_operation_bookmarks($operation); # eutilities results seem to be a mixture of xs:string element # and complex types which are just xs:seqs of xs:string elements # # cast these as a hash of hashes... my $omsg_elt = $bookmarks->{'o_msg_elt'}; my $opn_schema = $bookmarks->{'schema'}; my $ret = { $omsg_elt->att('name') => [] }; # do a quick recursion: _get_types((values %$ret)[0], $omsg_elt, $opn_schema); return $self->_cache("result_params_$operation", $ret); } sub response_parameters { shift->result_parameters( @_ ) } =head2 operations() Title : operations Usage : @opns = $wsdl->operations; Function: get a hashref with elts ( $operation_name => $soapAction ) for all operations defined by this WSDL Returns : array of scalar strings Args : none =cut sub operations { my $self = shift; return $self->_cache('operations') if $self->_cache('operations'); my %opns; foreach (@{$self->_parse->_operation_elts}) { $opns{$_->att('name')} = ($_->descendants('soap:operation'))[0]->att('soapAction'); } return $self->_cache('operations', \%opns); } =head2 service() Title : service Usage : $wsdl->service Function: gets the SOAP service url associated with this WSDL Returns : scalar string Args : none =cut sub service { my $self = shift; return $self->_cache('service') || $self->_cache('service', ($self->_parse->_service_elt->descendants('soap:address'))[0]->att('location')); } =head2 db() Title : db Usage : Function: If this is an efetch WSDL, returns the db name associated with it Returns : scalar string or undef Args : none =cut sub db { my $self = shift; $self->root->namespace('nsef') =~ /efetch_(.*?)$/; return $1; } =head1 Internals =head2 _operation_bookmarks() Title : _operation_bookmarks Usage : Function: find useful WSDL elements associated with the specified operation; return a hashref of the form { $key => $XML_Twig_Elt_obj, } Returns : hashref with keys: portType namespace schema i_msg_type i_msg_elt o_msg_type o_msg_elt Args : operation name (scalar string) Note : will import schema if necessary =cut sub _operation_bookmarks { my $self = shift; my $operation = shift; # check cache return $self->_cache("bookmarks_$operation") if $self->_cache("bookmarks_$operation"); # do work my %bookmarks; my $pT_opn = $self->_portType_elt->first_child( qq/ operation[\@name="$operation"] / ); my $imsg_type = $pT_opn->first_child('input')->att('message'); my $omsg_type = $pT_opn->first_child('output')->att('message'); # now lookup the schema element name from among the message elts my ($imsg_elt, $omsg_elt); foreach ( @{$self->_message_elts} ) { my $msg_name = $_->att('name'); if ( $imsg_type =~ qr/$msg_name/ ) { $imsg_elt = $_->first_child('part[@element=~/[Rr]equest/]')->att('element'); } if ( $omsg_type =~ qr/$msg_name/) { $omsg_elt = $_->first_child('part[@element=~/[Rr]esult/]')->att('element'); } last if ($imsg_elt && $omsg_elt); } $self->throw("Can't find request schema element corresponding to '$operation'") unless $imsg_elt; $self->throw("Can't find result schema element corresponding to '$operation'") unless $omsg_elt; # $imsg_elt has a namespace prefix, to lead us to the correct schema # as defined in the wsdl element. Get that schema $imsg_elt =~ /(.*?):/; my $opn_ns = $self->root->namespace($1); my $opn_schema = $self->_types_elt->first_child("xs:schema[\@targetNamespace='$opn_ns']"); $opn_schema ||= $self->_types_elt->first_child("xs:schema"); # only one $self->throw("Can't find types schema corresponding to '$operation'") unless defined $opn_schema; # need to import the schema? do it here. if ( my $import_elt = $opn_schema->first_child("xs:import") ) { my $import_url = $NCBI_BASEURL.$import_elt->att('schemaLocation'); my $imported = XML::Twig->new(); # better error checking here? eval { $imported->parse(Bio::WebAgent->new()->get($import_url)->content); }; $self->throw("Schema import failed (tried url '$import_url') : $@") if $@; my $imported_schema = $imported->root; # get included schemata my @included = $imported_schema->children("xs:include"); foreach (@included) { my $url = $NCBI_BASEURL.$_->att('schemaLocation'); my $incl = XML::Twig->new(); eval { $incl->parse( Bio::WebAgent->new()->get($url)->content ); }; $self->throw("Schema include failed (tried url '$url') : $@") if $@; # cut-n-paste my @incl = $incl->root->children; $_->cut; foreach my $child (@incl) { $child->cut; $child->paste( last_child => $_->former_parent ); } } # cut-n-paste $opn_schema->cut; $imported_schema->cut; $imported_schema->paste( first_child => $opn_schema->former_parent ); $opn_schema = $imported_schema; } # find the definition of $imsg_elt in $opn_schema $imsg_elt =~ s/.*?://; $imsg_elt = $opn_schema->first_child("xs:element[\@name='$imsg_elt']"); $self->throw("Can't find request element definition in schema corresponding to '$operation'") unless defined $imsg_elt; $omsg_elt =~ s/.*?://; $omsg_elt = $opn_schema->first_child("xs:element[\@name='$omsg_elt']"); $self->throw("Can't find result element definition in schema corresponding to '$operation'") unless defined $omsg_elt; @bookmarks{qw(portType i_msg_type o_msg_type namespace schema i_msg_elt o_msg_elt ) } = ($pT_opn, $imsg_type, $omsg_type, $opn_ns, $opn_schema, $imsg_elt, $omsg_elt); return $self->_cache("bookmarks_$operation", \%bookmarks); } =head2 _parse() Title : _parse Usage : $wsdl->_parse Function: parse the wsdl at url and create accessors for section twig elts Returns : self Args : =cut sub _parse { my $self = shift; my @args = @_; return $self if $self->_parsed; # already done $self->throw("Neither URL nor WSDL set in object") unless $self->url || $self->wsdl; eval { if ($self->wsdl) { $self->_twig->parsefile($self->wsdl); } else { eval { my $tfh = File::Temp->new(-UNLINK=>1); Bio::WebAgent->new()->get($self->url, ':content_file' => $tfh->filename); $tfh->close; $self->_twig->parsefile($tfh->filename); $self->wsdl($tfh->filename); }; $self->throw("URL parse failed : $@") if $@; } }; # $self->throw("Parser issue : $@") if $@; die $@ if $@; $self->_set_from_args( $self->_sections, -methods => [qw(_types_elt _message_elts _portType_elt _binding_elt _operation_elts _service_elt)], -create => 1 ); $self->_parsed(1); return $self; } =head2 root() Title : root Usage : $obj->root($newval) Function: holds the root Twig elt of the parsed WSDL Example : Returns : value of root (an XML::Twig::Elt) Args : on set, new value (an XML::Twig::Elt or undef, optional) =cut sub root { my $self = shift; return $self->{'root'} = shift if @_; return $self->{'root'}; } =head2 url() Title : url Usage : $obj->url($newval) Function: get/set the WSDL url Example : Returns : value of url (a scalar string) Args : on set, new value (a scalar or undef, optional) =cut sub url { my $self = shift; return $self->{'url'} = shift if @_; return $self->{'url'}; } =head2 wsdl() Title : wsdl Usage : $obj->wsdl($newval) Function: get/set wsdl XML filename Example : Returns : value of wsdl (a scalar string) Args : on set, new value (a scalar string or undef, optional) =cut sub wsdl { my $self = shift; my $file = shift; if (defined $file) { $self->throw("File not found") unless (-e $file) || (ref $file eq 'File::Temp'); return $self->{'wsdl'} = $file; } return $self->{'wsdl'}; } =head2 _twig() Title : _twig Usage : $obj->_twig($newval) Function: XML::Twig object for handling the wsdl Example : Returns : value of _twig (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _twig { my $self = shift; return $self->{'_twig'} = shift if @_; return $self->{'_twig'}; } =head2 _sections() Title : _sections Usage : $obj->_sections($newval) Function: holds hashref of twigs corresponding to main wsdl elements; filled by _parse() Example : Returns : value of _sections (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _sections { my $self = shift; return $self->{'_sections'} = shift if @_; return $self->{'_sections'}; } =head2 _cache() Title : _cache Usage : $wsdl->_cache($newval) Function: holds the wsdl info cache Example : Returns : value of _cache (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _cache { my $self = shift; my ($name, $value) = @_; unless (@_) { return $self->{'_cache'} = {}; } if (defined $value) { return $self->{'_cache'}->{$name} = $value; } return $self->{'_cache'}->{$name}; } sub clear_cache { shift->_cache() } =head2 _parsed() Title : _parsed Usage : $obj->_parsed($newval) Function: flag to indicate wsdl already parsed Example : Returns : value of _parsed (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _parsed { my $self = shift; return $self->{'_parsed'} = shift if @_; return $self->{'_parsed'}; } # =head2 _get_types() # Title : _get_types # Usage : very internal # Function: recursively parse through custom types # Returns : # Args : arrayref, XML::Twig::Elt, XML::Twig::Elt # (return array, type element, schema root) # =cut sub _get_types { my ($res, $elt, $sch, $visited) = @_; my $is_choice; $visited ||= []; # assuming max 1 xs:sequence or xs:choice per element my $seq = ($elt->descendants('xs:sequence'))[0]; $is_choice = ($seq ? '' : '|'); $seq ||= ($elt->descendants('xs:choice'))[0]; return 1 unless $seq; foreach ( $seq->descendants('xs:element') ) { for my $type ($_->att('type') || $_->att('ref')) { !defined($type) && do { Bio::Root::Root->throw("neither type nor ref attributes defined; cannot proceed"); last; }; $type eq 'xs:string' && do { push @$res, { $_->att('name').$is_choice => 1}; last; }; do { # custom type # find the type def in schema $type =~ s/.*?://; # strip tns if (grep /^$type$/, @$visited) { # check for circularity push @$res, { $_->att('name').$is_choice => "$type(reused)"}if $_->att('name'); last; } push @$visited, $type; my $new_elt = $sch->first_child("xs:complexType[\@name='$type']"); if (defined $new_elt) { my $new_res = []; push @$res, { $_->att('name').$is_choice => $new_res }; _get_types($new_res, $new_elt, $sch, $visited); } else { # a 'ref', make sure it's defined $new_elt = $sch->first_child("xs:element[\@name='$type']"); $DB::single=1 unless $new_elt; Bio::Root::Root->throw("type not defined in schema; cannot proceed") unless defined $new_elt; push @$res, { $new_elt->att('name').$is_choice => 1 }; } last; } } } return 1; } sub DESTROY { my $self = shift; if (ref($self->wsdl) eq 'File::Temp') { unlink $self->wsdl->filename; } } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities.pm000077500000000000000000000600771342734133000224160ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities - Interface to the NCBI Entrez web service *BETA* =head1 SYNOPSIS use Bio::DB::SoapEUtilities; # factory construction my $fac = Bio::DB::SoapEUtilities->new() # executing a utility call #get an iteratable adaptor my $links = $fac->elink( -dbfrom => 'protein', -db => 'taxonomy', -id => \@protein_ids )->run(-auto_adapt => 1); # get a Bio::DB::SoapEUtilities::Result object my $result = $fac->esearch( -db => 'gene', -term => 'sonic and human')->run; # get the raw XML message my $xml = $fac->efetch( -db => 'gene', -id => \@gids )->run( -raw_xml => 1 ); # change parameters my $new_result = $fac->efetch( -db => 'gene', -id => \@more_gids)->run; # reset parameters $fac->efetch->reset_parameters( -db => 'nucleotide', -id => $nucid ); $result = $fac->efetch->run; # parsing and iterating the results $count = $result->count; @ids = $result->ids; while ( my $linkset = $links->next_link ) { $submitted = $linkset->submitted_id; } ($taxid) = $links->id_map($submitted_prot_id); $species_io = $fac->efetch( -db => 'taxonomy', -id => $taxid )->run( -auto_adapt => 1); $species = $species_io->next_species; $linnaeus = $species->binomial; =head1 DESCRIPTION This module allows the user to query the NCBI Entrez database via its SOAP (Simple Object Access Protocol) web service (described at L). The basic tools (C) are available as methods off a C factory object. Parameters for each tool can be queried, set and reset for each method through the L standard calls (C). Returned data can be retrieved, accessed and parsed in several ways, according to user preference. Adaptors and object iterators are available for C, C, C, and C results. =head1 USAGE The C system has been designed to be as easy (few includes, available parameter facilities, reasonable defaults, intuitive aliases, built-in pipelines) or as complex (accessors for underlying low-level objects, all parameters accessible, custom hooks for builder objects, facilities for providing local copies of WSDLs) as the user requires or desires. (To the extent that it does not succeed in either direction, it is up to the user to report to the mailing list (L)!) =head2 Factory To begin, make a factory: my $fac = Bio::DB::SoapEUtilities->new(); From the factory, utilities are called, parameters are set, and results or adaptors are retrieved. If you have your own copy of the wsdl, use my $fac = Bio::Db::SoapEUtilities->new( -wsdl_file => $my_wsdl ); otherwise, the correct one will be obtained over the network (by L and friends). =head2 Utilities and parameters To run any of the standard NCBI EUtilities (C), call the desired utility from the factory. To use a utility, you must set its parameters and run it to get a result. TMTOWTDI: # verbose my $fetch = $fac->efetch(); $fetch->set_parameters( -db => 'gene', -id => [828392, 790]); my $result = $fetch->run; # compact my $result = $fac->efetch(-db =>'gene',-id => [828392,790])->run; # change ids $fac->efetch->set_parameters( -id => 470338 ); $result = $fac->run; # another util $result = $fac->esearch(-db => 'protein', -term => 'BRCA and human')->run; # the utilities are kept separate %search_params = $fac->esearch->get_parameters; %fetch_params = $fac->efetch->get_parameters; $search_param{db}; # is 'protein' $fetch_params{db}; # is 'gene' The factory is L compliant: that means you can find out what you can set with @available_search = $fac->esearch->available_parameters; @available_egquery = $fac->egquery->available_parameters; For more information on parameters, see L. =head2 Results The "intermediate" object for C query results is the L. This is a BioPerly parsing of the SOAP message sent by NCBI when a query is C. This can be very useful on it's own, but most users will likely want to proceed directly to L, which take a C and turn it into more intuitive/familiar BioPerl objects. Go there if the following details are too gory. Results can be highly- or lowly-parsed, depending on the parameters passed to the factory C method. To get the raw XML message with no parsing, do my $xml = $fac->$util->run(-raw_xml => 1); # $xml is a scalar string To retrieve a L object with limited parsing, but with accessors to the L message (provided by L), do my $result = $fac->$util->run(-no_parse => 1); my $som = $result->som; my $method_hash = $som->method; # etc... To retrieve a C object with message elements parsed into accessors, including C and C, run without arguments: my $result = $fac->esearch->run() my $count = $result->count; my @Count = $result->Count; # counts for each member of # the translation stack my @ids = $result->IdList_Id; # from automatic message parsing @ids = $result->ids; # a convenient alias See L for more, even gorier details. =head2 Adaptors Adaptors convert EUtility Cs into convenient objects, via a handle that usually provides an iterator, in the spirit of L. These are probably more useful than the C to the typical user, and so you can retrieve them automatically by setting the C parameter C<-auto_adapt => 1>. In general, retrieve an adaptor like so: $adp = $fac->$util->run( -auto_adapt => 1 ); # iterate... while ( my $obj = $adp->next_obj ) { # do stuff with $obj } The adaptor itself occasionally possesses useful methods besides the iterator. The method C always works, but a natural alias is also always available: $seqio = $fac->esearch->run( -auto_adapt => 1 ); while ( my $seq = $seqio->next_seq ) { # do stuff with $seq } In the above example, C<-auto_adapt => 1> also instructs the factory to perform an C based on the ids returned by the C (if any), so that the adaptor returned iterates over L objects. Here is a rundown of the different adaptor flavors: =over =item * C, Fetch Adaptors, and BioPerl object iterators The C creates bona fide BioPerl objects. Currently, there are FetchAdaptor subclasses for sequence data (both Genbank and FASTA rettypes) and taxonomy data. The choice of FetchAdaptor is based on information in the result message, and should be transparent to the user. $seqio = $fac->efetch( -db =>'nucleotide', -id => \@ids, -rettype => 'gb' )->run( -auto_adapt => 1 ); while (my $seq = $seqio->next_seq) { my $taxio = $fac->efetch( -db => 'taxonomy', -id => $seq->species->ncbi_taxid )->run(-auto_adapt => 1); my $tax = $taxio->next_species; unless ( $tax->TaxId == $seq->species->ncbi_taxid ) { print "more work for MAJ" } } See the pod for the FetchAdaptor subclasses (e.g., L) for more detail. =item * C, the Link adaptor, and the C iterator The C manages LinkSets. In C, an C call B preserves the correspondence between submitted and retrieved ids. The mapping between these can be accessed from the adaptor object directly as C my $links = $fac->elink( -db => 'protein', -dbfrom => 'nucleotide', -id => \@nucids )->run( -auto_adapt => 1 ); # maybe more than one associated id... my @prot_0 = $links->id_map( $nucids[0] ); Or iterate over the linksets: while ( my $ls = $links->next_linkset ) { @ids = $ls->ids; @submitted_ids = $ls->submitted_ids; # etc. } =item * C, the DocSum adaptor, and the C iterator The C manages docsums, the C return type. The objects returned by iterating with a C have accessors that let you obtain field information directly. Docsums contain lots of easy-to-forget fields; use C to remind yourself. my $docs = $fac->esummary( -db => 'taxonomy', -id => 527031 )->run(-auto_adapt=>1); # iterate over docsums while (my $d = $docs->next_docsum) { @available_items = $docsum->item_names; # any available item can be called as an accessor # from the docsum object...watch your case... $sci_name = $d->ScientificName; $taxid = $d->TaxId; } =item * C, the GQuery adaptor, and the C iterator The C manages global query items returned by calls to C, which identifies all NCBI databases containing hits for your query term. The databases actually containing hits can be retrieved directly from the adaptor with C: my $queries = $fac->egquery( -term => 'BRCA and human' )->run(-auto_adapt=>1); my @dbs = $queries->found_in_dbs; Retrieve the global query info returned for B database with C: my $prot_q = $queries->query_by_db('protein'); if ($prot_q->count) { #do something } Or iterate as usual: while ( my $q = $queries->next_query ) { if ($q->status eq 'Ok') { # do sth } } =back =head2 Web environments and query keys To make large or complex requests for data, or to share queries, it may be helpful to use the NCBI WebEnv system to manage your queries. Each EUtility accepts the following parameters: -usehistory -WebEnv -QueryKey for this purpose. These store the details of your queries serverside. C attempts to make using these relatively straightforward. Use C objects to obtain the correct parameters, and don't forget C<-usehistory>: my $result1 = $fac->esearch( -term => 'BRCA and human', -db => 'nucleotide', -usehistory => 1 )->run( -no_parse=>1 ); my $result = $fac->esearch( -term => 'AND early onset', -QueryKey => $result1->query_key, -WebEnv => $result1->webenv )->run( -no_parse => 1 ); my $result = $fac->esearch( -db => 'protein', -term => 'sonic', -usehistory => 1 )->run( -no_parse => 1 ); # later (but not more than 8 hours later) that day... $result = $fac->esearch( -WebEnv => $result->webenv, -QueryKey => $result->query_key, -RetMax => 800 # get 'em all )->run; # note we're parsing the result... @all_ids = $result->ids; =head2 Error checking Two kinds of errors can ensue on an Entrez SOAP run. One is a SOAP fault, and the other is an error sent in non-faulted SOAP message from the server. The distinction is probably systematic, and I would welcome an explanation of it. To check for result errors, try something like: unless ( $result = $fac->$util->run ) { die $fac->errstr; # this will catch a SOAP fault } # a valid result object was returned, but it may carry an error if ($result->count == 0) { warn "No hits returned"; if ($result->ERROR) { warn "Entrez error : ".$result->ERROR; } } Error handling will be improved in the package eventually. =head1 SEE ALSO L, L, L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::SoapEUtilities; use strict; use Bio::Root::Root; use Bio::DB::ESoap; use Bio::DB::SoapEUtilities::DocSumAdaptor; use Bio::DB::SoapEUtilities::FetchAdaptor; use Bio::DB::SoapEUtilities::GQueryAdaptor; use Bio::DB::SoapEUtilities::LinkAdaptor; use Bio::DB::SoapEUtilities::Result; use base qw(Bio::Root::Root Bio::ParameterBaseI ); our $AUTOLOAD; =head2 new Title : new Usage : my $eutil = new Bio::DB::SoapEUtilities(); Function: Builds a new Bio::DB::SoapEUtilities object Returns : an instance of Bio::DB::SoapEUtilities Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($db, $wsdl) = $self->_rearrange( [qw( DB WSDL_FILE )], @args ); $self->{db} = $db; $self->{'_wsdl_file'} = $wsdl; return $self; } =head2 run() Title : run Usage : $fac->$eutility->run(@args) Function: Execute the EUtility Returns : true on success, false on fault or error (reason in errstr(), for more detail check the SOAP message in last_result() ) Args : named params appropriate to utility -auto_adapt => boolean ( return an iterator over results as appropriate to util if true) -raw_xml => boolean ( return raw xml result; no processing ) Bio::DB::SoapEUtilities::Result constructor parms =cut sub run { my $self = shift; my @args = @_; $self->throw("run method requires named arguments") if @args % 2; $self->throw("call run method like '\$fac->\$eutility->run(\@args)") unless $self->_caller_util; my ($autofetch, $raw_xml) = $self->_rearrange( [qw( AUTO_ADAPT RAW_XML)], @args ); my ($adaptor); my %args = @args; # add tool argument for NCBI records $args{tool} = "BioPerl"; my %params = $self->get_parameters; $self->warn("No -email parameter set : be advised that NCBI requires a valid email to accompany all requests") unless $params{email}; my $util = $self->_caller_util; # pass util args to run only to a downstream utility (i.e., efetch # on autofetch.. # $self->set_parameters(%args) if %args; # kludge for elink : make sure to-ids and from-ids are associated if ( $util eq 'elink' ) { my $es = $self->_soap_facs($util); my $ids = $es->id; if (ref $ids eq 'ARRAY') { my %ids; @ids{@$ids} = (1) x scalar @$ids; $es->id(\%ids); } } $self->_soap_facs($util)->_client->outputxml($raw_xml); my $som = $self->{'_response_message'} = $self->_soap_facs($util)->run; # raw xml only... if ($raw_xml) { return $som; } # SOAP::SOM parsing... # check response status if ($som->fault) { $self->{'errstr'} = $som->faultstring; return 0; } # elsif non-fault error if (my $err = $som->valueof("//ErrorList")) { while ( my ($key, $val) = each %$err ) { $self->{'errstr'} .= join( " : ", $key, $val )."\n"; }; $self->{'errstr'} =~ s/\n$//; return 0; } # attach some key properties to the factory $self->{'_WebEnv'} = $som->valueof("//WebEnv"); # create convenient aliases off result for different utils my @alias_hash; for ($util) { /einfo/ && do { my %args = $self->get_parameters; if ($args{db}) { push @alias_hash, ( '-alias_hash' => { 'record_count' => 'DbInfo_Count', 'last_update' => 'DbInfo_LastUpdate', 'db' => 'DbInfo_DbName', 'description' => 'DbInfo_Description' } ); } else { push @alias_hash, ('-alias_hash' => {'dbs' => 'DbList_DbName'} ); } last; }; # put others here as nec } my $result = Bio::DB::SoapEUtilities::Result->new($self, @args, @alias_hash); # success, parse it out if ($autofetch) { for ($self->_caller_util) { $_ eq 'esearch' && do { # do an efetch with the same db and a returned list of ids... # reentering here! my $ids = $result->ids; if (!$result->count) { $self->warn("Can't fetch; no records returned"); return $result; } if (!$result->ids) { $self->warn("Can't fetch; no id list returned"); return $result; } if ( !$self->db ) { my %h = $self->get_parameters; $self->{db} = $h{db} || $h{DB}; } # pass run() args to the downstream utility here # (so can specify -rettype, basically) # note @args will contain -auto_adapt => 1 here. # keep the email arg my %parms = $self->get_parameters; $adaptor = $self->efetch( -db => $self->db, -id => $ids, -email => $parms{email}, -tool => $parms{tool}, @args )->run(-no_parse => 1, @args); last; }; $_ eq 'elink' && do { $adaptor = Bio::DB::SoapEUtilities::LinkAdaptor->new( -result => $result ); last; }; $_ eq 'esummary' && do { $adaptor = Bio::DB::SoapEUtilities::DocSumAdaptor->new( -result => $result ); last; }; $_ eq 'egquery' && do { $adaptor = Bio::DB::SoapEUtilities::GQueryAdaptor->new( -result => $result ); last; }; $_ eq 'efetch' && do { $adaptor = Bio::DB::SoapEUtilities::FetchAdaptor->new( -result => $result ); last; }; # else, ignore } return ($adaptor || $result); } else { return $result; 1; } } =head2 Useful Accessors =head2 response_message() Title : response_message Aliases : last_response, last_result Usage : $som = $fac->response_message Function: get the last response message Returns : a SOAP::SOM object Args : none =cut sub response_message { shift->{'_response_message'} } sub last_response { shift->{'_response_message'} } sub last_result { shift->{'_response_message'} } =head2 webenv() Title : webenv Usage : Function: contains WebEnv key referencing the session (set after run() ) Returns : scalar Args : none =cut sub webenv { shift->{'_WebEnv'} } =head2 errstr() Title : errstr Usage : $fac->errstr Function: get the last error, if any Example : Returns : value of errstr (a scalar) Args : none =cut sub errstr { shift->{'errstr'} } sub _wsdl_file { shift->{'_wsdl_file'} } =head2 Bio::ParameterBaseI compliance =head2 available_parameters() Title : available_parameters Usage : Function: get available request parameters for calling utility Returns : Args : -util => $desired_utility [optional, default is caller utility] =cut sub available_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->available_parameters(%args); } =head2 set_parameters() Title : set_parameters Usage : Function: Returns : none Args : -util => $desired_utility [optional, default is caller utility], named utility arguments =cut sub set_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->set_parameters(%args); } =head2 get_parameters() Title : get_parameters Usage : Function: Returns : array of named parameters Args : utility (scalar string) [optional] (default is caller utility) =cut sub get_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); return $self->_soap_facs($util)->get_parameters; } =head2 reset_parameters() Title : reset_parameters Usage : Function: Returns : none Args : -util => $desired_utility [optional, default is caller utility], named utility arguments =cut sub reset_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->reset_parameters(%args); } =head2 parameters_changed() Title : parameters_changed Usage : Function: Returns : boolean Args : utility (scalar string) [optional] (default is caller utility) =cut sub parameters_changed { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); return $self->_soap_facs($util)->parameters_changed; } # idea behind using autoload: attempt to buffer the module # against additions of new eutilities, and (of course) to # reduce work (laziness, not Laziness) sub AUTOLOAD { my $self = shift; my $util = $AUTOLOAD; my @args = @_; $util =~ s/.*:://; if ( $util =~ /^e/ ) { # this will bite me someday # create an ESoap factory for this utility my $fac = $self->_soap_facs($util); # check cache my @pms = ( -util => $util ); if ($self->_wsdl_file) { push @pms, ( -wsdl_file => $self->_wsdl_file ); } eval { $fac ||= Bio::DB::ESoap->new( @pms ); }; for ($@) { /^$/ && do { $self->_soap_facs($util,$fac); # put in cache last; }; /Utility .* not recognized/ && do { my $err = (ref $@ ? $@->text : $@); $self->throw($err); }; do { #else my $err = (ref $@ ? $@->text : $@); die $err; $self->throw("Problem creating ESoap client : $err"); }; } # arg setting $self->throw("Named arguments required") if @args % 2; $fac->set_parameters(@args) if @args; $self->_caller_util($util); return $self; # now, can do $obj->esearch()->run, etc, with methods in # this package, with an appropriate low-level factory # set up in the background. } elsif ($self->_caller_util) { # delegate to the appropriate soap factory my $method = $util; $util = $self->_caller_util; my $soapfac = $self->_soap_facs($util); if ( $soapfac && $soapfac->can($method) ) { return $soapfac->$method(@args); } } else { $self->throw("Can't locate method '$util' in module ". __PACKAGE__); } 1; } =head2 _soap_facs() Title : _soap_facs Usage : $self->_soap_facs($util, $fac) Function: caches Bio::DB::ESoap factories for the eutils in use by this instance Example : Returns : Bio::DB::ESoap object Args : $eutility, [optional on set] $esoap_factory_object =cut sub _soap_facs { my $self = shift; my ($util, $fac) = @_; $self->throw("Utility must be specified") unless $util; $self->{'_soap_facs'} ||= {}; if ($fac) { return $self->{'_soap_facs'}->{$util} = $fac; } return $self->{'_soap_facs'}->{$util}; } =head2 _caller_util() Title : _caller_util Usage : $self->_caller_util($newval) Function: the utility requested off the main SoapEUtilities object Example : Returns : value of _caller_util (a scalar string, a valid eutility) Args : on set, new value (a scalar string [optional]) =cut sub _caller_util { my $self = shift; return $self->{'_caller_util'} = shift if @_; return $self->{'_caller_util'}; } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/000077500000000000000000000000001342734133000220435ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/DocSumAdaptor.pm000077500000000000000000000132411342734133000251120ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::DocSumAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::DocSumAdaptor - Handle for Entrez SOAP DocSums =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a DocSumAdaptor my $docs = $fac->esummary( -db => 'taxonomy', -id => 527031 )->run(-auto_adapt=>1); # iterate over docsums while (my $d = $docs->next_docsum) { @available_items = $docsum->item_names; # any available item can be called as an accessor # from the docsum object...watch your case... $sci_name = $d->ScientificName; $taxid = $d->TaxId; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::DocSumAdaptor; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::DocSumAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::DocSumAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::DocSumAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("DocSumAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("DocSumAdaptor only works with elink results") unless $result->util eq 'esummary'; $self->{'_result'} = $result; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_docsum() Title : next_docsum Usage : Function: return the next DocSum from the attached Result Returns : Args : =cut sub next_docsum { my $self = shift; my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless $som->valueof($stem); my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; $params{'-id'} = $get->('Id'); my $names = []; for (my $i = 1; my $data = $som->dataof("$stem/[$i]"); $i++) { if ( $data->value and $data->value !~ /^\s*$/) { my $name = $data->attr->{'Name'}; next unless $name; my $content = $som->valueof("$stem/[$i]/ItemContent"); unless (defined $content) { next unless $som->dataof("$stem/[$i]/Item"); my $h = {}; _traverse_items("$stem/[$i]", $som, $h); $content = $h; } push @$names, $name; $params{$name} = $content; } } $params{'_item_names'} = $names; my $class = ref($self)."::docsum"; $ret = $class->new(%params); ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_docsum(@_) } sub rewind { shift->{'_idx'} = 1; }; sub _traverse_items { my ($stem, $som, $h) = @_; for (my $i = 1; my $data = $som->dataof($stem."/[$i]"); $i++) { my $name = $data->attr->{'Name'}; next unless $name; if ($name =~ /Type$/) { # clip out this node _traverse_items("$stem/[$i]", $som, $h); } else { my $content = $som->valueof("$stem/[$i]/ItemContent"); if ($content) { $$h{$name} = $content; } else { $$h{$name} = {}; _traverse_items("$stem/[$i]", $som, $$h{$name}); } } } return; } 1; #### package Bio::DB::SoapEUtilities::DocSumAdaptor::docsum; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref($self->{$k}) eq \'ARRAY\' ? @{$self->{$k}} : $self->{$k});' ); return $self; } =head2 item_names() Title : item_names Usage : @accs = $docsum->item_names Function: Return a list of items accessible from the object Returns : array of scalar strings Args : none =cut sub item_names { my $a = shift->{'__item_names'} ; return @$a if $a } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/FetchAdaptor.pm000077500000000000000000000146241342734133000247570ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor - Conversion of Entrez SOAP messages to BioPerl objects =head1 SYNOPSIS $fac = Bio::DB::SoapEUtilities->new(); $soap_result = $fac->efetch( -db => 'protein', -id => 2597988 ); $adp = Bio::DB::SoapEUtilities::FetchAdaptor( -result => $soap_result, -type => 'seq' ); while ( $gb_seq = $adp->next_obj ) { # do stuff } =head1 DESCRIPTION C is the base class of a system, modeled after L, to parse SOAP responses from the NCBI Entrez C utility into germane BioPerl objects. The user will rarely need to instantiate a C with L object as in the L. It usually suffices to use the C<-auto_adapt> parameter in the factory C method: my $fac = Bio::DB::SoapEUtilities->new(); my $taxio = $fac->efetch(-db => 'taxonomy', -id => 1394)->run(-auto_adapt=>1); my $sp = $taxio->next_species; # Bio::Species objects my $seqio = $fac->efetch(-db => 'protein', -id => 730439)->run(-auto_adapt=>1); my $seq = $seqio->next_seq; # Bio::Seq::RichSeq objects =head1 SEE ALSO L, C subclasses =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::SoapEUtilities::FetchAdaptor; use strict; use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::FetchAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::FetchAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::FetchAdaptor Args : named arguments -som => $soap_som_object (soap message) -type => $type ( optional, forces loading of $type adaptor ) =cut sub new { my ($class,@args) = @_; $class = ref($class) || $class; if ($class =~ /.*?::FetchAdaptor::(\S+)/) { my $self = $class->SUPER::new(@args); $self->_initialize(@args); return $self; } else { my %args = @args; my $result = $args{'-result'} || $args{'-RESULT'}; $class->throw("Bio::DB::SoapEUtilities::Result argument required") unless $result; $class->throw("RESULT argument must be a Bio::DB::SoapEUtilities::Result object") unless ref($result) eq 'Bio::DB::SoapEUtilities::Result'; # identify the correct adaptor module to load using Result info my $type ||= $result->fetch_type; $class->throw("Can't determine fetch type for this result") unless $type; # $type ultimately contains a FetchAdaptor subclass return unless( $class->_load_adaptor($type, $result) ); return "Bio::DB::SoapEUtilities::FetchAdaptor::$type"->new(@args); } } =head2 _initialize() Title : _initialize Usage : Function: Returns : Args : =cut sub _initialize { my $self = shift; my @args = @_; my ($result, $type) = $self->_rearrange([qw( RESULT TYPE )], @args); $self->throw("Bio::DB::SoapEUtilities::Result argument required") unless $result; $self->throw("RESULT argument must be a Bio::DB::SoapEUtilities::Result object") unless ref($result) eq 'Bio::DB::SoapEUtilities::Result'; $self->{'_type'} = $type || $result->fetch_type; $self->{'_result'} = $result; 1; } =head2 _load_adaptor() Title : _load_adaptor Usage : Function: loads a FetchAdaptor subclass Returns : Args : adaptor type (subclass name) =cut sub _load_adaptor { my ($class, $type, $result) = @_; return unless $type; # specials for ($result->fetch_type) { $_ eq 'seq' && do { $_[1] = $type = 'species' if $result->fetch_db and $result->fetch_db eq 'taxonomy'; last; }; # else, leave $type alone } my $module = "Bio::DB::SoapEUtilities::FetchAdaptor::".$type; my $ok; eval { $ok = $class->_load_module($module); }; for ($@) { /^$/ && do { return $ok; }; /Can't locate/ && do { $class->throw("Fetch adaptor for '$type' not found"); }; do { # else $class->throw("Error in fetch adaptor for '$type' : $@"); }; } } =head2 obj_class() Title : obj_class Usage : $adaptor->obj_class Function: Returns the fully qualified BioPerl classname of the objects returned by next_obj() Returns : scalar string (class name) Args : none =cut sub obj_class { shift->throw_not_implemented } =head2 next_obj() Title : next_obj Usage : $obj = $adaptor->next_obj Function: Returns the next parsed BioPerl object from the adaptor Returns : object of class obj_class() Args : none =cut sub next_obj { shift->throw_not_implemented } =head2 rewind() Title : rewind Usage : Function: Rewind the adaptor's iterator Returns : Args : none =cut sub rewind { shift->throw_not_implemented } =head2 result() Title : result Usage : Function: contains the SoapEUtilities::Result object Returns : Bio::DB::SoapEUtilities::Result object Args : none =cut sub result { shift->{'_result'} } =head2 type() Title : type Usage : Function: contains the fetch type of this adaptor Returns : Args : =cut sub type { shift->{'_type'} } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/FetchAdaptor/000077500000000000000000000000001342734133000244075ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/FetchAdaptor/seq.pm000077500000000000000000000563601342734133000255520ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor::seq # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor::seq - Fetch adaptor for 'seq' efetch SOAP messages =head1 SYNOPSIS Imported by L as required. =head1 DESCRIPTION Returns an iterator over L or L objects, depending on the the return type of the C. A standard C to a sequence database will return a GenBank SOAP result; this will be parsed into rich sequence objects: my $fac = Bio::DB::SoapEUtilities->new; my $seqio = $fac->efetch(-db => 'protein', -id => 730439)->run(-auto_adapt=>1); my $seq = $seqio->next_seq; $seq->species->binomial; # returns 'Bacillus caldolyticus' An C with C<-rettype => 'fasta'> will be parsed into L objects (VERY much faster): $seqio = $fac->efetch( -rettype => 'fasta' )->run(-auto_adapt=>1); $seq = $seqio->next_seq; $seq->species; # undef $seq->desc; # kitchen sink To find out the object type returned: $class = $seqio->obj_class; as for all L objects. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 CONTRIBUTORS Much inspiration from L and family. =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::FetchAdaptor::seq; use strict; use Bio::Root::Root; use Bio::Annotation::Collection; use Bio::Annotation::Comment; use Bio::Annotation::DBLink; use Bio::Annotation::Reference; use Bio::Annotation::SimpleValue; use Bio::Factory::FTLocationFactory; use Bio::SeqFeature::Generic; use Bio::Seq::SeqBuilder; use Bio::Seq::SeqFactory; use Bio::Species; use base qw(Bio::DB::SoapEUtilities::FetchAdaptor Bio::Root::Root); our %VALID_ALPHABET = ( 'AA' => 'protein', 'DNA' => 'dna', 'RNA' => 'rna' ); our %TYPE_XLT = ( 'Bio::Seq' => ['TSeqSet','TSeq'], 'Bio::Seq::RichSeq' => ['GBSet', 'GBSeq'] ); sub _initialize { my ($self, @args) = @_; $self->SUPER::_initialize(@args); my ($builder, $seqfac ) = $self->_rearrange( [qw(SEQBUILDER SEQFACTORY)], @args ); # choose rich or simple seq based on result my ($t) = keys %{$self->result->som->method}; for ($t) { /^GB/ && do { $t = 'GB'; # genbank info $self->{'_obj_class'} = ($seqfac ? $seqfac->type : 'Bio::Seq::RichSeq'); last; }; /^T/ && do { $t = 'T'; # fasta info $self->{'_obj_class'} = ($seqfac ? $seqfac->type : 'Bio::Seq'); last; }; $self->throw("FetchAdaptor::seq : unrecognized result elt type '$t', can't parse"); } $self->{'_builder'} = $builder || Bio::Seq::SeqBuilder->new(); $self->{'_builder'}->sequence_factory( $seqfac || Bio::Seq::SeqFactory->new( -type => $self->{'_obj_class'} ) ); $self->{'_locfac'} = Bio::Factory::FTLocationFactory->new(); $self->{'_idx'} = 1; 1; } sub rewind { shift->{'_idx'} = 1 } sub obj_class { shift->{'_obj_class'} } sub builder { shift->{'_builder'} }; sub locfac { shift->{'_locfac'} }; sub next_obj { my $self = shift; my $t = $TYPE_XLT{$$self{_obj_class}}; my $stem = "//$$t[0]/[".$self->{'_idx'}."]"; my $som = $self->result->som; my $seqid; return unless defined $som->valueof("$stem"); my $get = sub { $som->valueof("$stem/$$t[1]_".shift) }; # speed up (?) by caching top-level data hash my $toplev = $som->valueof("$stem"); my $get_tl = sub { $toplev->{"$$t[1]_".shift} }; my %params = (-verbose => $self->verbose); if ($t->[0] =~ /^T/) { $params{'-display_id'} = $get_tl->('accver'); $params{'-primary_id'} = $get_tl->('gi'); $params{'-length'} = $get_tl->('length'); $params{'-desc'} = $get_tl->('defline'); $params{'-seq'} = $get_tl->('sequence'); $params{'-alphabet'} = $get_tl->('seqtype') || undef; $self->builder->add_slot_value(%params); ($self->{_idx})++; if ( !$self->builder->want_object ) { # skip $self->builder->make_object; goto &next_obj; } else { return $self->builder->make_object; } } elsif ($t->[0] =~ /^GB/) { # source, id, alphabet $params{'-display_id'} = $get_tl->('locus'); $params{'-length'} = $get_tl->('length'); $get_tl->('moltype') =~ /(AA|[DR]NA)/; $params{'-alphabet'} = $VALID_ALPHABET{$1} || ''; # molecule, division, dates $params{'-molecule'} = $get_tl->('moltype'); $params{'-is_circular'} = ($get_tl->('topology') eq 'circular'); $params{'-division'} = $get->('division'); $params{'-dates'} = [$get_tl->('create-date'), $get_tl->('update-date')]; $self->builder->add_slot_value(%params); %params = (); if ( !$self->builder->want_object ) { # skip this $self->builder->make_object; ($self->{_idx})++; goto &next_obj; } # accessions, version, pid, description $get_tl->('accession-version') =~ /.*\.([0-9]+)$/; $params{'-version'} = $params{'-seq_version'} = $1; my @secondary_ids; my @ids = $get->('other-seqids/GBSeqid'); foreach (@ids) { /^gi\|([0-9]+)/ && do { $seqid = $params{'-primary_id'} = $1; $params{'-accession_number'} = $_; # correct? next; }; do { # else push @secondary_ids, $_; next; }; } $params{'-secondary_accessions'} = \@secondary_ids; $params{'-desc'} = $get->('definition'); # sequence if ( $self->builder->want_slot('seq')) { $params{'-seq'} = $get->('sequence'); } # keywords if ($get->('keywords')) { my @kw; foreach my $kw ($som->valueof("$stem/GBSeq_keywords/*")) { push @kw, $kw; } $params{'-keywords'} = join(' ',@kw); } $self->builder->add_slot_value(%params); %params = (); my $ann; # annotations if ($self->builder->want_slot('annotation')) { $ann = Bio::Annotation::Collection->new(); # references if ($get->('references')) { $ann->add_Annotation('reference', $_) for _read_references($stem,$som); } # comment if ($get_tl->('comment')) { $ann->add_Annotation('comment', Bio::Annotation::Comment->new( -tagname => 'comment', -text => $get_tl->('comment') ) ); } # project if ( $get_tl->('project') ) { $ann->add_Annotation('project', Bio::Annotation::SimpleValue->new( -value => $get_tl->('project') ) ); } # contig if ($get_tl->('contig')) { $ann->add_Annotation('contig', Bio::Annotation::SimpleValue->new( -value => $get_tl->('contig') ) ); } # dblink if ($get_tl->('source-db')) { _read_db_source($ann, $get); } $self->builder->add_slot_value(-annotation => $ann); } # features my $feats; if ($self->builder->want_slot('features')) { $feats = _read_features($stem,$som,$self->locfac,$get); $self->builder->add_slot_value( -features => $feats ); } # organism data if ( $self->builder->want_slot('species') && $get_tl->('source') ) { my $sp = _read_species($get); if ($sp && !$sp->ncbi_taxid) { my ($src) = grep { $_->primary_tag eq 'source' } @$feats; if ($src) { foreach my $val ($src->get_tag_values('db_xref')) { $sp->ncbi_taxid(substr($val,6)) if index($val,"taxon:") == 0; } } } $self->builder->add_slot_value( -species => $sp ); } } else { $self->throw("FetchAdaptor::seq : unrecognized result elt type '$t', can't parse"); } ($self->{_idx})++; return $self->builder->make_object; } # mostly ripped from Bio::SeqIO::genbank... sub _read_species { my ($get) = @_; my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown', 'Unspecified', 'Unknown', 'None', 'unclassified', 'unidentified organism', 'not supplied'); # dictionary of synonyms for taxid 32644 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified'); # all above can be part of valid species name my( $sub_species, $species, $genus, $sci_name, $common, $abbr_name, $organelle); $sci_name = $get->('organism') || return; # parse out organelle, common name, abbreviated name if present; # this should catch everything, but falls back to # entire GBSeq_taxonomy element just in case if ($get->('source') =~ m{^ (mitochondrion|chloroplast|plastid)? \s*(.*?) \s*(?: \( (.*?) \) )?\.? $}xms ) { ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional } else { $abbr_name = $get->('source'); # nothing caught; this is a backup! } # Convert data in classification lines into classification array. my @class = split(/; /, $get->('taxonomy')); # do we have a genus? my $possible_genus = quotemeta($class[-1]) . ($class[-2] ? "|" . quotemeta($class[-2]) : ''); if ($sci_name =~ /^($possible_genus)/) { $genus = $1; ($species) = $sci_name =~ /^$genus\s+(.+)/; } else { $species = $sci_name; } # is this organism of rank species or is it lower? # (we don't catch everything lower than species, but it doesn't matter - # this is just so we abide by previous behaviour whilst not calling a # species a subspecies) if ($species && $species =~ /subsp\.|var\./) { ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/; } # Don't make a species object if it's empty or "Unknown" or "None" # return unless $genus and $genus !~ /^(Unknown|None)$/oi; # Don't make a species object if it belongs to taxid 32644 my $src = $get->('source'); return unless ($species || $genus) and !grep { $_ eq $src } @unkn_names; # Bio::Species array needs array in Species -> Kingdom direction push(@class, $sci_name); @class = reverse @class; my $make = Bio::Species->new(); $make->scientific_name($sci_name); $make->classification(@class) if @class > 0; $make->common_name( $common ) if $common; $make->name('abbreviated', $abbr_name) if $abbr_name; $make->organelle($organelle) if $organelle; return $make; } sub next_seq { shift->next_obj } sub _read_references { my ($stem, $som) = @_; my @ret; for ( my $i = 1; $som->valueof($stem."/GBSeq_references/[$i]"); $i++ ) { my $get = sub { $som->valueof($stem."/GBSeq_references/[$i]/GBReference_".shift ) }; my %params; $params{'-title'} = $get->('title'); $params{'-pubmed'} = $get->('pubmed'); $params{'-medline'} = $get->('pubmed'); $params{'-journal'} = $get->('journal'); $params{'-comment'} = $get->('remark'); $params{'-consortium'} = $get->('consortium'); my $pos = $get->('position'); $pos and $pos =~ /^([0-9]+)[.]+([0-9]+)$/; $params{'-start'} = $1; $params{'-end'} = $2; $params{'-gb_reference'} = $get->('reference'); $params{'-authors'} = ''; foreach my $author ( $get->('authors/*') ) { $params{'-authors'} .= " $author"; } push @ret, Bio::Annotation::Reference->new( -tagname => 'reference', %params); } return @ret; } sub _read_features { my ($stem, $som, $locfac, $get_pri) = @_; my @ret; my $seqid = $get_pri->('primary-accession'); for ( my $i = 1; $get_pri->("feature-table/[$i]"); $i++ ) { my $get = sub { $som->valueof($stem."/GBSeq_feature-table/[$i]/GBFeature_".shift ) }; my $loc; my $sf = Bio::SeqFeature::Generic->direct_new(); if ($get->('location')) { # may have to parse GBIntervals instead here... $loc = $locfac->from_string( $get->('location') ); if ($seqid && !($loc->is_remote)) { $loc->seq_id($seqid); } } $sf->location($loc); $sf->seq_id($seqid); $sf->primary_tag($get->('key')); $sf->source_tag('EMBL/GenBank/SwissProt'); # fill other fields using $sf->add_tag_value... # qualifiers are name => value pairs. add as tags # to this feature if ($get->('quals')) { foreach ($get->('quals/*')) { $sf->add_tag_value( $_->{'GBQualifier_name'}, $_->{'GBQualifier_value'} ); } } if ($get->('partial5')) { $sf->add_tag_value( 'is_partial5', $get->('partial5') eq 'true' ? 1 : 0) } if ($get->('partial3')) { $sf->add_tag_value( 'is_partial3', $get->('partial3') eq 'true' ? 1 : 0) } push @ret, $sf; } return \@ret; } sub _read_db_source { my ($ann, $get) = @_; my $dbsource = $get->('source-db'); # ripped mainly from Bio::SeqIO::genbank... # deal with UniProKB dbsources if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,[^.]+\.\s*// ) { $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $2, -database => $1, -tagname => 'dblink')); if( $dbsource =~ s/created:\s+([^.]+)\.\s*// ) { $ann->add_Annotation ('swissprot_dates', Bio::Annotation::SimpleValue->new (-tagname => 'date_created', -value => $1)); } while( $dbsource =~ s/\s+(sequence|annotation)\s+ updated:\s+([^.]+)\.\s*//xg ) { $ann->add_Annotation ('swissprot_dates', Bio::Annotation::SimpleValue->new (-tagname => 'date_updated', -value => $2)); } $dbsource =~ s/\n/ /g; if ( $dbsource =~ s/xrefs:\s+ ((?:\S+,\s+)+\S+)\s+xrefs/xrefs/x ) { # will use $i to determine even or odd # for swissprot the accessions are paired my $i = 0; for my $dbsrc ( split(/,\s+/,$1) ) { if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) { my ($id,$version) = ($1,$2); $version ='' unless defined $version; my $db; if( $id =~ /^\d\S{3}/) { $db = 'PDB'; } else { $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; } $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -version => $version, -database => $db, -tagname => 'dblink')); } } } elsif ( $dbsource =~ s/xrefs:\s+(.+)\s+xrefs/xrefs/i ) { # download screwed up and ncbi didn't put # acc in for gi numbers my $i = 0; for my $id ( split(/\,\s+/,$1) ) { my ($acc,$db); if( $id =~ /gi:\s+(\d+)/ ) { $acc= $1; $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) { $acc= $1; $db = 'PDB'; } else { $acc= $id; $db = ''; } $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $acc, -database => $db, -tagname => 'dblink')); } } else { warn "Cannot match $dbsource"; } if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+ ((?:\S+,\s+)+\S+)//x ) { for my $id ( split(/\,\s+/,$1) ) { my $db; # quote from Bio::SeqIO::genbank: # this is because GenBank dropped the spaces!!! # I'm sure we're not going to get this right $db = substr($id,0,index($id,':')); $id = substr($id,index($id,':')+1); $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -database => $db, -tagname => 'dblink')); } } } else { if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) { my ($db,$id,$version) = ($1,$2,$3); $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -version => $version, -database => $db || 'GenBank', -tagname => 'dblink')); } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) { my ($id, $db, $version); if ($2 eq ':') { ($db, $id) = ($1, $3); } else { ($db, $id, $version) = ('GenBank', $1, $3); } $ann->add_Annotation('dblink', Bio::Annotation::DBLink->new( -primary_id => $id, -version => $version, -database => $db, -tagname => 'dblink') ); } else { warn "Unrecognized DBSOURCE data: $dbsource"; } } return 1; } 1; __END__ here\'s an example: PROTEIN 0 HASH(0x439b8a8) 'GBSet' => HASH(0x439c010) 'GBSeq' => HASH(0x43a79c8) 'GBSeq_accession-version' => 'CAA53922.1' 'GBSeq_comment' => 'On Nov 8, 1997 this sequence version replaced gi:443947.' 'GBSeq_create-date' => '18-JAN-1994' 'GBSeq_definition' => 'sonic hedgehog [Mus musculus]' 'GBSeq_division' => 'ROD' 'GBSeq_feature-table' => HASH(0x43abf4c) 'GBFeature' => HASH(0x43b23b4) 'GBFeature_intervals' => HASH(0x43b800c) 'GBInterval' => HASH(0x43b83fc) 'GBInterval_accession' => 'CAA53922.1' 'GBInterval_from' => 1 'GBInterval_to' => 437 'GBFeature_key' => 'CDS' 'GBFeature_location' => '1..437' 'GBFeature_quals' => HASH(0x43b8378) 'GBQualifier' => HASH(0x43baeb0) 'GBQualifier_name' => 'db_xref' 'GBQualifier_value' => 'UniProtKB/Swiss-Prot:Q62226' 'GBSeq_length' => 437 'GBSeq_locus' => 'CAA53922' 'GBSeq_moltype' => 'AA' 'GBSeq_organism' => 'Mus musculus' 'GBSeq_other-seqids' => HASH(0x43ab028) 'GBSeqid' => 'gi|2597988' 'GBSeq_primary-accession' => 'CAA53922' 'GBSeq_references' => HASH(0x43abe80) 'GBReference' => HASH(0x43af1f8) 'GBReference_authors' => HASH(0x43af3e4) 'GBAuthor' => 'McMahon,A.P.' 'GBReference_journal' => 'Submitted (03-NOV-1997) A.P. McMahon, Harvard University, 16 Divinity Ave., Cambridge, MA 02138, USA' 'GBReference_position' => '1..437' 'GBReference_reference' => 3 'GBReference_title' => 'Direct Submission' 'GBSeq_sequence' => 'mllllarcflvilassllvcpglacgpgrgfgkrrhpkkltplaykqfipnvaektlgasgryegkitrnserfkeltpnynpdiifkdeentgadrlmtqrckdklnalaisvmnqwpgvklrvtegwdedghhseeslhyegravdittsdrdrskygmlarlaveagfdwvyyeskahihcsvkaensvaaksggcfpgsatvhleqggtklvkdlrpgdrvlaaddqgrllysdfltfldrdegakkvfyvietleprerllltaahllfvaphndsgptpgpsalfasrvrpgqrvyvvaerggdrrllpaavhsvtlreeeagayapltahgtilinrvlascyavieehswahrafapfrlahallaalapartdgggggsipaaqsateargaeptagihwysqllyhigtwlldsetmhplgmavkss' 'GBSeq_source' => 'Mus musculus (house mouse)' 'GBSeq_source-db' => 'embl accession X76290.1' 'GBSeq_taxonomy' => 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Mus' 'GBSeq_topology' => 'linear' 'GBSeq_update-date' => '04-NOV-1997' NUCLEOTIDE 0 HASH(0x42c1a44) 'GBSet' => HASH(0x42dd728) 'GBSeq' => HASH(0x44bc2c8) 'GBSeq_accession-version' => 'NR_029721.1' 'GBSeq_comment' => 'PROVISIONAL REFSEQ: This record is based on preliminary annotation provided by NCBI staff in collaboration with miRBase. The reference sequence was derived from AL645478.15.; ~Summary: microRNAs (miRNAs) are short (20-24 nt) non-coding RNAs that are involved in post-transcriptional regulation of gene expression in multicellular organisms by affecting both the stability and translation of mRNAs. miRNAs are transcribed by RNA polymerase II as part of capped and polyadenylated primary transcripts (pri-miRNAs) that can be either protein-coding or non-coding. The primary transcript is cleaved by the Drosha ribonuclease III enzyme to produce an approximately 70-nt stem-loop precursor miRNA (pre-miRNA), which is further cleaved by the cytoplasmic Dicer ribonuclease to generate the mature miRNA and antisense miRNA star (miRNA*) products. The mature miRNA is incorporated into a RNA-induced silencing complex (RISC), which recognizes target mRNAs through imperfect base pairing with the miRNA and most commonly results in translational inhibition or destabilization of the target mRNA. The RefSeq represents the predicted microRNA stem-loop. [provided by RefSeq]; ~Sequence Note: This record represents a predicted microRNA stem-loop as defined by miRBase. Some sequence at the 5\' and 3\' ends may not be included in the intermediate precursor miRNA produced by Drosha cleavage.' 'GBSeq_create-date' => '29-OCT-2009' 'GBSeq_definition' => 'Mus musculus microRNA 196a-1 (Mir196a-1), microRNA' 'GBSeq_division' => 'ROD' 'GBSeq_feature-table' => HASH(0x4579f0c) 'GBFeature' => HASH(0x457ab6c) 'GBFeature_intervals' => HASH(0x457fa20) 'GBInterval' => HASH(0x45813d0) 'GBInterval_accession' => 'NR_029721.1' 'GBInterval_from' => 24 'GBInterval_to' => 45 'GBFeature_key' => 'ncRNA' 'GBFeature_location' => '24..45' 'GBFeature_quals' => HASH(0x45813e8) 'GBQualifier' => HASH(0x4581a90) 'GBQualifier_name' => 'db_xref' 'GBQualifier_value' => 'MGI:2676860' 'GBSeq_length' => 102 'GBSeq_locus' => 'NR_029721' 'GBSeq_moltype' => 'ncRNA' 'GBSeq_organism' => 'Mus musculus' 'GBSeq_other-seqids' => HASH(0x456bea8) 'GBSeqid' => 'gi|262205520' 'GBSeq_primary' => 'REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP~1-102 AL645478.15 79764-79865 ' 'GBSeq_primary-accession' => 'NR_029721' 'GBSeq_references' => HASH(0x45744ac) 'GBReference' => HASH(0x457ac20) 'GBReference_authors' => HASH(0x457f36c) 'GBAuthor' => 'Tuschl,T.' 'GBReference_journal' => 'RNA 9 (2), 175-179 (2003)' 'GBReference_position' => '1..102' 'GBReference_pubmed' => 12554859 'GBReference_reference' => 9 'GBReference_title' => 'New microRNAs from mouse and human' 'GBSeq_sequence' => 'tgagccgggactgttgagtgaagtaggtagtttcatgttgttgggcctggctttctgaacacaacgacatcaaaccacctgattcatggcagttactgcttc' 'GBSeq_source' => 'Mus musculus (house mouse)' 'GBSeq_strandedness' => 'single' 'GBSeq_taxonomy' => 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Mus' 'GBSeq_topology' => 'linear' 'GBSeq_update-date' => '06-JAN-2010' bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/FetchAdaptor/species.pm000077500000000000000000000141501342734133000264040ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor::species # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor::species - Fetch adaptor for 'taxonomy' efetch SOAP messages =head1 SYNOPSIS Imported by L as required. =head1 DESCRIPTION Returns an iterator over L objects: my $fac = Bio::DB::SoapEUtilities->new; my $taxio = $fac->efetch(-db => 'taxonomy', -id => 1394)->run(-auto_adapt=>1); my $sp = $taxio->next_species; $sp->binomial; # returns 'Bacillus caldolyticus' To find out the object type returned: $class = $seqio->obj_class; # $class is 'Bio::Species' as for all L objects. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::FetchAdaptor::species; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Species; use base qw(Bio::DB::SoapEUtilities::FetchAdaptor Bio::Root::Root ); sub _initialize { my ($self, @args) = @_; $self->SUPER::_initialize(@args); # my ($builder, $seqfac ) = $self->_rearrange( [qw(SEQBUILDER # SEQFACTORY)], @args ); $self->{'_obj_class'} = 'Bio::Species' ; $self->{'_idx'} = 1; 1; } sub rewind { shift->{'_idx'} = 1 } sub obj_class { shift->{'_obj_class'} } sub next_species { shift->next_obj } sub next_obj { my $self = shift; my $stem = "//TaxaSet/[".$self->{'_idx'}."]"; # my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless defined $som->valueof($stem); my $get = sub { $som->valueof("$stem/".shift) }; my $toplev = $som->valueof("$stem"); my $get_tl = sub { $toplev->{ shift @_ } }; my $sp = _read_species($get_tl); $self->warn("FetchAdaptor::species - parse error, no Bio::Species returned") unless $sp; ($self->{_idx})++; return $sp; } 1; # mostly ripped from Bio::SeqIO::genbank... sub _read_species { my ($get) = @_; my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown', 'Unspecified', 'Unknown', 'None', 'unclassified', 'unidentified organism', 'not supplied'); # dictionary of synonyms for taxid 32644 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified'); # all above can be part of valid species name my( $sub_species, $species, $genus, $sci_name, $common, $abbr_name, $organelle); $sci_name = $get->('ScientificName') || return; # no "source" elt like gb format./maj # parse out organelle, common name, abbreviated name if present; # this should catch everything, but falls back to # entire GBSeq_taxonomy element just in case # if ($get->('source') =~ m{^ # (mitochondrion|chloroplast|plastid)? # \s*(.*?) # \s*(?: \( (.*?) \) )?\.? # $}xms ) { # ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional # } else { # $abbr_name = $get->('source'); # nothing caught; this is a backup! # } # # Convert data in classification lines into classification array. my @class = split(/; /, $get->('Lineage')); # do we have a genus? my $possible_genus = quotemeta($class[-1]) . ($class[-2] ? "|" . quotemeta($class[-2]) : ''); if ($sci_name =~ /^($possible_genus)/) { $genus = $1; ($species) = $sci_name =~ /^$genus\s+(.+)/; } else { $species = $sci_name; } # is this organism of rank species or is it lower? # (we don't catch everything lower than species, but it doesn't matter - # this is just so we abide by previous behaviour whilst not calling a # species a subspecies) if ($species && $species =~ /subsp\.|var\./) { ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/; } # Don't make a species object if it's empty or "Unknown" or "None" # return unless $genus and $genus !~ /^(Unknown|None)$/oi; # Don't make a species object if it belongs to taxid 32644 my $src = $get->('ScientificName'); return unless ($species || $genus) and !grep { $_ eq $src } @unkn_names; # Bio::Species array needs array in Species -> Kingdom direction push(@class, $sci_name); @class = reverse @class; my $make = Bio::Species->new(); $make->scientific_name($sci_name); $make->classification(@class) if @class > 0; $make->common_name( $get->('CommonName')); $make->name('abbreviated', $abbr_name) if $abbr_name; $make->organelle($organelle) if $organelle; $make->ncbi_taxid( $get->('TaxId') ); $make->division( $get->('Division') ); return $make; } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/GQueryAdaptor.pm000077500000000000000000000137551342734133000251460ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::GQueryAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::GQueryAdaptor - Handle for Entrez SOAP GlobalQuery items =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a GQueryAdaptor my $queries = $fac->egquery( -term => 'BRCA and human' )->run(-auto_adapt=>1); # all databases with hits my @dbs = $queries->found_in_dbs; # queries by database my $prot_count = $queries->query_by_db('prot')->count; # iterate over gquery while ( my $q = $queries->next_query ) { my $db = $q->db; my $count = $q->count; my $status = $q->status; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 SEE ALSO L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::GQueryAdaptor; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::GQueryAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::GQueryAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::GQueryAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("GQueryAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("GQueryAdaptor only works with egquery results") unless $result->util eq 'egquery'; $self->{'_result'} = $result; $self->{'_query_by_db'} = {}; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_query() Title : next_query Usage : Function: return the next global query from the attached Result Returns : Args : =cut sub next_query { my $self = shift; # my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; # not consistent, kludge as follows: my $stem = "//eGQueryResult/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless $som->valueof($stem); my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; my $toplev = $get->(''); my $get_tl = sub { $toplev->{ shift @_ } }; $params{'-term'} = $som->valueof("//Term"); my $names = []; $params{'-count'} = $get_tl->('Count'); $params{'-db'} = $get_tl->('DbName'); $params{'-status'} = $get_tl->('Status'); my $class = ref($self)."::gquery"; $ret = $class->new(%params); $self->{_query_by_db}->{$params{'-db'}} = $ret; ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_query(@_) } sub rewind { shift->{'_idx'} = 1; }; =head2 found_in_dbs() Title : found_in_dbs Usage : Function: Return list of db names containing hits for the query term Returns : array of scalar strings Args : none =cut sub found_in_dbs { my $self = shift; return @{$self->{'_found_in_dbs'}} if $self->{'_found_in_dbs'}; my $som = $self->result->som; $self->{'_found_in_dbs'} = []; foreach ($som->valueof("//eGQueryResult/*")) { push @{$self->{'_found_in_dbs'}}, $_->{'DbName'} if $_->{'Count'}; } return @{$self->{'_found_in_dbs'}}; } =head2 query_by_db() Title : query_by_db Usage : Function: get gquery object by db name Returns : Args : db name (scalar string) =cut sub query_by_db { my $self = shift; my $db = shift; $self->throw("db must be specified") unless $db; return $self->{_query_by_db}->{$db} if $self->{_query_by_db}->{$db}; my $som = $self->result->som; my $i; for ($i = 1; my $val = $som->valueof("//eGQueryResult/[$i]/DbName"); $i++) { last if $val eq $db; } my $curidx = $self->{_idx}; my $query; { local $self->{_idx} = $i; $query = $self->next_query; } return $query; } 1; #### package Bio::DB::SoapEUtilities::GQueryAdaptor::gquery; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref $self->{$k} eq \'ARRAY\') ? @{$self->{$k}} : $self->{$k};' ); return $self; } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/LinkAdaptor.pm000077500000000000000000000134711342734133000246220ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::LinkAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::LinkAdaptor - Handle for Entrez SOAP LinkSets =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a LinkAdaptor $fac->elink( -db => 'nucleotide', -dbfrom => 'protein', -id => [qw(828392 790 470338)]); my $links = $fac->elink->run( -auto_adapt => 1); # get the linked ids corresponding to the submitted ids # (may be arrays if multiple crossrefs, or undef if none) my @nucids = $links->id_map(828392); # iterate over linksets while ( my $ls = $links->next_linkset ) { my @from_ids = $ls->submitted_ids; my @to_ids = $ls->ids; my $from_db = $ls->db_from; my $to_db = $ls->db_to; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 SEE ALSO L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::LinkAdaptor; use strict; use warnings; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::LinkAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::LinkAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::LinkAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("LinkAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("LinkAdaptor only works with elink results") unless $result->util eq 'elink'; $self->{'_result'} = $result; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_linkset() Title : next_linkset Usage : Function: return the next LinkSet from the attached Result Returns : Args : =cut sub next_linkset { my $self = shift; my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; return unless $self->result->som and $self->result->som->valueof($stem); my $som = $self->result->som; my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; $params{'-db_from'} = $get->('DbFrom'); $params{'-db_to'} = $get->('LinkSetDb/DbTo'); $params{'-link_name'} = $get->('LinkSetDb/LinkName'); $params{'-submitted_ids'} = [$get->('IdList/*')]; $params{'-ids'} = [$get->('LinkSetDb/Link/*')]; $params{'-webenv'} = $get->('WebEnv'); my $class = ref($self)."::linkset"; $ret = $class->new(%params); ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_linkset(@_) } sub rewind { shift->{'_idx'} = 1; }; =head2 id_map() Title : id_map Usage : $to_id = $adaptor->id_map($from_id) Function: Return 'to-database' ids corresponding to given specified 'from-database' or submitted ids Returns : array of scalars (to-database ids or arrayrefs of ids) Args : array of scalars (from-database ids) =cut sub id_map { my $self = shift; my @from_ids = @_; my $som = $self->result->som; my $stem = "//Body/".$self->result->result_type."/"; if (!defined $self->{'_id_map'}) { my $h = {}; for (my $i=1; $som->valueof($stem."[$i]"); $i++) { # note this assumes that in the elink query, # ids were provided individually (not as a comma-sep # list). This is the standard behavior for elink # in SoapEUtilities. my @to_ids = $som->valueof($stem."[$i]/LinkSetDb/Link/*"); $$h{$som->valueof($stem."[$i]/IdList/[1]")} = (@to_ids == 1 ? $to_ids[0] : \@to_ids); } $self->{'_id_map'} = $h; } return @{$self->{'_id_map'}}{@from_ids}; } package Bio::DB::SoapEUtilities::LinkAdaptor::linkset; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref $self->{$k} eq \'ARRAY\') ? @{$self->{$k}} : $self->{$k};' ); return $self; } 1; bioperl-run-release-1-7-3/lib/Bio/DB/SoapEUtilities/Result.pm000077500000000000000000000302661342734133000236710ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::Result # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::Result - Accessor object for SoapEUtilities results =head1 SYNOPSIS $fac = Bio::DB::SoapEUtilities->new(); $result = $fac->esearch( -db => 'gene', -term => 'hedgehog')->run; $count = $result->count; # case important; $result->Count could be arrayref @ids = $result->ids; =head1 DESCRIPTION This module attempts to make Entrez Utilities SOAP responses as user-friendly and intuitive as possible. These responses can be complex structures with much useful data; but users will generally desire the values of some key fields. The L object provides access to all response values via systematically named accessor methods, and commonly used values as convenience methods. The 'raw' SOAP message (a L object as returned by L) is also provided. =over =item Convenience accessors If a list of record ids is returned by the call, C will return these as an array reference: @seq_ids = $result->ids; The total count of returned records is provided by C: $num_recs = $result->count; If C was specified in the SOAP call, the NCBI-assigned web environment (that can be used in future calls) is available in C, and the query key assigned to the result in C: $next_result = $fac->efetch( -WebEnv => $result->webenv, -QueryKey => $result->query_key ); =item Walking the response This module uses C to provide accessor methods for all response data. Here is an example of a SOAP response as returned by a C call off the L object: DB<5> x $result->som->method 0 HASH(0x2eac9a4) 'Count' => 148 'IdList' => HASH(0x4139578) 'Id' => 100136227 'QueryKey' => 1 'QueryTranslation' => 'sonic[All Fields] AND hedgehog[All Fields]' 'RetMax' => 20 'RetStart' => 0 'TranslationSet' => '' 'TranslationStack' => HASH(0x4237b4c) 'OP' => 'GROUP' 'TermSet' => HASH(0x42c43bc) 'Count' => 2157 'Explode' => 'Y' 'Field' => 'All Fields' 'Term' => 'hedgehog[All Fields]' 'WebEnv' => 'NCID_1_150423569_130.14.22.101_9001_1262703782' Some of the data values here (at the tips of the data structure) are actually arrays of values ( e.g., the tip C Id> ), other tips are simple scalars. With this in mind, C accessor methods work as follows: Data values (at the tips of the response structure) are acquired by calling a method with the structure keys separated by underscores (if necessary): $query_key = $result->QueryKey; # $query_key == 1 $ids = $result->IdList_Id; # @$ids is an array of record ids Data I below a particular node in the response structure can also be obtained with similarly constructed method names. These 'internal node accessors' return a hashref, containing all data leaves below the node, keyed by the accessor names: $data_hash = $result->TranslationStack DB<3> x $data_hash 0 HASH(0x43569d4) 'TranslationStack_OP' => ARRAY(0x42d9988) 0 'AND' 1 'GROUP' 'TranslationStack_TermSet_Count' => ARRAY(0x4369c64) 0 148 1 148 2 2157 'TranslationStack_TermSet_Explode' => ARRAY(0x4368998) 0 'Y' 1 'Y' 'TranslationStack_TermSet_Field' => ARRAY(0x4368260) 0 'All Fields' 1 'All Fields' 'TranslationStack_TermSet_Term' => ARRAY(0x436c97c) 0 'sonic[All Fields]' 1 'hedgehog[All Fields]' Similarly, the call C< $result->TranslationStack_TermSet > would return a similar hash containing the last 4 elements of the example hash above. Creating accessors is somewhat costly, especially for fetch responses which can be deep and complex (not unlike BioPerl developers). Portions of the response tree can be ignored by setting C<-prune_at_node> to a arrayref of nodes to skip. Nodes should be specified in L format, e.g. ...::Result->new( -prune_at_nodes => ['//GBSeq_references'] ); Accessor creation can be skipped altogether by passing C<-no_parse => 1> to the C constructor. This is recommended if a result is being passed to a L. The original SOAP message with all data is always available in C<$result->som>. =back =over Other methods =item accessors() An array of available data accessor names. This contains only the data "tips". The internal node accessors are autoloaded. =item ok() True if no SOAP fault. =item errstr() Returns the SOAP fault error string. =item som() The original C message. =item util() The EUtility associated with the result. =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::Result; use strict; use warnings; use Bio::Root::Root; use base qw(Bio::Root::Root ); our $AUTOLOAD; our %ID_LIST_ELT = ( esearch => 'IdList_Id', esummary => 'DocSum_Id', elink => 'LinkSet_IdList_Id' ); # an object of accessors sub new { my $class = shift; my @args = @_; my $self = $class->SUPER::new(@args); my $eutil_obj = shift @args; my ($alias_hash, $prune_at_nodes, $no_parse, $make_index) = $self->_rearrange( [qw( ALIAS_HASH PRUNE_AT_NODES NO_PARSE INDEX_ACCESSORS ) ], @args); $self->throw("Result constructor requires Bio::DB::SoapEUtilities ". "argument") unless ($eutil_obj and ref($eutil_obj) eq 'Bio::DB::SoapEUtilities'); $alias_hash ||= {}; $$alias_hash{ 'ids' } = ($ID_LIST_ELT{$eutil_obj->_caller_util} || 'IdList_Id'); if ($prune_at_nodes) { $prune_at_nodes = [$prune_at_nodes] unless ref $prune_at_nodes; } $self->{'_util'} = $eutil_obj->_caller_util; my $som = $self->{'_som'} = $eutil_obj->last_result; return unless ( $som and ref($som) eq 'SOAP::SOM' ); return $self unless $self->ok; # SOAP fault $self->{'_result_type'} = $eutil_obj->_soap_facs($self->util)->_result_elt_name; $self->{'_accessors'} = []; $self->{'_WebEnv'} = $som->valueof("//WebEnv"); $self->{'_QueryKey'} = $som->valueof("//QueryKey"); $self->{'_fetch_type'} = $eutil_obj->_soap_facs($self->util)->_wsdl->db; $self->{'_fetch_db'} = ($self->util eq 'efetch' ? $eutil_obj->_soap_facs($self->util)->db : undef); return ($no_parse ? $self : $self->parse_methods($alias_hash, $prune_at_nodes)); } =head2 parse_methods() Title : parse_methods Usage : Function: parse out the accessor methods Returns : self (Result object) Args : $alias_hash (hashref), $prune_at_nodes (scalar or arrayref) =cut sub parse_methods { my $self = shift; # parse message into accessors my ($alias_hash, $prune_at_nodes) = @_; my @methods = keys %{$self->som->method}; my %methods; foreach my $m (@methods) { _traverse_methods($m, '/', '', $self->som, \%methods, $self->{'_accessors'}, $prune_at_nodes); } # convenience aliases... if ($alias_hash && ref($alias_hash) eq 'HASH') { for (keys %$alias_hash) { if ($methods{ $$alias_hash{$_} }) { # avoid undef'd accessors $methods{$_} = $methods{ $$alias_hash{$_} }; push @{$self->{_accessors}}, $_; } } } # specials... if ($methods{Count}) { push @{$self->{'_accessors'}}, 'count'; for (ref $methods{Count}) { /^$/ && do { $methods{count} = $methods{Count}; last; }; /ARRAY/ && do { $methods{count} = $methods{Count}->[0]; last; }; } } else { #work harder my @toplev = keys %{$self->som->method}; my ($set) = grep /^.*?S(et|um)$/, @toplev; if ($set) { $methods{count} = 0; # kludge out NCBI inconsistencies my $stem = ($set =~ /(?:DocSum|LinkSet)/ ? "//Body/".$self->result_type."/*" : "//$set/*"); foreach ($self->som->valueof($stem)) { $methods{count}++; } } push @{$self->{'_accessors'}}, 'count'; } $self->_set_from_args( \%methods, -methods => $self->{'_accessors'}, -case_sensitive => 1, -create => 1 ); return $self; } =head2 util() Title : util Usage : Function: Name of the utility producing this result object. Returns : scalar string Args : =cut sub util { shift->{'_util'} } =head2 som() Title : som Usage : Function: get the original SOAP::SOM object Returns : a SOAP::SOM object Args : none =cut sub som { shift->{'_som'} } =head2 ok() Title : ok Usage : Function: Returns : true if no SOAP fault Args : =cut sub ok { !(shift->som->fault) } =head2 errstr() Title : errstr Usage : Function: Returns : fault string of SOAP object Args : none =cut sub errstr { shift->som->faultstring } =head2 accessors() Title : accessors Usage : Function: get the list of created accessors for this result Returns : array of scalar strings Args : none Note : does not include valid AUTOLOADed accessors; see DESCRIPTION =cut sub accessors { my $a = shift->{'_accessors'} ; @$a if $a } =head2 webenv() Title : webenv Usage : Function: contains WebEnv key referencing this result's session Returns : scalar Args : none =cut sub webenv { shift->{'_WebEnv'} } =head2 query_key()() Title : query_key() Usage : Function: contains the web query key assigned to this result Returns : scalar Args : =cut sub query_key { shift->{'_QueryKey'} } =head2 fetch_type() Title : fetch_type Usage : Function: Get the efetch database name according to WSDL Returns : scalar string (db name) or undef if N/A Args : none =cut sub fetch_type { shift->{'_fetch_type'} } sub fetch_db { shift->{'_fetch_db'} } sub result_type { shift->{'_result_type'} } sub _traverse_methods { my ($m, $skey, $key, $som, $hash, $acc, $prune) = @_; if ($prune) { foreach (@$prune) { return if "$skey\/$m" =~ /^$_/; } } my $val = $som->valueof("$skey\/$m"); for (ref $val) { /^$/ && do { my @a = $som->valueof("$skey\/$m"); my $M = $m; # camelcase it $M =~ s/([-_])([a-zA-Z0-9])/\u$2/g; my $k = ($key ? "$key\_" : "").$M; push @{$acc}, $k; if (@a == 1) { $$hash{$k} = $a[0]; } else { $$hash{$k} = \@a; } return; }; /HASH/ && do { foreach my $k (keys %$val) { my $M = $m; # camelcase it $M =~ s/([-_])([a-zA-Z0-9])/\u$2/g; _traverse_methods( $k, "$skey\/$m", ($key ? "$key\_" : "").$M, $som, $hash, $acc, $prune ); } return; }; do { #else, huh? Bio::Root::Root->throw("SOAP::SOM parse error : please contact the mailing list"); }; } } sub AUTOLOAD { my $self = shift; my $accessor = $AUTOLOAD; $accessor =~ s/.*:://; my @list = grep /^${accessor}_/, @{$self->{'_accessors'}}; unless (@list) { $self->debug("Accessor '$accessor' not present in this result"); return; } my %ret; foreach (@list) { $ret{$_} = $self->$_; } return \%ret; } 1; bioperl-run-release-1-7-3/lib/Bio/Factory/000077500000000000000000000000001342734133000202625ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Factory/EMBOSS.pm000066400000000000000000000163071342734133000216170ustar00rootroot00000000000000# BioPerl module for Bio::Factory::EMBOSS # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Factory::EMBOSS - EMBOSS application factory class =head1 SYNOPSIS # Get an EMBOSS factory use Bio::Factory::EMBOSS; $f = Bio::Factory::EMBOSS -> new(); # Get an EMBOSS application object from the factory $water = $f->program('water') || die "Program not found!\n"; # Here is an example of running the application - water can # compare 1 sequence against 1 or more sequences using Smith-Waterman. # Pass a Sequence object and a reference to an array of objects. my $wateroutfile = 'out.water'; $water->run({-asequence => $seq_object, -bsequence => \@seq_objects, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # Now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while ( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSS factory class encapsulates access to EMBOSS programs. A factory object allows creation of only known applications. If you want to check command line options before sending them to the program set $prog-Everbose to positive integer. The value is passed on to programs objects and the ADC description of the available command line options is parsed and compared to input. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email heikki-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Factory::EMBOSS; use vars qw(@ISA $EMBOSSVERSION); use strict; use Bio::Root::Root; use Bio::Tools::Run::EMBOSSApplication; use Bio::Factory::ApplicationFactoryI; @ISA = qw(Bio::Root::Root Bio::Factory::ApplicationFactoryI ); $EMBOSSVERSION = "2.0.0"; sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); # set up defaults my($location) = $self->_rearrange([qw(LOCATION )], @args); $self->{ '_programs' } = {}; $self->{ '_programgroup' } = {}; $self->{ '_groups' } = {}; $self->location($location) if $location; $self->_program_list; # retrieve info about available programs return $self; } =head2 location Title : location Usage : $embossfactory->location Function: get/set the location of EMBOSS programs. Valid values are 'local' and 'novella'. Returns : string, defaults to 'local' Args : string =cut sub location { my ($self, $value) = @_; my %location = ('local' => '1', 'novella' => '1' ); if (defined $value) { $value = lc $value; if ($location{$value}) { $self->{'_location'} = $value; } else { $self->warn("Value [$value] not a valid value for ". "location(). Defaulting to [local]"); $self->{'_location'} = 'local'; } } $self->{'_location'} ||= 'local'; return $self->{'_location'}; } =head2 program Title : program Usage : $embossfactory->program('program_name') Function: Creates a representation of a single EMBOSS program and issues a warning if the program was not found. Returns : Bio::Tools::Run::EMBOSSApplication object or undef Args : string, program name =cut sub program { my ($self, $value) = @_; unless( $self->{'_programs'}->{$value} ) { $self->warn("Application [$value] is not available!"); return undef; } my $attr = {}; $attr->{name} = $value; $attr->{verbose} = $self->verbose; my $appl = Bio::Tools::Run::EMBOSSApplication->new($attr); return $appl; } =head2 version Title : $self->version Usage : $embossfactory->version() Function: gets the version of EMBOSS programs Throws : if EMBOSS suite is not accessible Returns : version value Args : None =cut sub version { my ($self) = @_; my $version = `embossversion -auto`; $self->throw("EMBOSS suite of programs is not available") if $?; chop $version; # compare versions $self->throw("EMBOSS has to be at least version $EMBOSSVERSION got $version\n") if $version lt $EMBOSSVERSION; return $version; } =head2 Programs These methods allow the programmer to query the EMBOSS suite and find out which program names can be used and what arguments can be used. =head2 program_info Title : program_info Usage : $embossfactory->program_info('emma') Function: Finds out if the program is available. Returns : definition string of the program, undef if program name not known Args : string, prgramname =cut sub program_info { my ($self, $value) = @_; return $self->{'_programs'}->{$value}; } =head2 Internal methods Do not call these methods directly =head2 _program_list Title : _program_list Usage : $embossfactory->_program_list() Function: Finds out what programs are available. Writes the names into an internal hash. Returns : true if successful Args : None =cut sub _program_list { my ($self) = @_; if( $^O =~ /Mac/i ) { return; } { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; local * SAVERR; open SAVERR, ">&STDERR"; open STDERR, ">$null"; open(WOSSOUT, "wossname -auto |") || return; open STDERR, ">&SAVERR"; } local $/ = "\n\n"; while( ) { my ($groupname) = (/^([A-Z][A-Z0-9 ]+)$/m); #print $groupname, "\n" if $groupname; $self->{'_groups'}->{$groupname} = [] if $groupname; while ( /^([a-z]\w+) +(.+)$/mg ) { #print "$1\t$2 \n" if $1; $self->{'_programs'}->{$1} = $2 if $1; $self->{'_programgroup'}->{$1} = $groupname if $1; push @{$self->{'_groups'}->{$groupname}}, $1 if $1; } } close(WOSSOUT); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/000077500000000000000000000000001342734133000177535ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/000077500000000000000000000000001342734133000205175ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/000077500000000000000000000000001342734133000224355ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Amap.pm000066400000000000000000000344411342734133000236570ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Amap # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Amap - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the Amap (2.0) program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Amap->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. #To run amap with training, try something like: #First round to generate train.params $factory = Bio::Tools::Run::Alignment::Amap->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'emissions' => '', 'verbose' => '', 'train' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/train.params"); #Second round to use train.params to get a high qual alignment $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); $aln = ''; $factory = ''; $factory = Bio::Tools::Run::Alignment::Amap->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'verbose' => '', 'paramfile' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Amap uses a Sequence Annealing algorithm which is an incremental method for building multiple alignments. You can get it and see information about it at this URL http://bio.math.berkeley.edu/amap/ =head2 Helping the module find your executable FIXME: Amap uses the same parameters as Probcons, plus a few others. I haven't had time to check all the changes from the Probcons.pm runnable. Feel free to do it. You will need to enable Amap to find the amap program. This can be done in (at least) three ways: 1. Make sure the amap executable is in your path (i.e. 'which amap' returns a valid program 2. define an environmental variable AMAPDIR which points to a directory containing the 'amap' app: In bash export AMAPDIR=/home/progs/amap or In csh/tcsh setenv AMAPDIR /home/progs/amap 3. include a definition of an environmental variable AMAPDIR in every script that will BEGIN {$ENV{AMAPDIR} = '/home/progs/amap'; } use Bio::Tools::Run::Alignment::Amap; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email foo@bar.com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Amap; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @AMAP_PARAMS @AMAP_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @AMAP_PARAMS = qw (CONSISTENCY ITERATIVE-REFINEMENT PRE-TRAINING ANNOT TRAIN PARAMFILE MATRIXFILE CLUSTALW PAIRS VITERBI VERBOSE EMISSIONS EDGE-WEIGHT-THRESHOLD GAP-FACTOR); #FIXME: Last line are switches, dunno how to set them, #gave as params @AMAP_SWITCHES = qw(); @OTHER_SWITCHES = qw(PROGRESSIVE NOREORDER ALIGNMENT-ORDER MAXSTEP PRINT-POSTERIORS); # Authorize attribute fields foreach my $attr ( @AMAP_PARAMS, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'amap'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{AMAPDIR}) if $ENV{AMAPDIR}; } =head2 new Title : new Usage : my $amap = Bio::Tools::Run::Alignment::Amap->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Amap Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on) if $on; my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #AMAP version 1.09 - align multiple protein sequences and print to standard output $string =~ /AMAP\s+version.+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run amap return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to amap program Example : Returns : nothing; amap output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to amap =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "amap command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Amap call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for amap program Example : Returns : name of file containing amap data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to amap!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for amap program Example : Returns : parameter string to be passed to amap during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @AMAP_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @AMAP_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by Amap $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $amap->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $amap->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Blat.pm000077500000000000000000000367411342734133000236730ustar00rootroot00000000000000# # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Blat =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Blat; my $factory = Bio::Tools::Run::Alignment::Blat->new(-db => $database); # $report is a Bio::SearchIO-compliant object my $report = $factory->run($seqobj); =head1 DESCRIPTION Wrapper module for Blat program. This newer version allows for all parameters to be set by passing them as an option to new(). Key bits not implemented yet (TODO): =over 3 =item * Implement all needed L methods Missing are a few, including version(). =item * Re-implement using L Would like to get this running under something less reliant on OS-dependent changes within code. =item * No .2bit or .nib conversions yet These require callouts to faToNib or faTwoTwoBit, which may or may not be installed on a user's machine. We can possibly add functionality to check for faToTwoBit/faToNib and other UCSC tools in the future. =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Chris Fields - cjfields at bioperl dot org Original author - Bala Email bala@tll.org.sg =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Blat; use strict; use warnings; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); use Bio::SeqIO; use Bio::Root::Root; use Bio::Factory::ApplicationFactoryI; use Bio::SearchIO; use Bio::Tools::Run::WrapperBase; our ($PROGRAM, $PROGRAMDIR, $PROGRAMNAME); our %BLAT_PARAMS = map {$_ => 1} qw(ooc t q tileSize stepSize oneOff minMatch minScore minIdentity maxGap makeOoc repmatch mask qMask repeats minRepeatsDivergence dots out maxIntron); our %BLAT_SWITCHES = map {$_ => 1} qw(prot noHead trimT noTrimA trimHardA fastMap fine extendThroughN); our %LOCAL_ATTRIBUTES = map {$_ => 1} qw(db DB qsegment hsegment searchio outfile_name quiet); our %searchio_map = ( 'psl' => 'psl', 'pslx' => 'psl', # I don't think there is support for this yet 'axt' => 'axt', 'blast' => 'blast', 'sim4' => 'sim4', 'wublast' => 'blast', 'blast8' => 'blasttable', 'blast9' => 'blasttable' ); =head2 new Title : new Usage : $blat->new( -db => '' ) Function: Create a new Blat factory Returns : A new Bio::Tools::Run::Alignment::Blat object Args : -db : Mandatory parameter. See db() -qsegment : see qsegment() -tsegment : see tsegment() Also, Blat parameters and flags are accepted: -t, -q, -minIdentity, -minScore, -out, -trimT, ... See Blat's manual for details. =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->set_parameters(@args); return $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: Get the program name Returns : string Args : None =cut sub program_name { return 'blat'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{BLATDIR}) if $ENV{BLATDIR}; } =head2 run Title : run() Usage : $obj->run($query) Function: Run Blat. Returns : A Bio::SearchIO object that holds the results Args : A Bio::PrimarySeqI object or a file of query sequences =cut sub run { my ($self, $query) = @_; if (ref($query) ) { # it is an object if (ref($query) =~ /GLOB/) { $self->throw("Cannot use filehandle as argument to run()"); } $query = $self->_writeSeqFile($query); } return $self->_run($query); } =head2 align Title : align Usage : $obj->align($query) Function: Alias to run() =cut sub align { return shift->run(@_); } =head2 db Title : db Usage : $obj->db() Function: Get or set the file of database sequences (.fa , .nib or .2bit) Returns : Database filename Args : Database filename =cut sub db { my $self = shift; return $self->{blat_db} = shift if @_; return $self->{blat_db}; } # this is a kludge for tests (so one might expect this to be used elsewhere). # None of the other parameters worked in the past, so not replacing them *DB = \&db; =head2 qsegment Title : qsegment Usage : $obj->qsegment('sequence_a:0-1000') Function : pass in a B string for the query sequence(s) Returns : string Args : string Note : Requires the sequence(s) in question be 2bit or nib format Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence begins at 0, but start isn't counted with length), whereas BioPerl coordinates are 1-based closed (sequence begins with 1, both start and end are counted in the length of the segment). For example, a segment that is 'sequence_a:0-1000' will have BioPerl coordinates of 'sequence_a:1-1000', both with the same length (1000). =cut sub qsegment { my $self = shift; return $self->{blat_qsegment} = shift if @_; return $self->{blat_qsegment}; } =head2 tsegment Title : tsegment Usage : $obj->tsegment('sequence_a:0-1000') Function : pass in a B string for the target sequence(s) Returns : string Args : string Note : Requires the sequence(s) in question be 2bit or nib format Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence begins at 0, but start isn't counted with length), whereas BioPerl coordinates are 1-based closed (sequence begins with 1, both start and end are counted in the length of the segment). For example, a segment that is 'sequence_a:0-1000' will have BioPerl coordinates of 'sequence_a:1-1000', both with the same length (1000). =cut sub tsegment { my $self = shift; return $self->{blat_tsegment} = shift if @_; return $self->{blat_tsegment}; } =head2 outfile_name Title : outfile_name Usage : $obj->outfile_name('out.blat') Function : Get or set the name for the BLAT output file Returns : string Args : string =cut # override this, otherwise one gets a default of 'mlc' sub outfile_name { my $self = shift; return $self->{blat_outfile} = shift if @_; return $self->{blat_outfile}; } =head2 searchio Title : searchio Usage : $obj->searchio{-writer => $writer} Function : Pass in additional parameters to the returned Bio::SearchIO parser Returns : Hash reference with Bio::SearchIO parameters Args : Hash reference Note : Currently, this implementation overrides any passed -format parameter based on whether the output is changed ('out'). This may change if requested, but we can't see the utility of doing so, as requesting mismatched output/parser combinations is just a recipe for disaster =cut sub searchio { my ($self, $params) = @_; if ($params && ref $params eq 'HASH') { delete $params->{-format}; $self->{blat_searchio} = $params; } return $self->{blat_searchio} || {}; } =head1 Bio::ParameterBaseI-specific methods These methods are part of the Bio::ParameterBaseI interface =head2 set_parameters Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : None Args : [optional] hash or array of parameter/values. These can optionally be hash or array references Note : This only sets parameters; to set methods use the method name =cut sub set_parameters { my $self = shift; # circumvent any issues arising from passing in refs my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} : (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_; # set the parameters passed in, but only ones supported for the program %args = map { my $a = $_; $a =~ s{^-}{}; $a => $args{$_}; } sort keys %args; while (my ($key, $val) = each %args) { if (exists $BLAT_PARAMS{$key}) { $self->{parameters}->{$key} = $val; } elsif (exists $BLAT_SWITCHES{$key}) { $self->{parameters}->{$key} = $BLAT_SWITCHES{$key} ? 1 : 0; } elsif ($LOCAL_ATTRIBUTES{$key} && $self->can($key)) { $self->$key($val); } } } =head2 reset_parameters Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my $self = shift; delete $self->{parameters}; if (@_) { $self->set_parameters(@_); } } =head2 validate_parameters Title : validate_parameters Usage : $pobj->validate_parameters(1); Function: sets a flag indicating whether to validate parameters via set_parameters() or reset_parameters() Returns : Bool Args : [optional] value evaluating to True/False Note : NYI =cut sub validate_parameters { 0 } =head2 parameters_changed Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : None Note : This module does not run state checks, so this always returns True =cut sub parameters_changed { 1 } =head2 available_parameters Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : [optional] name of executable being used; defaults to returning all available parameters =cut sub available_parameters { my ($self, $exec) = @_; my @params = (sort keys %BLAT_PARAMS, sort keys %BLAT_SWITCHES); return @params; } =head2 get_parameters Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of set key-value pairs, parameter => value Returns : List of key-value pairs Args : none =cut sub get_parameters { my ($self, $option) = @_; $option ||= ''; # no option my %params; if (exists $self->{parameters}) { %params = map {$_ => $self->{parameters}->{$_}} sort keys %{$self->{parameters}}; } else { %params = (); } return %params; } =head1 to_* methods All to_* methods are implementation-specific =head2 to_exe_string Title : to_exe_string Usage : $string = $pobj->to_exe_string; Function: Returns string (command line string in this case) Returns : String Args : =cut sub to_exe_string { my ($self, @passed) = @_; my ($seq) = $self->_rearrange([qw(SEQ_FILE)], @passed); $self->throw("Must provide a seq_file") unless defined $seq; my %params = $self->get_parameters(); my ($exe, $db, $qseg, $tseg) = ($self->executable, $self->db, $self->qsegment, $self->tsegment); $self->throw("Executable not found") unless defined($exe); if ($tseg) { $db .= ":$tseg"; } if ($qseg) { $seq .= ":$qseg"; } my @params; for my $p (sort keys %BLAT_SWITCHES) { if (exists $params{$p}) { push @params, "-$p" } } for my $p (sort keys %BLAT_PARAMS) { if (exists $params{$p}) { push @params, "-$p=$params{$p}" } } # this only passes in the first seq file (no globs are allow AFAIK) push @params, ($db, $seq); # quiet! Unfortunately, it is NYI my $string = "$exe ".join(' ',@params); return $string; } #=head2 _input # # Title : _input # Usage : obj->_input($seqFile) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _input() { my ($self,$infile1) = @_; if (defined $infile1) { $self->{'input'} = $infile1; } return $self->{'input'}; } #=head2 _database # # Title : _database # Usage : obj->_database($seqFile) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _database() { my ($self,$infile1) = @_; $self->{'db'} = $infile1 if(defined $infile1); return $self->{'db'}; } #=head2 _run # # Title : _run # Usage : $obj->_run() # Function: Internal (not to be used directly) # Returns : A Bio::SearchIO object that contains the results # Args : File of sequences # #=cut sub _run { my ($self, $seq_file) = @_; my $str = $self->to_exe_string(-seq_file => $seq_file); my $out = $self->outfile_name || $self->_tempfile; $str .= " $out".$self->_quiet; $self->debug($str."\n") if( $self->verbose > 0 ); my %params = $self->get_parameters; my $status = system($str); $self->throw( "Blat call ($str) crashed: $? \n") unless $status==0; my $format = exists($params{out}) ? $searchio_map{$params{out}} : 'psl'; my @io = ref ($out) !~ /GLOB/ ? (-file => $out,) : (-fh => $out,); my $blat_obj = Bio::SearchIO->new(%{$self->searchio}, @io, -query_type => $params{prot} ? 'protein' : $params{q} || 'dna', -hit_type => $params{prot} ? 'protein' : $params{t} || 'dna', -format => $format); return $blat_obj; } #=head2 _writeSeqFile # # Title : _writeSeqFile # Usage : obj->_writeSeqFile($seq) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _writeSeqFile { my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); return $inputfile; } sub _tempfile { my $self = shift; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR); # this is because we only want a unique filename close($tfh); return $outfile; } sub _quiet { my $self = shift; my $q = ''; # BLAT output goes to a file, all other output is STDOUT if ($self->quiet) { $q = $^O =~ /Win/i ? ' 2>&1 NUL' : ' > /dev/null 2>&1'; } return $q; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/DBA.pm000066400000000000000000000433651342734133000233740ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::DBA # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::DBA - Object for the alignment of two sequences using the DNA Block Aligner program. =head1 SYNOPSIS use Bio::Tools::Run::Alignment::DBA; # Build a dba alignment factory my @params = ('matchA' => 0.75, 'matchB' => '0.55', 'dymem' =>'linear'); my $factory = Bio::Tools::Run::Alignment::DBA->new(@params); # Pass the factory a filename with 2 sequences to be aligned. $inputfilename = 't/data/dbaseq.fa'; # @hsps is an array of GenericHSP objects my @hsps = $factory->align($inputfilename); # or my @files = ('t/data/dbaseq1.fa','t/data/dbaseq2.fa'); my @hsps = $factory->align(\@files); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; my @hsps = $factory->align($seq_array_ref); =head1 DESCRIPTION DNA Block Aligner program (DBA) was developed by Ewan Birney. DBA is part of the Wise package available at L. You will need to enable dba to find the dba program. This can be done in a few different ways: 1. Define an environmental variable WISEDIR: export WISEDIR =/usr/local/share/wise2.2.0 2. Include a definition of an environmental variable WISEDIR in every script that will use DBA.pm: $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; 3. Make sure that the dba application is in your PATH. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::DBA; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @DBA_SWITCHES @DBA_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Search::HSP::GenericHSP; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @DBA_PARAMS = qw(MATCHA MATCHB MATCHC MATCHD GAP BLOCKOPEN UMATCH SINGLE NOMATCHN PARAMS KBYTE DYMEM DYDEBUG ERRORLOG); @OTHER_SWITCHES = qw(OUTFILE); @DBA_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD ALIGN LABEL); # Authorize attribute fields foreach my $attr ( @DBA_PARAMS, @DBA_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'dba'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i ) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe = $self->executable(); return undef unless defined $exe; my $string = `$exe -- ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 align Title : align Usage : $inputfilename = 't/data/seq.fa'; @hsps = $factory->align($inputfilename); or #@seq_array is array of Seq objs $seq_array_ref = \@seq_array; @hsps = $factory->align($seq_array_ref); or my @files = ('t/data/seq1.fa','t/data/seq2.fa'); @hsps = $factory->align(\@files); Function: Perform a DBA alignment Returns : An array of Bio::Search::HSP::GenericHSP objects Args : Name of a file containing a set of 2 fasta sequences or else a reference to an array to 2 Bio::Seq objects. or else a reference to an array of 2 file names containing 1 fasta sequence each Throws an exception if argument is not either a string (eg a filename) or a reference to an array of 2 Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; my ($temp,$infile1, $infile2, $seq); my ($attr, $value, $switch); # Create input file pointer ($infile1,$infile2)= $self->_setinput($input); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");} # Create parameter string to pass to dba program my $param_string = $self->_setparams(); # run dba my @hsps = $self->_run($infile1,$infile2,$param_string); return @hsps; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to dba program Example : Returns : nothing; dba output is written to a temp file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to dba =cut sub _run { my ($self,$infile1,$infile2,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); unless( $self->outfile){ my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); undef $tfh; $self->outfile($outfile); } my $outfile = $self->outfile(); my $commandstring = $self->executable." $param_string -pff $infile1 $infile2 > $outfile"; $self->debug( "dba command = $commandstring"); my $status = system($commandstring); $self->throw( "DBA call ($commandstring) crashed: $? \n") unless $status==0; #parse pff format and return a Bio::Search::HSP::GenericHSP array my $hsps = $self->_parse_results($outfile); return @{$hsps}; } =head2 _parse_results Title : __parse_results Usage : Internal function, not to be called directly Function: Parses dba output Example : Returns : an reference to an array of GenericHSPs Args : the name of the output file =cut sub _parse_results { my ($self,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align); my $count = 0; my @hsps; open(OUT,$outfile); my (%query,%subject); while(my $entry = ){ if($entry =~ /^>(.+)/ ) { $tempname = $1; if( defined $name ) { if($count == 0){ my @parse = split("\t",$name); $query{seqname} = $parse[0]; $query{start} = $parse[3]; $query{end} = $parse[4]; $query{score} = $parse[5]; $query{strand} = ($parse[6] eq '+') ? 1 : -1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $query{$1} = $2; } $query{seq} = $seqchar; $count++; } elsif ($count == 1){ my @parse = split("\t",$name); $subject{seqname} = $parse[0]; $subject{start} = $parse[3]; $subject{end} = $parse[4]; $subject{score} = $parse[5]; $subject{strand} = ($parse[6] eq '+') ? 1:-1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $subject{$1} = $2; } $subject{seq} = $seqchar; #create homology string my $xor = $query{seq}^$subject{seq}; my $identical = $xor=~tr/\c@/*/; $xor=~tr/*/ /c; my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', -score =>$query{score}, -hsp_length =>length($query{seq}), -query_gaps =>$query{gaps}, -hit_gaps =>$subject{gaps}, -query_name =>$query{seqname}, -query_start =>$query{start}, -query_end =>$query{end}, -hit_name =>$subject{seqname}, -hit_start =>$subject{start}, -hit_end =>$subject{end}, -hit_length =>length($self->_subject_seq->seq), -query_length =>length($self->_query_seq->seq), -query_seq =>$query{seq}, -hit_seq =>$subject{seq}, -conserved =>$identical, -identical =>$identical, -homology_seq =>$xor); push @hsps, $hsp; $count = 0; } } $name = $tempname; $seqchar = ""; next; } $entry =~ s/[^A-Za-z\.\-]//g; $seqchar .= $entry; } #do for the last entry if($count == 1){ my @parse = split("\t",$name); $subject{seqname} = $parse[1]; $subject{start} = $parse[3]; $subject{end} = $parse[4]; $subject{score} = $parse[5]; $subject{strand} = ($parse[6] eq '+') ? 1:-1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $subject{$1} = $2; } $subject{seq} = $seqchar; #create homology string my $xor = $query{seq}^$subject{seq}; my $identical = $xor=~tr/\c@/*/; $xor=~tr/*/ /c; my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', -score =>$query{score}, -hsp_length =>length($query{seq}), -query_gaps =>$query{gaps}, -hit_gaps =>$subject{gaps}, -query_name =>$query{seqname}, -query_start =>$query{start}, -query_end =>$query{end}, -hit_name =>$subject{seqname}, -hit_start =>$subject{start}, -hit_end =>$subject{end}, -hit_length =>length($self->_subject_seq->seq), -query_length =>length($self->_query_seq->seq), -query_seq =>$query{seq}, -hit_seq =>$subject{seq}, -conserved =>$identical, -identical =>$identical, -homology_seq =>$xor); push @hsps, $hsp; } return \@hsps; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for dba program Example : Returns : name of file containing dba data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); #there is gotta be some repetition here...need to clean up if (ref($input) ne "ARRAY"){ #a single file containg 2 seqeunces $infilename = $input; unless(-e $input){return 0;} my $in = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta'); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta','-flush'=>1); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta','-flush'=>1); my $seq1 = $in->next_seq() || return 0; my $seq2 = $in->next_seq() || return 0; $out1->write_seq($seq1); $out2->write_seq($seq2); $self->_query_seq($seq1); $self->_subject_seq($seq2); $out1->close(); $out2->close(); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } else { scalar(@{$input}) == 2 || $self->throw("dba alignment can only be run on 2 sequences not."); if(ref($input->[0]) eq ""){#passing in two file names my $in1 = Bio::SeqIO->new(-file => $input->[0], '-format' => 'fasta'); my $in2 = Bio::SeqIO->new(-file => $input->[1], '-format' => 'fasta'); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); my $seq1 = $in1->next_seq() || return 0; my $seq2 = $in2->next_seq() || return 0; $out1->write_seq($seq1); $out2->write_seq($seq2); $self->_query_seq($seq1); $self->_subject_seq($seq2); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } elsif($input->[0]->isa("Bio::PrimarySeqI") && $input->[1]->isa("Bio::PrimarySeqI")) { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); $out1->write_seq($input->[0]); $out2->write_seq($input->[1]); $self->_query_seq($input->[0]); $self->_subject_seq($input->[1]); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } else { return 0; } } return 0; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for dba program Example : Returns : parameter string to be passed to dba during align or profile_align Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @DBA_PARAMS ) { $value = $self->$attr(); next unless (defined $value); # next if $attr =~/outfile/i; my $attr_key = lc $attr; #put params in format expected by dba if($attr_key =~ /match([ABCDabcd])/i){ $attr_key = "match".uc($1); } $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @DBA_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head2 _query_seq() Title : _query_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_seq'} = $seq; } return $self->{'_query_seq'}; } =head2 _subject_seq() Title : _subject_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_seq'} = $seq; } return $self->{'_subject_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Exonerate.pm000077500000000000000000000174331342734133000247400ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Exonerate =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Exonerate; use Bio::SeqIO; my $qio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>'fasta'); my $query = $qio->next_seq(); my $tio = Bio::SeqIO->new(-file=>$ARGV[1],-format=>'fasta'); my $target = $sio->next_seq(); #exonerate parameters can all be passed via arguments parameter. #parameters passed are not checked for validity my $run = Bio::Tools::Run::Alignment::Exonerate-> new(arguments=>'--model est2genome --bestn 10'); my $searchio_obj = $run->run($query,$target); while(my $result = $searchio->next_result){ while( my $hit = $result->next_hit ) { while( my $hsp = $hit->next_hsp ) { print $hsp->start."\t".$hsp->end."\n"; } } } =head1 DESCRIPTION Wrapper for Exonerate alignment program. You can get exonerate at http://www.ebi.ac.uk/~guy/exonerate/. This wrapper is written without parameter checking. All parameters are passed via the arugment parameter that is passed in the constructor. See SYNOPSIS. For exonerate parameters, run exonerate --help for more details. =head1 PROGRAM VERSIONS The tests have been shown to pass with exonorate versions 2.0 - 2.2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh-at-stanford.edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Exonerate; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @EXONERATE_PARAMS %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; use Bio::SearchIO; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'exonerate'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{EXONERATEDIR}) if $ENV{EXONERATEDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Phrap->new(); Function: creates a new Phrap factory Returns: Bio::Tools::Run::Phrap Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe -v` ; #exonerate from exonerate version 2.0.0\n... my ($version) = $string =~ /exonerate version ([\d+\.]+)/m; $version =~ s/\.(\d+)$/$1/; return $version || undef; } =head2 run Title : run() Usage : my $feats = $factory->run($seq) Function: Runs Phrap Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$query,$target) = @_; my @feats; my ($file1) = $self->_writeInput($query); my ($file2) = $self->_writeInput($target); my $assembly = $self->_run($file1,$file2); return $assembly; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; $self->{'input'} = $infile1 if(defined $infile1); return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs Phrap Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self,$query,$target)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my $param_str = $self->_setparams." ".$self->arguments; my $str = $self->executable." $param_str $query $target "." > $outfile"; $self->debug( "$str\n"); my $status = system($str); $self->throw( "Exonerate call ($str) crashed: $? \n") unless $status==0; my $filehandle; my $exonerate_obj = Bio::SearchIO->new(-file=>"$outfile",-format=>'exonerate'); close($tfh); undef $tfh; unlink $outfile; return $exonerate_obj; } =head2 _writeInput Title : _writeInput Usage : $factory->_writeInput($query,$target) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeInput{ my ($self,$query) = @_; my ($fh,$infile1); if (ref($query) =~ /ARRAY/i) { my @infilearr; ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -file => ">$infile1", -format => 'Fasta' ); foreach my $seq1 (@$query) { unless ($seq1->isa("Bio::PrimarySeqI")) { return 0; } $temp->write_seq($seq1); push @infilearr, $infile1; } } elsif($query->isa("Bio::PrimarySeqI")) { ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -file => ">$infile1", -format => 'Fasta' ); $temp->write_seq($query); } else { $infile1 = $query; } return $infile1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string = ''; foreach my $attr(@EXONERATE_PARAMS){ next if($attr=~/PROGRAM/); my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Gmap.pm000066400000000000000000000176511342734133000236710ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Gmap # # Cared for by George Hartzell # # Copyright George Hartzell # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Gmap - Wrapper for running gmap. =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Gmap; use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file=>$filename ,-format=>'fasta'); my @seq; while(my $seq = $sio->next_seq()){ push @seq,$seq; } my $mapper =Bio::Tools::Run::Gmap->new(); my $result = $mapper->run(\@seq); =head1 DESCRIPTION Bioperl-run wrapper around gmap. See L for information about gmap. It requires a reference to an array of bioperl SeqI objects and returns a reference to a filehandle from which the gmap output can be read. One can explicitly set the name of the genome database (defaults to NHGD_R36) using the 'genome_db()' method. One can also explicitly set the flags that are passed to gmap (defaults to '-f 9 -5 -e') using the 'flags()' method. The name of the gmap executable can be overridden using the program_name() method and the directory in which to find that executable can be overridden using the program_dir() method. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - George Hartzell Email hartzell@alerce.com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... # TODO handle stderr output from gmap. package Bio::Tools::Run::Alignment::Gmap; use strict; use warnings; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::SeqIO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::Alignment::Gmap(); Function: Builds a new Bio::Tools::Run::Alignment::Gmap object Returns : an instance of Bio::Tools::Run::Alignment::Gmap Args : =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{_program_name} = 'gmap'; return $self; } =head2 version Title : version Usage : print "gmap version: " . $mapper->version() . "\n"; Function: retrieves and returns the version of the gmap package. Example : Returns : scalar string containing the version number. Probably looks like YYYY-MM-DD. Args : none. =cut sub version { my ($self,@args) = @_; my $version; my $str = $self->executable; $str .= ' --version'; $self->debug("gmap version command = $str\n"); open(GMAPRUN, "$str |") || $self->throw($@); { local $/ = undef; my $result = ; ($version) = ($result =~ m|.*Part of GMAP package, version (.*).*|); } return($version); } =head2 program_name Title : program_name Usage : $mapper->program_name('gmap-dev'); my $pname = $mapper->program_name(); Function: sets/gets the name of the program to run. Returns : string representing the name of the executable. Args : [optional] string representing the name of the executable to set. =cut sub program_name { my $self = shift; $self->{_program_name} = shift if @_; return $self->{_program_name}; } =head2 program_dir Title : program_dir Usage : $mapper->program_dir('/usr/local/sandbox/gmap/bin'); my $pdir = $mapper->program_dir(); Function: sets/gets the directory path in which to find the gmap executable. Returns : string representing the path to the directory. Args : [optional] string representing the directory path to set. =cut sub program_dir { my $self = shift; $self->{_program_dir} = shift if @_; return $self->{_program_dir}; } =head2 input_file Title : input_file Usage : $mapper->input_file('/tmp/moose.fasta'); my $filename = $mapper->input_file(); Function: sets/gets the name of a file containing sequences to be mapped. Returns : string containing the name of the query sequence. Args : [optional] string representing the directory path to set. =cut sub input_file { my $self = shift; $self->{_input_file} = shift if @_; return $self->{_input_file}; } =head2 genome_db Title : genome_db Usage : $mapper->genome_db('NHGD_R36'); my $genome_db = $mapper->genome_db(); Function: sets/gets the name of the genome database, this will be passed to gmap using its '-d' flag. Returns : name of the genome database. Args : [optional] string representing the genome db to set. =cut sub genome_db { my $self = shift; $self->{_genome_db} = shift if @_; return $self->{_genome_db}; } =head2 flags Title : flags Usage : $mapper->flags('-A -e -5'); my $flags = $mapper->flags(); Function: sets/gets the flags that will be passed to gmap. Returns : the current value of the flags that will be passed to gmap. Args : [optional] the flags to set. =cut sub flags { my $self = shift; $self->{_flags} = shift if @_; return $self->{_flags}; } =head2 run Title : run Usage : $mapper->run() Function: runs gmap Example : Returns : a file handle, opened for reading, for gmap's output. Args : An array of references query sequences (as Bio::Seq objects) =cut sub run { my $self = shift; $self->input_file( $self->_build_fasta_input_file(@_) ) if(@_); my $result = $self->_run(); return $result; } =head2 _build_fasta_input_file Title : _build_fasta_input_file Usage : my $seq_file = $self->_build_fasta_input_file(@_); Function: Example : Returns : The name of the temporary file that contains the sequence. Args : A reference to an array of Bio::Seq objects. =cut use File::Temp; sub _build_fasta_input_file { my $self = shift; my $seqs = shift; my $seq_count = 0; # the object returned by File::Temp->new() is magic. Used normally # it behaves as a filehandle opened onto the temporary file. Used # as a string it behaves as a string that is the name of the # temporary file. # It is up to the user to remove the when finished with it. my $file_tmp = File::Temp->new( TEMPLATE => 'mvp-gmap-tempfile-XXXXXX', TMPDIR => 1, UNLINK => 0, ); my $seqio = Bio::SeqIO->new( -fh => $file_tmp, -format => 'Fasta' ); if (ref($seqs) =~ /ARRAY/i) { foreach my $seq (@$seqs) { throw Bio::Root::BadParameter(-text => "sequence args must be a Bio::SeqI subclass.", ) unless ($seq->isa("Bio::PrimarySeqI")); $seqio->write_seq($seq); $seq_count++; } } if ($seq_count == 0) { throw Bio::Root::BadParameter(-text => <executable; $str .= ' -d' . ($self->genome_db() || 'NHGD_R36'); $str .= ' ' . ($self->flags() || '-f 9 -5 -e'); $str .= ' ' . $self->input_file(); $str .= " 2> $null"; $str .= '; rm -f ' . $self->input_file(); $self->debug("gmap command = $str\n"); open(GMAPRUN, "$str |") || $self->throw("Can't open gmap (command = \"$str\"): $!"); return (\*GMAPRUN); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Kalign.pm000066400000000000000000000335121342734133000242040ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Kalign # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Kalign - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the KALIGN program =head1 SYNOPSIS # Build a kalign alignment factory $factory = Bio::Tools::Run::Alignment::Kalign->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Please cite: Timo Lassmann and Erik L.L. Sonnhammer (2005) Kalign - an accurate and fast multiple sequence alignment algorithm. BMC Bioinformatics 6:298 http://msa.cgb.ki.se/downloads/kalign/current.tar.gz =head2 Helping the module find your executable You will need to enable Kalign to find the kalign program. This can be done in (at least) three ways: 1. Make sure the kalign executable is in your path (i.e. 'which kalign' returns a valid program 2. define an environmental variable KALIGNDIR which points to a directory containing the 'kalign' app: In bash export KALIGNDIR=/home/progs/kalign or In csh/tcsh setenv KALIGNDIR /home/progs/kalign 3. include a definition of an environmental variable KALIGNDIR in every script that will BEGIN {$ENV{KALIGNDIR} = '/home/progs/kalign'; } use Bio::Tools::Run::Alignment::Kalign; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email idontlikespam@hotmail.com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Kalign; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @KALIGN_PARAMS @KALIGN_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @KALIGN_PARAMS = qw(IN OUT GAPOPEN GAPEXTENSION TERMINAL_GAP_EXTENSION_PENALTY MATRIX_BONUS SORT FEATURE DISTANCE TREE ZCUTOFF FORMAT MAXMB MAXHOURS MAXITERS); @KALIGN_SWITCHES = qw(QUIET); # Authorize attribute fields foreach my $attr ( @KALIGN_PARAMS, @KALIGN_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'kalign'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{KALIGNDIR}) if $ENV{KALIGNDIR}; } =head2 new Title : new Usage : my $kalign = Bio::Tools::Run::Alignment::Kalign->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Kalign Args : -outfile_name => $outname =cut sub new { my ($class,@args) = @_; my( @kalign_args, @obj_args); while( my $arg = shift @args ) { if( $arg =~ /^-/ ) { push @obj_args, $arg, shift @args; } else { push @kalign_args,$arg, shift @args; } } my $self = $class->SUPER::new(@obj_args); my ($on) = $self->_rearrange([qw(OUTFILE_NAME)],@obj_args); $self->outfile_name($on || ''); my ($attr, $value); # FIXME: only tested with fasta output format right now... $self->aformat($DEFAULTS{'AFORMAT'}); while ( @kalign_args) { $attr = shift @kalign_args; $value = shift @kalign_args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } if( defined $self->out ) { $self->outfile_name($self->out); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 2 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; # Kalign version 2.01, Copyright (C) 2004, 2005, 2006 Timo Lassmann return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Kalign\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if( defined $input ) { $infilename = $self->_setinput($input); } elsif( defined $self->in ) { $infilename = $self->_setinput($self->in); } else { $self->throw("No inputdata provided\n"); } if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run kalign return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to kalign program Example : Returns : nothing; kalign output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to kalign =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." -in $infilename $params"; $self->debug( "kalign command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Kalign call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for kalign program Example : Returns : name of file containing kalign data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to kalign!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for kalign program Example : Returns : parameter string to be passed to kalign during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @KALIGN_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key .' '.$value; } for $attr ( @KALIGN_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " -out ".$self->outfile_name; if ($self->quiet() || $self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $kalign->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $kalign->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Lagan.pm000066400000000000000000000464561342734133000240340ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Lagan # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Stephen Montgomery # # Special thanks to Jason Stajich. # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Lagan - Object for the local execution of the LAGAN suite of tools (including MLAGAN for multiple sequence alignments) =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Lagan; @params = ('chaos' => "The contents of this string will be passed as args to chaos", #Read you chaos README file for more info/This functionality #has not been tested and will be integrated in future versions. 'order' => "\"-gs -7 -gc -2 -mt 2 -ms -1\"", #Where gap start penalty of- 7, gap continue of -2, match of 2, #and mismatch of -1. 'recurse' => "\"(12,25),(7,25),(4,30)"\", #A list of (wordlength,score cutoff) pairs to be used in the #recursive anchoring 'tree' => "\"(sample1 (sample2 sample3))"\", #Used by mlagan / tree can also be passed when calling mlagan directly #SCORING PARAMETERS FOR MLAGAN: 'match' => 12, 'mismatch' => -8, 'gapstart' => -50, 'gapend' => -50, 'gapcont' => -2, ); =head1 DESCRIPTION To run mlagan/lagan, you must have an environment variable that points to the executable directory with files lagan.pl etc. "LAGAN_DIR=/opt/lagan_executables/" Simply having the executables in your path is not supported because the executables themselves only work with the environment variable set. All lagan and mlagan parameters listed in their Readmes can be set except for the mfa flag which has been turned on by default to prevent parsing of the alignment format. TO USE LAGAN: my $lagan = Bio::Tools::Run::Alignment::Lagan->new(@params); my $report_out = $lagan->lagan($seq1, $seq2); A SimpleAlign object is returned. TO USE MLAGAN: my $lagan = Bio::Tools::Run::Alignment::Lagan->new(); my $tree = "(($seqname1 $seqname2) $seqname3)"; my @sequence_objs; #an array of bioperl Seq objects ##If you use an unblessed seq array my $seq_ref = \@sequence_objs; bless $seq_ref, "ARRAY"; my $report_out = $lagan->mlagan($seq_ref, $tree); A SimpleAlign object is returned Only basic mlagan/lagan functionality has been implemented due to the iterative development of their project. Future maintenance upgrades will include enhanced features and scoring. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Stephen Montgomery Email smontgom@bcgsc.bc.ca Genome Sciences Centre in beautiful Vancouver, British Columbia CANADA =head1 CONTRIBUTORS MLagan/Lagan is the hard work of Michael Brudno et al. Sendu Bala bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Lagan; use strict; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::AlignIO; use Bio::SimpleAlign; use File::Spec; use Bio::Matrix::IO; use Cwd; use base qw(Bio::Tools::Run::WrapperBase); our @LAGAN_PARAMS = qw(chaos order recurse mfa out lazy maskedonly usebounds rc translate draft info fastreject); our @OTHER_PARAMS = qw(outfile); our @LAGAN_SWITCHES = qw(silent quiet); our @MLAGAN_PARAMS = qw(nested postir translate lazy verbose tree match mismatch gapstart gapend gapcont out version); #Not all of these parameters are useful in this context, care #should be used in setting only standard ones #The LAGAN_DIR environment variable must be set our $PROGRAM_DIR = $ENV{'LAGAN_DIR'} || ''; sub new { my($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@LAGAN_PARAMS, @OTHER_PARAMS, @LAGAN_SWITCHES, @MLAGAN_PARAMS], -create => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->out || $self->outfile || $tempfile; $self->out($outfile); close($tfh); undef $tfh; return $self; } =head2 lagan Runs the Lagan pairwise alignment algorithm Inputs should be two PrimarySeq objects. Returns an SimpleAlign object / preloaded with the tmp file of the Lagan multifasta output. =cut sub lagan { my ($self, $input1, $input2) = @_; $self->io->_io_cleanup(); my $executable = 'lagan.pl'; #my (undef, $tempfile) = $self->io->tempfile(); #$self->out($tempfile); my ($infile1, $infile2) = $self->_setinput($executable, $input1, $input2); my $lagan_report = &_generic_lagan( $self, $executable, $infile1, $infile2 ); } =head2 mlagan Runs the Mlagan multiple sequence alignment algorithm. Inputs should be an Array of Primary Seq objects and a Phylogenetic Tree in String format or as a Bio::Tree::TreeI compliant object. Returns an SimpleAlign object / preloaded with the tmp file of the Mlagan multifasta output. =cut sub mlagan { my ($self, $input1, $tree) = @_; $self->io->_io_cleanup(); my $executable = 'mlagan'; if ($tree && ref($tree) && $tree->isa('Bio::Tree::TreeI')) { # fiddle tree so mlagan will like it my %orig_ids; foreach my $node ($tree->get_nodes) { my $seq_id = $node->name('supplied'); $seq_id = $seq_id ? shift @{$seq_id} : ($node->node_name ? $node->node_name : $node->id); $orig_ids{$seq_id} = $node->id; $node->id($seq_id); } # convert to string my $tree_obj = $tree; $tree = $tree->simplify_to_leaves_string; # more fiddling $tree =~ s/ /_/g; $tree =~ s/"//g; $tree =~ s/,/ /g; # unfiddle the tree object foreach my $node ($tree_obj->get_nodes) { $node->id($orig_ids{$node->id}); } } my $infiles; ($infiles, $tree) = $self->_setinput($executable, $input1, $tree); my $lagan_report = &_generic_lagan ( $self, $executable, $infiles, $tree ); } =head2 nuc_matrix Title : nuc_matrix Usage : my $matrix_obj = $obj->nuc_matrix(); -or- $obj->nuc_matrix($matrix_obj); -or- $obj->nuc_matrix($matrix_file); Function: Get/set the substitution matrix for use by mlagan. By default the file $LAGAN_DIR/nucmatrix.txt is used by mlagan. By default this method returns a corresponding Matrix. Returns : Bio::Matrix::Mlagan object Args : none to get, OR to set: Bio::Matrix::MLagan object OR filename of an mlagan substitution matrix file NB: due to a bug in mlagan 2.0, the -nucmatrixfile option does not work, so this Bioperl wrapper is unable to simply point mlagan to your desired matrix file (or to a temp file generated from your matrix object). Instead the $LAGAN_DIR/nucmatrix.txt file must actually be replaced. This wrapper will make a back-up copy of that file, write the new file in its place, then revert things back to the way they were after the alignment has been produced. For this reason, $LAGAN_DIR must be writable, as must $LAGAN_DIR/nucmatrix.txt. =cut sub nuc_matrix { my ($self, $thing, $gap_open, $gap_continue) = @_; if ($thing) { if (-e $thing) { my $min = Bio::Matrix::IO->new(-format => 'mlagan', -file => $thing); $self->{_nuc_matrix} = $min->next_matrix; } elsif (ref($thing) && $thing->isa('Bio::Matrix::Mlagan')) { $self->{_nuc_matrix} = $thing; } else { $self->throw("Unknown kind of thing supplied, '$thing'"); } $self->{_nuc_matrix_set} = 1; } unless (defined $self->{_nuc_matrix}) { # read the program default file my $min = Bio::Matrix::IO->new(-format => 'mlagan', -file => File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt')); $self->{_nuc_matrix} = $min->next_matrix; } $self->{_nuc_matrix_set} = 1 if defined wantarray; return $self->{_nuc_matrix}; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Lagan executables Returns : name of files containing Lagan data input / or array of files and phylo tree for Mlagan data input =cut sub _setinput { my ($self, $executable, $input1, $input2) = @_; my ($fh, $infile1, $infile2, $temp1, $temp2, $seq1, $seq2); $self->io->_io_cleanup(); SWITCH: { if (ref($input1) =~ /ARRAY/i) { ##INPUTS TO MLAGAN / WILL hAVE TO BE CHANGED IF LAGAN EVER ##SUPPORTS MULTI-INPUT my @infilearr; foreach $seq1 (@$input1) { ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); unless ($seq1->isa("Bio::PrimarySeqI")) { return 0; } $temp->write_seq($seq1); close $fh; undef $fh; push @infilearr, $infile1; } $infile1 = \@infilearr; last SWITCH; } elsif ($input1->isa("Bio::PrimarySeqI")) { ##INPUTS TO LAGAN ($fh, $infile1) = $self->io->tempfile(); #Want to make sure their are no white spaces in sequence. #Happens if input1 is taken from an alignment. my $sequence = $input1->seq(); $sequence =~ s/\W+//g; $input1->seq($sequence); $temp1 = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); $temp1->write_seq($input1); close $fh; undef $fh; last SWITCH; } } SWITCH2: { if (ref($input2)) { if ($input2->isa("Bio::PrimarySeqI")) { ($fh, $infile2) = $self->io->tempfile(); #Want to make sure their are no white spaces in #sequence. Happens if input2 is taken from an #alignment. my $sequence = $input2->seq(); $sequence =~ s/\W+//g; $input2->seq($sequence); $temp2 = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); $temp2->write_seq($input2); close $fh; undef $fh; last SWITCH2; } } else { $infile2 = $input2; ##A tree as a scalar has been passed, pass it through } } return ($infile1, $infile2); } =head2 _generic_lagan Title : _generic_lagan Usage : internal function not called directly Returns : SimpleAlign object =cut sub _generic_lagan { my ($self, $executable, $input1, $input2) = @_; my $param_string = $self->_setparams($executable); my $lagan_report = &_runlagan($self, $executable, $param_string, $input1, $input2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for (m)Lagan program Returns : parameter string to be passed to Lagan Args : Reference to calling object and name of (m)Lagan executable =cut sub _setparams { my ($self, $executable) = @_; my (@execparams, $nucmatrixfile); if ($executable eq 'lagan.pl') { @execparams = @LAGAN_PARAMS; } elsif ($executable eq 'mlagan') { @execparams = @MLAGAN_PARAMS; if ($self->{_nuc_matrix_set}) { # we create this file on every call because we have no way of # knowing if user altered the matrix object (my $handle, $nucmatrixfile) = $self->io->tempfile(); my $mout = Bio::Matrix::IO->new(-format => 'mlagan', -fh => $handle); $mout->write_matrix($self->nuc_matrix); $self->{_nucmatrixfile} = $nucmatrixfile; } } ##EXPAND OTHER LAGAN SUITE PROGRAMS HERE my $param_string = $self->SUPER::_setparams(-params => [@execparams], -dash => 1); $param_string .= " -nucmatrixfile $nucmatrixfile" if $nucmatrixfile; return $param_string . " -mfa "; } =head2 _runlagan Title : _runlagan Usage : Internal function, not to be called directly Function: makes actual system call to (m)Lagan program Example : Returns : Report object in the SimpleAlign object =cut sub _runlagan { my ($self, $executable, $param_string, $input1, $input2) = @_; my ($lagan_obj, $exe); if ( ! ($exe = $self->executable($executable))) { return; } my $version = $self->version; my $command_string; if ($executable eq 'lagan.pl') { $command_string = $exe . " " . $input1 . " " . $input2 . $param_string; } if ($executable eq 'mlagan') { $command_string = $exe; foreach my $tempfile (@$input1) { $command_string .= " " . $tempfile; } if (defined $input2) { $command_string .= " -tree " . "\"" . $input2 . "\""; } $command_string .= " " . $param_string; my $matrix_file = $self->{_nucmatrixfile}; if ($version <= 3 && $matrix_file) { # mlagan 2.0 bug-workaround my $orig = File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt'); -e $orig || $self->throw("Strange, $orig doesn't seem to exist"); system("cp $orig $orig.bk") && $self->throw("Backup of $orig failed: $!"); system("cp $matrix_file $orig") && $self->throw("Copy of $matrix_file -> $orig failed: $!"); } } if (($self->silent || $self->quiet) && ($^O !~ /os2|dos|MSWin32|amigaos/)) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $command_string .= " > $null 2> $null"; } # will do brute-force clean up of junk files generated by lagan my $cwd = cwd(); opendir(my $cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); my %ok_files; foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /anch/) { $ok_files{$thing} = 1; } } closedir($cwd_dir); $self->debug("$command_string\n"); my $status = system(($version <= 3 ? '_POSIX2_VERSION=1 ' : '').$command_string); # temporary hack whilst lagan script 'rechaos.pl' uses obsolete sort syntax if ($version <= 1 && $self->{_nucmatrixfile}) { my $orig = File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt'); system("mv $orig.bk $orig") && $self->warn("Restore of $orig from $orig.bk failed: $!"); } opendir($cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /anch/) { unlink($thing) unless $ok_files{$thing}; } } closedir($cwd_dir); my $outfile = $self->out(); my $align = Bio::AlignIO->new( '-file' => $outfile, '-format' => 'fasta' ); my $aln = $align->next_aln(); return $aln; } =head2 executable Title : executable Usage : my $exe = $lagan->executable('mlagan'); Function: Finds the full path to the 'lagan' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found Thanks to Jason Stajich for providing the framework for this subroutine =cut sub executable { my ($self, $exename, $exe, $warn) = @_; $exename = 'lagan.pl' unless defined $exename; if ( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless ( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); unless( $exe ) { if ( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } # even if its executable, we still need the environment variable to have # been set if (! $PROGRAM_DIR) { $self->warn("Environment variable LAGAN_DIR must be set, even if the lagan executables are in your path"); $self->{'_pathtoexe'}->{$exename} = undef; } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_path Title : program_path Usage : my $path = $lagan->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Thanks to Jason Stajich for providing the framework for this subroutine =cut sub program_path { my ($self,$program_name) = @_; my @path; push @path, $self->program_dir if $self->program_dir; push @path, $program_name .($^O =~ /mswin/i ?'':''); # Option for Windows variants / None so far return Bio::Root::IO->catfile(@path); } =head2 program_dir Title : program_dir Usage : my $dir = $lagan->program_dir(); Function: Abstract get method for dir of program. To be implemented by wrapper. Returns : string representing program directory Thanks to Jason Stajich for providing the framework for this subroutine =cut sub program_dir { $PROGRAM_DIR; } =head2 version Title : version Usage : my $version = $lagan->version; Function: returns the program version Returns : number Args : none =cut sub version { my $self = shift; my $exe = $self->executable('mlagan') || return; open(my $VER, "$exe -version 2>&1 |") || die "Could not open command '$exe -version'\n"; my $version; while (my $line = <$VER>) { ($version) = $line =~ /(\d+\S+)/; } close($VER) || die "Could not complete command '$exe -version'\n"; return $version; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/MAFFT.pm000066400000000000000000000423211342734133000236320ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::MAFFT # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::MAFFT - run the MAFFT alignment tools =head1 SYNOPSIS # Build a MAFFT alignment factory $factory = Bio::Tools::Run::Alignment::MAFFT->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); #There are various additional options available. =head1 DESCRIPTION You can get MAFFT from L. "fftnsi" is the default method for Mafft version 4 in this implementation. See Bio::Tools::Run::Alignment::Clustalw for a description on how to specify parameters to the underlying alignment program. See the MAFFT manual page for a description of the MAFFT parameters. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.html - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::MAFFT; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @MAFFT4_PARAMS @MAFFT4_SWITCHES @OTHER_SWITCHES %OK_FIELD @MAFFT_ALN_METHODS @MAFFT6_PARAMS @MAFFT6_SWITCHES %OK_FIELD6 ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'OUTPUT' => 'fasta', 'METHOD' => 'fftnsi', 'CYCLES' => 2); @MAFFT4_PARAMS =qw( METHOD CYCLES ); @MAFFT4_SWITCHES = qw( NJ ALL_POSITIVE); # NB: Mafft6 options are case-sensitive (eg. --lop and --LOP is different) @MAFFT6_PARAMS = qw( weighti retree maxiterate partsize groupsize op ep lop lep lexp LOP LEXP bl jtt tm aamatrix fmodel seed ); @MAFFT6_SWITCHES = qw( auto 6merpair globalpair localpair genafpair fastapair fft nofft noscore memsave parttree dpparttree fastaparttree clustalout inputorder reorder treeout nuc amino ); @OTHER_SWITCHES = qw(QUIET ALIGN OUTPUT OUTFILE); @MAFFT_ALN_METHODS = qw(fftnsi fftns nwnsi nwns fftnsrough nwnsrough); #@MAFFT6_ALN_METHODS = qw(linsi ginsi einsi fftnsi fftns nwnsi nwns) # Authorize attribute fields foreach my $attr ( @MAFFT4_SWITCHES,@MAFFT4_PARAMS,@OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } foreach my $attr ( @MAFFT6_PARAMS, @MAFFT6_SWITCHES ) { $OK_FIELD6{$attr}++ } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'mafft'; } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the 'codeml' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe,$warn) = @_; $exename = $self->program_name unless (defined $exename ); if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_path Title : program_path Usage : my $path = $factory->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Args : none =cut sub program_path { my ($self,$program_name) = @_; my @path; push @path, $self->program_dir if $self->program_dir; push @path, $program_name .($^O =~ /mswin/i ?'.exe':''); return Bio::Root::IO->catfile(@path); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return File::Spec->rel2abs($ENV{MAFFTDIR}) if $ENV{MAFFTDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output ); if ( ! $self->_version6 ) { $self->method($DEFAULTS{'METHOD'}) unless( $self->method ); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; # NB: Mafft6 options are case-sensitive if ( $self->_version6 ) { if ( $OK_FIELD6{ $attr } ) { # Don't want the attrs to clash with bioperl attributes $self->{version6attrs}{$attr} = shift if @_; return $self->{version6attrs}{$attr}; } } $attr = uc $attr; # aliasing $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME'; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysis run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return unless $exe = $self->executable; # this is a bit of a hack, but MAFFT is just a gawk script # so we are actually grepping the scriptfile # UPDATE (Torsten Seemann) # it now seems to be a 'sh' script and the format has changed # slightly. i've tried to make the change compatible with both... # version="v5.860 (2006/06/12)"; export version if( open(my $NAME, "grep 'export version' $exe | ") ) { while(<$NAME>) { if( /version.*?([\d.a-z]+)\s+/ ) { return $1; } } $self->warn("No version found"); close($NAME); } else { $self->warn("$!"); } return; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : array ref of Bio::PrimarySeqI objects OR filename of sequences to run with =cut sub run { my ($self,$seqs) = @_; return $self->align($seqs); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is an array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename,$type) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my ($param_string,$outstr) = $self->_setparams(); # run mafft return $self->_run($infilename, $param_string,$outstr); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to tcoffee program Example : Returns : nothing; tcoffee output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to tcoffee =cut sub _run { my ($self,$infilename,$paramstr,$outstr) = @_; my $commandstring = $self->executable()." $paramstr $infilename $outstr"; $self->debug( "mafft command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile(); if( !-e $outfile || -z $outfile ) { $self->warn( "MAFFT call crashed: $? [command $commandstring]\n"); return; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->output); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for mafft programs Example : Returns : name of file containing mafft data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return; } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/ ) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for mafft program Example : Returns : parameter string to be passed to mafft program Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($outfile,$param_string) = ('',''); # Set default output file if no explicit output file selected unless (defined($outfile = $self->outfile) ) { my $tfh; ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile($outfile); } my ($attr,$value); if ( $self->_version6 ) { for $attr ( @MAFFT6_SWITCHES) { $value = $self->$attr(); next unless defined $value; my $attr_key = lc $attr; #put switches in format expected by mafft $attr_key = ' --'.$attr_key; $param_string .= $attr_key ; } for $attr ( @MAFFT6_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key; $param_string .= $attr_key .' '.$value; } if ( ! $self->no_param_checks ) { my @incompatible = qw/auto 6merpair globalpair localpair genafpair fastapair/; my @set = grep { $self->$_ } @incompatible; if ( @set > 1 ) { $self->throw("You can't specify more than one of @set"); } } } else { for $attr ( @MAFFT4_SWITCHES) { $value = $self->$attr(); next unless defined $value; my $attr_key = lc $attr; #put switches in format expected by mafft $attr_key = ' --'.$attr_key; $param_string .= $attr_key ; } # Method is a version 4 option my $method = $self->method; $self->throw("no method ") unless defined $method; if( $method !~ /(rough|nsi)$/ && defined $self->cycles) { $param_string .= " ".$self->cycles; } } my $outputstr = " 1>$outfile" ; if ($self->quiet() || $self->verbose < 0) { $param_string .= " --quiet"; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $outputstr .= " 2> $null"; } return ($param_string, $outputstr); } =head2 methods Title : methods Usage : my @methods = $self->methods() Function: Get/Set Alignment methods - NOT VALIDATED Returns : array of strings Args : arrayref of strings =cut sub methods { my ($self) = shift; return @MAFFT_ALN_METHODS; } =head2 _version6 Title : _version6 Usage : Internal function, not to be called directly Function: Check if the version of MAFFT is 6 Example : Returns : Boolean Args : None =cut sub _version6 { my $self = shift; if ( ! defined $self->{_version6} ) { my $version = $self->version || ''; if ( $version =~ /^v6/ ) { $self->{_version6} = 1; } else { $self->{_version6} = ''; } } return $self->{_version6}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $mafft->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $mafft->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/MSAProbs.pm000077500000000000000000000452211342734133000244300ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::MSAProbs # # Please direct questions and support issues to # # Cared for by Jessen Bredeson # # Copyright Jessen Bredeson # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::MSAProbs - Object for the calculation of a multiple sequence alignment (MSA) from a set of unaligned sequences using the MSAProbs program =head1 SYNOPSIS # Build a msaprobs alignment factory $factory = Bio::Tools::Run::Alignment::MSAProbs->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION MSAProbs is Liu, Schmidt, and Maskell's (2010) alignment program using HMM and partition function posterior probabilities. For more a more in-depth description see the original publication: Liu, Y., Schmidt, B., and Maskell, D. L. (2010) MSAProbs: multiple sequence alignment based on pair hidden Markov models and partition function posterior probabilities. I 26(16): 1958-1964 doi:10.1093/bioinformatics/btq338 -OR- http://bioinformatics.oxfordjournals.org/content/26/16/1958.abstract You can download the source code from http://sourceforge.net/projects/msaprobs/ It is recommended you use at least version 0.9; behaviour with earlier versions is questionable. =head2 Helping the module find your executable You will need to help MSAProbs to find the 'msaprobs' executable. This can be done in (at least) three ways: 1. Make sure the msaprobs executable is in your path (i.e. 'which msaprobs' returns a valid program) 2. define an environmental variable MSAPROBSDIR which points to a directory containing the 'msaprobs' app: In bash export MSAPROBSDIR=/home/progs/msaprobs or In csh/tcsh setenv MSAPROBSDIR /home/progs/msaprobs 3. include a definition of an environmental variable MSAPROBSDIR in every script that will BEGIN {$ENV{MSAPROBSDIR} = '/home/progs/msaprobs'; } use Bio::Tools::Run::Alignment::MSAProbs; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHOR - Jessen Bredeson Email jessenbredeson@berkeley.edu =head1 CONTRIBUTIONS This MSAProbs module was adapted from the Bio::Tools::Run::Alignment::Muscle module, written by Jason Stajich and almost all of the credit should be given to him. Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::MSAProbs; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS %MSAPROBS_PARAMS %MSAPROBS_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::GuessSeqFormat; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'QUIET' => 1, '_AFORMAT' => 'fasta', '_CONSISTENCY' => 2, '_ITERATIONS' => 10, '_CLUSTALW' => 0, '_ALIGNMENT_ORDER' => 0 ); %MSAPROBS_PARAMS = ( 'NUM_THREADS' => 'NUM_THREADS', 'CONSISTENCY' => 'C', 'ITERATIONS' => 'IR', 'ANNOT_FILE' => 'ANNOT' ); %MSAPROBS_SWITCHES = ( 'CLUSTALW' => 'CLUSTALW', 'ALIGNMENT_ORDER' => 'A' ); # Authorize attribute fields %OK_FIELD = map{ uc($_) => 1 } qw(INFILE OUTFILE VERBOSE QUIET VERSION), keys %MSAPROBS_PARAMS, keys %MSAPROBS_SWITCHES; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'msaprobs'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{MSAPROBSDIR}) if $ENV{MSAPROBSDIR}; } =head2 version Title : version Usage : exit if $prog->version() < 0.9.4 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my( $exe,$version ); return unless $exe = $self->executable; my $string = `$exe -version 2>&1` ; $string =~ /MSAPROBS\s+VERSION\s+([\d\.]+)/i; $version =~ s/\.(\d+)$/$1/ if ($version = $1); return $version || undef; } =head2 new Title : new Usage : my $msaprobs = Bio::Tools::Run::Alignment::MSAProbs->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::MSAProbs Args : -outfile => $outname =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my( @msap_args, @obj_args, $field ); while( my $arg = shift @args ) { $field = uc $arg; $field =~ s/^-//; $arg = '-'.$arg if $arg !~ /^-/; $self->throw("Invalid argument: $field") unless $OK_FIELD{$field}; push @msap_args, lc($arg),shift @args; } map{ $self->{lc($_)} = $DEFAULTS{$_} } keys %DEFAULTS; $self->_set_from_args(\@msap_args, -create => 1, -case_sensitive => 1, -methods => [map{lc($_);} keys %OK_FIELD]); return $self; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my( $self,$input ) = @_; $input ||= $self->infile; return $self->align($input); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if( defined($input) ) { $infilename = $self->_setinput($input); } elsif( defined($self->infile) ) { $infilename = $self->_setinput($self->infile); } else { $self->throw("No inputdata provided\n"); } unless( $infilename ) { $self->throw("Bad input data or less than 2 sequences in $infilename !"); } my $param_string = $self->_setparams(); # run msaprobs return &_run($self, $infilename, $param_string); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 infile Title : infile Usage : $prog->infile($filename) Function: get/set the fasta (and only a fasta) file to run on or the array reference containing the Bio::SeqI objects Returns : name of input sequence file or object array ref Args : name of input sequence file or object array ref =cut =head2 outfile Title : outfile Usage : $prog->outfile($filename) Function: get/set the file to save output to Returns : outfile name if set Args : newvalue (optional) =cut =head2 annot_file Title : annot_file Usage : $prog->annot_file($filename) Function: get/set the file name to write the MSA annotation to Returns : filename or undef Args : filename (optional) =cut =head2 num_threads Title : num_threads Usage : $prog->num_threads($cores) Function: get/set number of cores on your machine Returns : integer Args : integer (optional; executable auto-detects) =cut =head2 consistency Title : consistency Usage : $prog->consistency($passes) Function: get/set the number of consistency transformation passes Returns : integer Args : integer 0..5, [default 2] (optional) =cut =head2 iterations Title : iterations Usage : $prog->iterations($passes) Function: get/set the number of iterative-refinement passes Returns : integer Args : integer 0..1000, [default 10] (optional) =cut =head2 alignment_order Title : alignment_order Usage : $prog->alignment_order($bool) Function: specify whether or not to output aligned sequences in alignment order, not input order Returns : boolean Args : boolean [default: off] (optional) =cut =head2 clustalw Title : clustalw Usage : $prog->clustalw($bool) Function: write output in clustalw format; makes no sense unless outfile() is also specified Returns : boolean Args : boolean [default: off] (optional) =cut =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $msaprobs->outfile_name(); Function: Get the name of the output file from a run (if you wanted to do something special) Returns : string Args : none =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $msaprobs->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut =head1 Private Methods =cut =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to msaprobs program Example : Returns : nothing; msaprobs output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to msaprobs =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable.' '.$infilename.$params; $self->debug( "msaprobs command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name; if( !-s $outfile ) { $self->warn( "MSAProbs call crashed: $? [command $commandstring]\n"); return undef; } if( $self->clustalw ){ $outfile = $self->_clustalize($outfile); $self->aformat('clustalw'); } my $in = Bio::AlignIO->new( '-file' => $outfile, '-format' => $self->aformat, '-displayname_flat' => 1 ); my $aln = $in->next_aln(); undef $in; return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for msaprobs program Example : Returns : name of file containing msaprobs data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my( $self,$input ) = @_; my( $infilename,$outtemp,$tfh,@sequences ); if (! ref $input) { # check that file exists or throw return unless (-s $input && -r $input); # let's peek and guess $infilename = $input; open(IN,$input) || $self->throw("Cannot open $input"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); $header =~ /^>\s*\S+/ || $self->throw("Need to provide a FASTA-formatted file to msaprobs!"); my $inseqio = Bio::SeqIO->new( -file => $input, -format => 'fasta' ); while( my $seq = $inseqio->next_seq ){ push @sequences, $seq; } undef $inseqio; # have to check each seq for terminal '*', so # continue below and write clean output to temp file }elsif( ref($input) =~ /ARRAY/i ){ # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return; }elsif( $input->[0]->isa('Bio::PrimarySeqI') ){ @sequences = @$input; }else{ $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); return; } }else{ $self->warn("Got $input and don't know what to do with it\n"); return; } ($tfh,$infilename) = $self->io->tempfile(); $outtemp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my( @out,$string ); my $ct = 1; while( my $seq = shift @sequences){ return unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/){ $seq->display_id( "Seq".$ct++ ); } $string = $seq->seq; $string =~ s/\*$//; $seq->seq($string); if( $string =~ tr/~.-/~.-/ ){ $self->warn("These sequences may have already been aligned!"); } push @out, $seq; } $outtemp->write_seq(@out); $outtemp->close(); undef $outtemp; close($tfh); $tfh = undef; return $infilename; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for msaprobs program Example : Returns : parameter string to be passed to msaprobs during align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr,$method,$value,$param_string); $param_string = ''; unless( defined $self->outfile ){ $self->aformat($DEFAULTS{'AFORMAT'}); $self->clustalw(0); } #put switches/params in format expected by MSAProbs for $attr ( keys %MSAPROBS_PARAMS ){ $method = lc $attr; $value = $self->$method(); next unless (defined $value); my $attr_key = lc $MSAPROBS_PARAMS{$attr}; $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( keys %MSAPROBS_SWITCHES ){ $method = lc $attr; $value = $self->$method(); next unless $value; my $attr_key = lc $MSAPROBS_SWITCHES{$attr}; $attr_key = ' -'.$attr_key; $param_string .= $attr_key; } # Set default output file if no explicit file specified # or if a clustalw-formatted file is desired... if( $self->clustalw || ! $self->outfile ) { my ($tfh, $outfile) = $self->io->tempfile(-dir => $self->tempdir); close($tfh); undef $tfh; $self->outfile_name($outfile); }else{ $self->outfile_name($self->outfile); } my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= ' -v' if $self->verbose > 0; $param_string .= ' >'.$self->outfile_name; $param_string .= " 2>$null" if $self->quiet && $self->verbose < 1; $self->arguments($param_string); return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } sub _clustalize { my $self = shift; my $infile = shift; my $outfile = $self->outfile; local $/ = "\n"; my( $in,$out,$firstline,$line ); $in = Bio::Root::IO->new(-file => $infile); $out = Bio::Root::IO->new(-file => '>'.$outfile); while( defined( $firstline = $in->_readline )) { last if $firstline !~ /^\s*$/; } $in->_pushback('CLUSTALW format, '.$firstline); while( defined( $line = $in->_readline )) { $out->_print( $line ); } $out->close(); $in->close(); undef $out; undef $in; $self->debug($outfile); return $outfile if -s $outfile; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Muscle.pm000066400000000000000000000417471342734133000242400ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Muscle # # Please direct questions and support issues to # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Muscle - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the MUSCLE program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Muscle->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Muscle is Robert Edgar's progressive alignment program. You can get it and see information about it at this URL http://www.drive5.com/muscle It is recommended you use at least version 3.6. Behaviour with earlier versions is questionable. =head2 Helping the module find your executable You will need to enable Muscle to find the muscle program. This can be done in (at least) three ways: 1. Make sure the muscle executable is in your path (i.e. 'which muscle' returns a valid program 2. define an environmental variable MUSCLEDIR which points to a directory containing the 'muscle' app: In bash export MUSCLEDIR=/home/progs/muscle or In csh/tcsh setenv MUSCLEDIR /home/progs/muscle 3. include a definition of an environmental variable MUSCLEDIR in every script that will BEGIN {$ENV{MUSCLEDIR} = '/home/progs/muscle'; } use Bio::Tools::Run::Alignment::Muscle; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Muscle; use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our %DEFAULTS = ( 'AFORMAT' => 'fasta' ); our @MUSCLE_PARAMS = qw(in out tree1 log loga scorefile gapopen seqtype maxmb maxhours maxiters kband in1 in2 usetree usetree_nowarn weight1 weight2 smoothwindow SUEFF smoothscoreceil root1 root2 refinewindow physout phyiout objscore minsmoothscore minbestcolscore hydrofactor hydro anchorspacing center cluster1 cluster2 clwout diagbreak diaglength diagmargin distance1 distance2); our @MUSCLE_SWITCHES = qw(quiet verbose diags refine stable group clw clwstrict msf brenner cluster dimer fasta html le anchors noanchors phyi phys profile refinew sp spscore spn sv); our $PROGRAM_NAME = 'muscle'; our $PROGRAM_DIR = Bio::Root::IO->catfile($ENV{MUSCLEDIR}) if $ENV{MUSCLEDIR}; =head2 new Title : new Usage : my $muscle = Bio::Tools::Run::Alignment::Muscle->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Muscle Args : -outfile_name => $outname =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->aformat( $DEFAULTS{'AFORMAT'} ); $self->_set_from_args( \@args, -methods => [ @MUSCLE_PARAMS, @MUSCLE_SWITCHES ], -create => 1 ); my ($out) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME)], @args ); $self->outfile_name( $out || '' ); $self->aformat('msf') if $self->msf; $self->aformat('clustalw') if $self->clw || $self->clwstrict; if ( defined $self->out ) { $self->outfile_name( $self->out ); } return $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1`; $string =~ /MUSCLE\s+v(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (e.g. a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ( $self, $input ) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if ( defined $input ) { $infilename = $self->_setinput($input); } elsif ( defined $self->in ) { $infilename = $self->_setinput( $self->in ); } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run muscle return &_run( $self, $infilename, $param_string ); } =head2 run_profile Title : run_profile Usage : $alnfilename = /t/data/cysprot.msa'; $seqsfilename = 't/data/cysprot.fa'; $aln = $factory->run_profile($alnfilename,$seqsfilename); Function: Perform a profile alignment on a MSA to include more seqs Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing the fasta MSA and name of a file containing a set of unaligned fasta sequences Comments: This only works for muscle version 3.52. Some early versions of the 3.6 sources had a bug that caused a segfault with -profile. The attached should fix it, if not let Bob Edgar know. =cut sub run_profile { my ( $self, $alnfilename, $seqsfilename ) = @_; # Create input file pointer $self->io->_io_cleanup(); if ( $self->version ne '3.52' ) { $self->throw("profile does not work for this version of muscle\n"); } my $infilename; if ( defined $alnfilename ) { if ( !ref $alnfilename ) { # check that file exists or throw $infilename = $alnfilename; unless ( -e $infilename ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw( "Need to provide a FASTA format file to muscle profile!"); } } } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw( "Bad input data or less than 2 sequences in $infilename !"); } if ( defined $seqsfilename ) { if ( !ref $seqsfilename ) { # check that file exists or throw $infilename = $seqsfilename; unless ( -e $infilename ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw( "Need to provide a FASTA format file to muscle profile!"); } } } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw( "Bad input data or less than 2 sequences in $infilename !"); } my $param_string = $self->_setparams(); # run muscle $self->{_profile} = 1; return &_run( $self, "$alnfilename -in2 $seqsfilename", $param_string ); } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat { my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to muscle program Example : Returns : nothing; muscle output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to muscle =cut sub _run { my ( $self, $infilename, $params ) = @_; my $commandstring; if ( $self->{_profile} ) { $commandstring = $self->executable . " -profile -in1 $infilename $params"; $self->{_profile} = 0; } else { $commandstring = $self->executable . " -in $infilename $params"; } $self->debug("muscle command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if ( !-e $outfile || -z $outfile ) { $self->warn("Muscle call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new( '-file' => $outfile, '-format' => $self->aformat ); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for muscle program Example : Returns : name of file containing muscle data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ( $self, $input ) = @_; my ( $infilename, $seq, $temp, $tfh ); if ( !ref $input ) { # check that file exists or throw $infilename = $input; unless ( -e $input ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw("Need to provide a FASTA format file to muscle!"); } return ($infilename); } elsif ( ref($input) =~ /ARRAY/i ) { # $input may be an array of BioSeq objects... # Open temporary file for both reading & writing of array ( $tfh, $infilename ) = $self->io->tempfile(); if ( !ref( $input->[0] ) ) { $self->warn( "passed an array ref which did not contain objects to _setinput" ); return undef; } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new( '-fh' => $tfh, '-format' => 'fasta' ); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if ( !defined $seq->display_id || $seq->display_id =~ /^\s+$/ ) { $seq->display_id( "Seq" . $ct++ ); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry " . $input->[0] . " and don't know what to do with it\n" ); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for muscle program Example : Returns : parameter string to be passed to muscle during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ( $attr, $value, $param_string ); $param_string = ''; my $laststr; for $attr (@MUSCLE_PARAMS) { $value = $self->$attr(); next unless ( defined $value ); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key . ' ' . $value; } for $attr (@MUSCLE_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; # put switches in format expected by tcoffee $attr_key = ' -' . $attr_key; $param_string .= $attr_key; } # Set default output file if no explicit output file selected unless ( $self->outfile_name ) { my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $self->tempdir() ); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " -out " . $self->outfile_name; if ( $self->quiet() || $self->verbose < 0 ) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $muscle->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $muscle->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Pal2Nal.pm000077500000000000000000000216341342734133000242350ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Pal2Nal # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Pal2Nal - Wrapper for Pal2Nal =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Pal2Nal; # Make a Pal2Nal factory $factory = Bio::Tools::Run::Alignment::Pal2Nal->new(); # Run Pal2Nal with a protein alignment file and a multi-fasta nucleotide # file my $aln = $factory->run($protein_alignfilename, $nucleotide_filename); # or with Bioperl objects $aln = $factory->run($protein_bio_simplalign, [$nucleotide_bio_seq1, $nucleotide_bio_seq2]); # combinations of files/ objects are possible # $aln isa Bio::SimpleAlign of the nucleotide sequences aligned according to # the protein alignment =head1 DESCRIPTION This is a wrapper for running the Pal2Nal perl script by Mikita Suyama. You can get details here: http://coot.embl.de/pal2nal/. Pal2Nal is used for aligning a set of nucleotide sequences based on an alignment of their translations. You can try supplying normal pal2nal command-line arguments to new(), eg. new() or calling arg-named methods (excluding the initial hyphen, eg. $factory->(1) to set the - arg). You will need to enable this Pal2Nal wrapper to find the pal2nal.pl script. This can be done in (at least) three ways: 1. Make sure the script is in your path. 2. Define an environmental variable PAL2NALDIR which is a directory which contains the script: In bash: export PAL2NALDIR=/home/username/pal2nal/ In csh/tcsh: setenv PAL2NALDIR /home/username/pal2nal 3. Include a definition of an environmental variable PAL2NALDIR in every script that will use this Pal2Nal wrapper module, e.g.: BEGIN { $ENV{PAL2NALDIR} = '/home/username/pal2nal/' } use Bio::Tools::Run::Alignment::Pal2Nal; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Pal2Nal; use strict; use Bio::AlignIO; use Bio::SeqIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'pal2nal.pl'; our $PROGRAM_DIR = $ENV{'PAL2NALDIR'}; # methods for the pal2nal args we support our @PARAMS = qw(codontable); our @SWITCHES = qw(blockonly nogap nomismatch); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(output html h nostderr); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Alignment::Pal2Nal->new() Function: creates a new Pal2Nal factory. Returns : Bio::Tools::Run::Alignment::Pal2Nal Args : Most options understood by pal2nal.pl can be supplied as key => value pairs. These options can NOT be used with this wrapper: -output -html -h -nostderr =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($protein_align_file, $multi_fasta_nucleotide); -or- $result = $factory->run($prot_align_object, [$bioseq_object1, ...]); Function: Runs pal2nal on a protein alignment and set of nucleotide sequences. Returns : Bio::SimpleAlign; Args : The first argument represents a protein alignment, the second argument a set of nucleotide sequences. The alignment can be provided as an alignment file readable by Bio::AlignIO, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The nucleotide sequences can be provided as a single filename of a fasta file containing multiple nucleotide sequences, or an array ref of filenames, each file containing one sequence. Alternatively, an array ref of Bio::PrimarySeqI compliant objects can be supplied. In all cases, the protein alignment sequence names must correspond to the names of the supplied nucleotide sequences. =cut sub run { my ($self, $aln, $nucs) = @_; ($aln && $nucs) || $self->throw("alignment and nucleotides must be supplied"); $aln = $self->_alignment($aln); # gaps must be -, not . my $fixed_aln = Bio::SimpleAlign->new(); foreach my $seq ($aln->each_seq) { my $str = $seq->seq; $str =~ s/\./-/g; $fixed_aln->add_seq(Bio::LocatableSeq->new(-id => $seq->id, -seq => $str)); } $self->_alignment($fixed_aln); my $nucs_file; if (-e $nucs) { $nucs_file = $nucs; } elsif (ref($nucs) eq 'ARRAY') { (my $tempfh, $nucs_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); close($tempfh); my $sout = Bio::SeqIO->new(-file => ">".$nucs_file, -format => 'fasta'); foreach my $nuc (@{$nucs}) { if (-e $nuc) { my $sin = Bio::SeqIO->new(-file => $nuc); while (my $nuc_seq = $sin->next_seq) { $sout->write_seq($nuc_seq); } } elsif (ref($nuc) && $nuc->isa('Bio::PrimarySeqI')) { $sout->write_seq($nuc); } else { $self->throw("Don't understand nucleotide argument '$nuc'"); } } } else { $self->throw("Don't understand nucleotide argument '$nucs'"); } return $self->_run($nucs_file); } sub _run { my ($self, $nucs_file) = @_; my $exe = $self->executable || return; my $aln_file = $self->_write_alignment; my ($rfh, $result_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); my ($efh, $error_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); close($rfh); undef $rfh; close($efh); undef $efh; my $command = $exe.$self->_setparams($aln_file, $nucs_file, $result_file, $error_file); $self->debug("pal2nal command = $command\n"); system($command) && $self->throw("pal2nal call ($command) failed: $! | $?"); open(my $errfh, '<', $error_file); my $errors; while (<$errfh>) { $errors .= $_; } close($errfh); $self->throw("pal2nal call ($command) had errors:\n$errors") if $errors; my $ain = Bio::AlignIO->new(-file => $result_file, -format => 'fasta'); my $aln = $ain->next_aln; $ain->close; return $aln; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $nucs_file, $result_file, $error_file) = @_; my $param_string = ' '.$aln_file; $param_string .= ' '.$nucs_file; $param_string .= $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1); $param_string .= ' -output fasta'; $param_string .= " > $result_file 2> $error_file"; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Probalign.pm000066400000000000000000000334161342734133000247170ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Probalign # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Probalign - Object for the calculation of a multiple sequence alignment from a set of unaligned sequences or alignments using the Probalign program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Probalign->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. $factory = Bio::Tools::Run::Alignment::Probalign->new(); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Probalign: multiple sequence alignment using partition function posterior probabilities. Probalign uses partition function posterior probability estimates to compute maximum expected accuracy multiple sequence alignments. You can get it and see information about it at this URL http://www.cs.njit.edu/usman/probalign =head2 Helping the module find your executable You will need to enable Probalign to find the probalign program. This can be done in (at least) three ways: 1. Make sure the probalign executable is in your path (i.e. 'which probalign' returns a valid program 2. define an environmental variable PROBALIGNDIR which points to a directory containing the 'probalign' app: In bash export PROBALIGNDIR=/home/progs/probalign or In csh/tcsh setenv PROBALIGNDIR /home/progs/probalign 3. include a definition of an environmental variable PROBALIGNDIR in every script that will BEGIN {$ENV{PROBALIGNDIR} = '/home/progs/probalign'; } use Bio::Tools::Run::Alignment::Probalign; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Probalign; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PROBALIGN_PARAMS @PROBALIGN_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @PROBALIGN_PARAMS = qw (TEMPERATURE SCORE_MATRIX GAP-OPEN GAP-EXTENSION); @PROBALIGN_SWITCHES = qw(CLUSTALW VERBOSE ALIGNMENT-ORDER NUC PROT); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PROBALIGN_PARAMS, @PROBALIGN_SWITCHES, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'probalign'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROBALIGNDIR}) if $ENV{PROBALIGNDIR}; } =head2 new Title : new Usage : my $probalign = Bio::Tools::Run::Alignment::Probalign->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Probalign Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PROBALIGN version 1.09 - align multiple protein sequences and print to standard output $string =~ /PROBALIGN\s+Beta\s+Version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run probalign return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to probalign program Example : Returns : nothing; probalign output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to probalign =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "probalign command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Probalign call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for probalign program Example : Returns : name of file containing probalign data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to probalign!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for probalign program Example : Returns : parameter string to be passed to probalign during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PROBALIGN_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @PROBALIGN_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $probalign->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $probalign->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Probcons.pm000066400000000000000000000355541342734133000245740ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Probcons # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Probcons - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the Probcons program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Probcons->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. #To run probcons with training, try something like: #First round to generate train.params $factory = Bio::Tools::Run::Alignment::Probcons->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'emissions' => '', 'verbose' => '', 'train' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/train.params"); #Second round to use train.params to get a high qual alignment $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); $aln = ''; $factory = ''; $factory = Bio::Tools::Run::Alignment::Probcons->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'verbose' => '', 'paramfile' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Probcons is a Probabilistic Consistency-based Multiple Alignment of Amino Acid Sequences. You can get it and see information about it at this URL http://probcons.stanford.edu/ =head2 Helping the module find your executable You will need to enable Probcons to find the probcons program. This can be done in (at least) three ways: 1. Make sure the probcons executable is in your path (i.e. 'which probcons' returns a valid program 2. define an environmental variable PROBCONSDIR which points to a directory containing the 'probcons' app: In bash export PROBCONSDIR=/home/progs/probcons or In csh/tcsh setenv PROBCONSDIR /home/progs/probcons 3. include a definition of an environmental variable PROBCONSDIR in every script that will BEGIN {$ENV{PROBCONSDIR} = '/home/progs/probcons'; } use Bio::Tools::Run::Alignment::Probcons; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Probcons; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PROBCONS_PARAMS @PROBCONS_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @PROBCONS_PARAMS = qw (CONSISTENCY ITERATIVE-REFINEMENT PRE-TRAINING ANNOT TRAIN PARAMFILE MATRIXFILE CLUSTALW PAIRS VITERBI VERBOSE EMISSIONS); #FIXME: Last line are switches, dunno how to set them, #gave as params @PROBCONS_SWITCHES = qw(); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PROBCONS_PARAMS, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'probcons'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROBCONSDIR}) if $ENV{PROBCONSDIR}; } =head2 new Title : new Usage : my $probcons = Bio::Tools::Run::Alignment::Probcons->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Probcons Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); if ($attr =~ /verbose/i) { $self->{verbose_set} = 1; } } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PROBCONS version 1.09 - align multiple protein sequences and print to standard output $string =~ /PROBCONS\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams($infilename); # run probcons return &_run($self, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to probcons program Example : Returns : nothing; probcons output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to probcons =cut sub _run { my ($self, $params) = @_; my $commandstring = $self->executable." $params"; $self->debug( "probcons command = $commandstring \n"); my $status = system($commandstring); if ($status) { $self->warn( "Probcons call crashed: $? [command $commandstring]\n"); return; } my $outfile = $self->outfile_name(); if (-e $outfile || -z $outfile) { my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } return; # some modes of operation do not generate an output alignment } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for probcons program Example : Returns : name of file containing probcons data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to probcons!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for probcons program Example : Returns : parameter string to be passed to probcons during align or profile_align Args : name of calling object =cut sub _setparams { my ($self, $infilename) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PROBCONS_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } if ($self->{verbose_set}) { $param_string .= ' --verbose'; } for $attr ( @PROBCONS_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " $infilename > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $probcons->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $probcons->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Proda.pm000066400000000000000000000334261342734133000240500ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Proda # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Proda - Object for the calculation of sets of multiple sequence alignments from a set of unaligned sequences or alignments using the Proda program. =head1 SYNOPSIS # Build a Proda alignment factory $factory = Bio::Tools::Run::Alignment::Proda->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # @alns is an array of SimpleAlign objects. @alns = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; @alns = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. $factory = Bio::Tools::Run::Alignment::Proda->new(); @alns = $factory->align($seq_array_ref); =head1 DESCRIPTION You can get it and see information about it at this URL http://proda.stanford.edu This program will return one or more local alignments for the different repeated or rearranged regions in the sequences. If a sequences contains more than one of those patterns, it will be present more than once in the alignment. The difference will be in that the id contain the start and end, like myseqid(123-456) and myseqid(567-890), instead of simply myseqid as in the original input file. This is true for all the output ids, even if they are present only once. =head2 Helping the module find your executable You will need to enable Proda to find the proda program. This can be done in (at least) three ways: 1. Make sure the proda executable is in your path (i.e. 'which proda' returns a valid program 2. define an environmental variable PRODADIR which points to a directory containing the 'proda' app: In bash export PRODADIR=/home/progs/proda or In csh/tcsh setenv PRODADIR /home/progs/proda 3. include a definition of an environmental variable PRODADIR in every script that will BEGIN {$ENV{PRODADIR} = '/home/progs/proda'; } use Bio::Tools::Run::Alignment::Proda; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Proda; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PRODA_PARAMS @PRODA_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'proda' ); @PRODA_PARAMS = qw (L); @PRODA_SWITCHES = qw(POSTERIOR TRAN SILENT); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PRODA_PARAMS, @PRODA_SWITCHES, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'proda'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRODADIR}) if $ENV{PRODADIR}; } =head2 new Title : new Usage : my $proda = Bio::Tools::Run::Alignment::Proda->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Proda Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PRODA version 1.09 - align multiple protein sequences and print to standard output $string =~ /ProDA\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run proda return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to proda program Example : Returns : nothing; proda output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to proda =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "proda command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Proda call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat, ); my @alns; while(my $aln = $in->next_aln) { push @alns, $aln; } return @alns; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for proda program Example : Returns : name of file containing proda data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to proda!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for proda program Example : Returns : parameter string to be passed to proda during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PRODA_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @PRODA_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $proda->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $proda->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/Sim4.pm000066400000000000000000000247741342734133000236250ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Sim4 # # Please direct questions and support issues to # # Cared for by # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Sim4 - Wrapper for Sim4 program that allows for alignment of cdna to genomic sequences =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Sim4; my @params = (W=>15,K=>17,D=>10,N=>10,cdna_seq=>"mouse_cdna.fa",genomic_seq=>"mouse_genomic.fa"); my $sim4 = Bio::Tools::Run::Alignment::Sim4->new(@params); my @exon_sets = $sim4->align; foreach my $set(@exon_sets){ foreach my $exon($set->sub_SeqFeature){ print $exon->start."\t".$exon->end."\t".$exon->strand."\n"; print "\tMatched ".$exon->est_hit->seq_id."\t".$exon->est_hit->start."\t".$exon->est_hit->end."\n"; } } One can also provide a est database $sio = Bio::SeqIO->new(-file=>"est.fa",-format=>"fasta"); @est_seq=(); while(my $seq = $sio->next_seq){ push @est_seq,$seq; } my @exon_sets = $factory->align(\@est_seq,$genomic); =head1 DESCRIPTION Sim4 program is developed by Florea et al. for aligning cdna/est sequence to genomic sequences Florea L, Hartzell G, Zhang Z, Rubin GM, Miller W. A computer program for aligning a cDNA sequence with a genomic DNA sequence. Genome Res 1998 Sep;8(9):967-74 The program is available for download here: http://globin.cse.psu.edu/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Sim4; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SIM4_PARAMS @OTHER_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Sim4::Results; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable Sim4 to find the Sim4 program. This # can be done in (at least) two ways: # # 1. define an environmental variable SIM4DIR # export SIM4DIR =/usr/local/share/sim4 # where the sim4 package is installed # # 2. include a definition of an environmental variable SIM4 in # every script that will use Sim4.pm # $ENV{SIMR4DIR} = '/usr/local/share/sim4'; BEGIN { @SIM4_PARAMS= qw(A W X K C R D H P N B); @OTHER_PARAMS= qw(CDNA_SEQ GENOMIC_SEQ OUTFILE); @OTHER_SWITCHES = qw(SILENT QUIET VERBOSE); # Authorize attribute fields foreach my $attr ( @SIM4_PARAMS, @OTHER_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'sim4'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIM4DIR}) if $ENV{SIM4DIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); # to facilitiate tempfile cleanup $self->io->_initialize_io(); $self->A(0); # default my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; if ($attr =~/est_first/i ) { #NEW $self->{est_first} = $value; #NEW next; #NEW } #NEW next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i ) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : not supported Function: Cannot determine from program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef; } =head2 align Title : align Usage : $cdna = 't/data/cdna.fa'; $genomic = 't/data/cdna.fa'; @exon_set = $factory->align($cdna,$genomic); or #@seq_array is array of Seq objs $cdna = \@seq_array; @exon_set = $factory->align($cdna,$genomic); of @exon_set = $factory->align($cdna->[0],$genomic) Function: Perform a Sim4 alignment Returns : An array of Bio::SeqFeature::Generic objects which has exons as sub seqfeatures. Args : Name of two files containing fasta sequences, or 2 Bio::SeqI objects or a combination of both first is assumed to be cdna second is assumed to be genomic More than one cdna may be provided. If an object, assume that its an array ref. =cut sub align { my ($self,$cdna,$genomic) = @_; $self->cdna_seq($cdna) if $cdna; $self->throw("Need to provide a cdna sequence") unless $self->cdna_seq; $self->genomic_seq($genomic) if $genomic; $self->throw("Need to provide a genomic sequence") unless $self->genomic_seq; my ($temp,$infile1, $infile2, $est_first,$seq); my ($attr, $value, $switch); # Create input file pointer ($est_first,$infile1,$infile2)= $self->_setinput($self->cdna_seq,$self->genomic_seq); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in align!");} # Create parameter string to pass to Sim4 program my $param_string = $self->_setparams(); # run Sim4 my @exon_sets = $self->_run($est_first,$infile1,$infile2,$param_string); return @exon_sets; } ################################################# #internal methods =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to Sim4 program Example : Returns : nothing; Sim4 output is written to a temp file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to Sim4 =cut sub _run { my ($self,$estfirst,$infile1,$infile2,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); if(! $self->outfile){ my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); undef $tfh; $self->outfile($outfile); } my $outfile = $self->outfile(); my $commandstring = $self->executable." $infile1 $infile2 $param_string > $outfile"; if($self->quiet || $self->silent || ($self->verbose < 0)){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2>$null"; } $self->debug( "Sim4 command = $commandstring"); my $status = system($commandstring); $self->throw( "Sim4 call ($commandstring) crashed: $? \n") unless $status==0; #use Sim4 parser my $sim4_parser = Bio::Tools::Sim4::Results->new(-file=>$outfile,-estfirst=>$estfirst); my @out; while(my $exonset = $sim4_parser->next_exonset){ push @out, $exonset; } return @out; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for Sim4 program Example : Returns : name of file containing Sim4 data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self, $cdna,$genomic) = @_; my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); #my $estfirst=1; my $estfirst= defined($self->{est_first}) ? $self->{_est_first} : 1; my ($cdna_file,$genomic_file); #a sequence obj if(ref($cdna)) { my @cdna = ref $cdna eq "ARRAY" ? @{$cdna} : ($cdna); ($tfh1,$cdna_file) = $self->io->tempfile(-dir=>$self->tempdir); my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); foreach my $c (@cdna){ $seqio->write_seq($c); } close $tfh1; undef $tfh1; #if we have a est database, then input will go second if($#cdna > 0){ $estfirst=0; } } else { my $sio = Bio::SeqIO->new(-file=>$cdna,-format=>"fasta"); my $count = 0; while(my $seq = $sio->next_seq){ $count++; } $estfirst = $count > 1 ? 0:1; $cdna_file = $cdna; } if( ref($genomic) ) { ($tfh1,$genomic_file) = $self->io->tempfile(-dir=>$self->tempdir); my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); $seqio->write_seq($genomic); close $tfh1; undef $tfh1; } else { $genomic_file = $genomic; } return ($estfirst,$cdna_file,$genomic_file) if $estfirst; return ($estfirst,$genomic_file,$cdna_file); } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Sim4 program Example : Returns : parameter string to be passed to Sim4 during align or profile_align Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @SIM4_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = uc $attr; #put params in format expected by Sim4 $attr_key = ' '.$attr_key; $param_string .= $attr_key.'='.$value; } return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Alignment/StandAloneFasta.pm000066400000000000000000000354011342734133000260050ustar00rootroot00000000000000#StandAloneFasta.pm v1.00 2002/11/01 # #Bioperl module for Bio::Tools::Run::Alignment::StandAloneFasta # # Written by Tiequan Zhang # Please direct questions and support issues to # # Cared for by Shawn Hoon # Copyright Tiequan Zhang # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::StandAloneFasta - Object for the local execution of the Fasta3 programs ((t)fasta3, (t)fastx3, (t)fasty3 ssearch3) =head1 SYNOPSIS #!/usr/bin/perl use Bio::Tools::Run::Alignment::StandAloneFasta; use Bio::SeqIO; use strict; my @arg=( 'b' =>'15', 'O' =>'resultfile', 'H'=>'', 'program'=>'fasta34' ); my $factory=Bio::Tools::Run::Alignment::StandAloneFasta->new(@arg); $factory->ktup(1); $factory->library('p'); #print result file name print $factory->O; my @fastreport=$factory->run($ARGV[0]); foreach (@fastreport) { print "Parsed fasta report:\n"; my $result = $_->next_result; while( my $hit = $result->next_hit()) { print "\thit name: ", $hit->name(), "\n"; while( my $hsp = $hit->next_hsp()) { print "E: ", $hsp->evalue(), "frac_identical: ", $hsp->frac_identical(), "\n"; } } } #pass in seq objects my $sio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>"fasta"); my $seq = $sio->next_seq; my @fastreport=$factory->run($ARGV[0]); =head1 DESCRIPTION This wrapper works with version 3 of the FASTA program package (see W. R. Pearson and D. J. Lipman (1988), "Improved Tools for Biological Sequence Analysis", PNAS 85:2444-2448 (Pearson and Lipman, 1988); W. R. Pearson (1996) "Effective protein sequence comparison" Meth. Enzymol. 266:227-258 (Pearson, 1996); Pearson et. al. (1997) Genomics 46:24-36 (Zhang et al., 1997); Pearson, (1999) Meth. in Molecular Biology 132:185-219 (Pearson, 1999). Version 3 of the FASTA packages contains many programs for searching DNA and protein databases and one program (prss3) for evaluating statistical significance from randomly shuffled sequences. Fasta is available at ftp://ftp.virginia.edu/pub/fasta =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Tiequan Zhang Adapted for bioperl by Shawn Hoon Enhanced by Jason Stajich Email tqzhang1973@yahoo.com shawnh@fugu-sg.org jason-at-bioperl.org =head1 Appendix The rest of the documendation details each of the object methods. Internal methods are preceded with a underscore =cut package Bio::Tools::Run::Alignment::StandAloneFasta; use vars qw ($AUTOLOAD @ISA $library %parameters $ktup @FASTA_PARAMS %OK_FIELD @OTHER_PARAMS); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::SearchIO; use Bio::Tools::Run::WrapperBase; BEGIN { @FASTA_PARAMS=qw(a A b c E d f g h H i j l L M m n O o p Q q r R s S w x y z); @OTHER_PARAMS =qw(program output database); foreach my $att (@FASTA_PARAMS, @OTHER_PARAMS) {$OK_FIELD{$att}++;} $ktup=2; %parameters=('H' => '', 'q' => '', 'm' =>'1', 'O' =>''); } @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); sub new { my ($caller,@args)=@_; #chained new my $self = $caller->SUPER::new(@args); while(@args){ my $attr = shift @args; my $value = shift @args; next if ($attr=~/^-/ || ! $attr); $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the 'codeml' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe,$warn) = @_; $exename = 'fasta34' unless defined $exename; if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{FASTADIR}) if $ENV{FASTADIR}; } =head2 run Title : run Usage : my @fasta_object = $factory->($input,$onefile); where $factory is the name of executable FASTA program; $input is file name containing the sequences in the format of fasta or Bio::Seq object or array of Bio::Seq object; $onefile is 0 if you want to save the outputs to different files default: outputs are saved in one file Function: Attempts to run an executable FASTA program and return array of fasta objects containing the fasta report Returns : aray of fasta report object If the user specify the output file(s), the raw fasta report will be saved Args : sequence object OR array reference of sequence objects filename of file containing fasta formatted sequences =cut sub run { my ($self,$input,$onefile)=@_; local * FASTARUN; $self->io->_io_cleanup; my $program = $self->executable($self->program_name) || $self->throw("FASTA program not found or not executable.\n"); # You should specify a library file $self->throw("You didn't choose library.\n") unless ( $library); my @seqs = $self->_setinput($input); return 0 unless (@seqs); my @fastobj; my ($fhout, $tempoutfile)=$self->io->tempfile(-dir=>$self->tempdir); my $outfile=$self->O(); # The outputs from executable FASTA program will # be saved into different files if $onefile is 0, # else will be concatenated into one file my $onfile = (!defined $onefile || $onefile =~ /^0$/); unless( $onfile ) { my $count=0; # do some fancy stuff here to test if we are running fasta34 # with mlib so we just pass in a single file rather than # running fasta N times # (not implemented yet) foreach my $seq (@seqs){ $count++; # Decide if the output will be saved into a temporary file if( $outfile ) { $self->O(sprintf("%s_%d",$outfile,$count)); } my ($fhinput,$teminputfile)= $self->io->tempfile(-dir=>$self->tempdir); my $temp = Bio::SeqIO->new(-fh=>$fhinput, '-format'=>'Fasta'); $temp->write_seq($seq); $temp->close(); close $fhinput; undef $fhinput; my $para= $self->_setparams; $para .=" $teminputfile $library $ktup"; $para ="$program $para"; my $object; unless( $outfile ) { open(FASTARUN, "$para |") || $self->throw($@); $object = Bio::SearchIO->new(-fh=>\*FASTARUN, -format=>"fasta"); } else { if ( $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $para .= " >$null 2>$null"; } else { $self->debug("Going to execute: $para"); } my $status = system($para); $self->throw("$para crashed: $?\n" )unless ($status==0); $object = Bio::SearchIO->new(-file=>$self->O, -format=>"fasta"); } push @fastobj, $object; } } else { if ($outfile){ open (FILE, ">$outfile") or $self->throw("can't use $outfile:$!"); close(FILE); } foreach my $seq (@seqs){ my ($fhinput,$teminputfile)=$self->io->tempfile(-dir=>$self->tempdir); my $temp=Bio::SeqIO->new(-fh=>$fhinput, '-format'=>'fasta'); $temp->write_seq($seq); $temp->close(); close $fhinput; undef $fhinput; $self->O($tempoutfile) if( $outfile ); my $para= $self->_setparams; $para .= " $teminputfile $library $ktup"; $para ="$program $para"; my $object; unless( $outfile ) { open(FASTARUN, "$para |") || $self->throw($@); $object=Bio::SearchIO->new(-fh=>\*FASTARUN, -format=>"fasta"); } else { if ( $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $para .= " >$null 2>$null"; } else { $self->debug("Going to execute: $para"); } my $status = system($para); $self->throw("$para crashed: $?\n" )unless ($status==0); $object = Bio::SearchIO->new(-file=>$self->O, -format=>"fasta"); } push @fastobj, $object; # The output in the temporary file # will be saved at the end of $outfile if($outfile){ open (FHOUT, $tempoutfile) or die("can't open the $tempoutfile file"); open (FH, ">>$outfile") or die("can't use the $outfile file"); print FH (); close (FHOUT); close (FH); } } } return @fastobj; } =head2 library Title : library Usage : my $lb = $self->library Function: Fetch or set the name of the library to search against Returns : The name of the library Args : No argument if user wants to fetch the name of library file; A letter or a string of letter preceded by %; (e.g. P or %pn, the letter is the character in the third field of any line of fastlibs file ) or the name of library file (if environmental variable FASTLIBS is not set); if user wants to set the name of library file to search against =cut sub library { my($self,$lb)=@_; return $library if (!defined($lb)); if ( ($lb =~ /^%[a-zA-Z]+$/)||($lb=~ /^[a-zA-Z]$/)){ if(! defined $ENV{'FASTLIBS'} ){ $self->throw("abbrv. list request but FASTLIBS undefined, cannot use $lb"); } } else { unless ( -e $lb){ $self->throw("cannot open $lb library"); } } return $library=$lb; } *database = \&library; =head2 output Title : output Usage : $obj->output($newval) Function: The output directory if we want to use this Example : Returns : value of output (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub output{ my $self = shift; return $self->{'output'} = shift if @_; return $self->{'output'}; } =head2 ktup Title : ktup Usage : my $ktup = $self->ktup Function: Fetch or set the ktup value for executable FASTA programs Example : Returns : The value of ktup if defined, else undef is returned Args : No argument if user want to fetch ktup value; A integer value between 1-6 if user want to set the ktup value =cut sub ktup { my($self,$k)=@_; if(!defined($k)){return $ktup;} if ($k =~ /^[1-6]$/){ $ktup=$k; return $ktup } else { $self->warn("You should set the ktup value between 1-6. The FASTA program will decide your default ktup value."); return $ktup= undef; } } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Blast executable Example : Returns : array of Bio::Seq object reference Args : Seq object reference or input file name =cut sub _setinput { my ($self, $input) = @_; if( ! defined $input ) { $self->throw("Calling fasta program with no input"); } my @seqs; if( ! ref $input ) { if( -e $input ) { my $seqio = Bio::SeqIO->new(-format => 'fasta', -file => $input); while( my $seq = $seqio->next_seq ) { push @seqs, $seq; } } else { $self->throw("Input $input was not a valid filename"); } } elsif( ref($input) =~ /ARRAY/i ) { foreach ( @$input ) { if( ref($_) && $_->isa('Bio::PrimarySeqI') ) { push @seqs, $_; } else { $self->warn("Trying to add a " . ref($_) ." but expected a Bio::PrimarySeqI"); } } if( ! @seqs) { $self->throw("Did not pass in valid input -- no sequence objects found"); } } elsif( $input->isa('Bio::PrimarySeqI') ) { push @seqs, $input; } return @seqs; } =head2 _exist Title : _exist Usage : Internal function, not to be called directly Function: Determine whether a executable FASTA program can be found Cf. the DESCRIPTION section of this POD for how to make sure for your FASTA installation to be found. This method checks for existence of the blastall executable in the path. Returns : 1 if FASTA program found at expected location, 0 otherwise. Args : none =cut sub _exist { my $exe = shift @_; return 0 unless($exe =~ /fast|ssearch/); $exe .='.exe' if ($^O =~ /mswin/i); my $f; return ($f=Bio::Root::IO->exists_exe($exe))&&(-x $f); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FASTA executable Returns : part of parameter string to be passed to FASTA program Args : none =cut sub _setparams { my ($self,$attr,$value); $self = shift; my $para = ""; foreach my $attr(@FASTA_PARAMS) { $value = $self->$attr(); next unless (defined $value); $para .=" -$attr $value"; } $para .= " -q "; return $para; } 1; __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Analysis/000077500000000000000000000000001342734133000223025ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Analysis/soap.pm000066400000000000000000000533021342734133000236050ustar00rootroot00000000000000# $Id$ # # BioPerl module Bio::Tools::Run::Analysis::soap.pm # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Analysis::soap - A SOAP-based access to the analysis tools =head1 SYNOPSIS Do not use this object directly, it is recommended to access it and use it through the C module: use Bio::Tools::Run::Analysis; my $tool = Bio::Tools::Run::Analysis->new(-access => 'soap', -name => 'seqret'); =head1 DESCRIPTION This object allows to execute and to control a remote analysis tool (an application, a program) using the SOAP middleware, All its public methods are documented in the interface module C and explained in tutorial available in the C script. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over 4 =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 BUGS AND LIMITATIONS None known at the time of writing this. =head1 APPENDIX Here is the rest of the object methods. Internal methods are preceded with an underscore _. =cut # Let the code begin... package Bio::Tools::Run::Analysis::soap; use vars qw(@ISA $Revision $DEFAULT_LOCATION); use strict; use Bio::Tools::Run::Analysis; use SOAP::Lite on_fault => sub { my $soap = shift; my $res = shift; my $msg = ref $res ? "--- SOAP FAULT ---\n" . 'faultcode: ' . $res->faultcode . "\n" . 'faultstring: ' . Bio::Tools::Run::Analysis::soap::_clean_msg ($res->faultstring) : "--- TRANSPORT ERROR ---\n" . $soap->transport->status . "\n$res\n"; Bio::Tools::Run::Analysis::soap->throw ($msg); } ; @ISA = qw(Bio::Tools::Run::Analysis); BEGIN { $Revision = q[$Id$]; # where to go $DEFAULT_LOCATION = 'http://www.ebi.ac.uk/soaplab/services'; } # ----------------------------------------------------------------------------- =head2 _initialize Usage : my $tool = Bio::Tools::Run::Analysis->new(-access => 'soap', -name => 'seqret', ...); (_initialize is internally called from the 'new()' method) Returns : nothing interesting Args : This module recognises and uses following arguments: -location -name -httpproxy -timeout Additionally, the main module Bio::Tools::Run::Analysis recognises also: -access It populates calling object with the given arguments, and then - for some attributes and only if they are not yet populated - it assigns some default values. This is an actual new() method (except for the real object creation and its blessing which is done in the parent class Bio::Root::Root in method _create_object). Note that this method is called always as an I method (never as a I method) - and that the object who calls this method may already be partly initiated (from Bio::Tools::Run::Analysis::new method); so if you need to do some tricks with the 'class invocation' you need to change Bio::Analysis I method, not this one. =over 4 =item -location A URL (also called an I) defining where is located a Web Service representing this analysis tool. Default is C (services running at European Bioinformatics Institute on top of most of EMBOSS analyses, and few others). For example, if you run your own Web Service using Java(TM) Apache Axis toolkit, the location might be something like C. =item -name A name of a Web Service (also called a I or a I). There is no default value (which usually means that this parameter is mandatory unless your I<-location> parameter includes also a Web Service name). =item -destroy_on_exit =E '0' Default value is '1' which means that all Bio::Tools::Run::Analysis::Job objects - when being finalised - will send a request to the remote Web Service to forget the results of these jobs. If you change it to '0' make sure that you know the job identification - otherwise you will not be able to re-established connection with it (later, when you use your script again). This can be done by calling method C on the job object (such object is returned by any of these methods: C, C, C). =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. =item -timeout For long(er) running jobs the HTTP connection may be time-outed. In order to avoid it (or, vice-versa, to call timeout sooner) you may specify C with the number of seconds the connection will be kept alive. Zero means to keep it alive forever. The default value is two minutes. =back =cut sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_location'} = $DEFAULT_LOCATION unless $self->{'_location'}; # create a SOAP::Lite object, the main worker if (defined $self->{'_httpproxy'}) { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, timeout => (defined $self->{'_timeout'} ? $self->{'_timeout'} : 120), proxy => ['http' => $self->{'_httpproxy'}]); } else { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, timeout => (defined $self->{'_timeout'} ? $self->{'_timeout'} : 120), ); } $self->{'_soap'}->uri ($self->{'_name'}) if $self->{'_name'}; # forget cached things which should not be cloned into new # instances (because they may represent a completely different # analysis delete $self->{'_analysis_spec'}; delete $self->{'_input_spec'}; delete $self->{'_result_spec'}; } # # Create a hash with named inputs, all extracted # from the given data. # # The main job is done in the SUPER class - here we do # only the SOAP-specific stuff. # sub _prepare_inputs { my $self = shift; my $rh_inputs = $self->SUPER::_prepare_inputs (@_); foreach my $name (keys %{$rh_inputs}) { my $value = $$rh_inputs{$name}; # value of type ref ARRAY is send as byte[][] if (ref $value eq 'ARRAY') { my @bytes = map { SOAP::Data->new (type => 'base64', value => $_) } @$value; $$rh_inputs{$name} = \@bytes; next; } } return $rh_inputs; } # --------------------------------------------------------------------- # # Here are the methods implementing Bio::AnalysisI interface # (documentation is in Bio::AnalysisI) # # --------------------------------------------------------------------- sub analysis_name { my $self = shift; ${ $self->analysis_spec }{'name'}; } # Map getAnalysisType() sub analysis_spec { my ($self) = @_; return $self->{'_analysis_spec'} if $self->{'_analysis_spec'}; my $soap = $self->{'_soap'}; $self->{'_analysis_spec'} = $soap->getAnalysisType->result; } # String describe() sub describe { my ($self) = @_; my $soap = $self->{'_soap'}; $soap->describe->result; } # Map[] getInputSpec() sub input_spec { my ($self) = @_; return $self->{'_input_spec'} if $self->{'_input_spec'}; my $soap = $self->{'_soap'}; $self->{'_input_spec'} = $soap->getInputSpec->result; } # Map[] getResultSpec() sub result_spec { my ($self) = @_; return $self->{'_result_spec'} if $self->{'_result_spec'}; my $soap = $self->{'_soap'}; $self->{'_result_spec'} = $soap->getResultSpec->result; } # String createJob (Map inputs) # String createJob (String id) # String createJob () sub create_job { my ($self, $params) = @_; my $job_id; my $force_to_live; # if $params is a reference then it contains *all* input data # (see details in '_prepare_inputs' how they can be coded) - # send it to the server to get a unique job ID if (ref $params) { my $rh_inputs = $self->_prepare_inputs ($params); my $soap = $self->{'_soap'}; $job_id = $soap->createJob (SOAP::Data->type (map => $rh_inputs))->result; # if $params is a defined scalar it represents a job ID obtained in # some previous invocation - such job already exists on the server # side, just re-create it here using the same job ID # (in this case, such job will *not* be implicitly destroyed on exit) } elsif (defined $params) { $job_id = $params; $force_to_live = 1; # finally, if $params is undef, ask server to create an empty job # (and give me its unique job ID), the input data may be added # later using 'set_data' method(s) - see scripts/applmaker.pl } else { my $soap = $self->{'_soap'}; $job_id = $soap->createEmptyJob->result; # this method may not exist on server (TBD) } if ($force_to_live) { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, -destroy_on_exit => 0, ); } elsif (defined $self->{'_destroy_on_exit'}) { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, -destroy_on_exit => $self->{'_destroy_on_exit'}, ); } else { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, ); } } # String createAndRun (Map inputs) sub run { my $self = shift; return $self->create_job (@_)->run; } # Map runAndWaitFor (Map inputs) sub wait_for { my $self = shift; return $self->run (@_)->wait_for; } # --------------------------------------------------------------------- # # Here are internal methods fo Bio::Tools::Run::Analysis::soap... # # --------------------------------------------------------------------- # Do something (or nothing) with $rh_resuls (coming from the server) # depending on rules defined in $rh_rules. # # $rh_results: keys are result names, values are results themselves # (either scalars or array references - if one result is split into # more parts). # # $rh_rules: keys are result names, values say what to do with # results: undef ... do nothing, return unchanged result # - ... send it to STDOUT, return nothing # @[template] ... put it into file (invent its name, # perhaps based on template), return filename # ?[template] ... ask server for result type, then decide: # put a binary result into file (invent its name) # and return the filename, for other result type # do nothing and return result unchanged # Special cases: if $rh_rules is scalar '@[template]', do with ALL results # as described above for @[template], or # if $rh_rules is scalar '?[template]', do with ALL results # as described above for ?[template]. sub _process_results { my ($self, $rh_results, $rh_rules) = @_; my $default_rule = $rh_rules if defined $rh_rules && $rh_rules =~ /^[\?@]/; foreach my $name (keys %$rh_results) { my $rule = $default_rule ? $default_rule : $$rh_rules{$name}; next unless $rule; next if $rule =~ /^\?/ && ! $self->is_binary ($name); my ($prefix, $template) = $rule =~ /^([\?@])(.*)/; $template = $ENV{'RESULT_FILENAME_TEMPLATE'} unless $template; my $filename = $rule unless $template || $prefix; my $stdout = ($rule eq '-'); if (ref $$rh_results{$name}) { # --- result value is an array reference my $seq = 1; foreach my $part (@{ $$rh_results{$name} }) { print STDOUT $part && next if $stdout; $part = $self->_save_result (-value => $part, -name => $name, -filename => $filename, -template => $template, -seq => $seq++); } } else { # --- result value is a scalar print STDOUT $$rh_results{$name} && next if $stdout; $$rh_results{$name} = $self->_save_result (-value => $$rh_results{$name}, -name => $name, -filename => $filename, -template => $template); } delete $$rh_results{$name} if $stdout; } $rh_results; } # --------------------------------------------------------------------- # # is the given result $name binary? # =head2 is_binary Usage : if ($service->is_binary ('graph_result')) { ... } Returns : 1 or 0 Args : $name is a result name we are interested in =cut sub is_binary { my ($self, $name) = @_; foreach my $result (@{ $self->result_spec }) { if ($result->{'name'} eq $name) { return ($result->{'type'} =~ /^byte\[/); } } return 0; } # --------------------------------------------------------------------- # # Here are internal subroutines (NOT methods) # for Bio::Tools::Run::Analysis::soap # # --------------------------------------------------------------------- sub _clean_msg { my ($msg) = @_; $msg =~ s/^org\.embl\.ebi\.SoaplabShare\.SoaplabException\:\s*//; $msg; } # --------------------------------------------------------------------- # # Here is the rest of Bio::Analysis::soap # # --------------------------------------------------------------------- =head2 VERSION and Revision Usage : print $Bio::Tools::Run::Analysis::soap::VERSION; print $Bio::Tools::Run::Analysis::soap::Revision; =cut =head2 Defaults Usage : print $Bio::Tools::Run::Analysis::soap::DEFAULT_LOCATION; =cut # --------------------------------------------------------------------- # # Bio::Tools::Run::Analysis::Job::soap # ------------------------------------ # A module representing a job (an invocation, an execution) # of an analysis (the analysis itself is represented by # a Bio::Tools::Run::Analysis::soap object) # # Documentation is in Bio::AnalysisI::JobI. # # --------------------------------------------------------------------- package Bio::Tools::Run::Analysis::Job::soap; use vars qw(@ISA); use strict; @ISA = qw(Bio::Tools::Run::Analysis::Job); sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_destroy_on_exit'} = 1 unless defined $self->{'_destroy_on_exit'}; } # --------------------------------------------------------------------- # # Here are the methods implementing Bio::AnalysisI::JobI interface # (documentation is in Bio::AnalysisI) # # --------------------------------------------------------------------- # void run (String jobID) sub run { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->run (SOAP::Data->type (string => $self->{'_id'})); return $self; } # void waitFor (String jobID) sub wait_for { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->waitFor (SOAP::Data->type (string => $self->{'_id'})); return $self; } # void terminate (String jobID) sub terminate { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->terminate (SOAP::Data->type (string => $self->{'_id'})); return $self; } # String getLastEvent (String jobID) sub last_event { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getLastEvent (SOAP::Data->type (string => $self->{'_id'}))->result; } # String getStatus (String jobID) sub status { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getStatus (SOAP::Data->type (string => $self->{'_id'}))->result; } # long getCreated (String jobID) sub created { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getCreated (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getStarted (String jobID) sub started { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getStarted (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getEnded (String jobID) sub ended { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getEnded (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getElapsed (String jobID) sub elapsed { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getElapsed (SOAP::Data->type (string => $self->{'_id'}))->result; } # Map getCharacterictics (String jobID) sub times { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $rh_times = $soap->getCharacteristics (SOAP::Data->type (string => $self->{'_id'}))->result; map { $_ = Bio::Tools::Run::Analysis::Utils::format_time ($_) } values %$rh_times if $formatted; return $rh_times; } # Map getResults (String jobID) # Map getResults (String jobID, String[] resultNames) # Retrieving NAMED results: # ------------------------- # results ('name1', ...) => return results as they are, no storing into files # # results ( { 'name1' => 'filename', ... } ) => store into 'filename', return 'filename' # results ( 'name1=filename', ...) => ditto # # results ( { 'name1' => '-', ... } ) => send result to the STDOUT, do not return anything # results ( 'name1=-', ...) => ditto # # results ( { 'name1' => '@', ... } ) => store into file whose name is invented by # this method, perhaps using RESULT_NAME_TEMPLATE env # results ( 'name1=@', ...) => ditto # # results ( { 'name1' => '?', ... } ) => find of what type is this result and then use # {'name1'=>'@' for binary files, and a regular # return for non-binary files # results ( 'name=?', ...) => ditto # # Retrieving ALL results: # ----------------------- # results() => return all results as they are, no storing into files # # results ('@') => return all results, as if each of them given # as {'name' => '@'} (see above) # # results ('?') => return all results, as if each of them given # as {'name' => '?'} (see above) # # Misc: # ----- # * results(...) equals to result(...) # * any result can be returned as a scalar value, or as an array reference # (the latter is used for results consisting of more parts, such images); # this applies regardless whether the returned result is the result itself # or a filename created for the result sub results { my $self = shift; my $rh_names = Bio::Tools::Run::Analysis::Utils::normalize_names (@_); my $soap = $self->{'_analysis'}->{'_soap'}; if (ref $rh_names) { # retrieve only named results return $self->{'_analysis'}->_process_results ($soap->getSomeResults (SOAP::Data->type (string => $self->{'_id'}), [ keys %$rh_names ])->result, $rh_names); } else { # no result names given: take all return $self->{'_analysis'}->_process_results ($soap->getResults (SOAP::Data->type (string => $self->{'_id'}))->result, $rh_names); } } sub result { my $self = shift; my $rh_results = $self->results (@_); (values %$rh_results)[0]; } sub remove { shift->{'_destroy_on_exit'} = 1; } # # job objects are being destroyed if they have attribute # '_destroy_on_exit' set to true - which is a default value # (void destroy (String jobID) # sub DESTROY { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; return unless $self->{'_destroy_on_exit'} && $self->{'_id'}; # ignore all errors here eval { $soap->destroy (SOAP::Data->type (string => $self->{'_id'})); } } 1; __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/AnalysisFactory/000077500000000000000000000000001342734133000236325ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/AnalysisFactory/soap.pm000066400000000000000000000217371342734133000251440ustar00rootroot00000000000000# $Id$ # # BioPerl module Bio::Tools::Run::AnalysisFactory::soap.pm # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::AnalysisFactory::soap - A SOAP-based access to the list of analysis tools =head1 SYNOPSIS Do not use this object directly, it is recommended to access it and use it through the I module: use Bio::Tools::Run::AnalysisFactory; my $list = Bio::Tools::Run::AnalysisFactory->new(-access => 'soap') ->available_analyses; print join ("\n", @$list) . "\n"; =head1 DESCRIPTION All public methods are documented in the interface module C. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 BUGS AND LIMITATIONS None known at the time of writing this. =head1 APPENDIX The main documentation details are in C. =cut # Let the code begin... package Bio::Tools::Run::AnalysisFactory::soap; use vars qw(@ISA $Revision $DEFAULT_LOCATION @DEFAULT_DIR_SERVICE); use strict; use Bio::Tools::Run::AnalysisFactory; use Bio::Tools::Run::Analysis; use SOAP::Lite on_fault => sub { my $soap = shift; my $res = shift; my $msg = ref $res ? "--- SOAP FAULT ---\n" . 'faultcode: ' . $res->faultcode . "\n" . 'faultstring: ' . Bio::Tools::Run::AnalysisFactory::soap::_clean_msg ($res->faultstring) : "--- TRANSPORT ERROR ---\n" . $soap->transport->status . "\n$res\n"; Bio::Tools::Run::AnalysisFactory::soap->throw ($msg); } ; @ISA = qw(Bio::Tools::Run::AnalysisFactory); BEGIN { $Revision = q[$Id$]; # where to go... $DEFAULT_LOCATION = 'http://www.ebi.ac.uk/soaplab/services'; # ...and what to find there # (this is a list of service names available from the given location; # those that do not exist are ignored; if none exists then only # location - without any service name appended - is used) @DEFAULT_DIR_SERVICE = ('AnalysisFactory', 'GowlabFactory'); } # ----------------------------------------------------------------------------- =head2 _initialize Usage : my $factory = Bio::Tools::Run::AnalysisFactory->new(@args); (_initialize is internally called from the 'new()' method) Returns : nothing interesting Args : This module recognises and uses following arguments: -location -httpproxy -soap Additionally, the main module Bio::Tools::Run::AnalysisFactory recognises also: -access It populates calling object with the given arguments, and then - for some attributes and only if they are not yet populated - it assigns some default values. This is an actual new() method (except for the real object creation and its blessing which is done in the parent class Bio::Root::Root in method _create_object). Note that this method is called always as an I method (never as a I method) - and that the object who calls this method may already be partly initiated (from Bio::Tools::Run::AnalysisFactory::new method); so if you need to do some tricks with the 'class invocation' you need to change Bio::Tools::Run::AnalysisFactory I method, not this one. =over =item -location A URL (also called an I) defining where is located a Web Service functioning for this object. Default is C (a service running at European Bioinformatics Institute on top of most of the EMBOSS analyses, and on top of few others). For example, if you run your own Web Service using Java(TM) Apache Axis toolkit, the location might be something like C. =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. =item -soap Defines your own SOAP::Lite object. Useful if you need finer-grained access to many features and attributes of the wonderful Paul Kulchenko's module. =back =cut # ' sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_location'} = $DEFAULT_LOCATION unless $self->{'_location'}; # create a SOAP object which will do the main job # ('uri' (representing a service name) will be added before each call) unless ($self->{'_soap'}) { if (defined $self->{'_httpproxy'}) { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, proxy => ['http' => $self->{'_httpproxy'}]); } else { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}); } } } sub _clean_msg { my ($msg) = @_; $msg =~ s/^org\.embl\.ebi\.SoaplabShare\.SoaplabException\:\s*//; $msg; } # String[] getAvailableCategories() sub available_categories { my ($self) = @_; my $soap = $self->{'_soap'}; my @result = (); my $okay = 0; foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableCategories->result }); }; $okay = 1 unless $@; } return $soap->getAvailableCategories->result unless $okay; \@result; } # String[] getAvailableAnalyses() # String[] getAvailableAnalysesInCategory (String categoryName) sub available_analyses { my ($self, $category) = @_; my $soap = $self->{'_soap'}; my @result = (); my $okay = 0; if (defined $category) { foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableAnalysesInCategory (SOAP::Data->type (string => $category))->result }); }; $okay = 1 unless $@; } return $soap->getAvailableAnalysesInCategory (SOAP::Data->type (string => $category)) ->result unless $okay; \@result; } else { foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableAnalyses->result }); }; $okay = 1 unless $@; } return $soap->getAvailableAnalyses->result unless $okay; \@result; } } # String getServiceLocation (String analysisName) sub create_analysis { my ($self, $name) = @_; # service name my @name = ('-name', $name) if $name; # ask for an endpoint my $soap = $self->{'_soap'}; my $location; foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { $location = $soap->getServiceLocation (SOAP::Data->type (string => $name))->result; }; last if defined $location; } unless (defined $location) { $location = $soap->getServiceLocation (SOAP::Data->type (string => $name)) ->result; } my @location = ('-location', $location) if $location; # share some of my properties with the new Bio::Analysis object my @access = ('-access', $self->{'_access'}) if $self->{'_access'}; my @httpproxy = ('-httpproxy', $self->{'_httpproxy'}) if $self->{'_httpproxy'}; Bio::Tools::Run::Analysis->new(@name, @location, @httpproxy, @access); } =head2 VERSION and Revision Usage : print $Bio::Tools::Run::AnalysisFactory::soap::VERSION; print $Bio::Tools::Run::AnalysisFactory::soap::Revision; =cut 1; __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BEDTools.pm000066400000000000000000000626361342734133000225050ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::BEDTools # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BEDTools - Run wrapper for the BEDTools suite of programs *BETA* =head1 SYNOPSIS # use a BEDTools program $bedtools_fac = Bio::Tools::Run::BEDTools->new( -command => 'subtract' ); $result_file = $bedtools_fac->run( -bed1 => 'genes.bed', -bed2 => 'mask.bed' ); # if IO::Uncompress::Gunzip is available... $result_file = $bedtools_fac->run( -bed1 => 'genes.bed.gz', -bed2 => 'mask.bed.gz' ); # be more strict $bedtools_fac->set_parameters( -strandedness => 1 ); # and even more... $bedtools_fac->set_parameters( -minimum_overlap => 1e-6 ); # create a Bio::SeqFeature::Collection object $features = $bedtools_fac->result( -want => 'Bio::SeqFeature::Collection' ); =head1 DEPRECATION WARNING Most executables from BEDTools v>=2.10.1 can read GFF and VCF formats in addition to BED format. This requires the use of a new input file param, shown in the following documentation, '-bgv', in place of '-bed' for the executables that can do this. This behaviour breaks existing scripts. =head1 DESCRIPTION This module provides a wrapper interface for Aaron R. Quinlan and Ira M. Hall's utilities C that allow for (among other things): =over =item * Intersecting two BED files in search of overlapping features. =item * Merging overlapping features. =item * Screening for paired-end (PE) overlaps between PE sequences and existing genomic features. =item * Calculating the depth and breadth of sequence coverage across defined "windows" in a genome. =back (see L for manuals and downloads). =head1 OPTIONS C is a suite of 17 commandline executable. This module attempts to provide and options comprehensively. You can browse the choices like so: $bedtools_fac = Bio::Tools::Run::BEDTools->new; # all bowtie commands @all_commands = $bedtools_fac->available_parameters('commands'); @all_commands = $bedtools_fac->available_commands; # alias # just for default command ('bam_to_bed') @btb_params = $bedtools_fac->available_parameters('params'); @btb_switches = $bedtools_fac->available_parameters('switches'); @btb_all_options = $bedtools_fac->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. As a number of options are mutually exclusive, and the interpretation of intent is based on last-pass option reaching bowtie with potentially unpredicted results. This module will prevent inconsistent switches and parameters from being passed. See L for details of BEDTools options. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $bedtools_fac = Bio::Tools::Run::BEDTools->new( -command => 'pair_to_bed' ); @filespec = $bedtools_fac->filespec; This example returns the following array: #bedpe #bam bed #out This indicates that the bed (C BED format) file MUST be specified, and that the out, bedpe (C BEDPE format) and bam (C binary format) file MAY be specified (Note that in this case you MUST provide ONE of bedpe OR bam, the module at this stage does not allow this information to be queried). Use these in the C call like so: $bedtools_fac->run( -bedpe => 'paired.bedpe', -bgv => 'genes.bed', -out => 'overlap' ); The object will store the programs STDERR output for you in the C attribute: handle_bed_warning($bedtools_fac) if ($bedtools_fac->stderr =~ /Usage:/); For the commands 'fasta_from_bed' and 'mask_fasta_from_bed' STDOUT will also be captured in the C attribute by default and all other commands can be forced to capture program output in STDOUT by setting the -out filespec parameter to '-'. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L Rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BEDTools; use strict; our $HAVE_IO_UNCOMPRESS; BEGIN { eval 'require IO::Uncompress::Gunzip; $HAVE_IO_UNCOMPRESS = 1'; } use IPC::Run; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Tools::Run::BEDTools::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; use Bio::Tools::GuessSeqFormat; use Bio::SeqFeature::Generic; use Bio::SeqFeature::Collection; use Bio::SeqIO; use File::Sort qw( sort_file ); use base qw( Bio::Tools::Run::WrapperBase ); ## BEDTools our $program_name = '*bedtools'; our $default_cmd = 'bam_to_bed'; # Note: Other globals imported from Bio::Tools::Run::BEDTools::Config our $qual_param = undef; our $use_dash = 'single'; our $join = ' '; our %strand_translate = ( '+' => 1, '-' => -1, '.' => 0 ); =head2 new() Title : new Usage : my $obj = new Bio::Tools::Run::BEDTools(); Function: Builds a new Bio::Tools::Run::BEDTools object Returns : an instance of Bio::Tools::Run::BEDTools Args : =cut sub new { my ($class,@args) = @_; unless (grep /command/, @args) { push @args, '-command', $default_cmd; } my $self = $class->SUPER::new(@args); foreach (keys %command_executables) { $self->executables($_, $self->_find_executable($command_executables{$_})); } $self->want($self->_rearrange([qw(WANT)],@args)); $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI return $self; } =head2 run() Title : run Usage : $result = $bedtools_fac->run(%params); Function: Run a BEDTools command. Returns : Command results (file, IO object or Bio object) Args : Dependent on filespec for command. See $bedtools_fac->filespec and BEDTools Manual. Also accepts -want => '(raw|format|)' - see want(). Note : gzipped inputs are allowed if IO::Uncompress::Gunzip is available =cut sub run { my $self = shift; my ($ann, $bed, $bg, $bgv, $bgv1, $bgv2, $bam, $bedpe, $bedpe1, $bedpe2, $seq, $genome, $out); if (!(@_ % 2)) { my %args = @_; if ((grep /^-\w+/, keys %args) == keys %args) { ($ann, $bed, $bg, $bgv, $bgv1, $bgv2, $bam, $bedpe, $bedpe1, $bedpe2, $seq, $genome, $out) = $self->_rearrange([qw( ANN BED BG BGV BGV1 BGV2 BAM BEDPE BEDPE1 BEDPE2 SEQ GENOME OUT )], @_); } else { $self->throw("Badly formed named args: ".join(' ',@_)); } } else { if (grep /^-\w+/, @_) { $self->throw("Badly formed named args: ".join(' ',@_)); } else { $self->throw("Require named args."); } } # Sanity checks $self->executable || $self->throw("No executable!"); my $cmd = $self->command if $self->can('command'); for ($cmd) { =pod Command annotate bgv ann(s) #out =cut m/^annotate$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); @$ann = map { my $a = $_; $a = $self->_uncompress($a); $self->_validate_file_input(-ann => $a) || $self->throw("File '$a' not BED/GFF/VCF format."); $a; } @$ann; last; }; =pod graph_union bg_files #out =cut m/^graph_union$/ && do { @$bg = map { my $g = $_; $g = $self->_uncompress($g); $self->_validate_file_input(-bg => $g) || $self->throw("File '$a' not BedGraph format."); $g; } @$bg; last; }; =pod fasta_from_bed seq bgv #out mask_fasta_from_bed seq bgv #out =cut m/fasta_from_bed$/ && do { ($out // 0) eq '-' && $self->throw("Cannot capture results in STDOUT with sequence commands."); $seq = $self->_uncompress($seq); $self->_validate_file_input(-seq => $seq) || $self->throw("File '$seq' not fasta format."); $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); last; }; =pod bam_to_bed bam #out =cut m/^bam_to_bed$/ && do { $bam = $self->_uncompress($bam); $self->_validate_file_input(-bam => $bam) || $self->throw("File '$bam' not BAM format."); last; }; =pod bed_to_IGV bgv #out merge bgv #out sort bgv #out links bgv #out =cut m/^(?:bed_to_IGV|merge|sort|links)$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); }; =pod b12_to_b6 bed #out overlap bed #out group_by bed #out =cut m/^(?:b12_to_b6|overlap|group_by)$/ && do { $bed = $self->_uncompress($bed); $self->_validate_file_input(-bed => $bed) || $self->throw("File '$bgv' not BED format."); if ($cmd eq 'group_by') { my $c =(my @c)= split(",",$self->columns); my $o =(my @o)= split(",",$self->operations); unless ($c > 0 && $o == $c) { $self->throw("The command 'group_by' requires "."paired "x($o == $c)."'-columns' and '-operations' parameters"); } } last; }; =pod bed_to_bam bgv #out shuffle bgv genome #out slop bgv genome #out complement bgv genome #out =cut m/^(?:bed_to_bam|shuffle|slop|complement)$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); $genome = $self->_uncompress($genome); $self->_validate_file_input(-genome => $genome) || $self->throw("File '$genome' not genome format."); if ($cmd eq 'slop') { my $l = defined $self->add_to_left; my $r = defined $self->add_to_right; my $b = defined $self->add_bidirectional; # I think I have a lisp unless (($l && $r) || ($b xor ($l || $r))) { $self->throw("The command 'slop' requires an unambiguous description of the slop you want"); } } last; }; =pod genome_coverage bed genome #out =cut m/^genome_coverage$/ && do { $bed = $self->_uncompress($bed); $self->_validate_file_input(-bed => $bed) || $self->throw("File '$bed' not BED format."); $genome = $self->_uncompress($genome); $self->_validate_file_input(-genome => $genome) || $self->throw("File '$genome' not genome format."); my ($th, $tf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => '.bed' ); $th->close; sort_file({k => 1, I => $bed, o => $tf}); $bed = $tf; last; }; =pod window bgv1 bgv2 #out closest bgv1 bgv2 #out coverage bgv1 bgv2 #out subtract bgv1 bgv2 #out =cut m/^(?:window|closest|coverage|subtract)$/ && do { $bgv1 = $self->_uncompress($bgv1); $self->_validate_file_input(-bgv1 => $bgv1) || $self->throw("File '$bgv1' not BED/GFF/VCF format."); $bgv2 = $self->_uncompress($bgv2); $self->_validate_file_input(-bgv2 => $bgv2) || $self->throw("File '$bgv2' not BED/GFF/VCF format."); }; =pod pair_to_pair bedpe1 bedpe2 #out =cut m/^pair_to_pair$/ && do { $bedpe1 = $self->_uncompress($bedpe1); $self->_validate_file_input(-bedpe1 => $bedpe1) || $self->throw("File '$bedpe1' not BEDPE format."); $bedpe2 = $self->_uncompress($bedpe2); $self->_validate_file_input(-bedpe2 => $bedpe2) || $self->throw("File '$bedpe2' not BEDPE format."); last; }; =pod intersect bgv1|bam bgv2 #out =cut m/^intersect$/ && do { $bgv1 = $self->_uncompress($bgv1); $bam = $self->_uncompress($bam); ($bam && $self->_validate_file_input(-bam => $bam)) || ($bgv1 && $self->_validate_file_input(-bgv1 => $bgv1)) || $self->throw("File in position 1. not correct format."); $bgv2 = $self->_uncompress($bgv2); $self->_validate_file_input(-bgv2 => $bgv2) || $self->throw("File '$bgv2' not BED/GFF/VCF format."); last; }; =pod pair_to_bed bedpe|bam bgv #out bgv* signifies any of BED, GFF or VCF. ann is a bgv. NOTE: Replace 'bgv' with 'bed' unless $use_bgv is set. =cut m/^pair_to_bed$/ && do { $bedpe = $self->_uncompress($bedpe); $bam = $self->_uncompress($bam); ($bam && $self->_validate_file_input(-bam => $bam)) || ($bedpe && $self->_validate_file_input(-bedpe => $bedpe)) || $self->throw("File in position 1. not correct format."); $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bed' not BED/GFF/VCF format."); last; } } my %params = ( '-ann' => $ann, '-bam' => $bam, '-bed' => $bed, '-bgv' => $bgv, '-bg' => $bg, '-bgv1' => $bgv1, '-bgv2' => $bgv2, '-bedpe' => $bedpe, '-bedpe1' => $bedpe1, '-bedpe2' => $bedpe2, '-seq' => $seq, '-genome' => $genome ); map { delete $params{$_} unless defined $params{$_} } keys %params; my $format = $self->_determine_format(\%params); my $suffix = '.'.$format; if (!defined $out) { my ($outh, $outf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => $suffix ); $outh->close; $out = $outf; } elsif ($out ne '-') { $out .= $suffix; } else { undef $out; } $params{'-out'} = $out if defined $out; $self->_run(%params); $self->{'_result'}->{'file_name'} = $out // '-'; $self->{'_result'}->{'format'} = $format; $self->{'_result'}->{'file'} = defined $out ? Bio::Root::IO->new( -file => $out ) : undef; return $self->result; } sub _uncompress { my ($self, $file) = @_; return if !defined $file; return $file unless ($file =~ m/\.gz[^.]*$/); return $file unless (-e $file && -r _); # other people will deal with this unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$file'" ); } my ($tfh, $tf) = $self->io->tempfile( -dir => $self->tempdir() ); my $z = IO::Uncompress::Gunzip->new($file); while (my $block = $z->getline) { print $tfh $block } close $tfh; return $tf } =head2 want() Title : want Usage : $bowtiefac->want( $class ) Function: make factory return $class, or 'raw' results in file or 'format' for result format All commands can return Bio::Root::IO commands returning: can return object: - BED or BEDPE - Bio::SeqFeature::Collection - sequence - Bio::SeqIO Returns : return wanted type Args : [optional] string indicating class or raw of wanted result =cut sub want { my $self = shift; return $self->{'_want'} = shift if @_; return $self->{'_want'}; } =head2 result() Title : result Usage : $bedtoolsfac->result( [-want => $type|$format] ) Function: return result in wanted format Returns : results Args : [optional] hashref of wanted type Note : -want arg does not persist between result() call when specified in result(), for persistence, use want() =cut sub result { my ($self, @args) = @_; my $want = $self->_rearrange([qw(WANT)],@args); $want ||= $self->want; my $cmd = $self->command if $self->can('command'); my $format = $self->{'_result'}->{'format'}; my $file_name = $self->{'_result'}->{'file_name'}; return $self->{'_result'}->{'format'} if (defined $want && $want eq 'format'); return $self->{'_result'}->{'file_name'} if (!$want || $want eq 'raw'); return $self->{'_result'}->{'file'} if ($want =~ m/^Bio::Root::IO/); # this will be undef if -out eq '-' for ($format) { # these are dissected more finely than seems resonable to allow easy extension m/bed/ && do { for ($want) { m/Bio::SeqFeature::Collection/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqFeature::Collection/) { $self->{'_result'}->{'object'} = $self->_read_bed; } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/bedpe/ && do { for ($want) { m/Bio::SeqFeature::Collection/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqFeature::Collection/) { $self->{'_result'}->{'object'} = $self->_read_bedpe; } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/bam/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/^(?:fasta|raw)$/ && do { for ($want) { m/Bio::SeqIO/ && do { $file_name eq '-' && $self->throw("Cannot make a SeqIO object from STDOUT."); unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqIO/) { $self->{'_result'}->{'object'} = Bio::SeqIO->new(-file => $file_name, -format => $format); } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/tab/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/igv/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/html/ && do { $self->warn("Cannot make '$_' for $format."); return; }; do { $self->warn("Result format '$_' not recognised - have you called run() yet?"); } } } =head2 _determine_format() Title : _determine_format( $has_run ) Usage : $bedtools-fac->_determine_format Function: determine the format of output for current options Returns : format of bowtie output Args : [optional] boolean to indicate result exists =cut sub _determine_format { my ($self, $params) = @_; my $cmd = $self->command if $self->can('command'); my $format = $format_lookup{$cmd}; #special cases - dependent on switches and files for ($cmd) { m/^intersect$/ && do { return 'bed' if !defined $$params{'-bam'} || $self->write_bed; return 'bam'; }; m/^pair_to_bed$/ && do { return 'bedpe' if !defined $$params{'-bam'} || $self->write_bedpe; return 'bam'; }; m/^fasta_from_bed$/ && do { return $self->output_tab_format ? 'tab' : 'fasta'; } } return $format; } =head2 _read_bed() Title : _read_bed() Usage : $bedtools_fac->_read_bed Function: return a Bio::SeqFeature::Collection object from a BED file Returns : Bio::SeqFeature::Collection Args : =cut sub _read_bed { my ($self) = shift; my @features; if ($self->{'_result'}->{'file_name'} ne '-') { my $in = $self->{'_result'}->{'file'}; while (my $feature = $in->_readline) { chomp $feature; push @features, _read_bed_line($feature); } } else { for my $feature (split("\cJ", $self->stdout)) { push @features, _read_bed_line($feature); } } my $collection = Bio::SeqFeature::Collection->new; $collection->add_features(\@features); return $collection; } sub _read_bed_line { my $feature = shift; my ($chr, $start, $end, $name, $score, $strand, $thick_start, $thick_end, $item_RGB, $block_count, $block_size, $block_start) = split("\cI",$feature); $strand ||= '.'; # BED3 doesn't have strand data - for 'merge' and 'complement' return Bio::SeqFeature::Generic->new( -seq_id => $chr, -primary => $name, -start => $start, -end => $end, -strand => $strand_translate{$strand}, -score => $score, -tag => { thick_start => $thick_start, thick_end => $thick_end, item_RGB => $item_RGB, block_count => $block_count, block_size => $block_size, block_start => $block_size } ); } =head2 _read_bedpe() Title : _read_bedpe() Usage : $bedtools_fac->_read_bedpe Function: return a Bio::SeqFeature::Collection object from a BEDPE file Returns : Bio::SeqFeature::Collection Args : =cut sub _read_bedpe { my ($self) = shift; my @features; if ($self->{'_result'}->{'file_name'} ne '-') { my $in = $self->{'_result'}->{'file'}; while (my $feature = $in->_readline) { chomp $feature; push @features, _read_bedpe_line($feature); } } else { for my $feature (split("\cJ", $self->stdout)) { push @features, _read_bedpe_line($feature); } } my $collection = Bio::SeqFeature::Collection->new; $collection->add_features(\@features); return $collection; } sub _read_bedpe_line { my $feature = shift; my ($chr1, $start1, $end1, $chr2, $start2, $end2, $name, $score, $strand1, $strand2, @add) = split("\cI",$feature); $strand1 ||= '.'; $strand2 ||= '.'; return Bio::SeqFeature::FeaturePair->new( -primary => $name, -seq_id => $chr1, -start => $start1, -end => $end1, -strand => $strand_translate{$strand1}, -hprimary_tag => $name, -hseqname => $chr2, -hstart => $start2, -hend => $end2, -hstrand => $strand_translate{$strand2}, -score => $score ); } =head2 _validate_file_input() Title : _validate_file_input Usage : $bedtools_fac->_validate_file_input( -type => $file ) Function: validate file type for file spec Returns : file type if valid type for file spec Args : hash of filespec => file_name =cut sub _validate_file_input { my ($self, @args) = @_; my (%args); if (grep (/^-/, @args) && (@args > 1)) { # named parms $self->throw("Wrong number of args - requires one named arg") if (@args > 2); s/^-// for @args; %args = @args; } else { $self->throw("Must provide named filespec"); } for (keys %args) { m/bam/ && do { return 'bam'; }; do { return unless ( -e $args{$_} && -r _ ); my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$args{$_}); return $guesser->guess if grep {$guesser->guess =~ m/$_/} @{$accepted_types{$_}}; } } return; } =head2 version() Title : version Usage : $version = $bedtools_fac->version() Function: Returns the program version (if available) Returns : string representing location and version of the program =cut sub version{ my ($self) = @_; my $cmd = $self->command if $self->can('command'); defined $cmd or $self->throw("No command defined - cannot determine program executable"); # new bahaviour for some BEDTools executables - breaks previous approach to getting version # $dummy can be any non-recognised parameter - '-version' works for most my $dummy = '-version'; $dummy = '-examples' if $cmd =~ /graph_union/; my ($in, $out, $err); my $dum; $in = \$dum; $out = \$self->{'stdout'}; $err = \$self->{'stderr'}; # Get program executable my $exe = $self->executable; my @ipc_args = ( $exe, $dummy ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : $!"); }; # We don't bother trying to catch this: version is returned as an illegal file seek my @details = split("\n",$self->stderr); (my $version) = grep /^Program: .*$/, @details; $version =~ s/^Program: //; return $version; } sub available_commands { shift->available_parameters('commands') }; sub filespec { shift->available_parameters('filespec') }; 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BEDTools/000077500000000000000000000000001342734133000221325ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BEDTools/Config.pm000066400000000000000000000366231342734133000237070ustar00rootroot00000000000000# $Id: Config.pm kortsch $ # # BioPerl module for Bio::Tools::Run::BEDTools::Config # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BEDTools::Config - Configuration data for bowtie commands =head1 SYNOPSIS Used internally by L. =head1 DESCRIPTION This package exports information describing BEDTools commands, parameters, switches, and input and output filetypes for individual BEDTools commands. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BEDTools::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_executables %format_lookup %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files %accepted_types ); @EXPORT_OK = qw(); our @program_commands = qw( annotate fasta_from_bed overlap bam_to_bed genome_coverage pair_to_pair bed_to_bam graph_union pair_to_bed bed_to_IGV group_by shuffle b12_to_b6 intersect slop closest links sort complement mask_fasta_from_bed subtract coverage merge window ); our %command_executables = ( 'annotate' => 'annotateBed', 'bam_to_bed' => 'bamToBed', 'bed_to_bam' => 'bedToBam', 'bed_to_IGV' => 'bedToIgv', 'b12_to_b6' => 'bed12ToBed6', 'fasta_from_bed' => 'fastaFromBed', 'mask_fasta_from_bed' => 'maskFastaFromBed', 'shuffle' => 'shuffleBed', 'window' => 'windowBed', 'closest' => 'closestBed', 'genome_coverage' => 'genomeCoverageBed', 'merge' => 'mergeBed', 'slop' => 'slopBed', 'complement' => 'complementBed', 'intersect' => 'intersectBed', 'pair_to_bed' => 'pairToBed', 'sort' => 'sortBed', 'coverage' => 'coverageBed', 'links' => 'linksBed', 'pair_to_pair' => 'pairToPair', 'subtract' => 'subtractBed', 'overlap' => 'overlap', 'group_by' => 'groupBy', 'graph_union' => 'unionBedGraphs' ); our %format_lookup = ( 'annotate' => 'bed', 'bam_to_bed' => 'bed', 'bed_to_bam' => 'bam', 'bed_to_IGV' => 'igv', 'b12_to_b6' => 'bed', 'closest' => 'bedpe', 'complement' => 'bed', 'coverage' => 'bed', 'fasta_from_bed' => 'fasta', 'genome_coverage' => 'tab', 'graph_union' => 'bg', 'group_by' => 'bed', 'intersect' => 'bed|bam', 'links' => 'html', 'mask_fasta_from_bed' => 'fasta', 'merge' => 'bed', 'overlap' => 'bed', 'pair_to_bed' => 'bedpe|bam', 'pair_to_pair' => 'bedpe', 'slop' => 'bed', 'shuffle' => 'bed', 'sort' => 'bed', 'subtract' => 'bed', 'window' => 'bedpe' ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # our %composite_commands = ( ); # prefixes only for commands that take params/switches... our %command_prefixes = ( 'annotate' => 'ann', 'bam_to_bed' => 'ate', 'bed_to_bam' => 'eta', 'bed_to_IGV' => 'eti', 'b12_to_b6' => '126', 'fasta_from_bed' => 'ffb', 'mask_fasta_from_bed' => 'mfb', 'shuffle' => 'shb', 'window' => 'wib', 'closest' => 'clb', 'genome_coverage' => 'gcb', 'merge' => 'meb', 'slop' => 'slb', 'complement' => 'cob', 'intersect' => 'inb', 'pair_to_bed' => 'ptb', 'sort' => 'sob', 'coverage' => 'cvb', 'links' => 'lib', 'pair_to_pair' => 'ptp', 'subtract' => 'sub', 'overlap' => 'ove', 'group_by' => 'grp', 'graph_union' => 'ubg' ); our @program_params = qw( command ate|tag ate|color eta|quality eti|path eti|session eti|sort eti|slop eti|image shb|exclude shb|seed wib|window_size wib|left_window_size wib|right_window_size clb|ties_policy gcb|max_depth gcb|strand meb|max_distance slb|add_bidirectional slb|add_to_left slb|add_to_right inb|minimum_overlap ptb|minimum_overlap ptb|type ptp|minimum_overlap ptp|type ptp|slop sub|minimum_overlap lib|basename lib|organism lib|genome_build ove|columns grp|group grp|columns grp|operations ubg|names ubg|filler ); our @program_switches = qw( ann|names ann|count ann|both ann|strandedness ate|write_bedpe ate|use_edit_distance ate|bam12 ate|split ate|use_edit_distance ate|cigar eta|uncompressed eta|bed12 eti|collapse eti|name ffb|use_bed_name ffb|output_tab_format ffb|strandedness gcb|bedgraph gcb|bedgraph_all gcb|split mfb|soft_mask shb|keep_chromosome wib|define_by_strand wib|same_strand wib|report_once_only wib|report_hits wib|invert clb|strandedness clb|report_distance gcb|report_pos_depth meb|strandedness meb|report_n_merged meb|report_names_merged slb|define_by_strand inb|write_bed inb|write_entry_1 inb|write_entry_2 inb|report_once_only inb|report_n_hits inb|invert_match inb|reciprocal inb|strandedness inb|write_overlap inb|write_overlap_all inb|split ptb|write_bedpe ptb|strandedness ptb|use_edit_distance ptb|write_uncompressed sob|size_asc sob|size_desc sob|chr_size_asc sob|chr_size_desc sob|chr_score_asc sob|chr_score_desc cvb|strandedness cvb|histogram cvb|depth cvb|split ptp|ignore_strand ptp|slop_strandedness ptp|no_self_hits sub|strandedness ubg|header ubg|empty ); our %param_translation = ( 'ann|names' => 'names', 'ann|counts' => 'counts', 'ann|both' => 'both', 'ann|strandedness' => 's', 'ate|write_bedpe' => 'bedpe', 'ate|use_edit_distance' => 'ed', 'ate|bam12' => 'bam12', 'ate|split' => 'split', 'ate|use_edit_distance' => 'ed', 'ate|tag' => 'tag', 'ate|color' => 'color', 'ate|cigar' => 'cigar', 'eta|quality' => 'maqp', 'eta|uncompressed' => 'ubam', 'eta|bed12' => 'bed12', 'eti|path' => 'path', 'eti|session' => 'sess', 'eti|sort' => 'sort', 'eti|collapse' => 'clps', 'eti|name' => 'name', 'eti|slop' => 'slop', 'eti|image' => 'img', 'ffb|use_bed_name' => 'names', 'ffb|output_tab_format' => 'tab', 'ffb|strandedness' => 's', 'mfb|soft_mask' => 'soft', 'shb|keep_chromosome' => 'chrom', 'shb|exclude' => 'excl', 'shb|seed' => 'seed', 'wib|define_by_strand' => 'sw', 'wib|same_strand' => 'sm', 'wib|report_once_only' => 'u', 'wib|report_hits' => 'c', 'wib|invert' => 'v', 'wib|window_size' => 'w', 'wib|left_window_size' => 'l', 'wib|right_window_size' => 'r', 'clb|strandedness' => 's', 'clb|report_distance' => 'd', 'clb|ties_policy' => 't', 'gcb|report_pos_depth' => 'd', 'gcb|max_depth' => 'max', 'gcb|bedgraph' => 'bg', 'gcb|bedgraph_all' => 'bga', 'gcb|split' => 'split', 'gcb|strand' => 'strand', 'meb|strandedness' => 's', 'meb|report_n_merged' => 'n', 'meb|report_names_merged' => 'nms', 'meb|max_distance' => 'd', 'slb|define_by_strand' => 's', 'slb|add_bidirectional' => 'b', 'slb|add_to_left' => 'l', 'slb|add_to_right' => 'r', 'inb|write_bed' => 'bed', 'inb|write_entry_1' => 'wa', 'inb|write_entry_2' => 'wb', 'inb|write_overlap' => 'wo', 'inb|write_overlap_all' => 'woa', 'inb|report_once_only' => 'u', 'inb|report_n_hits' => 'c', 'inb|invert_match' => 'v', 'inb|reciprocal' => 'r', 'inb|strandedness' => 's', 'inb|minimum_overlap' => 'f', 'inb|split' => 'split', 'ptb|write_bedpe' => 'bedpe', 'ptb|strandedness' => 's', 'ptb|minimum_overlap' => 'f', 'ptb|type' => 'type', 'ptb|use_edit_distance' => 'ed', 'ptb|write_uncompressed' => 'ubam', 'sob|size_asc' => 'sizeA', 'sob|size_desc' => 'sizeD', 'sob|chr_size_asc' => 'chrThenSizeA', 'sob|chr_size_desc' => 'chrThenSizeD', 'sob|chr_score_asc' => 'chrThenScoreA', 'sob|chr_score_desc' => 'chrThenScoreD', 'cvb|strandedness' => 's', 'cvb|histogram' => 'hist', 'cvb|depth' => 'd', 'cvb|split' => 'split', 'ptp|ignore_strand' => 'is', 'ptp|slop_strandedness' => 'ss', 'ptp|minimum_overlap' => 'f', 'ptp|type' => 'type', 'ptp|slop' => 'slop', 'ptp|no_self_hits' => 'rdn', 'sub|strandedness' => 's', 'sub|minimum_overlap' => 'f', 'lib|basename' => 'base', 'lib|organism' => 'org', 'lib|genome_build' => 'db', 'ove|columns' => 'cols', 'grp|group' => 'grp', 'grp|columns' => 'opCols', 'grp|operations' => 'ops', 'ubg|header' => 'header', 'ubg|names' => 'names', 'ubg|empty' => 'empty', 'ubg|filler' => 'filler' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_bowtie # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'annotate' => [qw( -i|bgv -files|*ann >#out )], 'bam_to_bed' => [qw( -i|bam >#out )], 'bed_to_bam' => [qw( -i|bgv -g|genome >#out )], 'bed_to_IGV' => [qw( -i|bgv >#out )], 'b12_to_b6' => [qw( -i|bed >#out )], 'fasta_from_bed' => [qw( -fi|seq -bed|bgv -fo|#out )], 'mask_fasta_from_bed' => [qw( -fi|seq -bed|bgv -fo|#out )], 'shuffle' => [qw( -i|bgv -g|genome >#out )], 'window' => [qw( -a|bgv1 -b|bgv2 >#out )], 'closest' => [qw( -a|bgv1 -b|bgv2 >#out )], 'genome_coverage' => [qw( -i|bed -g|genome >#out )], 'merge' => [qw( -i|bgv >#out )], 'slop' => [qw( -i|bgv -g|genome >#out )], 'complement' => [qw( -i|bgv -g|genome >#out )], 'intersect' => [qw( -a|#bgv1 -abam|#bam -b|bgv2 >#out )], # (bgv1|bam) required 'pair_to_bed' => [qw( -a|#bedpe -abam|#bam -b|bgv >#out )], # (bedpe|bam) required 'sort' => [qw( -i|bgv >#out )], 'coverage' => [qw( -a|bgv1 -b|bgv2 >#out )], 'links' => [qw( -i|bgv >#out )], 'pair_to_pair' => [qw( -a|bedpe1 -b|bedpe2 >#out )], 'subtract' => [qw( -a|bgv1 -b|bgv2 >#out )], 'group_by' => [qw( -i|bed >#out )], 'graph_union' => [qw( -i|*bg -g|#genome >#out )], 'overlap' => [qw( -i|bed >#out )] ); our %accepted_types = ( 'ann' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bam' => [qw()], # we need a test for this 'bed' => [qw( tab )], 'bgv' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bgv1' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bgv2' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bedpe' => [qw( tab )], 'bedpe1' => [qw( tab )], 'bedpe2' => [qw( tab )], 'seq' => [qw( fasta )], 'genome' => [qw( tab )], 'bg' => [qw( tab )] ); 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BlastPlus.pm000077500000000000000000000075401342734133000227770ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::BlastPlus # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BlastPlus - A wrapper for NCBI's blast+ suite =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Blast+ is NCBI's successor to the C family of programs. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BlastPlus; use strict; use warnings; use lib '../../..'; use Bio::Root::Root; use Bio::Tools::Run::BlastPlus::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; use base qw(Bio::Tools::Run::WrapperBase Bio::Root::Root); =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::BlastPlus(); Function: Builds a new Bio::Tools::Run::BlastPlus object Returns : an instance of Bio::Tools::Run::BlastPlus Args : =cut sub new { my ($class,@args) = @_; $program_dir ||= $ENV{BLASTPLUSDIR}; my $self = $class->SUPER::new(@args); return $self; } =head2 program_version() Title : program_version Usage : $version = $bedtools_fac->program_version() Function: Returns the program version (if available) Returns : string representing location and version of the program Note : this works around the WrapperBase::version() method conflicting with the -version parameter for SABlast (good argument for not having getter/setters for these) =cut =head2 package_version() Title : package_version Usage : $version = $bedtools_fac->version() Function: Returns the BLAST+ package version (if available) Returns : string representing BLAST+ package version (may differ from version()) =cut sub program_version { my ($self) = @_; if (!defined $self->{program_version}) { $self->_version; } $self->{program_version} || ''; } sub package_version { my ($self) = @_; if (!defined $self->{package_version}) { $self->_version; } $self->{package_version} || ''; } sub _version { my $self = shift; my ($in, $out, $err); # Get program executable my $exe = $self->executable; my @ipc_args = ( $exe, '-version'); eval { IPC::Run::run(\@ipc_args, \$in, \$out, \$err) or die ("There was a problem running $exe : $!"); }; if ($out =~ /blastdbcmd\:\s+(\S+)\n\s*Package\:\s+([^,]+)/xms) { @{$self}{qw(program_version package_version)} = ($1, $2); } else { $self->throw("Unknown version output: $out"); } } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BlastPlus/000077500000000000000000000000001342734133000224305ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/BlastPlus/Config.pm000077500000000000000000000203201342734133000241730ustar00rootroot00000000000000#$Id$ package Bio::Tools::Run::BlastPlus::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files $program_name $program_dir $use_dash $join ); @EXPORT_OK = qw(); # getting the parms and switches from the usage string: #$ blastp -h | perl -ne '@a = m/\[(.*?)\]/g; for $a (@a) { @b = split(/\s+/,$a); $b[0]=~s/-//; $ptr = (@b==1 ? \@s : \@p ); push @$ptr, $b[0]; } END { print "p arms\n", join("\n",@p), "\n\n", "switches\n", join("\n",@s); }' # '*' indicates a 'pseudo'-program : i.e. each # command has its own executable... our $program_name = '*blast+'; our $use_dash = 'single'; our $join = ' '; our @program_commands = qw( run blastn blastx tblastx tblastn blastp psiblast rpsblast rpstblastn makeblastdb blastdb_aliastool blastdbcmd blastdbcheck convert2blastmask dustmasker segmasker windowmasker ); # full command => prefix our %command_prefixes = ( run => 'run', blastn => 'bln', blastx => 'blx', tblastx => 'tbx', tblastn => 'tbn', blastp => 'blp', psiblast => 'psi', rpsblast => 'rps', rpstblastn => 'rpst', makeblastdb => 'mak', blastdb_aliastool => 'dba', blastdbcmd => 'dbc', blastdbcheck => 'dbk', convert2blastmask => 'c2m', dustmasker => 'dms', segmasker => 'sms', windowmasker => 'wms' ); # each elt : pfx|wrapper_parm_name our @program_params = qw( command tbn|import_search_strategy tbn|export_search_strategy tbn|db tbn|dbsize tbn|gilist tbn|negative_gilist tbn|entrez_query tbn|subject tbn|subject_loc tbn|query tbn|out tbn|evalue tbn|word_size tbn|gapopen tbn|gapextend tbn|xdrop_ungap tbn|xdrop_gap tbn|xdrop_gap_final tbn|searchsp tbn|db_gencode tbn|frame_shift_penalty tbn|max_intron_length tbn|seg tbn|soft_masking tbn|matrix tbn|threshold tbn|culling_limit tbn|best_hit_overhang tbn|best_hit_score_edge tbn|window_size tbn|query_loc tbn|outfmt tbn|num_descriptions tbn|num_alignments tbn|max_target_seqs tbn|num_threads tbn|comp_based_stats tbn|in_pssm blx|import_search_strategy blx|export_search_strategy blx|db blx|dbsize blx|gilist blx|negative_gilist blx|entrez_query blx|db_soft_mask blx|subject blx|subject_loc blx|query blx|out blx|evalue blx|word_size blx|gapopen blx|gapextend blx|xdrop_ungap blx|xdrop_gap blx|xdrop_gap_final blx|searchsp blx|frame_shift_penalty blx|max_intron_length blx|seg blx|soft_masking blx|matrix blx|threshold blx|culling_limit blx|best_hit_overhang blx|best_hit_score_edge blx|window_size blx|query_loc blx|strand blx|query_gencode blx|outfmt blx|num_descriptions blx|num_alignments blx|max_target_seqs blx|num_threads bln|import_search_strategy bln|export_search_strategy bln|task bln|db bln|dbsize bln|gilist bln|negative_gilist bln|entrez_query bln|db_soft_mask bln|subject bln|subject_loc bln|query bln|out bln|evalue bln|word_size bln|gapopen bln|gapextend bln|perc_identity bln|xdrop_ungap bln|xdrop_gap bln|xdrop_gap_final bln|searchsp bln|penalty bln|reward bln|min_raw_gapped_score bln|template_type bln|template_length bln|dust bln|filtering_db bln|window_masker_taxid bln|window_masker_db bln|soft_masking bln|culling_limit bln|best_hit_overhang bln|best_hit_score_edge bln|window_size bln|use_index bln|index_name bln|query_loc bln|strand bln|outfmt bln|num_descriptions bln|num_alignments bln|max_target_seqs bln|num_threads blp|import_search_strategy blp|export_search_strategy blp|task blp|db blp|dbsize blp|gilist blp|negative_gilist blp|entrez_query blp|db_soft_mask blp|subject blp|subject_loc blp|query blp|out blp|evalue blp|word_size blp|gapopen blp|gapextend blp|xdrop_ungap blp|xdrop_gap blp|xdrop_gap_final blp|searchsp blp|seg blp|soft_masking blp|matrix blp|threshold blp|culling_limit blp|best_hit_overhang blp|best_hit_score_edge blp|window_size blp|query_loc blp|outfmt blp|num_descriptions blp|num_alignments blp|max_target_seqs blp|num_threads blp|comp_based_stats psi|import_search_strategy psi|export_search_strategy psi|db psi|dbsize psi|gilist psi|negative_gilist psi|entrez_query psi|subject psi|subject_loc psi|query psi|out psi|evalue psi|word_size psi|gapopen psi|gapextend psi|xdrop_ungap psi|xdrop_gap psi|xdrop_gap_final psi|searchsp psi|seg psi|soft_masking psi|matrix psi|threshold psi|culling_limit psi|best_hit_overhang psi|best_hit_score_edge psi|window_size psi|query_loc psi|outfmt psi|num_descriptions psi|num_alignments psi|max_target_seqs psi|num_threads psi|comp_based_stats psi|gap_trigger psi|num_iterations psi|out_pssm psi|out_ascii_pssm psi|in_msa psi|in_pssm psi|pseudocount psi|inclusion_ethresh psi|phi_pattern rpst|import_search_strategy rpst|export_search_strategy rpst|db rpst|dbsize rpst|gilist rpst|negative_gilist rpst|entrez_query rpst|query rpst|out rpst|evalue rpst|word_size rpst|xdrop_ungap rpst|xdrop_gap rpst|xdrop_gap_final rpst|searchsp rpst|query_gencode rpst|seg rpst|soft_masking rpst|window_size rpst|query_loc rpst|strand rpst|outfmt rpst|num_descriptions rpst|num_alignments rpst|max_target_seqs rpst|num_threads mak|in mak|dbtype mak|title mak|mask_data mak|out mak|max_file_sz mak|taxid mak|taxid_map mak|logfile dba|gi_file_in dba|gi_file_out dba|db dba|dbtype dba|title dba|gilist dba|out dba|dblist dba|num_volumes dba|logfile tbx|import_search_strategy tbx|export_search_strategy tbx|db tbx|dbsize tbx|gilist tbx|negative_gilist tbx|entrez_query tbx|subject tbx|subject_loc tbx|query tbx|out tbx|evalue tbx|word_size tbx|gapopen tbx|gapextend tbx|xdrop_ungap tbx|xdrop_gap tbx|xdrop_gap_final tbx|searchsp tbx|max_intron_length tbx|seg tbx|soft_masking tbx|matrix tbx|threshold tbx|culling_limit tbx|best_hit_overhang tbx|best_hit_score_edge tbx|window_size tbx|query_loc tbx|strand tbx|query_gencode tbx|db_gencode tbx|outfmt tbx|num_descriptions tbx|num_alignments tbx|max_target_seqs tbx|num_threads dbc|db dbc|dbtype dbc|entry dbc|entry_batch dbc|pig dbc|range dbc|strand dbc|mask_sequence_with dbc|out dbc|outfmt dbc|line_length c2m|in c2m|out c2m|outfmt dms|in dms|out dms|window dms|level dms|linker dms|outfmt sms|in sms|out sms|infmt sms|outfmt sms|window sms|locut sms|hicut wms|ustat wms|in wms|out wms|checkdup wms|window wms|t_extend wms|t_thres wms|t_high wms|t_low wms|set_t_high wms|set_t_low wms|infmt wms|outfmt wms|sformat wms|convert wms|fa_list wms|mem wms|smem wms|unit wms|genome_size wms|dust wms|dust_level wms|exclude_ids wms|ids wms|text_match wms|use_ba ); # each elt : pfx|wrapper_switch_name our @program_switches = qw( tbn|h tbn|help tbn|ungapped tbn|lcase_masking tbn|parse_deflines tbn|show_gis tbn|html tbn|remote tbn|use_sw_tback tbn|version blx|h blx|help blx|ungapped blx|lcase_masking blx|parse_deflines blx|show_gis blx|html blx|remote blx|version bln|h bln|help bln|no_greedy bln|ungapped bln|lcase_masking bln|parse_deflines bln|show_gis bln|html bln|remote bln|version blp|h blp|help blp|lcase_masking blp|parse_deflines blp|show_gis blp|html blp|ungapped blp|remote blp|use_sw_tback blp|version psi|h psi|help psi|lcase_masking psi|parse_deflines psi|show_gis psi|html psi|remote psi|use_sw_tback psi|version rpst|h rpst|help rpst|ungapped rpst|lcase_masking rpst|parse_deflines rpst|show_gis rpst|html rpst|remote rpst|version mak|h mak|help mak|parse_seqids mak|hash_index mak|version dba|h dba|help dba|version tbx|h tbx|help tbx|lcase_masking tbx|parse_deflines tbx|show_gis tbx|html tbx|remote tbx|version dbc|h dbc|help dbc|info dbc|target_only dbc|get_dups dbc|ctrl_a dbc|version c2m|h c2m|help c2m|parse_seqids c2m|version dms|h dms|help dms|xmlhelp dms|parse_seqids dms|version-full sms|h sms|help sms|xmlhelp sms|parse_seqids sms|version-full wms|h wms|help wms|xmlhelp wms|parse_seqids wms|version-full wms|mk_counts ); #each pair : pfx|wrapper_opt_name => command_line_name (without dashes) # for blast+, the options are all long and mnemonic, so a param translation # isn't required. In CommandExts, a parameter name should be passed through # as-is, if a translation is not found---. our %param_translation = ( ); our %composite_commands = ( ); our %command_files = ( ); 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Coil.pm000077500000000000000000000172631342734133000217570ustar00rootroot00000000000000# Wrapper module for Coil Bio::Tools::Run::Coil # # Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil # originally written by Marc Sohrmann (ms2@sanger.ac.uk) # Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Coil - wrapper for ncoils program =head1 SYNOPSIS # Build a Coil factory my $factory = Bio::Tools::Run::Coil->new($params); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION This module is a wrapper for the B program available via L for predicting coiled coils in protein sequences. By default it looks for an executable called I and data/parameter files in the directory specified by the I environmental variable. =head1 REFERENCES Lupas, van Dyke & Stock, I, Science B<252>:1162-1164, 1991. Lupas, A., I, Meth. Enzymology B<266>:513-525, 1996. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHORS Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil originally written by Marc Sohrmann (ms2@sanger.ac.uk) Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Coil; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @COIL_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Coil; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @COIL_PARAMS=qw(PROGRAM VERBOSE QUIET SILENT); foreach my $attr ( @COIL_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'ncoils'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{COILSDIR}) if $ENV{COILSDIR}; } =head2 new Title : new Usage : $coil->new(@params) Function: creates a new Coil factory Returns: Bio::Tools::Run::Coil Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $obj->run instead. Function: Runs Coil and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Coil and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI, or a Fasta filename. =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The argument is not a seq object but a sequence in a fasta file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." -f < ".$self->{'input'}." > ".$outfile; if($self->quiet || $self->verbose <=0 || $self->silent){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str.=" 2>$null"; } my $status = system($str); $self->throw( "Coil call ($str) crashed: $? \n") unless $status==0; my $coil_parser = Bio::Tools::Coil->new(); my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (COIL, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*COIL; } else { $filehandle = $outfile; } my @coil_feat; while(my $coil_feat = $coil_parser->next_result($filehandle)){ push @coil_feat, $coil_feat; } $self->cleanup(); close($tfh1); undef $tfh1; unlink $outfile; return @coil_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/EMBOSSApplication.pm000066400000000000000000000222121342734133000242300ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::EMBOSSApplication # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::EMBOSSApplication - class for EMBOSS Applications =head1 SYNOPSIS # Get an EMBOSS factory use Bio::Factory::EMBOSS; $f = Bio::Factory::EMBOSS -> new(); # Get an EMBOSS application object from the factory $water = $f->program('water') || die "Program not found!\n"; # Here is an example of running the application - water can # compare 1 sequence against 1 or more sequences using Smith-Waterman. # Pass a Sequence object and a reference to an array of objects. my $wateroutfile = 'out.water'; $water->run({-asequence => $seq_object, -bsequence => \@seq_objects, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # Now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while ( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSSApplication class can represent any EMBOSS program. It is created by a L object. If you want to check command line options before sending them to the program set $prog-Everbose to positive integer. The ADC description of the available command line options is then parsed in and compared to input. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email heikki-at-bioperl-dot-org =head2 CONTRIBUTORS Email: jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::EMBOSSApplication; use vars qw( $SEQIOLOADED $ALIGNIOLOADED ); use strict; use Data::Dumper; use Bio::Tools::Run::EMBOSSacd; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); sub new { my($class, $args) = @_; my $self = $class->SUPER::new(); $self->name($args->{'name'}); $self->verbose($args->{'verbose'}); $self->acd if $self->verbose > 0; return $self; } =head2 run Title : run Usage : $embossapplication->run($attribute_hash) Function: Runs the EMBOSS program. Returns : string or creates files for now; will return objects! Args : hash of input to the program =cut sub run { my ($self, $input) = @_; $self->io->_io_cleanup(); # test input $self->debug( Dumper($input) ); # parse ACD information $self->acd if $self->verbose > 0; # collect the options into a string my $option_string = ''; foreach my $attr (keys %{$input}) { my $attr_name = substr($attr, 1) if substr($attr, 0, 1) =~ /\W/; my $array = 0; if( defined $input->{$attr} && ref($input->{$attr}) ) { my (@pieces); if( $array = (ref($input->{$attr}) =~ /array/i) ) { foreach my $s ( @{$input->{$attr}} ) { @pieces = @{$input->{$attr}}; } } else { @pieces = ($input->{$attr}); } if( ! defined $pieces[0] ) { # we ignore for now $self->warn("specified a parameter $attr with no value"); $input->{$attr} = undef; return; } elsif( $pieces[0]->isa('Bio::PrimarySeqI') ) { unless( $SEQIOLOADED ) { require Bio::SeqIO; $SEQIOLOADED = 1; } my ($tfh,$tempfile) = $self->io->tempfile(-dir => $self->tempdir); my $out = Bio::SeqIO->new(-format => 'fasta', -fh => $tfh); foreach my $seq ( @pieces ) { $out->write_seq($seq); } $out->close(); $input->{$attr} = $tempfile; close($tfh); undef $tfh; } elsif( $pieces[0]->isa('Bio::Align::AlignI') ) { unless( $ALIGNIOLOADED ) { require Bio::AlignIO; $ALIGNIOLOADED = 1; } my ($tfh,$tempfile) = $self->io->tempfile(); my $out = Bio::AlignIO->new(-format => 'msf', -fh => $tfh); foreach my $p ( @pieces ) { $out->write_aln($p); } $input->{$attr} = $tempfile; close($tfh); undef $tfh; } } # check each argument against ACD if ($self->verbose > 0) { last unless defined $self->acd; # might not have the parser $self->throw("Attribute [$attr] not recognized!\n") unless $self->acd->qualifier($attr); } # print out debugging info $self->debug("Input attr: ". $attr_name. " => ". $input->{$attr}. "\n"); $option_string .= " " . $attr; $option_string .= " ". $input->{$attr} if defined $input->{$attr}; } # check mandatory attributes against given ones if ($self->verbose > 0) { last unless defined $self->acd; # might not have the parser # $self->acd->mandatory->print; # if ($self->name eq 'water') { # print Dumper($self->acd->mandatory); # } foreach my $attr (keys %{$self->acd->mandatory} ) { last unless defined $self->acd; # might not have the parser unless (defined $input->{$attr}) { print "-" x 38, "\n", "MISSING MANDATORY ATTRIBUTE: $attr\n", "-" x 38, "\n"; $self->acd->print($attr) and $self->throw("Program ". $self->name. " needs attribute [$attr]!\n") } } } my $runstring = join (' ', $self->name, $option_string, '-auto'); $self->debug( "Command line: ", $runstring, "\n"); return `$runstring`; } =head2 acd Title : acd Usage : $embossprogram->acd Function: finds out all the possible qualifiers for this EMBOSS application. They can be used to debug the options given. Throws : Returns : boolean Args : =cut sub acd { my ($self) = @_; unless ( $self->{'_acd'} ) { $self->{'_acd'} = Bio::Tools::Run::EMBOSSacd->new($self->name); } return $self->{'_acd'}; } =head2 name Title : name Usage : $embossprogram->name Function: sets/gets the name of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : name string Args : None =cut sub name { my ($self,$value) = @_; if (defined $value) { $self->{'_name'} = $value; } return $self->{'_name'}; } =head2 descr Title : descr Usage : $embossprogram->descr Function: sets/gets the descr of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : description string Args : None =cut sub descr { my ($self,$value) = @_; if (defined $value) { $self->{'_descr'} = $value; } return $self->{'_descr'}; } =head2 group Title : group Usage : $embossprogram->group Function: sets/gets the group of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. If the application is assigned into a subgroup use l to get it. Throws : Returns : string, group name Args : group string =cut sub group { my ($self,$value) = @_; if (defined $value) { my ($group, $subgroup) = split ':', $value; $self->{'_group'} = $group; $self->{'_subgroup'} = $subgroup; } return $self->{'_group'}; } =head2 subgroup Title : subgroup Usage : $embossprogram->subgroup Function: sets/gets the subgroup of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : string, subgroup name; undef if not defined Args : None =cut sub subgroup { my ($self) = @_; return $self->{'_subgroup'}; } =head2 program_dir Title : program_dir Usage : Function: Required by WrapperBase Throws : Returns : Name of directory with EMBOSS programs Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{EMBOSS_ACDROOT}); } =head2 program_path Title : program_path Usage : Function: Required by WrapperBase Throws : Returns : Full path of program Args : =cut sub program_path { my $self = shift; my $name = $self->{_name}; my $dir = Bio::Root::IO->catfile($ENV{EMBOSS_ACDROOT}); return "$dir/$name"; } =head2 executable Title : executable Usage : Function: Required by WrapperBase Throws : Returns : Name of program Args : =cut sub executable { my $self = shift; $self->{_name}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/EMBOSSacd.pm000066400000000000000000000234521342734133000225230ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::EMBOSSacd # # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::EMBOSSacd - class for EMBOSS Application qualifiers =head1 SYNOPSIS # Get an EMBOSS factory use Bio::Factory::EMBOSS; $f = Bio::Factory::EMBOSS -> new(); # Get an EMBOSS application object from the factory $water = $f->program('water') || die "Program not found!\n"; # Here is an example of running the application - water can # compare 1 sequence against 1 or more sequences using Smith-Waterman. # Pass a Sequence object and a reference to an array of objects. my $wateroutfile = 'out.water'; $water->run({-asequence => $seq_object, -bsequence => \@seq_objects, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # Now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while ( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSSacd represents all the possible command line arguments that can be given to an EMBOSS application. Do not create this object directly. It will be created and attached to its corresponding Bio::Tools::Run::EMBOSSApplication if you set $application->verbose > 0 Call $application->acd to retrive the Bio::Tools::Run::EMBOSSApplication::EMBOSSacd object. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email: heikki-at-bioperl-dot-org Address: EMBL Outstation, European Bioinformatics Institute Wellcome Trust Genome Campus, Hinxton Cambs. CB10 1SD, United Kingdom =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::EMBOSSacd; use vars qw(@ISA %QUALIFIER_CATEGORIES $QUAL %OPT); use strict; use Data::Dumper; use Bio::Root::Root; @ISA = qw(Bio::Root::Root); BEGIN { %QUALIFIER_CATEGORIES = ( 'Mandatory qualifiers' => 'mandatory', 'Standard (Mandatory) qualifiers' => 'mandatory', 'Optional qualifiers' => 'optional', 'Additional (Optional) qualifiers'=> 'optional', 'Advanced qualifiers' => 'advanced', 'Advanced (Unprompted) qualifiers'=> 'advanced', 'Associated qualifiers' => 'associated', 'General qualifiers' => 'general', ); $QUAL; # qualifier category } =head2 new Title : new Usage : $emboss_prog->acd($prog_name); Function: Constructor for the class. Calls EMBOSS program 'acdc', converts the HTML output into XML and uses XML::Twig XML parser to write out a hash of qualifiers which is then blessed. Throws : without program name Returns : new object Args : EMBOSS program name =cut sub new { my($class, $prog) = @_; eval {require XML::Twig;}; Bio::Root::Root->warn("You need XML::Twig for EMBOSS ACD parsing") and return undef if $@; Bio::Root::Root->throw("Need EMBOSSprogram name as an argument") unless $prog; # reset global hash %OPT = (); my $version = `embossversion -auto`; my $file; if ($version lt "2.8.0") { # reading from EMBOSS program acdc stdout (prior to version 2.8.0) $file = `acdc $prog -help -verbose -acdtable 2>&1`; } else { # reading from EMBOSS program acdtable stdout (version 2.8.0 or greater) $file = `acdtable $prog -help -verbose 2>&1`; } # converting HTML -> XHTML for XML parsing $file =~ s/border/border="1"/; $file =~ s/=(\d+)/="$1"/g; $file =~ s/
/
<\/br>/g; $file =~ s/ //g; my $t = XML::Twig->new( TwigHandlers => { '/table/tr' => \&_row } ); $t->safe_parse( $file); #Bio::Root::Root->throw("XML parsing error: $@"); my %acd = %OPT; # copy to a private hash $acd{'_name'} = $prog; bless \%acd, $class; } sub _row { my ($t, $row)= @_; return if $row->text eq "(none)"; # no qualifiers in this category my $name = $row->first_child; # qualifier name my $namet = $name->text; if ($namet =~ /qualifiers$/) { # set category $QUAL = $QUALIFIER_CATEGORIES{$namet}; if( ! defined $QUAL ) { warn("-- namet is $namet\n"); } return; } my $unnamed = 0; if ($namet =~ /\(Parameter (\d+)\)/) { # unnamed parameter $unnamed = $1; $namet =~ s/\(Parameter (\d+)\)//; $namet =~ s/[\[\]]//g ; # name is in brackets } my $desc = $name->next_sibling; my $values = $desc->next_sibling; my $default = $values->next_sibling; $OPT{$namet}{'unnamed'} = $unnamed; $OPT{$namet}{'category'} = $QUAL; $OPT{$namet}{'descr'} = $desc->text; $OPT{$namet}{'values'} = $values->text; $OPT{$namet}{'default'} = $default->text; $t->purge; # to reduce memory requirements } =head2 name Title : name Usage : $embossacd->name Function: sets/gets the name of the EMBOSS program Setting is done by the EMBOSSApplication object, you should only get it. Throws : Returns : name string Args : None =cut sub name { my ($self,$value) = @_; if (defined $value) { $self->{'_name'} = $value; } return $self->{'_name'}; } =head2 print Title : print Usage : $embossacd->print; $embossacd->print('-word'); Function: Print out the qualifiers. Uses Data::Dumper to print the qualifiers into STDOUT. A valid qualifier name given as an argment limits the output. Throws : Returns : print string Args : optional qualifier name =cut sub print { my ($self, $value) = @_; if ($value and $self->{$value}) { print Dumper $self->{$value}; } else { print Dumper $self; } } =head2 mandatory Title : mandatory Usage : $acd->mandatory Function: gets a mandatory subset of qualifiers Throws : Returns : Bio::Tools::Run::EMBOSSacd object Args : none =cut sub mandatory { my ($self) = @_; my %mand; foreach my $key (keys %{$self}) { next unless $key =~ /^-/; #ignore other attributes $mand{$key} = $self->{$key} if $self->{$key}{category} eq 'mandatory'; } bless \%mand; } =head2 Qualifier queries These methods can be used test qualifier names and read values. =cut =head2 qualifier Title : qualifier Usage : $acd->qualifier($string) Function: tests for the existence of the qualifier Throws : Returns : boolean Args : string, name of the qualifier =cut sub qualifier { my ($self, $value) = @_; $self->throw("Qualifier has to start with '-'") unless $value =~ /^-/; $self->{$value} ? 1 : 0 } =head2 category Title : category Usage : $acd->category($qual_name) Function: Return the category of the qualifier Throws : Returns : 'mandatory' or 'optional' or 'advanced' or 'associated' or 'general' Args : string, name of the qualifier =cut sub category { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'category'}; } =head2 values Title : values Usage : $acd->values($qual_name) Function: Return the possible values for the qualifier Throws : Returns : string Args : string, name of the qualifier =cut sub values { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'values'}; } =head2 descr Title : descr Usage : $acd->descr($qual_name) Function: Return the description of the qualifier Throws : Returns : boolean Args : string, name of the qualifier =cut sub descr { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'descr'}; } =head2 unnamed Title : unnamed Usage : $acd->unnamed($qual_name) Function: Find if the qualifier can be left unnamed Throws : Returns : 0 if needs to be named, order number otherwise Args : string, name of the qualifier =cut sub unnamed { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'unnamed'}; } =head2 default Title : default Usage : $acd->default($qual_name) Function: Return the default value for the qualifier Throws : Returns : scalar Args : string, name of the qualifier =cut sub default { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'default'}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/ERPIN.pm000066400000000000000000000252021342734133000217330ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::ERPIN # # Please direct questions and support issues to # # Cared for by Chris Fields # # Copyright Chris Fields # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::ERPIN - Wrapper for local execution of the ERPIN suite of programs. =head1 SYNOPSIS #run my @params = ( trset => 'BL.erpin', region => [1, 10], # Set up search strategy this way... strategy => [ 'umask' => [1, 2], 'umask' => [1, 2, 3, 4], 'umask' => [1, 2, 3, 4, 5, 6], 'nomask', 'cutoff' => [0, 10, 15, 20] ] # or use a simple string... #strategy => 'Ðumask 4 Ðadd 5 -nomask -cutoff 0 10 15', pcw => 100 ); my $factory = Bio::Tools::Run::ERPIN->new(-program =>'erpin', @params); # Pass the factory a Bio::Seq object or a file name # Returns a Bio::SearchIO object #my $search = $factory->run("B_sub.fas"); my $search = $factory->run($seq); my @feat; while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } =head1 DESCRIPTION =cut =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::ERPIN; use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::SearchIO; use Bio::AlignIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # will move parameters to each program, use this for _set_params my %ERPIN_PROGS = ( cfgs => 1, erpin => 1, frandseq => 1, mstat => 1, sview => 1, tstrip => 1, epnstat => 1, ev => 1, mhistview => 1, pview => 1, tstat => 1, tview => 1, ); my %ERPIN_SWITCHES = map {$_ => 1} qw(dmp smp fwd rev fwd+rev long short mute warnings globstat locstat unifstat Eon Eoff hist chrono); # order is important here my @ERPIN_PARAMS=qw(program model file strategy dmp smp fwd rev fwd+rev long short mute warnings globstat locstat unifstat Eon Eoff hist seq1 nseq bgn len logzero tablen chrono pcw hpcw spcw sumf tset); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::RNAMotif->new(@params) Function: creates a new RNAMotif factory Returns: Bio::Tools::Run::RNAMotif Args : list of parameters -tempfile => set tempfile flag (default 0) -outfile_name => set file to send output to (default none) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($out, $tf) = $self->_rearrange([qw(OUTFILE_NAME TEMPFILE)], @args); $self->io->_initialize_io(); if ($tf && !$out) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } else { $out ||= ''; $self->outfile_name($out); } $tf && $self->tempfile($tf); $self->_set_from_args(\@args, -methods => [@ERPIN_PARAMS], -create => 1 ); return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{ERPINDIR}) if $ENV{ERPINDIR}; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $string = `erpin -h 2>&1`; my $v; if ($string =~ m{Version\s([\d.]+)}) { $v = $1; } return $self->{'_progversion'} = $v || $string; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs ERPIN programs and returns Bio::SearchIO Returns : Args : Must pass Bio::PrimarySeqI's or file names =cut sub run { my ($self,@seq) = @_; $self->throw ("Must define 'db', pass a file name, or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 tempfile Title : tempfile Usage : $obj->tempfile(1) Function: Set tempfile flag. When set, writes output to a tempfile; this is overridden by outfile_name() if set Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub tempfile { my $self = shift; return $self->{'_tempfile'} = shift if @_; return $self->{'_tempfile'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Args : =cut sub _run { my ($self,$file,$prog) = @_; return unless $self->executable; $self->io->_io_cleanup(); my ($str, $progname, $outfile) = ($prog || $self->executable, $self->program_name, $self->outfile_name); my $param_str = $self->_setparams($file); $str .= " $param_str"; $self->debug("ERPIN command: $str\n"); # rnamotif => SearchIO object # rmfmt -a => AlignIO object # all others sent to outfile, tempfile, or STDERR (upon verbose = 1) my $obj = ($progname eq 'erpin') ? Bio::SearchIO->new(-verbose => $self->verbose, -format => "erpin", -version => $self->version, -database => $file ) : undef; my @args; # file-based if ($outfile) { local $SIG{CHLD} = 'DEFAULT'; my $status = system($str); if($status || !-e $outfile || -z $outfile ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "ERPIN call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($outfile); @args = (-file => $outfile); } # fh-based } else { open(my $fh,"$str |") || $self->throw("ERPIN call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io); return 1; } } # initialize SearchIO/AlignIO...um...IO # (since file/fh set post obj construction) $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self, $file) = @_; my $progname = $self->program_name; # small sanity check $self->throw("Unknown program: $progname") if (!exists $ERPIN_PROGS{$progname} ); my $param_string; my $outfile = ($self->outfile_name) ? ' > '.$self->outfile_name : ''; my ($tset, $st) = ($self->tset, $self->strategy); $param_string = join " ", ($tset, $file, $st); $self->debug("String : $param_string\n"); $self->throw("Must have both a training set and search strategy defined!") if (!defined($tset) || !defined ($st)); my @params; foreach my $attr (@ERPIN_PARAMS) { next if $attr eq 'program' || $attr eq 'tset' || $attr eq 'strategy'; my $value = $self->$attr(); next unless ($attr eq 'file' || defined $value); my $attr_key = '-'.$attr; if (exists $ERPIN_SWITCHES{$attr}) { push @params, $attr_key; } else { if ($attr eq 'file') { push @params, $file; } else { push @params, $attr_key.' '.$value; } } } $param_string .= ' '.join ' ', @params; $param_string .= $outfile if $outfile; return $param_string; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : writes passed Seq objects to tempfile, to be used as input for program Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Ensembl.pm000066400000000000000000000336551342734133000224560ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Ensembl # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Ensembl - A simplified front-end for setting up the registry for, and then using an Ensembl database with the Ensembl Perl API. =head1 SYNOPSIS use Bio::Tools::Run::Ensembl; # get a Bio::EnsEMBL::Gene for agene of interest my $gene = Bio::Tools::Run::Ensembl->get_gene_by_name(-species => 'human', -name => 'BRCA2'); =head1 DESCRIPTION This is a simple way of accessing the Ensembl database to retrieve gene information. Rather than learn the whole Ensembl Perl API, you only need to install it (that is, check it out from CVS: http://www.ensembl.org/info/docs/api/api_installation.html - ignore the information about BioPerl version) and then you can get information about a gene using get_gene_by_name(). For gene retrieval it is especially useful compared to direct Ensembl Perl API usage since it can use information from alternate data sources (orthologues, Swissprot, Entrez) to get your gene. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Ensembl; use strict; use Bio::WebAgent; use Bio::DB::EUtilities; use base qw(Bio::Root::Root); our $ENSEMBL_INSTALLED; our $NODB; our $LOADED_STR; our $TOTAL = 0; our $ORTHS = 0; our $SWISS = 0; our $NCBI = 0; our $BAD = 0; our $GOOD = 0; BEGIN { eval { require Bio::EnsEMBL::Registry; }; $ENSEMBL_INSTALLED = ! $@; $NODB = 0; $LOADED_STR = ''; } =head2 registry_setup Title : registry_setup Usage : Bio::Tools::Run::Ensembl->registry_setup(-host => $host, -user => $user); if (Bio::Tools::Run::Ensembl->registry_setup) {...} Function: Configure the ensembl registy to use a certain database. The database must be an Ensembl database compatible with the Ensembl Perl API, and you must have that API installed for this method to return true. Defaults to anonymous access to ensembldb.ensembl.org Or just ask if the registry is setup and the database ready to use. Returns : boolean (true if Registry loaded and ready to use) Args : -host => host name (defaults to 'ensembldb.ensembl.org') -user => username (defaults to 'anonymous') -pass => password (no default) -port => port (defaults to 3306) -db_version => version of ensembl database to use, if different from your installed Ensembl modules -verbose => boolean (1 to print messages during database connection) -no_database => boolean (1 to disable database access, causing this method to always return false) =cut sub registry_setup { return 0 unless $ENSEMBL_INSTALLED; my $class = shift; my ($host, $user, $pass, $port, $verbose, $no_db, $db_version) = $class->_rearrange([qw(HOST USER PASS PORT VERBOSE NO_DATABASE DB_VERSION)], @_); $host ||= 'ensembldb.ensembl.org'; $user ||= 'anonymous'; $NODB = $no_db if defined($no_db); return 0 if $NODB; my $load_str = $host.$user. (defined $pass ? $pass : '') . (defined $port ? $port : ''); unless ($LOADED_STR eq $load_str) { Bio::EnsEMBL::Registry->load_registry_from_db(-host => $host, -user => $user, defined $pass ? (-pass => $pass) : (), defined $port ? (-port => $port) : (), defined $db_version ? (-db_version => $db_version) : (), -verbose => $verbose); $LOADED_STR = $load_str; } return 1; } =head2 get_adaptor Title : get_adaptor Usage : my $adaptor = Bio::Tools::Run::Ensembl->get_adaptor($species, $type); Function: Get a species-specific 'core' database adaptor, optionally of a certain type. Returns : Bio::EnsEMBL::DBSQL::DBAdaptor, OR if a certain type requested, a Bio::EnsEMBL::DBSQL::${type}Adaptor Args : Bio::Species or string (species name) (REQUIRED), AND optionally string (the type of adaptor, eg. 'Gene' or 'Slice'). =cut sub get_adaptor { my ($class, $species, $type) = @_; return unless $class->registry_setup; return unless $species; if (ref($species)) { $species = $species->scientific_name; } return Bio::EnsEMBL::Registry->get_adaptor($species, 'core', $type) if $type; return Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core'); } =head2 get_gene_by_name Title : get_gene_by_name Usage : my $gene = Bio::Tools::Run::Ensembl->get_gene_by_name(); Function: Get a gene given species and a gene name. If multiple genes match this combination, tries to pick the 'best' match. Returns : Bio::EnsEMBL::Gene Args : -species => Bio::Species or string (species name), REQUIRED -name => string: gene name, REQUIRED If searching for the supplied gene name in the supplied species results in no genes, or more than one, you can choose what else is attempted in order to find just one gene: -use_orthologues => Bio::Species or string (species name), or array ref of such things: see if any of these supplied species have (unambiguously) a gene with the supplied gene name and if a (one-to-one) orthologue of that gene in that species is present in the main desired species supplied to -species, returns that orthologous gene. (default: none, do not use orthologues) -use_swiss_lookup => boolean: queries swissprot at expasy and if a suitable match is found, queries ensembl with the swissprot id. (default: 0, do not use swiss) -use_entrez_lookup => boolean: queries entrez at the NCBI server if (only) a single gene could not be found by any other method, then query ensembl with the entrez gene id. (default: 0, do not use NCBI) (Attempts proceed in this order and return as soon as one method is successful.) -strict => boolean: return undef with no warnings if more than one, or zero genes were found. (default: 0, warnings are issued and if many genes were found, one of them is returned) =cut sub get_gene_by_name { my $class = shift; return unless $class->registry_setup; my ($species, $gene_name, $use_swiss, $use_orth, $use_entrez, $strict) = $class->_rearrange([qw(SPECIES NAME USE_SWISS_LOOKUP USE_ORTHOLOGUES USE_ENTREZ_LOOKUP STRICT)], @_); $species || $class->throw("You must supply a -species"); $gene_name || $class->throw("You must supply a -name"); my $taxid; if (ref($species)) { $taxid = $species->id; $species = $species->scientific_name; } $TOTAL++; #print ". "; my $gene_adaptor = $class->get_adaptor($species, 'Gene') || return; # get the first gene that matches our query, warn if more than one did my @genes = @{$gene_adaptor->fetch_all_by_external_name($gene_name)}; my $gene = shift(@genes); # if not good enough, try again using orthologues if ($use_orth && (! $gene || @genes > 0)) { my @tests; if (ref($use_orth) && ref($use_orth) eq 'ARRAY') { @tests = @{$use_orth}; } else { @tests = ($use_orth); } my $alias_species = Bio::EnsEMBL::Registry->get_alias($species); foreach my $test_species (@tests) { $test_species = $test_species->scientific_name if ref($test_species); $test_species eq $species and next; my $test_gene = $class->get_gene_by_name(-species => $test_species, -name => $gene_name, -strict => 1) || next; my $homologue_results_ref = $test_gene->get_all_homologous_Genes(); # get the species and gene id of each homologue foreach my $result_ref (@{$homologue_results_ref}) { my ($homolog_gene, $homology, $homolog_species) = @{$result_ref}; # get_alias returns lower case, underscored version of what we get here $homolog_species = lc($homolog_species); $homolog_species =~ s/ /_/g; $homolog_species eq $alias_species or next; $homology->description eq 'UBRH' or next; $gene = $homolog_gene; $ORTHS++; last; } $gene and last; } } # if not good enough, try again using swissprot if ($use_swiss && (! $gene || @genes > 0)) { my $swiss_id; #*** swiss look up should be farmed out to some dedicated class my $swiss_name = lc($gene_name); my $swiss_species = lc($species); $swiss_species =~ s/\s/+/g; my $url = "http://www.expasy.org/cgi-bin/get-entries?db=sp&db=tr&DE=&GNc=AND&GN=$swiss_name&OC=$swiss_species&view=&num=100"; my $web_agent = Bio::WebAgent->new(); $web_agent->url($url); my $rq = HTTP::Request->new(GET=>$url); my $reply = $web_agent->request($rq); if ($reply->is_error) { $class->throw($reply->as_string()."\nError getting for url $url!\n"); } my $content = $reply->content; if ($content && $content !~ /No entries have been found/) { my @possibles = split(" 'y', -verbose => -1); my $esummary = Bio::DB::EUtilities->new(-eutil => 'esummary', -history => $esearch->next_History); eval {$esummary->parse_data;}; if (!$@) { my $ncbi_id; while (my $docsum = $esummary->next_DocSum) { my $item = $docsum->get_Item_by_name('Name'); if (lc($item->get_content) eq lc($gene_name)) { $ncbi_id = $docsum->get_id; last; } } if ($ncbi_id) { @genes = @{$gene_adaptor->fetch_all_by_external_name($ncbi_id)}; $gene = shift(@genes); $NCBI++ if ($gene && @genes == 0); } } } if (@genes > 0) { return if $strict; #$class->warn("Species '$species' had multiple matches to gene '$gene_name', using first gene '".$gene->display_id."'"); } unless ($gene) { return if $strict; $BAD++; #$class->warn("Species '$species' didn't have gene '$gene_name'"); return; } $GOOD++; return $gene; } sub _stats { print "$TOTAL | $ORTHS | $SWISS | $NCBI | good vs bad = $GOOD vs $BAD\n"; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Eponine.pm000066400000000000000000000236001342734133000224530ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Tania Oh # # Copyright Tania Oh # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Eponine - Object for execution of the Eponine which is a mammalian TSS predictor =head1 SYNOPSIS use Bio::Tools::Run::Eponine; use strict; my $seq = "/data/seq.fa"; my $threshold = "0.999"; my @params = ( '-seq' => $seq, '-threshold' => $threshold, '-epojar' => '/usr/local/bin/eponine-scan.jar', '-java' => '/usr/local/bin/java'); my $factory = Bio::Tools::Run::Eponine->new(@params); # run eponine against fasta my $r = $factory->run($seq); my $parser = Bio::Tools::Eponine->new($r); while (my $feat = $parser->next_prediction){ #$feat contains array of SeqFeature foreach my $orf($feat){ print $orf->seqname. "\n"; } } # Various additional options and input formats are available. See # the DESCRIPTION section for details. =head1 DESCRIPTION wrapper for eponine, a mammalian TSS predictor. The environment variable EPONINEDIR must be set to point at either the directory which contains eponine-scan.jar or directly at the jar which eponine-scan classfiles. NOTE: EPONINEDIR must point at the real file not a symlink. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Email gisoht@nus.edu.sg =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Eponine; #tgot to take inmore parameters use vars qw($AUTOLOAD @ISA @EPONINE_PARAMS %EPONINE_PARAMS $EPOJAR $JAVA $PROGRAMDIR $PROGRAMNAME $PROGRAM $TMPDIR $TMPOUTFILE $DEFAULT_THRESHOLD %OK_FIELD); use strict; use Bio::Tools::Eponine; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $DEFAULT_THRESHOLD = 50; $PROGRAMNAME = 'java'; $EPOJAR = 'eponine-scan.jar'; if( ! defined $PROGRAMDIR ) { $PROGRAMDIR = $ENV{'JAVA_HOME'} || $ENV{'JAVA_DIR'}; } if (defined $PROGRAMDIR) { foreach my $progname ( [qw(java)],[qw(bin java)] ) { my $f = Bio::Root::IO->catfile($PROGRAMDIR, @$progname); if( -e $f && -x $f ) { $PROGRAM = $f; last; } } } if( $ENV{'EPONINEDIR'} ) { if ( -d $ENV{'EPONINEDIR'} ) { $EPOJAR = Bio::Root::IO->catfile($ENV{'EPONINEDIR'}, $EPOJAR) } elsif(-e $ENV{'EPONINEDIR'}) { $EPOJAR = $ENV{'EPONINEDIR'}; } if ( ! -e $EPOJAR) { $EPOJAR =undef; } } %EPONINE_PARAMS = ('SEQ' => '/tmp/test.fa', 'THRESHOLD' => '0.999', 'EPOJAR' => '/usr/local/bin/eponine-scan.jar', 'JAVA' => '/usr/java/jre1.3.1_02/bin/java'); @EPONINE_PARAMS=qw(SEQ THRESHOLD JAVA EPOJAR); foreach my $attr ( @EPONINE_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $self->debug( "************ attr: $attr\n"); $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } sub new { my ($caller, @args) = @_; # chained new my $self = $caller->SUPER::new(@args); # so that tempfiles are cleaned up my $java; my $seq; my $threshold; my $epojar; my ($attr, $value); ($TMPDIR) = $self->tempdir(CLEANUP=>1); my $tfh; ($tfh,$TMPOUTFILE) = $self->io->tempfile(-dir => $TMPDIR); close($tfh); undef $tfh; while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/JAVA/i) { $java = $value; next; } if ($attr =~ /EPOJAR/i){ $epojar = $value; next; } if ($attr =~ /THRESHOLD/i){ $threshold = $value; next; } if ($attr =~ /SEQ/i){ $seq = $value; next; } $self->$attr($value); } $self->{'_java'} = undef; # location of java vm $self->{'_epojar'} = undef; # location of eponine-scan.jar executable JAR file. $self->{'_threshold'} = 0.999; # minimum posterior for filtering predictions $self->{'_filename'} = undef; #location of seq $seq = $EPONINE_PARAMS{'seq'} unless defined $seq; $threshold = $EPONINE_PARAMS{'threshold'} unless defined $threshold; if (! defined $epojar && defined $EPOJAR) { $epojar = $EPOJAR; } else { $epojar = $EPONINE_PARAMS{'epojar'} unless defined $epojar; } if (! defined $java && defined $PROGRAM) { $java = $PROGRAM; } else { $java = $EPONINE_PARAMS{'JAVA'} unless defined $java; } $self->filename($seq) if ($seq); if (-x $java) { # full path assumed $self->java($java); } $self->epojar($epojar) if (defined $epojar); if (defined $threshold && $threshold >=0 ){ $self->threshold($threshold); } else { $self->threshold($DEFAULT_THRESHOLD); } return $self; } =head2 java Title : java Usage : $obj->java('/usr/opt/java130/bin/java'); Function: Get/set method for the location of java VM Args : File path (optional) =cut sub executable { shift->java(@_); } sub java { my ($self, $exe,$warn) = @_; if( defined $exe ) { $self->{'_pathtojava'} = $exe; } unless( defined $self->{'_pathtojava'} ) { if( $PROGRAM && -e $PROGRAM && -x $PROGRAM ) { $self->{'_pathtojava'} = $PROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) && -x $exe ) { $self->{'_pathtojava'} = $exe; } else { $self->warn("Cannot find executable for $PROGRAMNAME") if $warn; $self->{'_pathtojava'} = undef; } } } $self->{'_pathtojava'}; } =head2 epojar Title : epojar Usage : $obj->epojar('/some/path/to/eponine-scan.jar'); Function: Get/set method for the location of the eponine-scan executable JAR Args : Path (optional) =cut sub epojar { my ($self, $location) = @_; if ($location) { unless( $location ) { $self->warn("eponine-scan.jar not found at $location: $!\n"); return; } $self->{'_epojar'} = $location ; } return $self->{'_epojar'}; } =head2 threshold Title : threshold Usage : my $threshold = $self->threshold Function: Get/Set the threshold for Eponine Returns : string Args : b/w 0.9 and 1.0 =cut sub threshold{ my ($self, $threshold) = @_; if (defined $threshold) { $self->{'_threshold'} = $threshold ; } return $self->{'_threshold'}; } =head2 run Title : run Usage : my @genes = $self->run($seq) Function: runs Eponine and creates an array of features Returns : An Array of SeqFeatures Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my $infile = $self->_setinput($seq); my @tss = $self->_run_eponine($infile); return @tss; } =head2 predict_TSS Title : predict_TSS Usage : Alias for run() =cut sub predict_TSS { return shift->run(@_); } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : =cut sub _setinput { my ($self,$seq) = @_; #better be a file if(!ref $seq){ return $seq; } my ($tfh1,$inputfile) = $self->tempfile(-dir=>$TMPDIR); my $in = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta'); $in->write_seq($seq); close($tfh1); undef $tfh1; return ($inputfile); } =head2 _run_eponine Title : run_eponine Usage : $obj->_run_eponine() Function: execs the Java VM to run eponine Returns : none Args : none =cut sub _run_eponine { my ($self,$infile) = @_; my $result = $TMPOUTFILE; my @tss; #run eponine $self->debug( "Running eponine-scan\n"); my ($java,$epojar) = ( $self->java, $self->epojar); unless( defined $java && -e $java && -x $java ) { $self->warn("Cannot find java"); return; } if (! defined $epojar) { $self->warn("Don't know the name of the Eponine jar file"); return; } if (! -e $epojar) { $self->warn("Cannot find Eponine jar: $epojar - either you specified an incorrect path in\nEPONINEDIR or it was not in the current working directory"); return; } my $cmd = $self->java.' -jar '.$self->epojar.' -seq '.$infile.' -threshold '.$self->threshold." > ".$result; $self->throw("Error running eponine-scan on ".$self->filename. " \n Check your java version, it has to be version 1.2 or later. Eponine crashed ($cmd) crashed: $? \n") if (system ($cmd)); #parse results even though it's wierd.. thought parser and wrapper should be separate my $epoParser = Bio::Tools::Eponine->new(-file =>$result); while (my $tss = $epoParser->next_prediction()){ push (@tss, $tss); } return @tss; } 1; __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/FootPrinter.pm000066400000000000000000000375231342734133000233420ustar00rootroot00000000000000# BioPerl module for FootPrinter # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::FootPrinter - wrapper for the FootPrinter program =head1 SYNOPSIS use Bio::Tools::Run::FootPrinter; my @params = (size => 10, max_mutations_per_branch => 4, sequence_type => 'upstream', subregion_size => 30, position_change_cost => 5, triplet_filtering => 1, pair_filtering => 1, post_filtering => 1, inversion_cost => 1, max_mutations => 4, tree => "~/software/FootPrinter2.0/tree_of_life" ); my $fp = Bio::Tools::Run::FootPrinter->new(@params, -verbose => 1); my $sio = Bio::SeqIO->new(-file => "seq.fa", -format => "fasta"); while (my $seq = $sio->next_seq){ push @seq, $seq; } my @fp = $fp->run(@seq); foreach my $result(@fp){ print "***************\n".$result->seq_id."\n"; foreach my $feat($result->sub_SeqFeature){ print $feat->start."\t".$feat->end."\t".$feat->seq->seq."\n"; } } =head1 DESCRIPTION From the FootPrinter manual: FootPrinter is a program that performs phylogenetic footprinting. It takes as input a set of unaligned orthologous sequences from various species, together with a phylogenetic tree relating these species. It then searches for short regions of the sequences that are highly conserved, according to a parsimony criterion. The regions identified are good candidates for regulatory elements. By default, the program searches for regions that are well conserved across all of the input sequences, but this can be relaxed to find regions conserved in only a subset of the species =head2 About Footprinter Written by Mathieu Blanchette and Martin Tompa. Available here: http://www.mcb.mcgill.ca/~blanchem/FootPrinter2.1.tar.gz =head2 Running Footprinter To run FootPrinter, you will need to set the environment variable FOOTPRINTER_DIR to where the binary is located (even if the executable is in your path). For example: setenv FOOTPRINTER_DIR /usr/local/bin/FootPrinter2.0/ =head2 Available Parameters PARAM VALUES DESCRIPTION ------------------------------------------------------------------------ tree REQUIRED, Tree in Newick Format to evaluate parsimony score REQUIRED unless tree_of_life exists in FOOTPRINTER_DIR sequence_type upstream Default upstream downstream other size 4-16 Specifies the size of the motifs sought max_mutations 0-20 maximum parsimony score allowed for the motifs max_mutations_per_branch 0-20 Allows at most a fixed number of mutations per branch of the tree losses files give span constraints so that the motifs reported are statistically significant Example files universal([6-9]|1[0-2])(loose|tight)?.config come with FootPrinter2.0. Install these in FOOTPRINTER_DIR and use by setting "losses" to "somewhat significant", "significant", or "very significant". Do not set loss_cost. loss_cost 0-20 a cost associated with losing a motif along some branch of the tre subregion_size 1-infinity penalize motifs whose position in the sequences varies too much position_change_cost 0-20 Cost for changing subregion triplet_filtering 1/0 pre-filtering step that removes from consideration any substring that does not have a sufficiently good pair of matching substrings in some pair of the other input sequences pair_filtering 1/0 Same as triplet filtering, but looks only for one match per other sequence post_filtering 1/0 when used in conjunction with the triplet filtering option, this often significantly speeds up the program, while still garanteeing optimal results indel_cost 1-5 insertions and deletions will be allowed in the motifs sought, at the given cost inversion_cost 1-5 This option allows for motifs to undergo inversions, at the given cost. An inversion reverse-complements the motif. details 1/0 Shows some of the details about the progress of the computation html 1/0 produce html output (never deleted) ps 1/0 produce postscript output (never deleted) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut package Bio::Tools::Run::FootPrinter; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @FP_SWITCHES @FP_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Cwd; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::FootPrinter; use Bio::SeqIO; # Let the code begin... @ISA = qw(Bio::Tools::Run::WrapperBase); BEGIN { @FP_PARAMS = qw(SEQUENCE_TYPE SIZE MAX_MUTATIONS MAX_MUTATIONS_PER_BRANCH LOSSES LOSS_COST TREE PROGRAM SUBREGION_SIZE POSITION_CHANGE_COST INDEL_COST INVERSION_COST ); @FP_SWITCHES = qw(TRIPLET_FILTERING PAIR_FILTERING POST_FILTERING DETAILS); @OTHER_SWITCHES = qw(QUIET HTML PS); # Authorize attribute fields foreach my $attr ( @FP_PARAMS, @FP_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'FootPrinter'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{FOOTPRINTER_DIR}) if $ENV{FOOTPRINTER_DIR}; } =head2 executable Title : executable Usage : my $exe = $footprinter->executable('FootPrinter'); Function: Finds the full path to the 'FootPrinter' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my $self = shift; my $exe = $self->SUPER::executable(@_) || return; # even if its executable, we still need the environment variable to have # been set if (! $ENV{FOOTPRINTER_DIR}) { $self->warn("Environment variable FOOTPRINTER_DIR must be set, even if the FootPrinter executable is in your path"); return; } return $exe; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new($seq) Function: creates a new wrapper Returns: Bio::Tools::Run::FootPrinter Args : self =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } if(!$self->tree && -e $ENV{FOOTPRINTER_DIR}."/tree_of_life"){ $self->tree($ENV{FOOTPRINTER_DIR}."/tree_of_life"); } unless($self->tree){ $self->debug("Phylogenetic tree not provided. FootPrinter won't be able to run without it. use \$fp->tree to set the tree file"); } return $self; } =head2 run Title : run Usage : $fp->run(@seq) Function: carry out FootPrinter Example : Returns : An array of SeqFeatures Args : An array of Bio::PrimarySeqI compliant object At least 2 are needed. =cut sub run { my ($self,@seq) = @_; #need at least 2 for comparative genomics duh. $#seq > 0 || $self->throw("Need at least two sequences"); $self->tree || $self->throw("Need to specify a phylogenetic tree using -tree option"); my $infile = $self->_setinput(@seq); my $param_string = $self->_setparams(); my @footprint_feats = $self->_run($infile,$self->tree,$param_string); return @footprint_feats; } =head2 _run Title : _run Usage : $fp->_run ($filename,$param_string) Function: internal function that runs FootPrinter Example : Returns : an array of features Args : the filename to the input sequence, filename to phylo tree and the parameter string =cut sub _run { my ($self,$infile,$tree,$param_string) = @_; my $instring; my $exe = $self->executable || return; $self->debug( "Program ".$self->executable."\n"); my $outfile = $infile.".seq.txt"; my $cmd_str = $self->executable. " $infile $tree $param_string"; $self->debug("FootPrinter command = $cmd_str"); if ($self->verbose <=0){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd_str.= " >&$null > $null"; } # will do brute-force clean up of junk files generated by FootPrinter my $cwd = cwd(); opendir(my $cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); my %ok_files; foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /^mlc\./) { $ok_files{$thing} = 1; } } closedir($cwd_dir); my $status = system($cmd_str); $self->throw("FootPrinter Call($cmd_str) crashed: $?\n") unless $status == 0 || $status==256; unless (open (FP, $outfile)) { $self->throw("Cannot open FootPrinter outfile for parsing"); } my $fp_parser = Bio::Tools::FootPrinter->new(-fh=>\*FP); my @fp_feat; while(my $fp_feat = $fp_parser->next_feature){ push @fp_feat, $fp_feat; } unless( $self->save_tempfiles ) { unlink $outfile; unlink $infile; # is this dangerous?? unlink "$infile.order.txt"; # is this dangerous?? opendir($cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /^mlc\./) { unlink($thing) unless $ok_files{$thing}; } } closedir($cwd_dir); $self->cleanup(); } return @fp_feat; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FootPrinter program Example : Returns : parameter string to be passed to FootPrinter Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @FP_PARAMS ) { $value = $self->$attr(); next if $attr=~/TREE/i; next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba if ($attr_key eq 'losses' && $value =~ /^\s*(somewhat|very)?\s*significant\s*$/) { $value = "$ENV{FOOTPRINTER_DIR}/universal".$self->size(); if (defined $1) { if ($1 eq 'somewhat') { $value .= 'loose'; } else { # $1 eq 'very' $value .= 'tight'; } } $value .= '.config'; -f $value or $self->throw("universal losses file $value does not exist"); } $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @FP_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } $self->html() or $param_string .= " -no_html"; $self->ps() or $param_string .= " -no_ps"; return $param_string; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : a Bio::PrimarySeqI compliant object =cut sub _setinput { my ($self,@seq) = @_; my ($tfh1,$outfile1); $outfile1 = $self->outfile_name(); if (defined $outfile1) { open($tfh1,">$outfile1"); } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); } my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'Fasta'); foreach my $seq(@seq){ $seq->isa("Bio::PrimarySeqI") || $self->throw("Need a Bio::PrimarySeq compliant object for FootPrinter"); $out1->write_seq($seq); } $out1->close(); # close the SeqIO object close($tfh1); # close the fh explicitly (just in case) undef($tfh1); # really get rid of it (just in case) return ($outfile1); } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Genemark.pm000066400000000000000000000207501342734133000226120ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Genemark # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genemark - Wrapper for local execution of the GeneMark family of programs. =head1 SYNOPSIS # GeneMark.hmm (prokaryotic) my $factory = Bio::Tools::Run::Genemark->new('-program' => 'gmhmmp', '-m' => 'model.icm'); # Pass the factory Bio::Seq objects # returns a Bio::Tools::Genemark object my $genemark = $factory->run($seq); =head1 DESCRIPTION Wrapper module for the GeneMark family of programs. Should work with all flavors of GeneMark.hmm at least, although only the prokaryotic version has been tested. General information about GeneMark is available at L. Contact information for licensing inquiries is available at: L Note that GeneMark.hmm (prokaryotic at least) will only process the first sequence in a fasta file (if you run() more than one sequence at a time, only the first will be processed). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genemark; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Genemark; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program)); our @genemark_params = (qw(i m p)); our @genemark_switches = (qw(a n r)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 new Title : new Usage : $genemark->new(@params) Function: creates a new Genemark factory Returns: Bio::Tools::Run::Genemark Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @genemark_params, @genemark_switches, ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } unless (defined($self->m())) { $self->throw('Must specify model'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs Genemark Returns : A Bio::Tools::Genemark object Args : An array of Bio::PrimarySeqI objects =cut sub run { my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); # GeneMark.hmm (prokaryotic version, anyway) ignores sequences after the # first in a fasta file if ($program_name eq 'gmhmmp') { if (@seq > 1) { $self->warn("Program $program_name processes one sequence at a time"); } } return $self->_run($file_name, $seq[0]->display_id()); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::Genemark Args : file name, sequence identifier (optional) =cut sub _run { my ($self, $seq_file_name, $seq_id) = @_; my ($temp_fh, $temp_file_name) = $self->io->tempfile(-dir=>$self->tempdir()); close($temp_fh); # IPC::Run wants an array where the first element is the executable my @cmd = ( $self->executable(), split(/\s+/, $self->_setparams()), '-o', $temp_file_name, $seq_file_name, ); my $cmd = join(' ', @cmd); $self->debug("GeneMark Command = $cmd"); # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. my ($program_stdout, $program_stderr); eval { IPC::Run::run( \@cmd, \undef, \$program_stdout, \$program_stderr, ) || die $CHILD_ERROR; }; if ($EVAL_ERROR) { $self->throw("GeneMark call crashed: $EVAL_ERROR"); } ## The prokaryotic version of GeneMark.HMM, at least, returns ## 0 (success) even when the license has expired. if ((-z $temp_file_name) && ($program_stderr =~ /license period has ended/i)) { $self->throw($program_stderr); } elsif ($program_stderr =~ /\d+ days remaining/i) { $self->warn($program_stderr); } $self->debug(join("\n", 'GeneMark STDOUT:', $program_stdout)) if $program_stdout; $self->debug(join("\n", 'GeneMark STDERR:', $program_stderr)) if $program_stderr; return Bio::Tools::Genemark->new(-file => $temp_file_name, -seqname => $seq_id); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [@genemark_params], -switches => [@genemark_switches], -dash => 1, ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Genewise.pm000066400000000000000000000305511342734133000226270ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by # # Copyright to a FUGU Student Intern # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genewise - Object for predicting genes in a given sequence given a protein =head1 SYNOPSIS # Build a Genewise alignment factory my $factory = Bio::Tools::Run::Genewise->new(); # Pass the factory 2 Bio:SeqI objects (in the order of query peptide # and target_genomic). # @genes is an array of Bio::SeqFeature::Gene::GeneStructure objects my @genes = $factory->run($protein_seq, $genomic_seq); # Alternatively pass the factory a profile HMM filename and a # Bio:SeqI object (in the order of query HMM and target_genomic). # Set hmmer switch first to tell genewise to expect an HMM $factory->hmmer(1); my @genes = $factory->run($hmmfile, $genomic_seq); =head1 DESCRIPTION Genewise is a gene prediction program developed by Ewan Birney http://www.sanger.ac.uk/software/wise2. =head2 Available Params: NB: These should be passed without the '-' or they will be ignored, except switches such as 'hmmer' (which have no corresponding value) which should be set on the factory object using the AUTOLOADed methods of the same name. Model [-codon,-gene,-cfreq,-splice,-subs,-indel,-intron,-null] Alg [-kbyte,-alg] HMM [-hmmer] Output [-gff,-gener,-alb,-pal,-block,-divide] Standard [-help,-version,-silent,-quiet,-errorlog] =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - FUGU Student Intern Email: fugui@worf.fugu-sg.org =head1 CONTRIBUTORS Jason Stajich jason-AT-bioperl_DOT_org Keith James kdj@sanger.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genewise; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @GENEWISE_SWITCHES @GENEWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::SeqFeature::Gene::Exon; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::SeqFeature::FeaturePair; use Bio::SeqFeature::Gene::Transcript; use Bio::SeqFeature::Gene::GeneStructure; use Bio::Tools::Genewise; use Bio::Tools::AnalysisResult; use strict; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); # Two ways to run the program ..... # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { @GENEWISE_PARAMS = qw( DYMEM CODON GENE CFREQ SPLICE GENESTATS INIT SUBS INDEL INTRON NULL INSERT SPLICE_MAX_COLLAR SPLICE_MIN_COLLAR GW_EDGEQUERY GW_EDGETARGET GW_SPLICESPREAD KBYTE HNAME ALG BLOCK DIVIDE GENER U V S T G E M); @GENEWISE_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD TREV PSEUDO NOSPLICE_GTAG SPLICE_GTAG NOGWHSP GWHSP TFOR TABS BOTH HMMER ); # Authorize attribute fields foreach my $attr ( @GENEWISE_PARAMS, @GENEWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'genewise'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin/") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $prog = $self->executable; my $string = `$prog -version`; if( $string =~ /Version:\s+\$\s*Name:\s+(\S+)\s+\$/ ) { return $1; } elsif( $string =~ /(Version *)/i ) { return $1; } else { return undef; } } =head2 predict_genes Title : predict_genes Usage : DEPRECATED. Use $factory->run($seq1,$seq2) Function: Predict genes Returns : A Bio::Seqfeature::Gene:GeneStructure object Args : Name of a file containing a set of 2 fasta sequences in the order of peptide and genomic sequences or else 2 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 2 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub predict_genes { return shift->run(@_); } =head2 run Title : run Usage : 2 sequence objects $genes = $factory->run($seq1, $seq2); Function: run Returns : A Bio::Seqfeature::Gene:GeneStructure object Args : Names of a files each containing a fasta sequence in the order of either (peptide sequence, genomic sequence) or (profile HMM, genomic sequence). Alternatively any of the fasta sequence filenames may be substituted with a Bio::Seq object. Throws an exception if argument is not either a string (eg a filename) or Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. Also throws an exception if a profile HMM is expected (the -hmmer genewise switch has been set). =cut sub run{ my ($self, $seq1, $seq2) = @_; my ($attr, $value, $switch); $self->io->_io_cleanup(); # Create input file pointer my ($infile1,$infile2)= $self->_setinput($seq1, $seq2); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) ");} # run genewise my @genes = $self->_run($infile1,$infile2); return @genes; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Makes actual system call to a genewise program Example : Returns : L Args : Name of a files containing 2 sequences in the order of peptide and genomic =cut sub _run { my ($self,$infile1,$infile2) = @_; my $instring; $self->debug("Program ".$self->executable."\n"); unless ( $self->executable ) { $self->throw("Cannot run Genewise unless the executable is found. Check your environment variables or make sure genewise is in your path."); } my $paramstring = $self->_setparams; my $commandstring = $self->executable." $paramstring $infile1 $infile2"; # this is to capture STDERR messages which leak out when you run programs # with open(FH, "... |"); if (($self->silent && $self->quiet) && ($^O !~ /os2|dos|amigaos/)) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2> $null"; } my ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); $self->debug("genewise command = $commandstring"); my $status = system("$commandstring > $outfile1"); $self->throw("Genewise call $commandstring crashed: $? \n") unless $status == 0; my $genewiseParser = Bio::Tools::Genewise->new(-file=> $outfile1); my @genes; while (my $gene = $genewiseParser->next_prediction()) { push @genes, $gene; } close ($tfh1); undef ($tfh1); return @genes; } sub get_strand { my ($self,$start,$end) = @_; $start || $self->throw("Need a start"); $end || $self->throw("Need an end"); my $strand; if ($start > $end) { my $tmp = $start; $start = $end; $end = $tmp; $strand = -1; } else { $strand = 1; } return ($start,$end,$strand); } sub _setinput { my ($self, $arg1, $seq2) = @_; my ($tfh1,$tfh2,$outfile1,$outfile2); $self->throw("calling with not enough arguments") unless $arg1 && $seq2; # Not going to set _query_pep/_subject_dna_seq if you pass in a # filename unless( ref($arg1) ) { unless( -e $arg1 ) { if ($self->hmmer) { $self->throw("Argument1 was not a HMMER profile HMM file\n") } else { $self->throw("Argument1 is not a Bio::PrimarySeqI object nor file\n"); } } $outfile1 = $arg1; } else { if ($self->hmmer) { $self->throw("Argument1 was not a HMMER profile HMM file\n") } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'fasta'); $out1->write_seq($arg1); $self->_query_pep_seq($arg1); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh1); undef $tfh1; } } unless( ref($seq2) ) { unless( -e $seq2 ) { $self->throw("Sequence2 is not a Bio::PrimarySeqI object nor file\n"); } $outfile2 = $seq2; } else { ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out2 = Bio::SeqIO->new('-fh' => $tfh2, '-format' => 'fasta'); $out2->write_seq($seq2); $self->_subject_dna_seq($seq2); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh2); undef $tfh2; } return ($outfile1,$outfile2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@GENEWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } foreach my $attr(@GENEWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } $param_string = $param_string." -genesf"; #specify the output option return $param_string; } =head2 _query_pep_seq Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_pep_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_pep_seq'} = $seq; } return $self->{'_query_pep_seq'}; } =head2 _subject_dna_seq Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_dna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_dna_seq'} = $seq; } return $self->{'_subject_dna_seq'}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Genscan.pm000066400000000000000000000136771342734133000224510ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Genscan # # Please direct questions and support issues to # # Cared for by # # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genscan - Object for identifying genes in a given sequence given a matrix(for appropriate organisms). =head1 SYNOPSIS # Build a Genscan factory my $param = ('MATRIX'=>HumanIso.smat); my $factory = Bio::Tools::Run::Genscan->new($param); # Pass the factory a Bio::Seq object #@genes is an array of Bio::Tools::Predictions::Gene objects my @genes = $factory->run($seq); =head1 DESCRIPTION Genscan is a gene identifying program developed by Christopher Burge http://genes.mit.edu/burgelab/ By default it looks for an executable called I and data/parameter files in the directory specified by the I environmental variable. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genscan; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @GENSCAN_PARAMS %OK_FIELD); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Genscan; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @GENSCAN_PARAMS=qw(MATRIX VERBOSE QUIET); foreach my $attr ( @GENSCAN_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'genscan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{GENSCANDIR}); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_genes() Title : predict_genes() Usage : DEPRECATED: use $obj->run($seq) instead Function: Runs genscan and creates an array of Genes Returns : An array of Bio::Tools::Prediction::Gene objects Args : A Bio::PrimarySeqI =cut sub predict_genes{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs genscan and creates an array of Genes Returns : An array of Bio::Tools::Prediction::Gene objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my $infile1 = $self->_writeSeqFile($seq); $self->_set_input($infile1); my @feat = $self->_run(); return @feat; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::Tools::Prediction::Gene objects Args : =cut sub _run { my ($self) = @_; my @genes; my $gene; my $str = $self->executable.' '.$self->MATRIX.' '.$self->{'input'}; if($self->verbose){ $str.=" -v "; } if($self->quiet){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(STDERR,">$null"); } unless (open(GENSCAN, "$str |")){ $self->warn("Cannot run $str"); } close(STDERR); my $genScanParser = Bio::Tools::Genscan->new(-fh=> \*GENSCAN); while( $gene = $genScanParser->next_prediction()){ push(@genes, $gene); } $self->cleanup(); return @genes; } =head2 _set_input() Title : _set_input Usage : obj->_set_input($matrixFile,$seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _set_input() { my ($self,$infile1) = @_; $self->{'input'}=$infile1; } =head2 _writeSeqFile() Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile(){ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Glimmer.pm000066400000000000000000000266451342734133000224660ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Glimmer # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Glimmer - Wrapper for local execution of Glimmer, GlimmerM and GlimmerHMM. =head1 SYNOPSIS # glimmer2 my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmer3', '-model' => 'model.icm'); # glimmer3 my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmer2', '-model' => 'model.icm'); # glimmerm my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmerm'); # glimmerHMM my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmerHMM'); # Pass the factory Bio::Seq objects # returns a Bio::Tools::Glimmer object my $glimmer = $factory->run($seq); or my $glimmer = $factor->run(@seq); =head1 DESCRIPTION Wrapper module for the Glimmer family of programs. Should work with all currently available flavors: Glimmer, GlimmerM and GlimmerHMM. However, only Glimmer 2.X and 3.X have been tested. Glimmer is open source and available at L. GlimmerM is open source and available at L. GlimmerHMM is open source and available at L. Note that Glimmer 2.X will only process the first sequence in a fasta file (if you run() more than one sequence at a time, only the first will be processed). Note that Glimmer 3.X produces two output files. This module only passes the .predict file to Bio::Tools::Glimmer. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Glimmer; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Glimmer; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program model)); our @glimmer2_params = (qw(C L g i o p q s t w)); our @glimmer2_switches = (qw(M X f l r)); our @glimmer3_params = (qw(A C E L M P Z b g i t z)); our @glimmer3_switches = (qw(X f l o q r)); our @glimmerM_params = (qw(d g t)); our @glimmerM_switches = (qw(5 3 f r s)); our @glimmerHMM_params = (qw(d n p)); our @glimmerHMM_switches = (qw(f h v)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 model Title : model Usage : $factory>model() Function: gets/sets the name of the model (icm) file Returns: string Args : string =cut sub model { my ($self, $val) = @_; $self->{'_model'} = $val if $val; return $self->{'_model'}; } =head2 new Title : new Usage : $glimmer->new(@params) Function: creates a new Glimmer factory Returns: Bio::Tools::Run::Glimmer Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @glimmer2_params, @glimmer2_switches, @glimmer3_params, @glimmer3_switches, @glimmerM_params, @glimmerM_switches, @glimmerHMM_params, @glimmerHMM_switches ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } unless (defined($self->model())) { $self->throw('Must specify model'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs Glimmer/GlimmerM/GlimmerHMM Returns : A Bio::Tools::Glimmer object Args : An array of Bio::PrimarySeqI objects =cut sub run{ my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); my @run_args = ( $file_name ); # Glimmer 2.X ignores sequences after the first in a fasta file # Glimmer 3.X will process multiple sequences at once if ($program_name eq 'glimmer2') { if (@seq > 1) { $self->warn("Program $program_name processes one sequence at a time"); } push @run_args, $seq[0]->display_id(); push @run_args, $seq[0]->length(); } return $self->_run(@run_args); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::Glimmer Args : file name, sequence identifier (optional) =cut sub _run { my ($self, $seq_file_name, $seq_id, $seq_length) = @_; my @cmd = ( $self->executable(), $seq_file_name, $self->model(), split(/\s+/, $self->_setparams()), ); my $cmd = join(' ', @cmd); $self->debug("Glimmer Command = $cmd"); my $program_name = $self->program_name(); my ($output_fh, $output_file_name, $detail_file_name); my ($program_stdout, $program_stderr); my @ipc_args = (\@cmd, \undef); # No STDOUT option for glimmer3, it takes a # 'tag' argument, and outputs tag.predict and # tag.detail. It seems that tag can be a path, # which is handy. if ($program_name eq 'glimmer3') { my $temp_dir = $self->tempdir(); my $glimmer3_tag = "$temp_dir/glimmer3"; push @cmd, $glimmer3_tag; $output_file_name = "$glimmer3_tag.predict"; $detail_file_name = "$glimmer3_tag.detail"; push @ipc_args, \$program_stdout, \$program_stderr; } else { ($output_fh, $output_file_name) = $self->io->tempfile(-dir=>$self->tempdir()); close($output_fh); push @ipc_args, '>', $output_file_name; push @ipc_args, '2>', \$program_stderr; } # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. eval { IPC::Run::run(@ipc_args) || die $CHILD_ERROR;; }; if ($EVAL_ERROR) { $self->throw("Glimmer call crashed: $EVAL_ERROR"); } $self->debug(join("\n", 'Glimmer STDOUT:', $program_stdout)) if $program_stdout; $self->debug(join("\n", 'Glimmer STDERR:', $program_stderr)) if $program_stderr; my %parser_args = (-file => $output_file_name); # Pass along $seq_id and $seq_length if they were provided # (only should be for glimmer2). if (defined($seq_id)) { $parser_args{-seqname } = $seq_id; } if (defined($seq_length)) { $parser_args{-seqlength} = $seq_length; } # Pass along the name of extra output file, with handy information about # sequence lengths (only produced by glimmer3) if (defined($detail_file_name)) { $parser_args{-detail} = $detail_file_name; } return Bio::Tools::Glimmer->new(%parser_args); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [ @glimmer2_params, @glimmer3_params, @glimmerM_params, @glimmerHMM_params, ], -switches => [ @glimmer2_switches, @glimmer2_switches, @glimmerM_switches, @glimmerHMM_switches, ], -dash => 1 ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Hmmer.pm000077500000000000000000000427361342734133000221440ustar00rootroot00000000000000# You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Hmmer - Wrapper for local execution of hmmalign, hmmbuild, hmmcalibrate, hmmemit, hmmpfam, hmmsearch =head1 SYNOPSIS # run hmmsearch (similar for hmmpfam) my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm'); # Pass the factory a Bio::Seq object or a file name, returns a Bio::SearchIO my $searchio = $factory->hmmsearch($seq); while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $result->query_name, $hsp->query->start, $hsp->query->end, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->score, $hsp->evalue, $hsp->seq_str, )), "\n"; } } } # build a hmm using hmmbuild my $aio = Bio::AlignIO->new(-file => "protein.msf", -format => 'msf'); my $aln = $aio->next_aln; my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm'); $factory->hmmbuild($aln); # calibrate the hmm $factory->calibrate(); # emit a sequence stream from the hmm my $seqio = $factory->hmmemit(); # align sequences to the hmm my $alnio = $factory->hmmalign(@seqs); =head1 DESCRIPTION Wrapper module for Sean Eddy's HMMER suite of program to allow running of hmmalign, hmmbuild, hmmcalibrate, hmmemit, hmmpfam and hmmsearch. Binaries are available at http://hmmer.janelia.org/ You can pass most options understood by the command-line programs to new(), or set the options by calling methods with the same name as the argument. In both instances, case sensitivity matters. Additional methods are hmm() to specifiy the hmm file (needed for all HMMER programs) which you would normally set in the call to new(). The HMMER programs must either be in your path, or you must set the environment variable HMMERDIR to point to their location. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email: shawnh-at-gmx.net =head1 CONTRIBUTORS Shawn Hoon shawnh-at-gmx.net Jason Stajich jason -at- bioperl -dot- org Scott Markel scott -at- scitegic -dot com Sendu Bala bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Hmmer; use strict; use Bio::SeqIO; use Bio::SearchIO; use Bio::AlignIO; use base qw(Bio::Tools::Run::WrapperBase); our $DefaultFormat = 'msf'; our $DefaultReadMethod = 'hmmer'; our %ALL = (quiet => 'q', o => 'outfile'); our @ALIGN_PARAMS = qw(mapali outformat withali o); our @ALIGN_SWITCHES = qw(m oneline q); our @BUILD_PARAMS = qw(n archpri cfile gapmax idlevel null pam pamwgt pbswitch prior swentry swexit o); our @BUILD_SWITCHES = qw(f g s A F amino binary fast hand noeff nucleic wblosum wgsc wme wnone wpb wvoronoi); our @CALIBRATE_PARAMS = qw(fixed histfile mean num sd seed cpu); our @CALIBRATE_SWITCHES = qw(); our @EMIT_PARAMS = qw(n seed o); our @EMIT_SWITCHES = qw(c q); our @PFAM_PARAMS = qw(A E T Z domE domT informat cpu); our @PFAM_SWITCHES = qw(n acc cut_ga cut_gc cut_nc forward null2 xnu); our @SEARCH_PARAMS = @PFAM_PARAMS; our @SEARCH_SWITCHES = @PFAM_SWITCHES; our %OTHER = (_READMETHOD => '_readmethod', program_name => [qw(PROGRAM program)], hmm => [qw(HMM db DB)]); # just to be explicit our @UNSUPPORTED = qw(h verbose a compat pvm); =head2 new Title : new Usage : $HMMER->new(@params) Function: Creates a new HMMER factory Returns : Bio::Tools::Run::HMMER Args : -hmm => filename # the hmm, used by all program types; if not set # here, must be set with hmm() method prior to # running anything -_READMETHOD => 'hmmer' (default) || 'hmmer_pull' # the parsing # module to use for # hmmpfam/hmmsearch Any option supported by a Hmmer program, where switches are given a true value, eg. -q => 1, EXCEPT for the following which are handled internally/ incompatible: h verbose a compat pvm WARNING: the default sequence format passed to hmmpfam is msf. If you are using a different format, you need to pass it with informat. e.g. my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm', -informat => 'fasta'); -q is synonymous with -quiet -o is synonymous with -outfile # may be specified here, allowing run() to be used, or # it can be omitted and the corresponding method (eg. # hmmalign()) used later. -program => hmmalign|hmmbuild|hmmcalibrate|hmmemit|hmmpfam|hmmsearch =cut sub new { my($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $ALL{$_} } keys %ALL), (map { $_ => $OTHER{$_} } keys %OTHER), (map { $_ => $_ } (@ALIGN_PARAMS, @ALIGN_SWITCHES, @BUILD_PARAMS, @BUILD_SWITCHES, @CALIBRATE_PARAMS, @CALIBRATE_SWITCHES, @EMIT_PARAMS, @EMIT_SWITCHES, @PFAM_PARAMS, @PFAM_SWITCHES, @SEARCH_PARAMS, @SEARCH_SWITCHES))}, -create => 1, -case_sensitive => 1); $self->informat || $self->informat($DefaultFormat); $self->_READMETHOD || $self->_READMETHOD($DefaultReadMethod); return $self; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs one of the Hmmer programs, according to the current setting of program() (as typically set during new(-program => 'name')). Returns : A Bio::SearchIO, Bio::AlignIO, Bio::SeqIO or boolean depending on the program being run (see method corresponding to program name for details). Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub run { my $self = shift; my $program = lc($self->program_name || $self->throw("The program must already be specified")); $self->can($program) || $self->throw("'$program' wasn't a valid program"); return $self->$program(@_); } =head2 hmmalign Title : hmmalign Usage : $obj->hmmalign() Function: Runs hmmalign Returns : A Bio::AlignIO Args : list of Bio::SeqI OR Bio::Align::AlignI OR filename of file with sequences or an alignment =cut sub hmmalign { my $self = shift; $self->program_name('hmmalign'); my $input = $self->_setinput(@_); unless (defined $self->o()) { $self->q(1); } if (! $self->outformat) { $self->outformat($DefaultFormat); } return $self->_run($input); } =head2 hmmbuild Title : hmmbuild Usage : $obj->hmmbuild() Function: Runs hmmbuild, outputting an hmm to the file currently set by method hmm() or db(), or failing that, o() or outfile(), or failing that, to a temp location. Returns : true on success Args : Bio::Align::AlignI OR filename of file with an alignment =cut sub hmmbuild { my $self = shift; $self->program_name('hmmbuild'); my $input = $self->_setinput(@_); unless (defined $self->hmm()) { $self->hmm($self->o() || $self->io->tempfile(-dir => $self->tempdir)); } return $self->_run($input); } =head2 hmmcalibrate Title : hmmcalibrate Usage : $obj->hmmcalibrate() Function: Runs hmmcalibrate Returns : true on success Args : none (hmm() must be set, most likely by the -hmm option of new()), OR optionally supply an hmm filename to set hmm() and run =cut sub hmmcalibrate { my ($self, $hmm) = @_; $self->program_name('hmmcalibrate'); $self->hmm($hmm) if $hmm; $self->hmm || $self->throw("hmm() must be set first"); return $self->_run(); } =head2 hmmemit Title : hmmemit Usage : $obj->hmmemit() Function: Runs hmmemit Returns : A Bio::SeqIO Args : none (hmm() must be set, most likely by the -hmm option of new()), OR optionally supply an hmm filename to set hmm() and run =cut sub hmmemit { my ($self, $hmm) = @_; $self->program_name('hmmemit'); $self->hmm($hmm) if $hmm; $self->hmm || $self->throw("hmm() must be set first"); unless (defined $self->o()) { $self->q(1); } return $self->_run(); } =head2 hmmpfam Title : hmmpfam Usage : $obj->hmmpfam() Function: Runs hmmpfam Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub hmmpfam { my $self = shift; $self->program_name('hmmpfam'); my $input = $self->_setinput(@_); return $self->_run($input); } =head2 hmmsearch Title : hmmsearch Usage : $obj->hmmsearch() Function: Runs hmmsearch Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub hmmsearch { my $self = shift; $self->program_name('hmmsearch'); my $input = $self->_setinput(@_); return $self->_run($input); } =head2 _setinput Title : _setinput Usage : $obj->_setinput() Function: Internal(not to be used directly) Returns : filename Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub _setinput { my ($self, @things) = @_; @things || $self->throw("At least one input is required"); my $infile; if (ref $things[0] && $things[0]->isa("Bio::PrimarySeqI") ){# it is an object $infile = $self->_writeSeqFile(@things); } elsif(ref $things[0] && $things[0]->isa("Bio::Align::AlignI")){ $infile = $self->_writeAlignFile(@things); } elsif (-e $things[0]) { $infile = $things[0]; } else { $self->throw("Unknown kind of input '@things'"); } return $infile; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Bio::SearchIO Args : file name =cut sub _run { my ($self, $file) = @_; # Use double quotes if file path have empty spaces if ($file =~ m/ /) { $file = "\"$file\""; } my $str = $self->executable; # Use double quotes if executable path have empty spaces if ($str =~ m/ /) { $str = "\"$str\""; } $str .= $self->_setparams; $str .= ' '.$file if $file; $self->debug("HMMER command = $str"); my $progname = $self->program_name; my @in; my @verbose = (-verbose => $self->verbose); if ($progname =~ /align|build|emit/) { my $outfile = $self->o; if ($outfile || $progname eq 'hmmbuild') { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str .= " > $null" if $self->quiet; if ($progname eq 'hmmbuild') { my $status = system($str); return $status ? 0 : 1; } else { system($str) && $self->throw("HMMER call ($str) crashed: $?\n"); @in = (-file => $outfile); } } else { open(my $fh, "$str |") || $self->throw("HMMER call ($str) crashed: $?\n"); @in = (-fh => $fh); } } elsif ($progname =~ /pfam|search/i) { open(my $fh, "$str |") || $self->throw("HMMER call ($str) crashed: $?\n"); return Bio::SearchIO->new(-fh => $fh, @verbose, -format => $self->_READMETHOD); } if ($progname eq 'hmmalign') { return Bio::AlignIO->new(@in, @verbose, -format => $self->outformat); } elsif ($progname eq 'hmmemit') { return Bio::SeqIO->new(@in, @verbose, -format => 'fasta'); } elsif ($progname =~ /calibrate/) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str .= " > $null 2> $null" if $self->quiet; my $status = system($str); return $status ? 0 : 1; } } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my @execparams; my @execswitches; SWITCH: for ($self->program_name) { /align/ && do { @execparams = @ALIGN_PARAMS; @execswitches = @ALIGN_SWITCHES; last SWITCH; }; /build/ && do { @execparams = @BUILD_PARAMS; @execswitches = @BUILD_SWITCHES; last SWITCH; }; /calibrate/ && do { @execparams = @CALIBRATE_PARAMS; @execswitches = @CALIBRATE_SWITCHES; last SWITCH; }; /emit/ && do { @execparams = @EMIT_PARAMS; @execswitches = @EMIT_SWITCHES; last SWITCH; }; /pfam/ && do { @execparams = @PFAM_PARAMS; @execswitches = @PFAM_SWITCHES; last SWITCH; }; /search/ && do { @execparams = @SEARCH_PARAMS; @execswitches = @SEARCH_SWITCHES; last SWITCH; }; } my $param_string = $self->SUPER::_setparams(-params => \@execparams, -switches => \@execswitches, -mixed_dash => 1); my $hmm = $self->hmm || $self->throw("Need to specify either HMM file or Database"); # Use double quotes if hmm path have empty spaces if ($hmm =~ m/ /) { $hmm = "\"$hmm\""; } $param_string .= ' '.$hmm; return $param_string; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : none =cut sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift; # hack so that when program_name changes, so does executable() delete $self->{'_pathtoexe'}; } return $self->{program_name} || ''; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : none =cut sub program_dir { return $ENV{HMMERDIR} if $ENV{HMMERDIR}; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : filename Args : list of Bio::SeqI =cut sub _writeSeqFile { my ($self, @seq) = @_; my ($tfh, $inputfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->informat('fasta'); my $out = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s (@seq) { $out->write_seq($s); } $out->close(); $out = undef; close($tfh); undef $tfh; return $inputfile; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : filename Args : list of Bio::Align::AlignI =cut sub _writeAlignFile{ my ($self, @align) = @_; my ($tfh, $inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::AlignIO->new('-fh' => $tfh, '-format' => $self->informat); foreach my $a (@align) { $out->write_aln($a); } $out->close(); $out = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Infernal.pm000066400000000000000000001200111342734133000226060ustar00rootroot00000000000000# # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code # # _history # # March 2007 - first full implementation; needs some file IO tweaking between # runs but works for now # April 2008 - add 0.81 parameters (may be removed in the 1.0 release) # # July 2009 - updated for v1.0. No longer supporting pre-1.0 Infernal =head1 NAME Bio::Tools::Run::Infernal - Wrapper for local execution of cmalign, cmbuild, cmsearch, cmscore =head1 SYNOPSIS # parameters which are switches are set with any value that evals TRUE, # others are set to a specific value my $factory = Bio::Tools::Run::Infernal->new(@params); # run cmalign|cmbuild|cmsearch|cmscore|cmemit directly as a wrapper method # this resets the program flag if previously set $factory->cmsearch(@seqs); # searches Bio::PrimarySeqI's based on set cov. model # saves output to optional outfile_name, returns # Bio::SearchIO # only values which are allowed for a program are set, so one can use the same # wrapper for the following... $factory->cmalign(@seqs); # aligns Bio::PrimarySeqI's to a set cov. model, # --merge option allows two alignments generated # from the same CM to be merged. # output to outfile_name, returns Bio::AlignIO $factory->cmscore(); # scores set cov. model against Bio::PrimarySeqI, # output to outfile_name/STDOUT. $factory->cmbuild($aln); # builds covariance model based on alignment # CM to outfile_name or model_file (one is required # here), output to STDOUT. $factory->cmemit(); # emits sequence from specified cov. model; # set one if no file specified. output to # outfile_name, returns Bio::SeqIO or (if -a is set) # Bio::AlignIO $factory->cmcalibrate($file); # calibrates specified cov. model; output to # STDOUT $factory->cmstat($file); # summary stats for cov. model; set one if no file # specified; output to STDOUT # run based on the setting of the program parameter my $factory = Bio::Tools::Run::Infernal->new(-program => 'cmsearch', @params); my $search = $factory->run($seq); # using cmsearch returns a Bio::SearchIO object while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } =head1 DESCRIPTION Wrapper module for Sean Eddy's Infernal suite of programs. The current implementation runs cmsearch, cmcalibrate, cmalign, cmemit, cmbuild, cmscore, and cmstat. cmsearch will return a Bio::SearchIO, cmemit a Bio::SeqIO/AlignIO, and cmalign a Bio::AlignIO. All others send output to STDOUT. Optionally, any program's output can be redirected to outfile_name. We HIGHLY suggest upgrading to Infernal 1.0. In that spirit, this wrapper now supports parameters for Infernal 1.0 only; for wrapping older versions of Infernal we suggest using the version of Bio::Tools::Run::Infernal that came with previous versions of BioPerl-run. NOTE: Due to conflicts in the way Infernal parameters are now formatted vs. subroutine naming in Perl (specifically the inclusion of hyphens) and due to the very large number of parameters available, setting and resetting parameters via set_parameters() and reset_parameters() is required. All valid parameters can be set, but only ones valid for the executable set via program()/program_name() are used for calling the executables, the others are silently ignored. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Infernal; use strict; use warnings; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::ParameterBaseI); use Bio::SeqIO; use Bio::SearchIO; use Bio::AlignIO; use Data::Dumper; # yes, these are the current parameters our %INFERNAL_PARAMS = ( 'A' => ['switch', '-', qw(cmbuild)], 'E' => ['param', '-', qw(cmsearch cmstat)], 'F' => ['switch', '-', qw(cmbuild)], 'Lmax' => ['param', '--', qw(cmscore)], 'Lmin' => ['param', '--', qw(cmscore)], 'T' => ['param', '-', qw(cmsearch cmstat)], 'Wbeta' => ['param', '--', qw(cmbuild)], 'Z' => ['param', '-', qw(cmsearch cmstat)], 'a' => ['switch', '-', qw(cmbuild cmemit cmscore)], 'afile' => ['param', '--', qw(cmstat)], 'ahmm' => ['param', '--', qw(cmemit)], 'all' => ['switch', '--', qw(cmstat)], 'aln-hbanded' => ['switch', '--', qw(cmsearch)], 'aln-optacc' => ['switch', '--', qw(cmsearch)], 'aln2bands' => ['switch', '--', qw(cmscore cmsearch)], 'banddump' => ['param', '--', qw(cmalign)], 'begin' => ['param', '--', qw(cmemit)], 'beta' => ['param', '--', qw(cmalign cmscore cmsearch cmstat)], 'betae' => ['param', '--', qw(cmscore)], 'betas' => ['param', '--', qw(cmscore)], 'bfile' => ['param', '--', qw(cmstat)], 'binary' => ['switch', '--', qw(cmbuild)], 'bits' => ['switch', '--', qw(cmstat)], 'bottomonly' => ['switch', '--', qw(cmsearch)], 'c' => ['switch', '-', qw(cmemit)], 'call' => ['switch', '--', qw(cmbuild)], 'cdump' => ['param', '--', qw(cmbuild)], 'cfile' => ['param', '--', qw(cmbuild)], 'checkfb' => ['switch', '--', qw(cmalign)], 'checkpost' => ['switch', '--', qw(cmalign)], 'cmL' => ['param', '--', qw(cmstat)], 'cmaxid' => ['param', '--', qw(cmbuild)], 'cmtbl' => ['param', '--', qw(cmbuild)], 'corig' => ['switch', '--', qw(cmbuild)], 'ctarget' => ['param', '--', qw(cmbuild)], 'cyk' => ['switch', '--', qw(cmalign cmbuild cmsearch)], 'devhelp' => ['switch', '--', qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch)], 'dlev' => ['param', '--', qw(cmalign)], 'dna' => ['switch', '--', qw(cmalign cmemit cmsearch)], 'eX' => ['param', '--', qw(cmbuild)], 'eent' => ['switch', '--', qw(cmbuild)], 'efile' => ['param', '--', qw(cmstat)], 'ehmmre' => ['param', '--', qw(cmbuild)], 'elself' => ['param', '--', qw(cmbuild)], 'emap' => ['param', '--', qw(cmbuild)], 'emit' => ['switch', '--', qw(cmscore)], 'end' => ['param', '--', qw(cmemit)], 'enone' => ['switch', '--', qw(cmbuild)], 'ere' => ['param', '--', qw(cmbuild)], 'exp' => ['param', '--', qw(cmemit)], 'exp-T' => ['param', '--', qw(cmcalibrate)], 'exp-beta' => ['param', '--', qw(cmcalibrate)], 'exp-cmL-glc' => ['param', '--', qw(cmcalibrate)], 'exp-cmL-loc' => ['param', '--', qw(cmcalibrate)], 'exp-ffile' => ['param', '--', qw(cmcalibrate)], 'exp-fract' => ['param', '--', qw(cmcalibrate)], 'exp-gc' => ['param', '--', qw(cmcalibrate)], 'exp-hfile' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLn-glc' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLn-loc' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLx' => ['param', '--', qw(cmcalibrate)], 'exp-no-qdb' => ['switch', '--', qw(cmcalibrate)], 'exp-pfile' => ['param', '--', qw(cmcalibrate)], 'exp-qqfile' => ['param', '--', qw(cmcalibrate)], 'exp-random' => ['switch', '--', qw(cmcalibrate)], 'exp-sfile' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-cglc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-cloc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-hglc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-hloc' => ['param', '--', qw(cmcalibrate)], 'exp-tailp' => ['param', '--', qw(cmcalibrate)], 'exp-tailxn' => ['param', '--', qw(cmcalibrate)], 'fil-E-hmm' => ['param', '--', qw(cmsearch)], 'fil-E-qdb' => ['param', '--', qw(cmsearch)], 'fil-F' => ['param', '--', qw(cmcalibrate)], 'fil-N' => ['param', '--', qw(cmcalibrate)], 'fil-Smax-hmm' => ['param', '--', qw(cmsearch)], 'fil-T-hmm' => ['param', '--', qw(cmsearch)], 'fil-T-qdb' => ['param', '--', qw(cmsearch)], 'fil-aln2bands' => ['switch', '--', qw(cmcalibrate)], 'fil-beta' => ['param', '--', qw(cmsearch)], 'fil-dfile' => ['param', '--', qw(cmcalibrate)], 'fil-gemit' => ['switch', '--', qw(cmcalibrate)], 'fil-no-hmm' => ['switch', '--', qw(cmsearch)], 'fil-no-qdb' => ['switch', '--', qw(cmsearch)], 'fil-nonbanded' => ['switch', '--', qw(cmcalibrate)], 'fil-tau' => ['param', '--', qw(cmcalibrate)], 'fil-xhmm' => ['param', '--', qw(cmcalibrate)], 'fins' => ['switch', '--', qw(cmalign cmbuild)], 'forecast' => ['param', '--', qw(cmcalibrate cmsearch)], 'forward' => ['switch', '--', qw(cmscore cmsearch)], 'g' => ['switch', '-', qw(cmsearch cmstat)], 'ga' => ['switch', '--', qw(cmsearch cmstat)], 'gapthresh' => ['param', '--', qw(cmalign cmbuild)], 'gcfile' => ['param', '--', qw(cmsearch)], 'ge' => ['switch', '--', qw(cmstat)], 'gfc' => ['switch', '--', qw(cmstat)], 'gfi' => ['switch', '--', qw(cmstat)], 'gibbs' => ['switch', '--', qw(cmbuild)], 'gtbl' => ['param', '--', qw(cmbuild)], 'gtree' => ['param', '--', qw(cmbuild)], 'h' => ['switch', '-', qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch cmstat)], 'hbanded' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'hmm-W' => ['param', '--', qw(cmsearch)], 'hmm-cW' => ['param', '--', qw(cmsearch)], 'hmmL' => ['param', '--', qw(cmstat)], 'hsafe' => ['switch', '--', qw(cmalign cmscore)], 'ignorant' => ['switch', '--', qw(cmbuild)], 'iins' => ['switch', '--', qw(cmbuild)], 'infile' => ['param', '--', qw(cmscore)], 'informat' => ['param', '--', qw(cmalign cmbuild cmsearch)], 'inside' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'l' => ['switch', '-', qw(cmalign cmbuild cmemit cmscore)], 'lambda' => ['param', '--', qw(cmsearch)], 'le' => ['switch', '--', qw(cmstat)], 'lfc' => ['switch', '--', qw(cmstat)], 'lfi' => ['switch', '--', qw(cmstat)], 'm' => ['switch', '-', qw(cmstat)], 'matchonly' => ['switch', '--', qw(cmalign)], 'merge' => ['switch', '--', qw(cmalign)], 'mpi' => ['switch', '--', qw(cmalign cmcalibrate cmscore cmsearch)], 'mxsize' => ['param', '--', qw(cmalign cmbuild cmcalibrate cmscore cmsearch)], 'n' => ['param', '-', qw(cmbuild cmemit cmscore)], 'nc' => ['switch', '--', qw(cmsearch cmstat)], 'no-null3' => ['switch', '--', qw(cmalign cmcalibrate cmscore cmsearch)], 'no-qdb' => ['switch', '--', qw(cmsearch)], 'noalign' => ['switch', '--', qw(cmsearch)], 'nobalance' => ['switch', '--', qw(cmbuild)], 'nodetach' => ['switch', '--', qw(cmbuild)], 'nonbanded' => ['switch', '--', qw(cmalign cmbuild cmscore)], 'null' => ['param', '--', qw(cmbuild)], 'null2' => ['switch', '--', qw(cmsearch)], 'o' => ['param', '-', qw(cmalign cmsearch)], 'old' => ['switch', '--', qw(cmscore)], 'onepost' => ['switch', '--', qw(cmalign)], 'optacc' => ['switch', '--', qw(cmalign)], 'outfile' => ['param', '--', qw(cmscore)], 'p' => ['switch', '-', qw(cmalign cmsearch)], 'pad' => ['switch', '--', qw(cmscore)], 'pbegin' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pbswitch' => ['param', '--', qw(cmbuild)], 'pebegin' => ['switch', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pend' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pfend' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'prior' => ['param', '--', qw(cmbuild)], 'q' => ['switch', '-', qw(cmalign)], 'qdb' => ['switch', '--', qw(cmalign cmscore)], 'qdbboth' => ['switch', '--', qw(cmscore)], 'qdbfile' => ['param', '--', qw(cmstat)], 'qdbsmall' => ['switch', '--', qw(cmscore)], 'random' => ['switch', '--', qw(cmscore)], 'rdump' => ['param', '--', qw(cmbuild)], 'refine' => ['param', '--', qw(cmbuild)], 'regress' => ['param', '--', qw(cmalign cmbuild cmscore)], 'resonly' => ['switch', '--', qw(cmalign)], 'rf' => ['switch', '--', qw(cmalign cmbuild)], 'rna' => ['switch', '--', qw(cmalign cmemit cmsearch)], 'rsearch' => ['param', '--', qw(cmbuild)], 'rtrans' => ['switch', '--', qw(cmsearch)], 's' => ['param', '-', qw(cmalign cmbuild cmcalibrate cmemit cmscore)], 'sample' => ['switch', '--', qw(cmalign)], 'scoreonly' => ['switch', '--', qw(cmscore)], 'search' => ['switch', '--', qw(cmscore cmstat)], 'seqfile' => ['param', '--', qw(cmstat)], 'sfile' => ['param', '--', qw(cmstat)], 'shmm' => ['param', '--', qw(cmemit)], 'small' => ['switch', '--', qw(cmalign)], 'stall' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'sub' => ['switch', '--', qw(cmalign cmbuild cmscore)], 'sums' => ['switch', '--', qw(cmalign cmsearch)], 'tabfile' => ['param', '--', qw(cmsearch)], 'tau' => ['param', '--', qw(cmalign cmbuild cmscore cmsearch)], 'taue' => ['param', '--', qw(cmscore)], 'taus' => ['param', '--', qw(cmscore)], 'tc' => ['switch', '--', qw(cmsearch cmstat)], 'tfile' => ['param', '--', qw(cmalign cmbuild cmemit cmscore)], 'toponly' => ['switch', '--', qw(cmsearch cmstat)], 'u' => ['switch', '-', qw(cmemit)], 'v' => ['switch', '-', qw(cmbuild cmcalibrate)], 'viterbi' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'wblosum' => ['switch', '--', qw(cmbuild)], 'wgiven' => ['switch', '--', qw(cmbuild)], 'wgsc' => ['switch', '--', qw(cmbuild)], 'wid' => ['param', '--', qw(cmbuild)], 'withali' => ['param', '--', qw(cmalign)], 'withpknots' => ['switch', '--', qw(cmalign)], 'wnone' => ['switch', '--', qw(cmbuild)], 'wpb' => ['switch', '--', qw(cmbuild)], 'x' => ['switch', '-', qw(cmsearch)], 'xfile' => ['param', '--', qw(cmstat)], ); our %INFERNAL_PROGRAM = ( 'cmalign' => "cmalign [-options] \n". 'cmalign [-options] --merge ', 'cmbuild' => 'cmbuild [-options] ', 'cmcalibrate' => 'cmcalibrate [-options] ', 'cmemit' => 'cmemit [-options] ', 'cmscore' => 'cmscore [-options] ', 'cmsearch' => 'cmsearch [-options] ', 'cmstat' => 'cmstat [-options] ', ); # this is a simple lookup for easy validation for passed methods our %LOCAL_PARAMS = map {$_ => 1} qw(program outfile tempfile model); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::Infernal->new(@params) Function: creates a new Infernal factory Returns: Bio::Tools::Run::Infernal wrapper Args : list of parameters =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); # these are specific parameters we do not want passed on to set_parameters my ($program, $model, $validate, $q, $o1, $o2) = $self->_rearrange([qw(PROGRAM MODEL_FILE VALIDATE_PARAMETERS QUIET OUTFILE_NAME O)], @args); if ($o1 && $o2) { $self->warn("Only assign to either -outfile_name or -o, not both;"); } my $out = $o1 || $o2; $self->validate_parameters($validate); $q && $self->quiet($q); $program && $self->program($program); $model && $self->model_file($model); $out ||= ''; $self->outfile_name($out); $self->io->_initialize_io(); $self->set_parameters(@args); return $self; } =head2 program Title : program Usage : $obj->program() Function: Set the program called when run() is used. Synonym of program_name() Returns : String (program name) Args : String (program name) Status : Unstable (may delegate to program_name, which is the interface method) =cut sub program { my $self = shift; return $self->program_name(@_); } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; if (@_) { my $p = shift; $self->throw("Program '$p' not supported") if !exists $INFERNAL_PROGRAM{lc $p}; $self->{'_program'} = lc $p; # set up cache of valid parameters while (my ($p, $data) = each %INFERNAL_PARAMS) { my %in_exe = map {$_ => 1} @$data[2..$#{$data}]; $self->{valid_params}->{$p} = 1 if exists $in_exe{$self->{'_program'}}; } } return $self->{'_program'}; } =head2 model_file Title : model_file Usage : $obj->model_file() Function: Set the model file used when run() is called. Returns : String (file location of covariance model) Args : String (file location of covariance model) =cut sub model_file { my $self = shift; return $self->{'_model_file'} = shift if @_; return $self->{'_model_file'}; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { my ($self, $dir) = @_; if ($dir) { $self->{_program_dir} = $dir; } return Bio::Root::IO->catfile($ENV{INFERNALDIR}) || ''; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program (uses cmsearch) Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return unless $self->executable; my $exe = $self->executable; my $string = `$exe -h 2>&1`; my $v; if ($string =~ m{Infernal\s([\d.]+)}) { $v = $1; $self->deprecated(-message => "Only Infernal 1.0 and above is supported.", -version => 1.006001) if $v < 1; } return $self->{'_progversion'} = $v || undef; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs Infernal and returns Bio::SearchIO Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI or file name =cut # TODO: update to accept multiple seqs, alignments sub run { my ($self,@seq) = @_; if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } elsif (ref $seq[0] && $seq[0]->isa("Bio::Align::AlignI") ){ # it is an object my $infile1 = $self->_writeAlignFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head1 Specific program interface methods =head2 cmsearch Title : cmsearch Usage : $obj->cmsearch($seqFile) Function: Runs Infernal cmsearch and returns Bio::SearchIO Returns : A Bio::SearchIO Args : Bio::PrimarySeqI or file name =cut sub cmsearch { my ($self,@seq) = @_; $self->program('cmsearch'); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run(-seq_files => [$infile1]); } else { return $self->_run(-seq_files => \@seq); } } =head2 cmalign Title : cmalign Usage : $obj->cmalign($seqFile) Function: Runs Infernal cmalign and returns Bio::AlignIO Returns : A Bio::AlignIO Args : Bio::PrimarySeqI or file name =cut sub cmalign { my ($self,@seq) = @_; $self->program('cmalign'); if (ref $seq[0]) { # it is an object if ($seq[0]->isa("Bio::PrimarySeqI") ){ my $infile1 = $self->_writeSeqFile(@seq); return $self->_run(-seq_files => [$infile1]); } elsif ( $seq[0]->isa("Bio::Align::AlignI") ) { if (scalar(@seq) != 2) { $self->throw("") } my $infile1 = $self->_writeAlignFile($seq[0]); my $infile2 = $self->_writeAlignFile($seq[1]); return $self->_run(-align_files => [$infile1, $infile2]); } } else { # we can maybe add a check for the file extension and try to DTRT my %params = $self->get_parameters('valid'); $params{merge} ? return $self->_run(-align_files => \@seq): return $self->_run(-seq_files => \@seq); return $self->_run(-seq_files => \@seq); } } =head2 cmemit Title : cmemit Usage : $obj->cmemit($modelfile) Function: Runs Infernal cmemit and returns Bio::AlignIO Returns : A Bio::AlignIO Args : None; set model_file() to use a specific model =cut sub cmemit { my ($self) = shift; $self->program('cmemit'); return $self->_run(@_); } =head2 cmbuild Title : cmbuild Usage : $obj->cmbuild($alignment) Function: Runs Infernal cmbuild and saves covariance model Returns : 1 on success (no object for covariance models) Args : Bio::AlignIO with structural information (such as from Stockholm format source) or alignment file name =cut sub cmbuild { my ($self,@seq) = @_; $self->program('cmbuild'); if (ref $seq[0] && $seq[0]->isa("Bio::Align::AlignI") ){# it is an object my $infile1 = $self->_writeAlignFile(@seq); return $self->_run(-align_files => [$infile1]); } else { return $self->_run(-align_files => \@seq); } } =head2 cmscore Title : cmscore Usage : $obj->cmscore($seq) Function: Runs Infernal cmscore and saves output Returns : None Args : None; set model_file() to use a specific model =cut sub cmscore { my ($self,@seq) = @_; $self->program('cmscore'); return $self->_run(); } =head2 cmcalibrate Title : cmcalibrate Usage : $obj->cmcalibrate('file') Function: Runs Infernal calibrate on specified CM Returns : None Args : None; set model_file() to use a specific model =cut sub cmcalibrate { my ($self,@seq) = @_; $self->program('cmcalibrate'); return $self->_run(); } =head2 cmstat Title : cmstat Usage : $obj->cmstat($seq) Function: Runs Infernal cmstat and saves output Returns : None Args : None; set model_file() to use a specific model =cut sub cmstat { my ($self,@seq) = @_; $self->program('cmstat'); return $self->_run(); } =head1 Bio::ParameterBaseI-specific methods These methods are part of the Bio::ParameterBaseI interface =cut =head2 set_parameters Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : None Args : [optional] hash or array of parameter/values. These can optionally be hash or array references Note : This only sets parameters; to set methods use the method name =cut sub set_parameters { my $self = shift; # circumvent any issues arising from passing in refs my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} : (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_; # set the parameters passed in, but only ones supported for the program my ($prog, $validate) = ($self->program, $self->validate_parameters); # parameter cleanup %args = map { my $a = $_; $a =~ s{^-}{}; lc $a => $args{$_} } sort keys %args; while (my ($key, $val) = each %args) { if (exists $INFERNAL_PARAMS{$key}) { my ($type, $prefix) = @{$INFERNAL_PARAMS{$key}}[0..1]; @{$self->{parameters}->{$key}} = ($type, $prefix); unshift @{$self->{parameters}->{$key}}, $type eq 'param' ? $val : $type eq 'switch' && $val ? 1 : 0; if ($validate) { my %in_exe = map {$_ => 1} @{$INFERNAL_PARAMS{$key}}[2..$#{$INFERNAL_PARAMS{$key}}]; $self->warn("Parameter $key not used for $prog") if !exists $in_exe{$key}; } } else { $self->warn("Parameter $key does not exist") if ($validate); } } } =head2 reset_parameters Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my $self = shift; delete $self->{parameters}; if (@_) { $self->set_parameters(@_); } } =head2 validate_parameters Title : validate_parameters Usage : $pobj->validate_parameters(1); Function: sets a flag indicating whether to validate parameters via set_parameters() or reset_parameters() Returns : Bool Args : [optional] value evaluating to True/False Note : Optionally implemented method; up to the implementation on whether to automatically validate parameters or optionally do so =cut sub validate_parameters { my ($self) = shift; if (@_) { $self->{validate_params} = defined $_[0] ? 1 : 0; } return $self->{validate_params}; } =head2 parameters_changed Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : None Note : This module does not run state checks, so this always returns True =cut sub parameters_changed { 1 } =head2 available_parameters Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : [optional] name of executable being used; defaults to returning all available parameters =cut sub available_parameters { my ($self, $exec) = @_; my @params; if ($exec) { $self->throw("$exec is not part of the Infernal package") if !exists($INFERNAL_PROGRAM{$exec}); for my $p (sort keys %INFERNAL_PARAMS) { if (grep { $exec eq $_ } @{$INFERNAL_PARAMS{$p}}[2..$#{$INFERNAL_PARAMS{$p}}]) { push @params, $p; } } } else { @params = (sort keys %INFERNAL_PARAMS, sort keys %LOCAL_PARAMS); } return @params; } =head2 get_parameters Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of set key-value pairs, parameter => value Returns : List of key-value pairs Args : [optional] 'full' - this option returns everything associated with the parameter as an array ref value; that is, not just the value but also the value, type, and prefix. Default is value only. 'valid'- same a 'full', but only returns the grouping valid for the currently set executable =cut sub get_parameters { my ($self, $option) = @_; $option ||= ''; # no option my %params; if (exists $self->{parameters}) { %params = (ref $option eq 'ARRAY') ? ( map {$_ => $self->{parameters}{$_}[0]} grep { exists $self->{parameters}{$_} } @$option) : (lc $option eq 'full') ? (%{$self->{parameters}}) : (lc $option eq 'valid') ? (map {$_ => $self->{parameters}{$_}} grep { exists $self->{valid_params}->{$_} } keys %{$self->{parameters}}) : (map {$_ => $self->{parameters}{$_}[0]} keys %{$self->{parameters}}); } else { %params = (); } return %params; } =head1 to_* methods All to_* methods are implementation-specific =cut =head2 to_exe_string Title : to_exe_string Usage : $string = $pobj->to_exe_string; Function: Returns string (command line string in this case) Returns : String Args : =cut sub to_exe_string { my ($self, @passed) = @_; my ($seqs, $aligns) = $self->_rearrange([qw(SEQ_FILES ALIGN_FILES)], @passed); if ($seqs || $aligns) { $self->throw("Seqs or alignments must be an array reference") unless ($seqs && ref($seqs) eq 'ARRAY') || ($aligns && ref($aligns) eq 'ARRAY' ); } my %args = map {$_ => []} qw(switch param input redirect); my %params = $self->get_parameters('valid'); my ($exe, $prog, $model, $outfile) = ($self->executable, $self->program_name, $self->model_file, $self->outfile_name); $self->throw("Executable not found") unless defined($exe); delete $params{o} if exists $params{o}; if (!defined($model) && $prog ne 'cmbuild') { $self->throw("model_file() not defined") } $outfile ||= ''; for my $p (sort keys %params) { if ($params{$p}[0]) { my $val = $params{$p}[1] eq 'param' ? ' '.$params{$p}[0] : ''; push @{$args{$params{$p}[1]}}, $params{$p}[2].$p.$val; } } # TODO: not sure what happens when we pass in multiple seq or alignment # filenames, may need checking if ($prog eq 'cmscore' || $prog eq 'cmstat' || $prog eq 'cmcalibrate') { push @{$args{'redirect'}}, "> $outfile" if $outfile; push @{$args{'input'}}, $model; } elsif ($prog eq 'cmsearch') { if (!defined $seqs) { $self->throw('cmsearch requires a sequence file name'); } push @{$args{'param'}}, "-o $outfile" if $outfile; push @{$args{'input'}}, ($model, @$seqs); } elsif ($prog eq 'cmalign') { if ($params{'merge'}) { $self->throw('cmalign with --merge option requires two alignment files') if !defined($aligns) || @$aligns < 2; push @{$args{'input'}}, ($model, @$aligns); } else { $self->throw('cmalign requires a sequence file') if !defined $seqs; push @{$args{'input'}}, ($model, @$seqs); } push @{$args{'param'}}, "-o $outfile" if $outfile; } elsif ($prog eq 'cmbuild') { $self->throw('cmbuild requires one alignment file') if !defined($aligns); if ($model) { push @{$args{'input'}}, ($model, @$aligns); push @{$args{'redirect'}}, "> $outfile" if $outfile; } else { push @{$args{'input'}}, ($outfile, @$aligns); } } elsif ($prog eq 'cmemit') { if (!$outfile) { $self->throw('cmemit requires an outfile_name; tempfile support not implemented yet'); } else { push @{$args{'input'}}, ($model, ,$outfile); } } # quiet! if ($self->quiet && $prog ne 'cmsearch') { if ($prog eq 'cmalign') { push @{$args{switch}}, '-q' if !exists $params{q}; } else { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; push @{$args{redirect}}, "> $null"; } } my $string = "$exe ".join(' ',(@{$args{switch}}, @{$args{param}}, @{$args{input}}, @{$args{redirect}})); $string; } ############### PRIVATE ############### #=head2 _run # # Title : _run # Usage : $obj->_run() # Function: Internal(not to be used directly) # Returns : # Args : # #=cut { my %ALLOWED = map {$_ => 1} qw(run cmsearch cmalign cmemit cmbuild cmcalibrate cmstat cmscore); sub _run { my ($self)= shift; my ($prog, $model, $out, $version) = ($self->program, $self->model_file, $self->outfile_name, $self->version); if (my $caller = (caller(1))[3]) { $caller =~ s{.*::(\w+)$}{$1}; $self->throw("Calling _run() from disallowed method") unless exists $ALLOWED{$caller}; } else { $self->throw("Can't call _run directly"); } # a model and a file must be defined for all but cmemit; cmemit must have a # file or model defined (using $file if both are defined) # relevant files are passed on to the string builder my $str = $self->to_exe_string(@_); $self->debug("Infernal command: $str\n"); my %has = $self->get_parameters('valid'); my $obj = ($prog eq 'cmsearch') ? Bio::SearchIO->new(-format => 'infernal', -version => $version, -model => $model) : ($prog eq 'cmalign' ) ? Bio::AlignIO->new(-format => 'stockholm') : ($prog eq 'cmemit' && $has{a}) ? Bio::AlignIO->new(-format => 'stockholm') : ($prog eq 'cmemit') ? Bio::SeqIO->new(-format => 'fasta') : undef; my @args; # file output if ($out) { my $status = system($str); if($status || !-e $out || -z $out ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "Infernal call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($out); @args = (-file => $out); } # fh-based (no outfile) } else { open(my $fh,"$str |") || $self->throw("Infernal call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io) if $io; return 1; } } $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeAlignFile{ my ($self,@align) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::AlignIO->new('-fh' => $tfh , '-format' => 'stockholm'); foreach my $s(@align){ $in->write_aln($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } # this is a private sub used to regenerate the class data structures, # dumped to STDOUT # could probably add in a description field if needed... sub _dump_params { my %params; my %usage; for my $exec (qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch cmstat)) { my $output = `$exec --devhelp`; if ($?) { $output = `$exec -h`; } my @lines = split("\n",$output); for my $line (@lines) { next if $line =~ /^#/; if ($line =~ /^\s*(-{1,2})(\S+)\s+(<\S+>)?/) { my %data; ($data{prefix}, my $p, $data{arg}) = ($1, $2, $3 ? 'param' : 'switch'); if (exists $params{$p}) { if ($data{prefix} ne $params{$p}{prefix}) { warn("$data{prefix} for $p in $exec doesn't match prefix for same parameter in ".$params{$p}{exec}[-1].":".$params{$p}{prefix}); } if ($data{arg} ne $params{$p}{arg}) { warn("$data{arg} for $p in $exec doesn't match arg for same parameter in ".$params{$p}{exec}[-1].":".$params{$p}{arg}); } } while (my ($key, $val) = each %data) { $params{$p}->{$key} = $val; } push @{$params{$p}->{exec}}, $exec; } elsif ($line =~ /Usage:\s*(.+)$/) { push @{$usage{$exec}}, $1; } else { #print "$line\n"; } } } # generate data structure print "our %INFERNAL_PARAMS = (\n"; for my $k (sort keys %params) { printf(" %-17s => [","'$k'"); for my $sub (qw(arg prefix exec)) { my $str = (ref($params{$k}{$sub}) eq 'ARRAY') ? "qw(".join(' ', @{$params{$k}{$sub}}).")" : "'".$params{$k}{$sub}."',"; printf("%-10s", $str); } print "],\n"; } print ");\n\n"; # generate usage data structure print "our %INFERNAL_PROGRAM = (\n"; for my $k (sort keys %usage) { printf(" %-17s => [\n","'$k'"); print ' '.join(",\n ", map {"'$_'"} @{$usage{$k}})."\n"; print " ],\n"; } print ");\n"; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/MCS.pm000077500000000000000000000230601342734133000215030ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::MCS # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::MCS - Wrapper for MCS =head1 SYNOPSIS use Bio::Tools::Run::MCS; # Make a MCS factory $factory = Bio::Tools::Run::MCS->new(); # Run MCS on an alignment my @results = $factory->run($alignfilename); # or with alignment object @results = $factory->run($bio_simplalign); # look at the results foreach my $feat (@results) { my $seq_id = $feat->seq_id; my $start = $feat->start; my $end = $feat->end; my $score = $feat->score; my ($pvalue) = $feat->get_tag_values('pvalue'); my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon' } =head1 DESCRIPTION This is a wrapper for running the MCS (binCons) scripts by Elliott H Margulies. You can get details here: http://zoo.nhgri.nih.gov/elliott/mcs_doc/. MCS is used for the prediciton of transcription factor binding sites and other regions of the genome conserved amongst different species. Note that this wrapper assumes you already have alignments, so only uses MCS for the latter stages (the stages involving align2binomial.pl, generate_phyloMAX_score.pl and generate_mcs_beta.pl). You can try supplying normal MCS command-line arguments to new(), eg. $factory->new(-percentile => 95) or calling arg-named methods (excluding the initial hyphens, eg. $factory->percentile(95) to set the --percentile arg). You will need to enable this MCS wrapper to find the MCS scripts. This can be done in (at least) three ways: 1. Make sure the MCS scripts are in your path. 2. Define an environmental variable MCSDIR which is a directory which contains the MCS scripts: In bash: export MCSDIR=/home/username/mcs/ In csh/tcsh: setenv MCSDIR /home/username/mcs 3. Include a definition of an environmental variable MCSDIR in every script that will use this MCS wrapper module, e.g.: BEGIN { $ENV{MCSDIR} = '/home/username/mcs/' } use Bio::Tools::Run::MCS; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::MCS; use strict; use Cwd; use File::Spec; use Bio::AlignIO; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'align2binomial.pl'; our $PROGRAM_DIR; # methods for the mcs args we support our @PARAMS = qw(neutral percentile mcs specificity sensitivity name); our @SWITCHES = qw(neg-score); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(ucsc gtf neutral-only fourd-align align-only ar); BEGIN { # lets add all the mcs scripts to the path so that when we call # align2binomial.pl it can find its siblings $PROGRAM_DIR = $ENV{'MCSDIR'}; $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::MCS->new() Function: creates a new MCS factory Returns : Bio::Tools::Run::MCS Args : Many options understood by MCS can be supplied as key => value pairs. These options can NOT be used with this wrapper: ucsc gtf neutral-only fourd-align align-only ar =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($align_file_or_object, Bio::Location::Atomic, [Bio::SeqFeatureI]); Function: Runs the MCS scripts on an alignment. Returns : list of Bio::SeqFeatureI feature objects (with coordinates corrected according to the supplied offset, if any) Args : The first argument represents an alignment, the optional second argument represents the chromosome, stand and end and the optional third argument represents annotation of the exons in the alignment. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The position in the genome can be provided as a Bio::Location::Atomic with start, end and seq_id set. The annnotation can be provided as an array of Bio::SeqFeatureI objects. =cut sub run { my ($self, $aln, $offset, $exon_feats) = @_; $self->_alignment($aln || $self->alignment || $self->throw("An alignment must be supplied")); return $self->_run($offset, $exon_feats); } sub _run { my ($self, $atomic, $exon_feats) = @_; my $exe = $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $offset = ''; my $start_adjust = 0; if ($atomic) { $start_adjust = $atomic->start; $offset = '--ucsc '.$atomic->seq_id.':'.$start_adjust.'-'.$atomic->end; $start_adjust--; } my $gtf_file = 'exons.gtf'; if ($exon_feats) { my $fout = Bio::FeatureIO->new(-file => ">$gtf_file", -format => 'gtf'); foreach my $feat (@{$exon_feats}) { $fout->write_feature($feat); } } my $gtf = $exon_feats ? "--gtf $gtf_file" : ''; # step '2' (http://zoo.nhgri.nih.gov/elliott/mcs_doc/node1.html) of MCS: # run align2binomial.pl to calculate individual species binomial scores my $aln_file = $self->_write_alignment; my $error_file = 'stderr'; my $binomial_file = 'align_name.binomial'; my $cmd = "align2binomial.pl $offset $gtf $aln_file > $binomial_file 2> $error_file"; #system("rm -fr $cwd/mcs_dir; cp -R $temp_dir $cwd/mcs_dir"); my $throw = system($cmd); open(my $efh, "<", $error_file) || $self->throw("Could not open error file '$error_file'"); my $error; while (<$efh>) { $error .= $_; $throw = 1 if /not divisible by 3/; } close($efh); $self->throw($error) if $throw; # step '3': run generate_phyloMAX_score.pl to combine the individual # binomial scores and generate the final Multi-species Conservation Score my $phylo_file = 'align_name.phylo'; system("generate_phyloMAX_score.pl $binomial_file > $phylo_file") && $self->throw("generate_phyloMAX_score.pl call failed: $?, $!"); # step '4': Generate MCSs from the conservation score using # generate_mcs_beta.pl my $mcs_file = 'mcs_result.stdout'; my $bed_file = 'align_name.bed'; # hardcoded in generate_mcs_beta.pl system("generate_mcs_beta.pl $offset $gtf $phylo_file > $mcs_file") && $self->throw("generate_mcs_beta.pl failed: $?, $!"); my @feats; my $fin = Bio::FeatureIO->new(-file => $bed_file, -format => 'bed'); my $source = Bio::Annotation::SimpleValue->new(-value => 'MCS'); while (my $feat = $fin->next_feature()) { # convert coords given offset if ($start_adjust) { $feat->start($feat->start + $start_adjust); $feat->end($feat->end + $start_adjust); } $feat->source($source); push(@feats, $feat); } # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1); my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Match.pm000066400000000000000000000171751342734133000221240ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Match # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Match - Wrapper for Transfac's match(TM) =head1 SYNOPSIS use Bio::Tools::Run::Match; # Make a Match factory $factory = Bio::Tools::Run::Match->new(-mxlib => '/path/to/matrix.dat'); # Run Match on an sequence object my @results = $factory->run($bio_seq); # look at the results foreach my $feat (@results) { my $seq_id = $feat->seq_id; my $start = $feat->start; my $end = $feat->end; my $score = $feat->score; my ($pvalue) = $feat->get_tag_values('pvalue'); } =head1 DESCRIPTION This is a wrapper for running the match(TM) program supplied with Transfac Pro distributions. You can try supplying normal match command-line arguments to new(), eg. new(-b => 1) or calling arg-named methods (excluding the initial hyphens, eg. $factory->b(1) to set the -b option to true). Histogram output isn't supported. -p is supported by using -mxprf, see the docs of new() for details. You will need to enable this match wrapper to find the match executable. This can be done in (at least) three ways: 1. Make sure match is in your path. 2. Define an environmental variable MATCHDIR which is a directory which contains the match executable: In bash: export MATCHDIR=/home/username/match/ In csh/tcsh: setenv MATCHDIR /home/username/match 3. Include a definition of an environmental variable MATCHDIR in every script that will use this match wrapper module, e.g.: BEGIN { $ENV{MATCHDIR} = '/home/username/match/' } use Bio::Tools::Run::Match; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Match; use strict; use Cwd; use File::Spec; use Bio::SeqIO; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use Bio::Tools::Match; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'match'; our $PROGRAM_DIR = $ENV{'MATCHDIR'}; # methods for the match args we support our @PARAMS = qw(mxlib mxprf imcut); # these aren't actually match args, but # are methods we use internally our @SWITCHES = qw(b u); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(H HH pp ppg pn png pr jkn i p); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Match->new() Function: creates a new MCS factory Returns : Bio::Tools::Run::MCS Args : The following args can either be supplied here or set by calling arg-named methods (eg. $factory->imcut(2) ). -mxlib => path to the matrix.dat file containing Transfac matricies -mxprf => path to a profile file | [core_thresh, [matrix_thresh]] (defaults to a standard one based on the mxlib provided if file not supplied, using core_thresh and matrix_thresh values if those are supplied instead) -imcut => floating point number, the importance cutoff -b | -u => boolean, mutually exclusive =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($bio_seqi_object); Function: Runs match on a sequence. Returns : list of Bio::SeqFeatureI feature objects Args : Bio::SeqI compliant object NB: mxlib has to have been set prior to calling run(), either as an argument to new() or by calling mxlib(). =cut sub run { my ($self, $seq) = @_; $self->mxlib || $self->throw("mxlib has to have been set first"); return $self->_run($seq); } sub _run { my ($self, $seq) = @_; my $exe = $self->executable || return; my $mxlib = File::Spec->rel2abs($self->mxlib()); my $mxprf_file = $self->mxprf(); if ($mxprf_file && -e $mxprf_file) { $mxprf_file = File::Spec->rel2abs($mxprf_file); } # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); # make the profile file if necessary if (! $mxprf_file || ! -e $mxprf_file) { my @thresh; if ($mxprf_file && ref($mxprf_file) eq 'ARRAY') { @thresh = @{$mxprf_file}; } $mxprf_file = 'mxprf'; system("$exe $mxlib ignored ignored $mxprf_file -p @thresh") && $self->throw("Something went wrong whist creating profile: $! | $?"); } # output the sequence to a fasta file my $seq_file = 'sequence.fa'; my $so = Bio::SeqIO->new(-file => ">$seq_file", -format => 'fasta'); $so->write_seq($seq); $so->close(); # run match my $result_file = 'out'; my $param_str = $self->_setparams(); my $cmd_line = "$exe $mxlib $seq_file $result_file $mxprf_file".$param_str; system($cmd_line) && $self->throw("Something went wrong whist running '$cmd_line': $! | $?"); # parse the results my $parser = Bio::Tools::Match->new(-file => $result_file); # correct the coords my @feats; while (my $feat = $parser->next_result) { push(@feats, $feat); } # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = $self->SUPER::_setparams(-switches => \@SWITCHES, -dash => 1); my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Mdust.pm000066400000000000000000000242731342734133000221610ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Mdust # # Please direct questions and support issues to # # Cared for by Donald Jackson, donald.jackson@bms.com # # Copyright Donald Jackson # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Mdust - Perl extension for Mdust nucleotide filtering =head1 SYNOPSIS use Bio::Tools::Run::Mdust; my $mdust = Bio::Tools::Run::Mdust->new(); $mdust->run($bio_seq_object); =head1 DESCRIPTION Perl wrapper for the nucleic acid complexity filtering program B as available from TIGR via L. Takes a Bio::SeqI or Bio::PrimarySeqI object of type DNA as input. If a Bio::Seq::RichSeqI is passed then the low-complexity regions will be added to the feature table of the target object as Bio::SeqFeature::Generic items with primary tag = 'Excluded' . Otherwise a new target object will be returned with low-complexity regions masked (by N's or other character as specified by maskchar()). The mdust executable must be in a directory specified with either the PATH or MDUSTDIR environment variable. =head1 SEE ALSO L, L, L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Donald Jackson (donald.jackson@bms.com) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Mdust; require 5.005_62; use strict; use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use vars qw($AUTOLOAD $PROGRAMNAME @ARGNAMES @MASKCHARS $VERSION @ISA); @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); @ARGNAMES = qw(TARGET WSIZE CUTOFF MASKCHAR COORDS TMPDIR DEBUG); $PROGRAMNAME = 'mdust'; @MASKCHARS = qw(N X L); =head2 new Title : new Usage : my $mdust = Bio::Tools::Run::Mdust->new( -target => $target_bioseq) Purpose : Create a new mdust object Returns : A Bio::Seq object Args : target - Bio::Seq object for masking - alphabet MUST be DNA. wsize - word size for masking (default = 3) cutoff - cutoff score for masking (default = 28) maskchar - character for replacing masked regions (default = N) coords - boolean - indicate low-complexity regions as Bio::SeqFeature::Generic objects with primary tag 'Excluded', do not change sequence (default 0) tmpdir - directory for storing temporary files debug - boolean - toggle debugging output, do not remove temporary files Notes : All of the arguments can also be get/set with their own accessors, such as: my $wsize = $mdust->wsize(); When processing multiple sequences, call Bio::Tools::Run::Mdust->new() once then pass each sequence as an argument to the target() or run() methods. =cut sub new { my ($proto, @args) = @_; my $pkg = ref($proto) || $proto; my %args; my $self = { wsize => undef, cutoff => undef, maskchar => undef, coords => 0, }; bless ($self, $pkg); @args{@ARGNAMES} = $self->_rearrange(\@ARGNAMES, @args); # load target first since it requires special handling $self->target($args{'TARGET'}) if ($args{'TARGET'}); # package settings $self->{'coords'} = $args{'COORDS'} if (defined $args{'COORDS'}); $self->{'tmpdir'} = $args{'TMPDIR'} || $ENV{'TMPDIR'} || $ENV{'TMP'} || '.'; # mdust options $self->{'wsize'} = $args{'WSIZE'} if (defined $args{'WSIZE'}); $self->{'cutoff'} = $args{'CUTOFF'} if (defined $args{'CUTOFF'}); $self->{'maskchar'} = $args{'MASKCHAR'} if (defined $args{'CUTOFF'}); # set debugging $self->verbose($args{'DEBUG'}); return $self; } =head2 run Title : run Usage : $mdust->run(); Purpose : Run mdust on the target sequence Args : target (optional) - Bio::Seq object of alphabet DNA for masking Returns : Bio::Seq object with masked sequence or low-complexity regions added to feature table. =cut sub run { my ($self, $target) = @_; if ($target) { $self->target($target); } return $self->_run_mdust; } sub program_dir { return Bio::Root::IO->catfile($ENV{MDUSTDIR}) if $ENV{MDUSTDIR}; } sub program_name { return $PROGRAMNAME; } sub _run_mdust { # open a pipe to the mdust command. Pass in sequence(s?) as fasta # files on STDIN, recover filtered seqs on STDOUT my ($self) = @_; my $target = $self->target or warn "No target sequence specified\n" && return undef; # make sure program is available - doesn't seem to check #my $executable = $self->executable('mdust', 1); # add options my $mdust_cmd = $self->program_path; $mdust_cmd .= " -w " . $self->wsize if (defined $self->wsize); $mdust_cmd .= " -v " . $self->cutoff if (defined $self->cutoff); $mdust_cmd .= " -m " . $self->maskchar if (defined $self->maskchar); $mdust_cmd .= " -c" if ($self->coords); print STDERR "Running mdust: $mdust_cmd\n" if ($self->debug); my $maskedfile = $self->_maskedfile; eval { my $pid = open (MDUST, "| $mdust_cmd > $maskedfile"); # bind STDIN of mdust to filehandle local $| = 1; my $seqout = Bio::SeqIO->new(-fh => \*MDUST, -format => 'Fasta'); $seqout->write_seq($target); close MDUST; # need to do this to get output to flush! }; $self->throw($@) if ($@); my $rval; if ($self->coords) { $self->_parse_coords($maskedfile); $rval = $self->target; } else { # replace original seq w/ masked seq my $seqin = Bio::SeqIO->new(-file=>$maskedfile, -format => 'Fasta'); $rval = $seqin->next_seq } unlink $maskedfile unless $self->save_tempfiles; return $rval; } =head2 target Title : target Usage : $mdust->target($bio_seq) Purpose : Set/get the target (sequence to be filtered). Returns : Target Bio::Seq object Args : Bio::SeqI or Bio::PrimarySeqI object using the DNA alphabet (optional) Note : If coordinate parsing is selected ($mdust->coords = 1) then target MUST be a Bio::Seq::RichSeqI object. Passing a RichSeqI object automatically turns on coordinate parsing. =cut sub target { my ($self, $targobj) = @_; if ($targobj) { return $self->_set_target($targobj); } else { return $self->{'target'}; } } sub _set_target { my ($self, $targobj) = @_; unless ($targobj->isa('Bio::SeqI') or ($targobj->isa('Bio::PrimarySeqI'))) { $self->throw( -text => "Target must be passed as a Bio::SeqI or Bio::PrimarySeqI object", -class => 'Bio::Root::BadParameter', -value => $targobj ); } if ($self->coords) { unless ($targobj->isa('Bio::Seq::RichSeqI')) { $self->throw( -text => "Target must be passed as a Bio::Seq::RichSeqSeqI object when coords == 1", -class => 'Bio::Root::BadParameter', -value => $targobj ); } } elsif ($targobj->isa('Bio::Seq::RichSeqI')) { $self->coords(1); } unless ($targobj->alphabet eq 'dna') { $self->throw( -text => "Target must be a DNA sequence", -class => 'Bio::Root::BadParameter', -value => $targobj ); } $self->{'target'} = $targobj; return 1; } sub _maskedfile { my ($self, $file) = @_; my $tmpdir = $self->tempdir; if ($file) { $self->{'maskedfile'} = $file; # add some sanity chex for writability? } elsif (!$self->{'maskedfile'}) { ($self->{'maskedfh'},$self->{'maskedfile'}) = $self->io->tempfile(-dir=>$self->tempdir()); } return $self->{'maskedfile'}; } sub _parse_coords { my ($self, $file) = @_; my $target = $self->target; open(FILE, $file) or die "Unable to open $file: $!"; while () { chomp; s/\r//; my ($seq, $length, $mstart, $mstop) = split(/\t/); # add masked region as a SeqFeature in target my $masked = Bio::SeqFeature::Generic->new( -start => $mstart, -end => $mstop, ); $masked->primary_tag('Excluded'); $masked->source_tag('mdust'); $target->add_SeqFeature($masked); } return 1; } =head2 maskchar Title : maskchar Usage : $mdust->maskchar('N') Purpose : Set/get the character for masking low-complexity regions Returns : True on success Args : Either N (default), X or L (lower case) =cut sub maskchar { my ($self, $maskchar) = @_; return $self->{'maskchar'} unless (defined $maskchar); unless ( grep {$maskchar eq $_} @MASKCHARS ) { $self->throw( -text => "maskchar must be one of N, X or L", -class => 'Bio::Root::BadParameter', -value => $maskchar ); } $self->{'maskchar'} = $maskchar; 1; } sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } sub AUTOLOAD { my ($self, $value) = @_; my $name = $AUTOLOAD; $name =~ s/.+:://; return if ($name eq 'DESTROY'); if (defined $value) { $self->{$name} = $value; } unless (exists $self->{$name}) { warn "Attribute $name not defined for ", ref($self), "\n" if ($self->debug); return undef; } return $self->{$name}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/000077500000000000000000000000001342734133000216125ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/FastTree.pm000066400000000000000000000241171342734133000236720ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::FastTree # # Please direct questions and support issues to # # Copyright Brian Osborne # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::FastTree =head1 SYNOPSIS # Build a FastTree factory $factory = Bio::Tools::Run::Phylo::FastTree->new(-quiet => 1, -fastest => 1); # Get an alignment my $alignio = Bio::AlignIO->new( -format => 'fasta', -file => '219877.cdna.fasta'); my $alnobj = $alignio->next_aln; # Analyze the aligment and get a Tree my $tree = $factory->run($alnobj); =head1 DESCRIPTION Get a Bio::Tree object given a protein or DNA alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I Do not contact the module maintainer directly. Many experienced experts at bioperl-l will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Brian Osborne Email briano@bioteam.net =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::FastTree; use strict; use Bio::Seq; use Bio::SeqIO; use Bio::TreeIO; use Bio::AlignIO; use Bio::Root::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @FastTree_PARAMS = qw(log cat n intree intree1 constraints sprlength topm close refresh constraintWeight spr mlacc nni mlnni seed matrix gtrrates gtrfreq makematrix ); our @FastTree_SWITCHES = qw(quiet nopr nt fastest slow nosupport gtr wag quote noml nome gamma mllen slownni nocat notoo 2nd no2nd nj bionj top notop nomatrix rawdist ); our $PROGRAM_NAME = 'FastTree'; =head2 new Title : new Usage : my $treebuilder = Bio::Tools::Run::Phylo::FastTree->new(); Function: Constructor Returns : Bio::Tools::Run::Phylo::FastTree Args : -outfile_name => $outname =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args( \@args, -methods => [ @FastTree_PARAMS, @FastTree_SWITCHES ], -create => 1 ); my ($out) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME)], @args ); $self->outfile_name( $out || '' ); $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory Returns: string Args : =cut sub program_dir { undef; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; $self->{'error_string'} = $value if ( defined $value ); $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1`; $string =~ /FastTree\s+version\s+([\d\.]+)/; return $1 || undef; } =head2 run Title : run Usage : $factory->run($stockholm_file) OR $factory->run($align_object) Function: Runs FastTree to generate a tree Returns : Bio::Tree::Tree object Args : File name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_write_alignfile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } $self->_run($in); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Runs the application Returns : Tree object Args : Alignment file name =cut sub _run { my ( $self, $file ) = @_; # If -nt is not set check the alphabet of the input $self->_alphabet($file) if ( ! $self->nt ); my $exe = $self->executable || return; my $param_str = $self->arguments . " " . $self->_setparams($file); my $command = "$exe $param_str"; $self->debug("FastTree command = $command"); my $status = system($command); my $outfile = $self->outfile_name(); if ( !-e $outfile || -z $outfile ) { $self->warn("FastTree call had status of $status: $? [command $command]\n"); return undef; } my $treeio = Bio::TreeIO->new( -format => 'newick', -file => $outfile ); my $tree = $treeio->next_tree; # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id # if ($self->boot) { # my @nodes = $tree->get_nodes; # my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); # foreach my $node (@nodes) { # next if exists $non_internal{$node}; # $node->bootstrap && next; # protect ourselves incase the parser improves # $node->bootstrap($node->id); # $node->id(''); # } # } $tree; } =head2 _write_alignfile Title : _write_alignfile Usage : Internal function, not to be called directly Function: Create an alignment file Returns : filename Args : Bio::Align::AlignI =cut sub _write_alignfile { my ( $self, $align ) = @_; my ( $tfh, $tempfile ) = $self->io->tempfile( -dir => $self->tempdir ); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip' ); $out->write_aln($align); $out->close(); undef($out); close($tfh); undef($tfh); die "Alignment file $tempfile was not created" if ( ! -e $tempfile ); $tempfile; } =head2 _alphabet Title : _alphabet Usage : my $alphabet = $self->_alphabet; Function: Get the alphabet of the input alignment, defaults to 'dna' Returns : 'dna' or 'protein' Args : Alignment file =cut sub _alphabet { my ($self,$file) = @_; if ( $file ) { if ( -e $file ) { my $in = Bio::AlignIO->new(-file => $file); my $aln = $in->next_aln; # arbitrary, the first one my $seq = $aln->get_seq_by_pos(1); my $alphabet = $seq->alphabet; $self->{_alphabet} = $alphabet; $self->nt(1) if ( $alphabet eq 'dna' ); } else { die "File $file can not be found"; } } # default is 'dna' return $self->{'_alphabet'} || 'dna'; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FastTree program Example : Returns : parameter string to be passed to FastTree Args : name of calling object =cut sub _setparams { my ($self,$infile) = @_; my ( $attr, $value, $param_string ); $param_string = ''; my $laststr; for $attr (@FastTree_PARAMS) { $value = $self->$attr(); next unless ( defined $value ); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key . ' ' . $value; } for $attr (@FastTree_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key; } # Set default output file if no explicit output file has been given if ( ! $self->outfile_name ) { my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $self->tempdir() ); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " $infile > " . $self->outfile_name; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null" if ( $self->quiet() || $self->verbose < 0 ); $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $FastTree->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $FastTree->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Gerp.pm000077500000000000000000000230021342734133000230450ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Gerp # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Gerp - Wrapper for GERP =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Gerp; # Make a Gerp factory $factory = Bio::Tools::Run::Phylo::Gerp->new(); # Run Gerp with an alignment and tree file my $parser = $factory->run($alignfilename, $treefilename); # or with alignment object and tree object (which needs branch lengths) $parser = $factory->run($bio_simplalign, $bio_tree_tree); # (mixtures of the above are possible) # look at the results while (my $feat = $parser->next_result) { my $start = $feat->start; my $end = $feat->end; my $rs_score = $feat->score; my $p_value = ($feat->annotation->get_Annotations('p-value'))[0]->value; } =head1 DESCRIPTION This is a wrapper for running the GERP (v2) programs 'gerpcol' and 'gerpelem' by Eugene Davydov (originally Gregory M. Cooper et al.). You can get details here: http://mendel.stanford.edu/sidowlab/. GERP can be used for phylogenetic footprinting/ shadowing (it finds 'constrained elements in multiple alignments'). You can try supplying normal gerpcol/gerpelem command-line arguments to new(), eg. $factory-Enew(-e =E 0.05) or calling arg-named methods, eg. $factory-Ee(0.05). The filename-related args (t, f, x) are handled internally by the run() method. This wrapper currently only supports running GERP on a single alignment at a time (ie. F isn't used at all, nor are multiple fs possible). You will need to enable this GERP wrapper to find the GERP executables. This can be done in (at least) three ways: 1. Make sure gerpcol and gerpelem are in your path. 2. Define an environmental variable GERPDIR which is a directory which contains the GERP executables: In bash: export GERPDIR=/home/username/gerp/ In csh/tcsh: setenv GERPDIR /home/username/gerp 3. Include a definition of an environmental variable GERPDIR in every script that will use this GERP wrapper module, e.g.: BEGIN { $ENV{GERPDIR} = '/home/username/gerp/' } use Bio::Tools::Run::Phylo::Gerp; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Gerp; use strict; use Cwd; use File::Spec; use File::Basename; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Phylo::Gerp; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'gerpcol'; our $PROGRAM_DIR; # methods for the gerp args we support our @COLPARAMS = qw(r n s); our @ELEMPARAMS = qw(l L t d p b a c r e); our @SWITCHES = qw(v); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(h t f F x); BEGIN { # lets add all the gerp executables to the path $PROGRAM_DIR = $ENV{'GERPDIR'}; $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift } return $self->{program_name} || $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Gerp->new() Function: creates a new GERP factory Returns : Bio::Tools::Run::Phylo::Gerp Args : Most options understood by GERP can be supplied as key => value pairs. These options can NOT be used with this wrapper: h, t, f, F and x =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@COLPARAMS, @ELEMPARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $parser = $factory->run($align_file, $tree_file); -or- $parser = $factory->run($align_object, $tree_object); Function: Runs GERP on an alignment. Returns : Bio::Tools::Phylo::Gerp parser object, containing the results Args : The first argument represents an alignment, the second argument a phylogenetic tree with branch lengths. The alignment can be provided as a MAF format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. In all cases, the alignment sequence names must correspond to node ids in the tree. Multi-word species names should have the spaces replaced with underscores (eg. Homo_sapiens) =cut sub run { my ($self, $aln, $tree) = @_; $self->_alignment($aln || $self->throw("An alignment must be supplied")); $self->_tree($tree || $self->throw("A phylo tree must be supplied")); # check node and seq names match $self->_check_names; return $self->_run; } sub _run { my $self = shift; $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); foreach my $prog ('gerpcol', 'gerpelem') { delete $self->{'_pathtoexe'}; $self->program_name($prog); my $exe = $self->executable || $self->throw("'$prog' executable not found"); my $command = $exe.$self->_setparams($prog); $self->debug("gerp command = $command\n"); #eval { # local $SIG{ALRM} = sub { die "alarm\n" }; # alarm 60; # system($command) && $self->throw("gerp call ($command) failed: $! | $?"); # alarm 0; #}; #die if $@ && $@ ne "alarm\n"; #if ($@) { # die "Gerp timed out\n"; #} # # system("rm -fr $cwd/gerp_dir; cp -R $temp_dir $cwd/gerp_dir"); open(my $pipe, "$command |") || $self->throw("gerp call ($command) failed to start: $? | $!"); my $error = ''; my $warning = ''; while (<$pipe>) { if ($self->quiet) { $error .= $_; $warning .= $_ if /warning/i; } else { print; } } close($pipe) || ($error ? $self->throw("gerp call ($command) failed: $error") : $self->throw("gerp call ($command) crashed: $?")); # (throws most likely due to seg fault in gerpelem when ~25000 entries # in rates file, not much I can do about it!) $self->warn("GERP: ".$warning) if $warning; } #system("rm -fr $cwd/gerp_dir; cp -R $temp_dir $cwd/gerp_dir"); my $result_file = $self->{align_base}.'.rates.elems'; my $parser = Bio::Tools::Phylo::Gerp->new(-file => $result_file); # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return $parser; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my ($self, $prog) = @_; my $param_string; if ($prog eq 'gerpcol') { my $align_file = $self->_write_alignment; $param_string .= ' -f '.$align_file; $self->{align_base} = basename($align_file); $param_string .= ' -t '.$self->_write_tree; $param_string .= $self->SUPER::_setparams(-params => \@COLPARAMS, -switches => \@SWITCHES, -dash => 1); } else { $param_string .= ' -f '.$self->{align_base}.'.rates'; $param_string .= $self->SUPER::_setparams(-params => \@ELEMPARAMS, -switches => \@SWITCHES, -dash => 1); } $param_string .= " 2>&1"; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/000077500000000000000000000000001342734133000227135ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/Base.pm000066400000000000000000000417431342734133000241340ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::Base # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::Base - Hyphy wrapping base methods =head1 SYNOPSIS FIXME =head1 DESCRIPTION HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::Base; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values Valid and default values are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. =cut our $PROGRAMNAME = 'HYPHYMP'; our $PROGRAM; BEGIN { if( defined $ENV{'HYPHYDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'HYPHYDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'.exe':'');; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters (this needs to be specified per child class). Returns an empty array in the base class. Args : None =cut sub valid_values { return (); } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{HYPHYDIR}) if $ENV{HYPHYDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy object Returns : Bio::Tools::Run::Phylo::Hyphy Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my $versionstring = $self->version(); return $self; } =head2 prepare Title : prepare Usage : my $rundir = $hyphy->prepare($aln); Function: prepare the analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$aln,$tree) = @_; $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run hyphy"); return 0; } my ($tempdir) = $self->tempdir(); my ($tempseqFH,$tempalnfile); if( ! ref($aln) && -e $aln ) { $tempalnfile = $aln; } else { ($tempseqFH,$tempalnfile) = $self->io->tempfile('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); $aln->set_displayname_flat(1); my $alnout = Bio::AlignIO->new('-format' => 'fasta', '-fh' => $tempseqFH); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempseqFH); } $self->{'_params'}{'tempalnfile'} = $tempalnfile; # setting a new temp file to hold the run output for debugging $self->{'run_output'} = "$tempdir/run_output"; my $outfile = $self->outfile_name; if ($outfile eq "") { $outfile = "$tempdir/results.out"; $self->outfile_name($outfile); } my ($temptreeFH,$temptreefile); if( ! ref($tree) && -e $tree ) { $temptreefile = $tree; } else { ($temptreeFH,$temptreefile) = $self->io->tempfile('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{'_params'}{'temptreefile'} = $temptreefile; $self->create_wrapper; $self->{_prepared} = 1; return $tempdir; } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $redirect = "stdinRedirect"; my ($self,$batchfile) = @_; my $tempdir = $self->tempdir; $self->update_ordered_parameters; #check version of HYPHY: my $versionstring = $self->version(); $versionstring =~ /.*?(\d+\.\d+).*/; my $version = $1; my $wrapper = "$tempdir/wrapper.bf"; open(WRAPPER, ">", $wrapper) or $self->throw("cannot open $wrapper for writing"); print WRAPPER qq{$redirect = {};\n\n}; my $counter = sprintf("%02d", 0); foreach my $elem (@{ $self->{'_orderedparams'} }) { my ($param,$val) = each %$elem; if ($val eq "") { $val = "$tempdir/$param"; # any undefined parameters must be temporary output files. } print WRAPPER qq{$redirect ["$counter"] = "$val";\n}; $counter = sprintf("%02d",$counter+1); } # This next line is for BatchFile: if ((ref ($self)) =~ m/BatchFile/) { print WRAPPER "\nExecuteAFile ($batchfile, $redirect);\n"; } else { # Not exactly sure what version of HYPHY caused this change, # but Github source changes suggest that it was sometime # after version 0.9920060501 was required. $batchfile =~ s/"//g; # remove any extra quotes in the batchfile name. if ($version >= 0.9920060501) { print WRAPPER qq{\nExecuteAFile (HYPHY_LIB_DIRECTORY + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "$batchfile", stdinRedirect);\n}; } else { print WRAPPER qq{\nExecuteAFile (HYPHY_BASE_DIRECTORY + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "$batchfile", stdinRedirect);\n}; } } close(WRAPPER); $self->{'_wrapper'} = $wrapper; } =head2 run Title : run Usage : my ($rc,$results) = $BatchFile->run(); Function: run the Hyphy analysis using the specified batchfile and its ordered parameters Returns : Return code, Hash Args : none =cut sub run { my ($self) = @_; my $aln = $self->alignment; my $tree = $self->tree; unless (defined($self->{'_prepared'})) { $self->prepare($aln,$tree); } my $rc = 1; my $results = ""; my $commandstring; my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } #runs the HYPHY command $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $self->{'_wrapper'}; my $pid = open(RUN, "-|", "$commandstring") or $self->throw("Cannot open exe $exe"); my $waiting = waitpid $pid,0; # waitpid will leave a nonzero error in $? if the HYPHY command crashes, so we should bail gracefully. my $error = $? & 127; if ($error != 0) { $self->throw("Error: " . $self->program_name . " ($waiting) quit unexpectedly with signal $error"); } #otherwise, return the results and exit with 1 so that the parent knows we were successful. while (my $line = ) { $results .= "$line"; } close(RUN); # process the errors from $? and set the error values. $rc = $? >> 8; if (($results =~ m/error/i) || ($rc == 0)) { # either the child process had an error, or HYPHY put one in the output. $rc = 0; $self->warn($self->program_name . " reported error $rc - see error_string for the program output"); $results =~ m/(error.+)/is; $self->error_string($1); } # put these run results into the temp run output file: open (OUT, ">", $self->{'run_output'}); print OUT $results; close OUT; return ($rc,$results); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $hyphy->alignment($aln); Function: Get/Set the L object Returns : L object Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || !$aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to alignment(): you specified a " . ref($aln)); return; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $hyphy->tree($tree); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( !ref($tree) || !$tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to tree(): you specified a " . ref($tree)); return; } else { $self->{'_tree'} = $tree; } } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters { my ($self) = @_; # we're returning a copy of this return %{ $self->{'_params'} }; } =head2 set_parameter Title : set_parameter Usage : $hyphy->set_parameter($param,$val); Function: Sets a hyphy parameter, will be validated against the valid values. The checks can be ignored if one turns off param checks like this: $hyphy->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter { my ($self,$param,$value) = @_; # FIXME - add validparams checking $self->{'_params'}{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $obj->set_default_parameters(); Function: (Re)set the default parameters from the defaults (the first value in each array in the valid_values() array) Returns : none Args : none =cut sub set_default_parameters { my ($self) = @_; my @validvals = $self->valid_values(); foreach my $elem (@validvals) { keys %$elem; #reset hash iterator my ($param,$val) = each %$elem; if (ref($val)=~/ARRAY/i ) { unless (ref($val->[0])=~/HASH/i) { push @{ $self->{'_orderedparams'} }, {$param, $val->[0]}; } else { $val = $val->[0]; } } if ( ref($val) =~ /HASH/i ) { my $prevparam; while (defined($val)) { last unless (ref($val) =~ /HASH/i); last unless (defined($param)); $prevparam = $param; ($param,$val) = each %{$val}; push @{ $self->{'_orderedparams'} }, {$prevparam, $param}; push @{ $self->{'_orderedparams'} }, {$param, $val} if (defined($val)); } } elsif (ref($val) !~ /HASH/i && ref($val) !~ /ARRAY/i) { push @{ $self->{'_orderedparams'} }, {$param, $val}; } } } =head2 update_ordered_parameters Title : update_ordered_parameters Usage : $hyphy->update_ordered_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub update_ordered_parameters { my ($self) = @_; for (my $i=0; $i < scalar(@{$self->{'_orderedparams'}}); $i++) { my ($param,$val) = each %{$self->{'_orderedparams'}[$i]}; if (exists $self->{'_params'}{$param}) { $self->{'_orderedparams'}[$i] = {$param, $self->{'_params'}{$param}}; } else { $self->{'_orderedparams'}[$i] = {$param, $val}; } } } =head2 outfile_name Title : outfile_name Usage : my $outfile = $hyphy->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_params'}->{'outfile'} = shift @_; } return $self->{'_params'}->{'outfile'}; } =head2 version Title : version Usage : $obj->version() Function: Returns the version string from HYPHY Returns : string Args : none =cut sub version { my $self = shift; my $tempdir = $self->tempdir; if (defined $self->{'_version'}) { return $self->{'_version'}; } # if it's not already defined, write out a small batchfile to return the version string, then clean up. my $versionbf = "$tempdir/version.bf"; open(WRAPPER, ">", $versionbf) or $self->throw("cannot open $versionbf for writing"); print WRAPPER qq{GetString (versionString, HYPHY_VERSION, 2);\nfprintf (stdout, versionString);}; close(WRAPPER); my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } my $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $versionbf; open(RUN, "$commandstring |") or $self->throw("Cannot open exe $exe"); my $output = ; close(RUN); unlink $versionbf; $self->{'_version'} = $output; return $output; } =head2 hyphy_lib_dir Title : hyphy_lib_dir Usage : $obj->hyphy_lib_dir() Function: Returns the HYPHY_LIB_DIRECTORY from HYPHY Returns : string Args : none =cut sub hyphy_lib_dir { my $self = shift; if (defined $self->{'_hyphylibdir'}) { return $self->{'_hyphylibdir'}; } # if it's not already defined, write out a small batchfile to return the version string, then clean up. my $hyphylibdirbf = $self->io->catfile($self->tempdir,"hyphylibdir.bf"); open(WRAPPER, ">", $hyphylibdirbf) or $self->throw("cannot open $hyphylibdirbf for writing"); print WRAPPER qq{fprintf (stdout, HYPHY_LIB_DIRECTORY);}; close(WRAPPER); my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } my $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $hyphylibdirbf; open(RUN, "$commandstring |") or $self->throw("Cannot open exe $exe"); my $output = ; close(RUN); unlink $hyphylibdirbf; $self->{'_hyphylibdir'} = $output; return $output; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/BatchFile.pm000066400000000000000000000232541342734133000251000ustar00rootroot00000000000000=head1 NAME Bio::Tools::Run::Phylo::Hyphy::BatchFile - Wrapper for custom execution of Hyphy batch files =head1 SYNOPSIS my $aln = Bio::Align::AlignI->new(); my $treeio = Bio::TreeIO->new(-format => "nexus", -file => "$tree_file"); my $tree = $treeio->next_tree(); my $bf_exec = Bio::Tools::Run::Phylo::Hyphy::BatchFile->new(-params => {'bf' => "hyphybatchfile.bf", 'order' => ["Universal", "Custom", $aln, "001001", $tree]}); $bf_exec->set_parameter('3', "012012"); my ($rc,$parser) = $bf_exec->run(); =head1 DESCRIPTION This module creates a generic interface to processing of HBL files in HyPhy ([Hy]pothesis Testing Using [Phy]logenies), a package by Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. Instances of this module require only a link to the batch file and an ordered list of parameters, as described in the HyPhy documentation "SelectionAnalyses.pdf." =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Daisie Huang Email daisieh@zoology.ubc.ca =head1 CONTRIBUTORS Additional contributors names and emails here =cut package Bio::Tools::Run::Phylo::Hyphy::BatchFile; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::BatchFile->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::BatchFile object Returns : Bio::Tools::Run::Phylo::Hyphy::BatchFile Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) this hashref should include 'bf' => custombatchfile.bf 'order' => [array of ordered parameters] -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 update_ordered_parameters Title : update_ordered_parameters Usage : $BatchFile->update_ordered_parameters(); Function: updates all of the parameters needed for the ordered input redirect in HBL. Returns : nothing Args : none =cut sub update_ordered_parameters { my ($self) = @_; unless (defined ($self->{'_params'}{'order'})) { $self->throw("No ordered parameters for HYPHY were defined."); } for (my $i=0; $i< scalar @{$self->{'_params'}{'order'}}; $i++) { my $item = @{$self->{'_params'}{'order'}}[$i]; #FIXME: update_ordered_parameters should be more flexible. It should be able to tell what type of object $item is and, if necessary, create a temp file for it. if (ref ($item) =~ m/Bio::SimpleAlign/) { $item = $self->{'_params'}{'tempalnfile'}; } elsif (ref ($item) =~ m/Bio::Tree::Tree/) { $item = $self->{'_params'}{'temptreefile'}; } $self->{'_orderedparams'}[$i] = {$i, $item}; } $self->SUPER::update_ordered_parameters(); } =head2 run Title : run Usage : my ($rc,$results) = $BatchFile->run(); Function: run the Hyphy analysis using the specified batchfile and its ordered parameters Returns : Return code, Hash Args : none =cut sub run { my $self = shift; my ($rc, $results) = $self->SUPER::run(); my $outfile = $self->outfile_name(); open(OUTFILE, ">", $outfile) or $self->throw("cannot open $outfile for writing"); print OUTFILE $results; close(OUTFILE); return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: Creates the wrapper file for the batchfile specified in the hash, saves it to the hash as '_wrapper'. Returns : nothing Args : none =cut sub create_wrapper { my $self = shift; my $batchfile = $self->batchfile; unless (defined($batchfile)) { $self->throw("No batchfile specified, couldn't create wrapper."); } unless (-f $batchfile) { # check to see if maybe this batchfile is a template batchfile my $new_bf = $self->io->catfile($self->hyphy_lib_dir,"TemplateBatchFiles",$batchfile); $new_bf =~ s/\"//g; if (-f $new_bf) { $self->batchfile($new_bf); } else { $self->throw ("Specified batchfile $batchfile not found."); return; } } $self->SUPER::create_wrapper('"' . $self->batchfile . '"'); } =head2 set_parameter Title : set_parameter Usage : $hyphy->set_parameter($param,$val); Function: Sets the named parameter $param to $val if it is a non-numeric parameter If $param is a number, sets the corresponding value of the ordered redirect array (starts from 1). Returns : boolean if set was successful Args : $param => name of the parameter $value => value to set the parameter to =cut sub set_parameter { my ($self,$param,$value) = @_; if ($param =~ /\d+/) { $self->{'_params'}{'order'}[$param-1] = $value; } else { $self->{'_params'}{$param} = $value; } return 1; } =head2 batchfile Title : batchfile Usage : $hyphy->batchfile($bf_name); Function: Gets/sets the batchfile that is run by $hyphy. Returns : The batchfile path. Args : $bf_name => path of new batchfile =cut sub batchfile { my ($self,$bf) = @_; if (defined $bf) { $self->set_parameter('bf', $bf); } if ($self->{'_params'}{'bf'}) { return $self->{'_params'}{'bf'}; } else { $self->warn ("Batchfile was requested but no batchfile was found."); } return; } =head2 make_batchfile_with_contents Title : make_batchfile_with_contents Usage : $hyphy->make_batchfile_with_contents($bf_string); Function: Creates a temporary file with the specified string of contents for the batchfile. Returns : The batchfile path. Args : $bf_string => contents for the batchfile =cut sub make_batchfile_with_contents { my ($self,$bf_string) = @_; my $temp_bf = $self->io->catfile($self->tempdir,"temp.bf"); open (BF, ">", $temp_bf) or $self->throw("cannot open $temp_bf for writing"); print BF "$bf_string\n"; close BF; return $self->batchfile($temp_bf); } =head2 set_default_parameters Title : set_default_parameters Usage : $BatchFile->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the valid_values) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters { my ($self,$keepold) = @_; unless (defined $keepold) { $keepold = 0; } my @validvals = $self->valid_values(); for (my $i=0; $i< scalar (@validvals); $i++) { my $elem = $validvals[$i]; keys %$elem; #reset hash iterator my ($param,$val) = each %$elem; # skip if we want to keep old values and it is already set if (ref($val)=~/ARRAY/i ) { $self->{'_orderedparams'}[$i] = {$param, $val->[0]}; } else { $self->{'_orderedparams'}[$i] = {$param, $val}; } #FIXME: for alignment and treefile, this should default to the ones in params. } } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/FEL.pm000066400000000000000000000170731342734133000236670ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::FEL # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::FEL - Wrapper around the Hyphy FEL analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::FEL; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $fel = Bio::Tools::Run::Phylo::Hyphy::FEL->new(); $fel->alignment($aln); $fel->tree($tree); my ($rc,$results) = $fel->run(); =head1 DESCRIPTION This is a wrapper around the FEL analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Ghostrides/Wrapper.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::FEL; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for FEL are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'New/Restore' => [ "New Analysis", "Restore"]}, {'tempalnfile' => undef }, # aln file goes here {'Model Options' => [ { "Custom" => '010010' }, { "Default" => undef } ] }, {'temptreefile' => undef }, # tree file goes here {'Model Fit Results' => [ $null] }, # Windows have NUL instead of /dev/null {'dN/dS bias parameter' => [ { "Estimate dN/dS only" => undef }, { "Neutral" => undef }, { "Estimate" => undef }, { "Estimate + CI" => undef }, { "User" => '3' } ] }, {'Ancestor Counting' => [ 'Two rate FEL','Single Ancestor Counting','Weighted Ancestor Counting', 'Sample Ancestal States','Process Sampled Ancestal States', 'One rate FEL','Rate Distribution', 'Full site-by-site LRT','Multirate FEL'] }, {'Significance level' => '0.05' }, {'Branch Options' => ['Internal Only','All','A Subtree only','Custom subset'] }, {'outfile' => undef }, # outfile goes here ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::FEL->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::FEL object Returns : Bio::Tools::Run::Phylo::Hyphy::FEL Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $fel->run($aln); Function: run the fel analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, Hash Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc, $run_results) = $self->SUPER::run(); my $results ={}; my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while () { if ($readed_header) { # FEL results are csv my @values = split("\,",$_); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\,",$_); $readed_header = 1; } } return ($rc, $results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "QuickSelectionDetection.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/Modeltest.pm000066400000000000000000000157441342734133000252240ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::Modeltest # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::Modeltest - Wrapper around the Hyphy Modeltest analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::Modeltest; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $modeltest = Bio::Tools::Run::Phylo::Hyphy::Modeltest->new(); $modeltest->alignment($aln); $modeltest->tree($tree); my ($rc,$results) = $modeltest->run(); =head1 DESCRIPTION This is a wrapper around the Modeltest analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Modeltest.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::Modeltest; use vars qw(@ISA); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for Modeltest are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here {'Number of Rate Classes' => [ '4' ] }, {'Model Selection Method' => [ 'Both', 'Hierarchical Test', 'AIC Test'] }, {'Model rejection level' => '0.05' }, {'hieoutfile' => undef }, {'aicoutfile' => undef } ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::Modeltest->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::Modeltest object Returns : Bio::Tools::Run::Phylo::Hyphy::Modeltest Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $modeltest->run($aln); Function: run the modeltest analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, hash containing the "Hierarchical Testing" and "AIC" results, both as hashes. Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc, $run_results) = $self->SUPER::run(); my $results = {}; my @run_result_array = split (/\n/, $run_results); my $line = shift @run_result_array; my $current_model = "error"; # if this stays "error" when you're trying to add results for a model, something's wrong. while (defined $line) { if ($line =~ m/Hierarchical Testing based model \((.*)\)/) { $current_model = "Hierarchical Testing"; $results->{$current_model}{'model_name'} = $1; } elsif ($line =~ m/AIC based model \((.*)\)/) { $current_model = "AIC"; $results->{$current_model}{'model_name'} = $1; } elsif ($line =~ m/Model String:(\d+)/) { $results->{$current_model}{'model_string'} = $1; } elsif ($line =~ m/Model Options: (.+)/) { $results->{$current_model}{'model_options'} = $1; } elsif ($line =~ m/Equilibrium Frequencies Option: (.+)/) { $results->{$current_model}{'eq_freq_option'} = $1; } $line = shift @run_result_array; } return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "ModelTest.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/REL.pm000066400000000000000000000145061342734133000237010ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::REL # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::REL - Wrapper around the Hyphy REL analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::REL; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $rel = Bio::Tools::Run::Phylo::Hyphy::REL->new(); $rel->alignment($aln); $rel->tree($tree); my ($rc,$results) = $rel->run(); =head1 DESCRIPTION This is a wrapper around the REL analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Ghostrides/Wrapper.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::REL; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here {'Model' => [ "Null for Test 1", "Null for Test 2", "Alternative"]}, {'outfile' => undef } # site-by-site conditional probabilities go to this file ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::REL->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::REL object Returns : Bio::Tools::Run::Phylo::Hyphy::REL Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $rel->run($aln); Function: run the rel analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, Hash Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc,$run_results) = $self->SUPER::run(); my $results = {}; my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while () { if ($readed_header) { # REL results are csv my @values = split("\,",$_); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\,",$_); $readed_header = 1; } } return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "YangNielsenBranchSite2005.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Hyphy/SLAC.pm000066400000000000000000000173261342734133000240040ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::SLAC # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::SLAC - Wrapper around the Hyphy SLAC analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::SLAC; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $slac = Bio::Tools::Run::Phylo::Hyphy::SLAC->new(); $slac->alignment($aln); $slac->tree($tree); my ($rc,$results) = $slac->run(); =head1 DESCRIPTION This is a wrapper around the SLAC analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::SLAC; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for SLAC are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'New/Restore' => [ "New Analysis", "Restore"]}, {'tempalnfile' => undef }, # aln file goes here {'Model Options' => [ { "Custom" => '010010' }, { "Default" => undef } ] }, {'temptreefile' => undef }, # tree file goes here {'Model Fit Results' => [ $null] }, # Windows have NUL instead of /dev/null {'dN/dS bias parameter' => [ { "Estimate dN/dS only" => undef }, { "Neutral" => undef }, { "Estimate" => undef }, { "Estimate + CI" => undef }, { "User" => '3' } ] }, {'Ancestor Counting' => [ 'Single Ancestor Counting','Weighted Ancestor Counting', 'Sample Ancestal States','Process Sampled Ancestal States', 'One rate FEL','Two rate FEL','Rate Distribution', 'Full site-by-site LRT','Multirate FEL'] }, {'SLAC Options' => ['Full tree','Tips vs Internals'] }, {'Treatment of Ambiguities' => ['Resolved','Averaged'] }, {'Test Statistic' => ['Approximate','Simulated Null'] }, {'Significance level' => '0.05' }, {'Output options' => 'Export to File' }, #we force a tsv file here {'outfile' => undef }, # outfile goes here {'Rate class estimator' => [ 'Skip','Count'] }, ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::SLAC->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::SLAC object Returns : Bio::Tools::Run::Phylo::Hyphy::SLAC Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $slac->run($aln); Function: run the slac analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, hash Args : L object, L object [optional] =cut sub run { my $self = shift; my $results = {}; my ($rc, $run_output) = $self->SUPER::run(); my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while (my $line = ) { if ($readed_header) { # SLAC results are tsv my @values = split("\t",$line); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\t",$line); $readed_header = 1; } } return ($rc, $results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "QuickSelectionDetection.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/LVB.pm000066400000000000000000000305161342734133000226000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::LVB # # Created by Daniel Barker, based on ProtPars.pm by Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::LVB - Object for using the LVB program to create an array of L objects from a nucleotide multiple alignment file or a nucleotide SimpleAlign object. Works with LVB version 2.1. =head1 SYNOPSIS use Bio::Tools::Run::Phylo::LVB; # Create a SimpleAlign object. # NOTE. Aligning nucleotide sequence directly, as below, makes # sense for non-coding nucleotide sequence (e.g., structural RNA # genes, introns, ITS). For protein-coding genes, to prevent # Clustal intronducing frameshifts one should instead align the # translations of the genes, then convert the multiple alignment # to nucleotide by referring to the corresponding transcript # sequences (e.g., using EMBOSS tranalign). use Bio::Tools::Run::Alignment::Clustalw; $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new(quiet => 1); $inputfilename = "/Users/daniel/nuc.fa"; $aln = $aln_factory->align($inputfilename); # Create the tree or trees. $tree_factory = Bio::Tools::Run::Phylo::LVB->new(quiet => 1); @trees = $tree_factory->run($aln); # Or one can pass in a file name containing a nucleotide multiple # alignment in Phylip 3.6 format: $tree_factory = Bio::Tools::Run::Phylo::LVB->new(quiet => 1); $tree = $tree_factory->run("/Users/daniel/nuc.phy"); =head1 DESCRIPTION Wrapper for LVB, which uses a simulated annealing heuristic search to seek parsimonious trees from a nucleotide multiple alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 PARAMETERS FOR LVB COMPUTATION =head2 FORMAT Title : FORMAT Description : (optional) When running LVB from a Phylip 3.6-format multiple alignment file, this specifies the layout of the file. It may be "interleaved" or "sequential". FORMAT is automatically set to "interleaved" if running from a SimpleAlign object. Defaults to "interleaved". =head2 GAPS Title : GAPS Description : (optional) LVB can treat gaps represented in the multiple alignment by "-" as either "fifthstate" or "unknown". "fifthstate" regards "-" as equivalent to "O", which is an unambiguous character state distinct from all nucleotides. "unknown" regards "-" as equivalent to "?", which is as an ambiguous site that may contain "A" or "C" or "G" or "T" or "O". Defaults to "unknown". =head2 SEED Title : SEED Description : (optional) This specifies the random number seed for LVB. SEED must be an integer in the range 0 to 900000000 inclusive. If no seed is specified, LVB takes a seed from the system clock. By default, no seed is specified. =head2 DURATION Title : DURATION Description : (optional) This specifies the duration of the analysis, which may be "fast" or "slow". "slow" causes LVB to perform a more thorough and more time-consuming search than "fast". Defaults to "slow". =head2 BOOTSTRAPS Title : BOOTSTRAPS Description : (optional) This specifies the number of bootstrap replicates to use, which must be a positive integer. Set bootstraps to 0 for no bootstrapping. Defaults to 0. =head1 AUTHOR Daniel Barker =head1 CONTRIBUTORS Email jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::LVB; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @LVB_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Cwd; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Root::IO; use File::Copy; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable the LVB program. # You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/lvb'); # my $lvb_factory = Bio::Tools::Run::Phylo::LVB->new(@params); # BEGIN { # NOTE. The order of the members of @LVB_PARAMS is vital! @LVB_PARAMS = qw(FORMAT GAPS SEED DURATION BOOTSTRAPS); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@LVB_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : ->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'lvb'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns undef Args : =cut sub program_dir { return undef; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); # set defaults $self->FORMAT("interleaved"); $self->GAPS("unknown"); $self->SEED(""); $self->DURATION("slow"); $self->BOOTSTRAPS(0); # re-set with user's values where specified my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : $inputfilename = '/Users/daniel/nuc.phy'; @trees = $factory->run($inputfilename); Function: Create one or more LVB trees from a SimpleAlign object or a file containing a Phylip 3.6-format nucleotide multiple alignment. Example : Returns : Array of L objects Args : Name of a file containing a nucleotide multiple alignment in Phylip 3.6 format, or a SimpleAlign object =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for lvb. Probably bad input data in $input !");} # Create parameter string to pass to lvb program my $param_string = $self->_setparams(); # run lvb my @trees = $self->_run($infilename,$param_string); } =head2 create_tree Title : create_tree Usage : $inputfilename = '/Users/daniel/nuc.phy'; @trees = $factory->create_tree($inputfilename); Function: Create one or more LVB trees from a SimpleAlign object or a file containing a Phylip 3.6-format nucleotide multiple alignment. Example : Returns : Array of L objects Args : Name of a file containing a nucleotide multiple alignment in Phylip 3.6 format, or a SimpleAlign object =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to lvb program Example : Returns : Array of Bio::Tree objects Args : Name of a file containing a multiple alignment in Phylip 3.6 format and a parameter string to be passed to LVB =cut sub _run { my ($self,$infile,$param_string) = @_; return unless( $self->executable ); my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $param_string; $self->debug( "Program ".$self->executable || ''."\n"); # create LVB's working copy of the input file, which must be named "infile" # NOTE, we cut trailing spaces since they can cause trouble with LVB 2.1 my $lvb_infile = $self->tempdir . "/infile"; open(LVB_SUB_RUN_TMP_IN_FH, "$infile"); open(LVB_SUB_RUN_TMP_OUT_FH, ">$lvb_infile"); while () { s/ +$//; print LVB_SUB_RUN_TMP_OUT_FH or $self->throw("output error on $lvb_infile"); } chdir($self->tempdir); #open a pipe to run lvb to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(LVB_PIPE,"|".$self->executable.">$null"); } else { open(LVB_PIPE,"|".$self->executable); } print LVB_PIPE $instring; close(LVB_PIPE); chdir($curpath); #get the results my $treefile = $self->tempdir . "/outtree"; $self->throw("LVB did not create treefile correctly") unless (-e $treefile); #create the trees my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my @trees = (); while (my $tree = $in->next_tree()) { push @trees, $tree; } unless ( $self->save_tempfiles ) { # Clean up the temporary files created along the way... unlink $lvb_infile; unlink $treefile; } return @trees; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for lvb program Example : Returns : name of file containing a multiple alignment in Phylip 3.6 format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); # If $input is not a reference it better be the name of a # file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } # $input may be a SimpleAlign Object if ($input->isa("Bio::Align::AlignI")) { # Open temporary file for both reading & writing of BioSeq array ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip',idlength=>$10); $alnIO->write_aln($input); $alnIO->close(); close($tfh); $tfh = undef; unless ($self->format() =~ /^interleaved$/i) { $self->warn("resetting LVB format to interleaved"); $self->format("interleaved"); } return $alnfilename; } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for lvb program Example : Returns : parameter string to be passed to LVB Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr (@LVB_PARAMS) { $value = $self->$attr(); if ($attr =~/SEED/i) { $value = "" unless defined $value; $param_string .= "$value\n"; } elsif ($attr =~ /BOOTSTRAPS/i) { $value = 0 unless defined $value; $param_string .= "$value\n"; } else { # we want I for "interleaved" or S for "sequential", # U for "unknown" or F for "fifthstate", # F for "fast" or S for "slow" $param_string .= uc(substr $value, 0, 1) . "\n"; } } return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Molphy/000077500000000000000000000000001342734133000230625ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Molphy/ProtML.pm000066400000000000000000000451371342734133000246070ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Molphy::ProtML # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Molphy::ProtML - A wrapper for the Molphy pkg app ProtML =head1 SYNOPSIS use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Molphy::ProtML; my %args = ( 'models' => 'jtt', 'search' => 'quick', 'other' => [ '-information', '-w'] ); my $verbose = 0; # change to 1 if you want some debugging output my $protml = Bio::Tools::Run::Phylo::Molphy::ProtML->new(-verbose => $verbose, -flags => \%args); die("cannot find the protml executable") unless $protml->executable; # read in a previously built protein alignment my $in = Bio::AlignIO->new(-format => 'clustalw', -file => 't/data/cel-cbr-fam.aln'); my $aln = $in->next_aln; $protml->alignment($aln); my ($rc,$results) = $protml->run(); # This may be a bit of overkill, but it is possible we could # have a bunch of results and $results is a # Bio::Tools::Phylo::Molphy object my $r = $results->next_result; # $r is a Bio::Tools::Phylo::Molphy::Result object my @trees; while( my $t = $r->next_tree ) { push @trees, $t; } print "search space is ", $r->search_space, "\n"; "1st tree score is ", $tree[0]->score, "\n"; my $out = Bio::TreeIO->new(-file => ">saved_MLtrees.tre", -format => "newick"); $out->write_tree($tree[0]); $out = undef; =head1 DESCRIPTION This is a wrapper for the exe from the Molphy (MOLecular PHYlogenetics) package by Jun Adachi & Masami Hasegawa. The software can be downloaded from L. Note that PHYLIP (Joe Felsenstein) also provides a version of protml which this module is currently NOT prepared to handle. Use the package available directly from MOLPHY authors if you want to use the module in its present implementation (extensions are welcomed!). The main components are the protml and nucml executables which are used to build maximum likelihood (ML) phylogenetic trees based on either protein or nucleotide sequences. Here are the valid input parameters, we have added a longhand version of the parameters to help you understand what each one does. Either the longhand or the original Molphy parameter will work. Bioperl Molphy Description Longhand parameter Model (one of these): --------------- jtt j Jones, Taylor & Thornton (1992) jtt-f jf JTT w/ frequencies dayhoff d Dahoff et al. (1978) dayhoff-f d dayhoff w/ frequencies mtrev24 m mtREV24 Adachi & Hasegwa (1995) mtrev24-f mf mtREV24 w/ frequencies poisson p Poisson proportional pf Proportional rsr r Relative Substitution Rate rsr-f rf RSR w/ frequencies frequencies f data frequencies Search Strategy (one of these): ---------------- usertrees u User trees (must also supply a tree) rearrangement R Local rearrangement lbp RX Local boostrap prob exhaustive e Exhaustive search star s Star decomposition search (may not be ML) quick q Quick Add OTU search (may not be ML) distance D ML Distance matrix --> NJDIST (need to supply NJDIST tree) Others (can be some or all of these): --------------- norell-bp b No RELL-BP minimumevolution M Minimum evolution sequential S Sequence is in Sequential format _OR_ interleaved I Sequence is in Interleaved format verbose v Verbose messages directed to STDERR information i Output some information (tree vals) w More some extra information (transition matricies, etc) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-AT-bioperl_DOT_org =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Molphy::ProtML; use vars qw(@ISA $PROGRAMNAME $PROGRAM $MINNAMELEN %VALIDVALUES %VALIDFLAGS); use strict; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Phylo::Molphy; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); BEGIN { $MINNAMELEN = 25; %VALIDFLAGS = ( 'models' => { # models jtt => 'j', # Jones, Taylor & Thornton (1992) 'jtt-f' => 'jf', # jtt w/ frequencies dayhoff => 'd', # Dahoff et al. (1978) 'dayhoff-f' => 'df', # dayhoff w/ frequencies mtrev24 => 'm', # Adachi & Hasegwa (1995) 'mtrev24-f' => 'mf', # mtREV24 w/ frequencies poisson => 'p', # Poisson proportional => 'pf', # Proportional rsr => 'r', # Relative Substitution Rate 'rsr-f' => 'rf', # RSR w/ frequencies frequencies => 'f', # data frequencies }, 'search' => { # search strategy usertrees => 'u', # must also supply tree rearrangement => 'R', # local rearrangement lbp => 'RX', # local boostrap prob exhaustive => 'e', # exhaustive star => 's', # star decomposition search (may not be ML) quick => 'q', # quick add OTU search (may not be ML) distance => 'D', # ML Distance matrix --> NJDIST }, 'others' => { # others 'norell-bp' => 'b', sequential => 'S', # sequential format interleaved => 'I', # interleaved format minimumevolution => 'M', # minimum evolution verbose => 'v', # verbose to stderr information => 'i', # output some information w => 'w', # some extra information } ); # this will allow for each of the parameters to also accept the original # protML params my @toadd; foreach my $type ( keys %VALIDFLAGS ) { my @keys = keys %{ $VALIDFLAGS{$type} }; for my $k ( @keys ) { my $v = $VALIDFLAGS{$type}->{$k}; $VALIDFLAGS{$type}->{$v} = $v; } } %VALIDVALUES = (num_retained => sub { my $a = shift; if( $a =~ /^\d+$/) { return 'n'; }}, # should be a number percent_retained => sub { my $a = shift; if( $a =~ /^\d+$/ && $a >= 0 && $a <= 100) { return 'P'; }} ); } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protml'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{MOLPHYDIR}) if $ENV{MOLPHYDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Molphy::ProtML->new(); Function: Builds a new Bio::Tools::Run::Phylo::Molphy::ProtML object Returns : Bio::Tools::Run::Phylo::Molphy::ProtML Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of PAML parameters (all passed to set_parameter) -executable => where the protml executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{'_protmlparams'} = {}; $self->{'_protmlflags'} = {}; my ($aln, $tree, $st, $flags, $params, $exe) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES FLAGS PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree ); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); if( defined $flags ) { if( ref($flags) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { foreach my $type ( keys %$flags ) { if( $type =~ /other/i ) { foreach my $flag ( @{$flags->{$type}} ) { $self->set_flag('others', $flag) ; } } else { $self->set_flag($type, $flags->{$type}) ; } } } } if( defined $params ) { if( ref($flags) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : $protml->run(); Function: run the protml analysis using the default or updated parameters the alignment parameter must have been set Returns : Bio::Tools::Phylo::Molphy Args : =cut sub run { my ($self) = @_; unless ( $self->save_tempfiles ) { $self->cleanup(); } my $align = $self->alignment(); if( ! $align ) { $self->warn("must have provided a valid alignment object"); return -1; } if( $align->get_seq_by_pos(1)->alphabet ne 'protein' ) { $self->warn("Must have provided a valid protein alignment"); return -1; } my %params = $self->get_parameters; my %flags = $self->get_flags(); my $cmdstring = $self->executable; if( ! defined $flags{'search'} ) { $self->warn("Must have set a valid 'search' flag to run protml this is one of ".join(",", keys %{$VALIDFLAGS{'search'}})); return; } my $tree = $self->tree; for my $t ( keys %flags ) { if( $t eq 'others' ) { $cmdstring .= " " . join(" ", map { '-'.$_ } keys %{$flags{$t}}); } else { next if $flags{$t} eq 'u'; $cmdstring .= " -".$flags{$t}; } } while( my ($param,$val) = each %params ) { $cmdstring .= " \-$param $val"; } my ($tmpdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile) = $self->io->tempfile ('DIR' => $tmpdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlinebreak' => 1, '-idlength' => $MINNAMELEN > $align->maxdisplayname_length() ? $MINNAMELEN : $align->maxdisplayname_length() +1); $alnout->write_aln($align); $alnout->close(); $alnout = undef; close($tempseqFH); $tempseqFH = undef; $cmdstring .= " $tempseqfile"; if( $tree && defined $flags{'search'} eq 'u' ) { my ($temptreeFH,$temptreefile) = $self->io->tempfile ('DIR' => $tmpdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); $cmdstring .= " $temptreefile"; } $self->debug( "cmdstring is $cmdstring\n"); unless( open(PROTML, "$cmdstring |") ) { $self->warn("Cannot run $cmdstring"); return undef; } my $parser= Bio::Tools::Phylo::Molphy->new(-fh => \*PROTML); return (1,$parser); } =head2 alignment Title : alignment Usage : $protml->align($aln); Function: Get/Set the Bio::Align::AlignI object Returns : Bio::Align::AlignI object Args : [optional] Bio::Align::AlignI Comment : We could potentially add support for running directly on a file but we shall keep it simple See also : L, L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function"); return undef; } $self->{'_alignment'} = $aln; } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $protml->tree($tree, %params); Function: Get/Set the Bio::Tree::TreeI object Returns : Bio::Tree::TreeI Args : [optional] $tree => Bio::Tree::TreeI, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also : L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 get_flags Title : get_flags Usage : my @params = $protml->get_flags(); Function: returns the list of flags Returns : array of flag names coded in the way that Args : none =cut sub get_flags{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_protmlflags'} }; } =head2 set_flag Title : set_flag Usage : $protml->set_parameter($type,$val); Function: Sets a protml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $protml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $type => name of the parameter This can be one of 'search', 'model', 'other' $value => flag value See also: L =cut sub set_flag{ my ($self,$type,$param) = @_; $type = lc($type); while( substr($type,0,1) eq '-') { # handle multiple '-' substr($type,0,1,''); } if( ! defined $type || ! defined $param ) { $self->debug("Must supply a type and param when setting flag"); return 0; } if( ! $VALIDFLAGS{$type} ) { $self->warn("$type is an unrecognized type"); } $param = lc($param); while( substr($param,0,1) eq '-') { # handle multiple '-' substr($param,0,1,''); } if(! $self->no_param_checks && ! defined $VALIDFLAGS{$type}->{$param} ) { $self->warn("unknown flag ($type) $param will not be set unless you force by setting no_param_checks to true"); return 0; } if($type eq 'others' ) { $self->{'_protmlflags'}->{$type}->{$VALIDFLAGS{$type}->{$param} || $param} = 1; } else { $self->{'_protmlflags'}->{$type} = $VALIDFLAGS{$type}->{$param} || $param; } return 1; } =head2 get_parameters Title : get_parameters Usage : my %params = $protml->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_protmlparams'} }; } =head2 set_parameter Title : set_parameter Usage : $protml->set_parameter($param,$val); Function: Sets a protml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $protml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; $param = lc($param); $param =~ s/^\-//; if(! $self->no_param_checks && ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } my $paramflag = $VALIDVALUES{$param}->($value); if( $paramflag ) { $self->{'_protmlparams'}->{$paramflag} = $value; } else { print "value $value was not valid for param $param\n"; return 0; } return 1; } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $protml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $protml->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Njtree/000077500000000000000000000000001342734133000230415ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Njtree/Best.pm000066400000000000000000000354511342734133000243040ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Njtree::Best # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Njtree::Best - Wrapper around the Njtree (Njtree/phyml) best program. =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Njtree::Best; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/njtree_aln2.nucl.mfa'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'nhx', -file => 't/data/species_tree_njtree.nh'); my $tree = $treeio->next_tree; my $njtree_best = Bio::Tools::Run::Phylo::Njtree::Best->new(); $njtree_best->alignment($aln); $njtree_best->tree($tree); my $nhx_tree = $njtree_best->run(); =head1 DESCRIPTION This is a wrapper around the best program of Njtree by Li Heng. See http://treesoft.sourceforge.net/njtree.shtml for more information. Wrapper for the calculation of a reconciled phylogenetic tree with inferred duplication tags from amultiple sequence alignment and a species tree using NJTREE. =head2 Helping the module find your executable You will need to enable NJTREEDIR to find the njtree program. This can be done in (at least) three ways: 1. Make sure the njtree executable is in your path (i.e. 'which njtree' returns a valid program 2. define an environmental variable NJTREEDIR which points to a directory containing the 'njtree' app: In bash export NJTREEDIR=/home/progs/treesoft/njtree or In csh/tcsh setenv NJTREEDIR /home/progs/treesoft/njtree 3. include a definition of an environmental variable NJTREEDIR in every script that will BEGIN {$ENV{NJTREEDIR} = '/home/progs/treesoft/njtree'; } use Bio::Tools::Run::Phylo::Njtree::Best; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Njtree::Best; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM @NJTREE_BEST_PARAMS @NJTREE_BEST_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @NJTREE_BEST_PARAMS = qw(C p F c k a d l L b); @NJTREE_BEST_SWITCHES = qw(P S A r D s g N); # Authorize attribute fields foreach my $attr ( @NJTREE_BEST_PARAMS, @NJTREE_BEST_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'njtree'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{NJTREEDIR}) if $ENV{NJTREEDIR}; } =head2 new Title : new Usage : my $njtree_best = Bio::Tools::Run::Phylo::Njtree::Best->new(); Function: Builds a new Bio::Tools::Run::Phylo::Njtree::Best Returns : Bio::Tools::Run::Phylo::Njtree::Best Args : -alignment => the Bio::Align::AlignI object -tree => the Bio::Tree::TreeI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -executable => where the njtree executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); return $self; } =head2 prepare Title : prepare Usage : my $rundir = $njtree_best->prepare(); Function: prepare the njtree_best analysis using the default or updated parameters the alignment parameter and species tree must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$aln,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("Must have supplied a valid alignment file in order to run njtree_best"); return 0; } if( ! $tree ) { $self->warn("Must have supplied a valid species tree file in order to run njtree_best"); return 0; } my ($tempdir) = $self->tempdir(); my $tempalnFH; if( ! ref($aln) && -e $aln ) { $self->{_tempalnfile} = $aln; } else { ($tempalnFH,$self->{_tempalnfile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'fasta', '-fh' => $tempalnFH); $aln->set_displayname_flat(1); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempalnFH); } my ($temptreeFH); if( ! ref($tree) && -e $tree ) { $self->{_temptreefile} = $tree; } else { ($temptreeFH,$self->{_temptreefile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{_prepared} = 1; $self->{_njtree_best_params} = $self->_setparams(); return $tempdir; } =head2 run Title : run Usage : my $nhx_tree = $njtree_best->run(); Function: run the njtree_best analysis using the default or updated parameters the alignment parameter must have been set Returns : L object [optional] Args : L object L object =cut sub run { my ($self,$aln,$tree) = @_; $self->prepare($aln,$tree) unless (defined($self->{_prepared})); my ($rc,$nhx_tree) = (1); my ($tmpdir) = $self->tempdir(); my $outfile = $self->outfile_name; { my $commandstring; my $exit_status; #./njtree best [other_params] -f species_file.nh -p tree -o inputfile.best.nhx inputfile.nucl.mfa my $njtree_executable = $self->executable; $commandstring = $njtree_executable." best "; $commandstring .= $self->{_njtree_best_params}; $commandstring .= " -f $self->{_temptreefile} -p tree -o "; unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } $commandstring .= $self->outfile_name; $commandstring .= " $self->{_tempalnfile} "; $self->throw("unable to find or run executable for 'njtree'") unless $njtree_executable && -e $njtree_executable && -x _; open(RUN, "$commandstring |") or $self->throw("Cannot run $commandstring"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if( (grep { /^\[ /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $nhx_tree = Bio::TreeIO->new(-file => "$tmpdir/$outfile", -format => 'nhx'); }; if( $@ ) { $self->warn($self->error_string); } } unless ( $self->save_tempfiles ) { $self->cleanup(); } return ($rc,$nhx_tree); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Version\:\s+(\d+.\d+.\d+)/m; return $1 || undef; } =head2 alignment Title : alignment Usage : $njtree_best->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $njtree_best->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 check_names Title : check_names Usage : Function: Example : Returns : Args : =cut sub check_names { my $self = shift; my $tree = $self->tree; my $aln = $self->alignment; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run njtree_best"); return 0; } if( ! $tree ) { $self->warn("must have supplied a valid species tree file in order to run njtree_best"); return 0; } foreach my $leaf ($tree->get_leaf_nodes) { my $id = $leaf->id; $id =~ s/\-\*.+//; # njtree does not consider anything after a \-\* $self->{_treeids}{$id} = 1; } foreach my $seq ($aln->each_seq) { my $id = $seq->id; $id =~ s/.+\_//; # njtree only looks at the right side of the \_ $self->{_alnids}{$id} = 1; } foreach my $alnid (keys %{$self->{_alnids}}) { $self->{_unmappedids}{$alnid} = 1 unless (defined($self->{_treeids}{$alnid})); } if (defined($self->{_unmappedids})) { my $count = scalar(keys%{$self->{_unmappedids}}); my $unmapped = join(",",keys %{$self->{_unmappedids}}); $self->warn("$count unmapped ids between the aln and the tree $unmapped"); } } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for njtree_best program Example : Returns : parameter string to be passed to njtree_best during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @NJTREE_BEST_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key .' '.$value; } for $attr ( @NJTREE_BEST_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $njtree_best->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $njtree_best->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phast/000077500000000000000000000000001342734133000226715ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phast/PhastCons.pm000066400000000000000000000363621342734133000251430ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phast::PhastCons # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phast::PhastCons - Wrapper for footprinting using phastCons =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phast::PhastCons; # Make a PhastCons factory $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(); # Pass the factory an alignment and the corresponding species tree $align_filename = 't/data/apes.multi_fasta'; $species_tree_filename = 't/data/apes.newick'; @features = $factory->run($align_filename, $species_tree_filename); # or get a Bio::Align::AlignI (SimpleAlign) object from somewhere, and # generate the species tree automatically using a Bio::DB::Taxonomy database $tdb = Bio::DB::Taxonomy->new(-source => 'entrez'); @features = $factory->run($aln_obj, $tdb); # @features is an array of Bio::SeqFeature::Annotated, one feature per # alignment sequence and prediction =head1 DESCRIPTION This is a wrapper for running the phastCons application by Adam Siepel. You can get details here: http://compgen.bscb.cornell.edu/~acs/software.html phastCons is used for phylogenetic footprinting/ shadowing. Currently the interface is extremely simplified, allowing only one analysis method. The focus here is on ease of use, allowing phastCons to estimate as many parameters as possible and having it output just the 'most conserved' blocks it detects. You can, however, try supplying normal phastCons arguments to new(), or calling arg-named methods (excluding initial hyphens and converting others to underscores, eg. $factory-Eindels_only(1) to set the --indels-only arg). The particular analysis carried out here is to: 1. Use phyloFit to generate a tree model for initialization of the nonconserved model from the supplied alignment (all data) and species tree 2. Run phastCons in 'training' mode for parameter estimation using all the alignment data and the model from step 1 3. Run phastCons with the trees from step 2 to discover the most conserved regions See the 'HowTo' at http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html for details on how to improve results. WARNING: the API is likely to change in the future to allow for alternative analysis types. You will need to enable this phastCons wrapper to find the phast programs (at least phastCons and phyloFit). This can be done in (at least) three ways: 1. Make sure the phastCons and phyloFit executables are in your path. 2. Define an environmental variable PHASTDIR which is a directory which contains the phastCons and phyloFit applications: In bash: export PHASTDIR=/home/username/phast/bin In csh/tcsh: setenv PHASTDIR /home/username/phast/bin 3. Include a definition of an environmental variable PHASTDIR in every script that will use this PhastCons wrapper module, e.g.: BEGIN { $ENV{PHASTDIR} = '/home/username/phast/bin' } use Bio::Tools::Run::Phylo::Phast::PhastCons; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phast::PhastCons; use strict; use Cwd; use File::Basename; use Clone qw(clone); use Bio::AlignIO; use Bio::Tools::Run::Phylo::Phast::PhyloFit; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'phastCons'; our $PROGRAM_DIR = $ENV{'PHASTDIR'}; # methods and their synonyms from the phastCons args we support our %PARAMS = (rho => 'R', nrates => 'k', transitions => 't', target_coverage => 'C', expected_length => ['E', 'expected_lengths'], lnl => 'L', log => 'g', max_micro_indel => 'Y', indel_params => 'D', lambda => 'l', extrapolate => 'e', hmm => 'H', catmap => 'c', states => 'S', reflect_strand => 'U', require_informative => 'M', not_informative => 'F'); our %SWITCHES = (quiet => 'q', indels => 'I', indels_only => 'J', FC => 'X', coding_potential => 'p', ignore_missing => 'z'); # just to be explicit, args we don't support (yet) or we handle ourselves our %UNSUPPORTED = (estimate_trees => 'T', estimate_rho => 'O', gc => 'G', msa_format => 'i', score => 's', no_post_probs => 'n', seqname => 'N', refidx => 'r', idpref => 'P', help => 'h', alias => 'A', most_conserved => ['V', 'viterbi']); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(@params) Function: Creates a new PhastCons factory Returns : Bio::Tools::Run::Phylo::Phast::PhastCons Args : Optionally, provide any of the following (defaults are not to use, see the same-named methods for information on what each option does): { -target_coverage => number between 0 and 1 AND -expected_length => int } -rho => number between 0 and 1 -quiet => boolean (turn on or off program output to console) Most other options understood by phastCons can be supplied as key => value pairs in this way. Options that don't normally take a value should be given a value of 1. You can type the keys as you would on the command line (eg. '--indels-only' => 1) or with only a single hyphen to start and internal hyphens converted to underscores (eg. -indels_only => 1) to avoid having to quote the key. These options can NOT be used with this wrapper currently: estimate_trees / T estimate_rho / O gc / G msa_format / i score / s no_post_probs / n seqname / N idpref / P help / h alias / A most_conserved / V / viterbi refidx / r =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES)}, -create => 1); return $self; } =head2 target_coverage Title : target_coverage Usage : $factory->target_coverage(0.25); Function: Constrain transition parameters such that the expected fraction of sites in conserved elements is the supplied value. Returns : number (default undef) Args : None to get, number (between 0 and 1) to set =cut sub target_coverage { my ($self, $num) = @_; if (defined ($num)) { ($num > 0 && $num < 1) || $self->throw("target_coverage value must be between 0 and 1, exclusive"); $self->{coverage} = $num; } return $self->{coverage} || return; } =head2 expected_length Title : expected_length Usage : $factory->expected_length(5); Function: Set transition probabilities such that the expected length of a conserved element is the supplied value. target_coverage() must also be set. Returns : int (default undef) Args : None to get, int to set =cut # created automatically =head2 rho Title : rho Usage : $factory->rho(0.3); Function: Set the *scale* (overall evolutionary rate) of the model for the conserved state to be the supplied number times that of the model for the non-conserved state (default 0.3). Returns : number (default undef) Args : None to get, number (between 0 and 1) to set =cut sub rho { my ($self, $num) = @_; if (defined ($num)) { ($num > 0 && $num < 1) || $self->throw("rho value must be between 0 and 1, exclusive"); $self->{rho} = $num; } return $self->{rho} || return; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs phastCons on an alignment to find the most conserved regions ('footprinting'). Returns : array of Bio::SeqFeature::Annotated (one feature per alignment sequence and prediction) Args : The first argument represents an alignment, the second argument a species tree. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases, the alignment sequence names must correspond to node ids in the species tree. Multi-word species names should be joined with underscores to form the sequence names, eg. Homo_sapiens =cut sub run { my ($self, $aln, $tree) = @_; ($aln && $tree) || $self->throw("alignment and tree must be supplied"); my $aln_obj = $self->_alignment($aln); $tree = $self->_tree($tree); # if aln was a file, set the alignment id to match file name if (-e $aln) { my $aln_id = basename($aln); ($aln_id) = $aln_id =~ /^([^\.]+)/; $aln_obj->id($aln_id); } return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; # use phyloFit to generate tree model initialization (?) using species tree # and alignment my $pf = Bio::Tools::Run::Phylo::Phast::PhyloFit->new(-verbose => $self->verbose, -quiet => $self->quiet); my $init_mod = $pf->run($self->_alignment, $self->_tree) || $self->throw("phyloFit failed to work as expected, is it installed?"); # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $aln_file = $self->_write_alignment; # do training for parameter estimation my $command = $exe.$self->_setparams($aln_file, $init_mod); $self->debug("phastCons training command = $command\n"); system($command) && $self->throw("phastCons training call ($command) crashed: $?"); # do the final analysis $command = $exe.$self->_setparams($aln_file); $self->debug("phastCons command = $command\n"); system($command) && $self->throw("phastCons call ($command) crashed: $?"); # read in most_cons.bed as the result my $bedin = Bio::FeatureIO->new(-format => 'bed', -file => 'most_cons.bed'); # cd back to orig dir chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); my @feats = (); my $aln = $self->_alignment; while (my $feat = $bedin->next_feature) { $feat->source_tag('phastCons'); my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'predicted', -value => 1); $feat->annotation->add_Annotation($sv); # $feat->type('TF_binding_site'); causes seg fault in subsequent clone() # features are in zero-based alignment coords; make a feature for each # alignment sequence foreach my $seq ($aln->each_seq) { my $clone = clone($feat); # $clone->type('TF_binding_site'); causes massive slowdown if you later store/retrieve these features from Bio::DB::SeqFeature database # give it the correct id $clone->seq_id($seq->id); # check and correct the coords (sequence may not have the feature) my $sloc = $seq->location_from_column($feat->start + 1) || next; my $eloc = $seq->location_from_column($feat->end + 1) || next; $clone->start($sloc->start - 1); $clone->end($eloc->end - 1); push(@feats, $clone); } } return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment file name for result production, AND filename of phyloFit generated init.mod file to estimate trees =cut sub _setparams { my ($self, $aln_file, $init_mod) = @_; my $param_string = $self->SUPER::_setparams(-params => [keys %PARAMS], -switches => [keys %SWITCHES], -double_dash => 1, -underscore_to_dash => 1); $param_string .= ' --no-post-probs'; my $aln_id = $self->_alignment->id; $param_string .= " --seqname $aln_id --idpref $aln_id" if $aln_id; $param_string .= ' --refidx 0'; my $input = ' --msa-format FASTA '.$aln_file; if ($init_mod) { $param_string .= ' --estimate-trees mytrees '.$input.' '.$init_mod; } else { $param_string .= $input.' --most-conserved most_cons.bed --score mytrees.cons.mod,mytrees.noncons.mod'; } return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phast/PhyloFit.pm000066400000000000000000000243631342734133000247750ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phast::PhyloFit # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phast::PhyloFit - Wrapper for phyloFit =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phast::PhyloFit; # Make a PhyloFit factory $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(); # Generate an init.mod file for use by phastCons my $init_file = $factory->run($alignment, $tree); =head1 DESCRIPTION This is a wrapper for running the phyloFit application by Adam Siepel. You can get details here: http://compgen.bscb.cornell.edu/~acs/software.html Currently the interface is extremely simplified. Only the --tree form of usage is allowed (not --init-model), which means a tree must be supplied with the alignment (to run()). You can try supplying normal phyloFit arguments to new(), or calling arg-named methods (excluding initial hyphens and converting others to underscores, eg. $factory-Egaps_as_bases(1) to set the --gaps-as-bases arg). WARNING: the API may change in the future to allow for greater flexability and access to more phyloFit features. You will need to enable this PhyloFit wrapper to find the phast programs (at least phyloFit itself). This can be done in (at least) three ways: 1. Make sure the phyloFit executable is in your path. 2. Define an environmental variable PHASTDIR which is a directory which contains the phyloFit application: In bash: export PHASTDIR=/home/username/phast/bin In csh/tcsh: setenv PHASTDIR /home/username/phast/bin 3. Include a definition of an environmental variable PHASTDIR in every script that will use this PhyloFit wrapper module, e.g.: BEGIN { $ENV{PHASTDIR} = '/home/username/phast/bin' } use Bio::Tools::Run::Phylo::Phast::PhyloFit; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phast::PhyloFit; use strict; use Cwd; use File::Spec; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'phyloFit'; our $PROGRAM_DIR = $ENV{'PHASTDIR'}; # methods and their synonyms from the phastCons args we support our %PARAMS = (subst_mod => 's', min_informative => 'I', precision => 'p', log => 'l', ancestor => 'A', nrates => 'k', alpha => 'a', rate_constants => 'K', features => 'g', catmap => 'c', do_cats => 'C', reverse_groups => 'R'); our %SWITCHES = (gaps_as_bases => 'G', quiet => 'q', EM => 'E', init_random => 'r', estimate_freqs => 'F', markov => 'N', non_overlapping => 'V'); # just to be explicit, args we don't support (yet) or we handle ourselves our %UNSUPPORTED = (msa_format => 'i', out_root => 'o', tree => 't', help => 'h', lnl => 'L', init_model => 'M', scale_only => 'B', scale_subtree => 'S', no_freqs => 'f', no_rates => 'n', post_probs => 'P', expected_subs => 'X', expected_total_subs => 'Z', column_probs => 'U', windows => 'w', windows_explicit => 'v'); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phast::PhyloFit->new() Function: creates a new PhyloFit factory Returns : Bio::Tools::Run::Phylo::Phast::PhyloFit Args : Most options understood by phastCons can be supplied as key => value pairs. Options that don't normally take a value should be given a value of 1. You can type the keys as you would on the command line (eg. '--gaps-as-bases' => 1) or with only a single hyphen to start and internal hyphens converted to underscores (eg. -gaps_as_bases => 1) to avoid having to quote the key. These options can NOT be used with this wrapper currently: msa_format / i out_root / o tree / t help / h lnl / L init_model / M scale_only / B scale_subtree / S no_freqs / f no_rates / n post_probs / P expected_subs / X expected_total_subs / Z column_probs / U windows / w windows_explicit / v =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES)}, -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs phyloFit on an alignment. Returns : filename of init.mod file produced Args : The first argument represents an alignment, the second argument a species tree. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases, the alignment sequence names must correspond to node ids in the species tree. Multi-word species names should be joined with underscores to form the sequence names, eg. Homo_sapiens =cut sub run { my ($self, $aln, $tree) = @_; ($aln && $tree) || $self->throw("alignment and tree must be supplied"); $self->_alignment($aln); $tree = $self->_tree($tree); $tree->force_binary; # adjust tree node ids to convert spaces to underscores (eg. if tree # generated from taxonomy) foreach my $node ($tree->get_leaf_nodes) { my $id = $node->id; $id =~ s/ /_/g; $node->id($id); } # check node and seq names match $self->_check_names; return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $aln_file = $self->_write_alignment; my $tree_file = $self->_write_tree; #...phyloFit --tree "(human,(mouse,rat))" --msa-format FASTA --out-root init alignment.fa my $command = $exe.$self->_setparams($aln_file, $tree_file); $self->debug("phyloFit command = $command\n"); system($command) && $self->throw("phyloFit call ($command) crashed: $?"); # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return File::Spec->catfile($temp_dir, 'init.mod'); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $tree_file) = @_; my $param_string = ' --tree '.$tree_file; $param_string .= ' --msa-format FASTA'; $param_string .= ' --out-root init'; # --min-informative defaults to 50, but must not be greater than the number # of bases in the alignment my $aln = $self->_alignment; my $length = $aln->length; my $min_informative = $self->min_informative || 50; if ($length < $min_informative) { $self->min_informative($length); } $param_string .= $self->SUPER::_setparams(-params => [keys %PARAMS], -switches => [keys %SWITCHES], -double_dash => 1, -underscore_to_dash => 1); $param_string .= ' '.$aln_file; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/000077500000000000000000000000001342734133000230575ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/Base.pm000066400000000000000000000112711342734133000242710ustar00rootroot00000000000000# $Id $ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::Base # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Base - Base object for Phylip modules =head1 SYNOPSIS # Do not use directly # This module is for setting basic data sets for the Phylip wrapper # modules =head1 DESCRIPTION This module is just a base object for Bioperl Phylip wrappers. IMPORTANT PHYLIP VERSION ISSUES By default we assume you have Phylip 3.6 installed, if you have installed Phylip 3.5 you need to set the environment variable PHYLIPVERSION =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::Base; use vars qw(@ISA %DEFAULT %FILENAME); use strict; BEGIN { eval { require File::Spec }; if( $@) { Bio::Root::RootI->throw("Must have installed File::Spec to run Bio::Tools::Run::Phylo::Phylip tools"); } } use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::Phylo::Phylip::PhylipConf; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { %DEFAULT = ( 'VERSION' => $ENV{'PHYLIPVERSION'} || '3.6', ); %FILENAME = %Bio::Tools::Run::Phylo::Phylip::PhylipConf::FileName; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::Base->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::Base object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::Base Args : =cut =head2 outfile Title : outfile Usage : $obj->outfile($newval) Function: Get/Set default PHYLIP outfile name ('outfile' usually) Changing this is only necessary when you have compiled PHYLIP to use a different filename for the default 'outfile' This will not change the default output filename by PHYLIP Returns : value of outfile Args : newvalue (optional) =cut sub outfile{ my $self = shift; $self->{'_outfile'} = shift if @_; return $self->{'_outfile'} || $FILENAME{$self->version}{'OUTFILE'} } =head2 treefile Title : treefile Usage : $obj->treefile($newval) Function: Get/Set the default PHYLIP treefile name ('treefile' usually) Returns : value of treefile Args : newvalue (optional) =cut sub treefile{ my $self = shift; $self->{'_treefile'} = shift if @_; return $self->{'_treefile'} || $FILENAME{$self->version}{'TREEFILE'}; } =head2 fontfile Title : fontfile Usage : $obj->fontfile($newval) Function: Get/Set the fontfile Returns : value of fontfile (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub fontfile{ my $self = shift; return $self->{'fontfile'} = shift if @_; return $self->{'fontfile'} ; } =head2 plotfile Title : plotfile Usage : $obj->plotfile($newval) Function: Get/Set the plotfile Returns : value of plotfile (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub plotfile { my $self = shift; return $self->{'plotfile'} = shift if @_; return $self->{'plotfile'} || $FILENAME{$self->version}{'PLOTFILE'}; } =head2 version Title : version Usage : $obj->version($newval) Function: Get/Set the version Returns : value of version (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub version { my $self = shift; return $self->{'version'} = shift if @_; return $self->{'version'} || $DEFAULT{'VERSION'}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/Consense.pm000066400000000000000000000415001342734133000251720ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::Consense # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Consense - Wrapper for the phylip program Consense =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::Consense; use Bio::Tools::Run::Phylo::Phylip::SeqBoot; use Bio::Tools::Run::Phylo::Phylip::ProtDist; use Bio::Tools::Run::Phylo::Phylip::Neighbor; use Bio::Tools::Run::Phylo::Phylip::DrawTree; #first get an alignment my $aio= Bio::AlignIO->new(-file=>$ARGV[0],-format=>"clustalw"); my $aln = $aio->next_aln; # To prevent truncation of sequence names by PHYLIP runs, use set_displayname_safe my ($aln_safe, $ref_name)=$aln->set_displayname_safe(); #next use seqboot to generate multiple aligments my @params = ('datatype'=>'SEQUENCE','replicates'=>10); my $seqboot_factory = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); my $aln_ref= $seqboot_factory->run($aln); Or, for long sequence names: my $aln_ref= $seqboot_factory->run($aln_safe); #next build distance matrices and construct trees my $pd_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(); my $ne_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(); foreach my $a (@{$aln_ref}){ my $mat = $pd_factory->create_distance_matrix($a); push @tree, $ne_factory->create_tree($mat); } #now use consense to get a final tree my $con_factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(); #you may set outgroup either by the number representing the order in #which species are entered or by the name of the species $con_factory->outgroup(1); $con_factory->outgroup('HUMAN'); my $tree = $con_factory->run(\@tree); # Restore original sequence names, after ALL phylip runs: my @nodes = $tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } #now draw the tree my $draw_factory = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); my $image_filename = $draw_factory->draw_tree($tree); =head1 DESCRIPTION Wrapper for phylip consense program Taken from phylip documentation... CONSENSE reads a file of computer-readable trees and prints out (and may also write out onto a file) a consensus tree. At the moment it carries out a family of consensus tree methods called the M[l] methods (Margush and McMorris, 1981). These include strict consensus and majority rule consensus. Basically the consensus tree consists of monophyletic groups that occur as often as possible in the data. More documentation on using Consense and setting parameters may be found in the phylip package. VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6 although this is still experimental as v3.6 is still under alpha release and not all functionalities maybe supported. =head1 PARAMETERS FOR Consense =head2 TYPE Title : TYPE Description : (optional) Only available in phylip v3.6 This program supports 3 types of consensus generation MRe : Majority Rule (extended) Any set of species that appears in more than 50% of the trees is included. The program then considers the other sets of species in order of the frequency with which they have appeared, adding to the consensus tree any which are compatible with it until STRICT: A set of species must appear in all input trees to be included in the strict consensus tree. MR : A set of species is included in the consensus tree if it is present in more than half of the input trees. Ml : The user is asked for a fraction between 0.5 and 1, and the program then includes in the consensus tree any set of species that occurs among the input trees more than that fraction of then time. The Strict consensus and the Majority Rule consensus are extreme cases of the M[l] consensus, being for fractions of 1 and 0.5 respectively usage: my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-type=>"Ml 0.7"); Defaults to MRe =head2 ROOTED Title: ROOTED Description: (optional) toggles between the default assumption that the input trees are unrooted trees and the selection that specifies that the tree is to be treated as a rooted tree and not re-rooted. Otherwise the tree will be treated as outgroup-rooted and will be re-rooted automatically at the first species encountered on the first tree (or at a species designated by the Outgroup option) usage: my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-rooted=>1); Defaults to unrooted =head2 OUTGROUP Title : OUTGROUP Description : (optional) It is in effect only if the Rooted option selection is not in effect. The trees will be re-rooted with a species of your choosing. usage my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-outgroup=>2); Defaults to first species encountered. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::Consense; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @CONSENSE_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use IO::String; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the Consense program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use Consense.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/consense'); # my $Consense_factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(@params); # BEGIN { @CONSENSE_PARAMS = qw(TYPE OUTGROUP ROOTED); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@CONSENSE_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $obj->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'consense'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.treefile'; $tree= $Consense_factory->run($inputfilename); or $tree= $consense_factory->run(\@tree); Function: Create bootstrap sets of alignments Example : Returns : a L Args : either a file containing trees in newick format or an array ref of L Throws an exception if argument is not either a string (eg a filename) or a Bio::Tree::TreeI object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for Consense. Probably bad input data in $input !"); } # Create parameter string to pass to Consense program my $param_string = $self->_setparams(); # run Consense my $aln = $self->_run($infilename,$param_string); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to Consense program Example : Returns : an array ref of Args : Name of a file containing a set of tree in newick format and a parameter string to be passed to Consense =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } my $tmpdir = $self->tempdir; chdir($self->tempdir); # open a pipe to run Consense to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(Consense,"| ".$self->executable .">$null"); } else { open(Consense,"| ".$self->executable); } $instring = $infile."\n".$param_string; $self->debug( "Program ".$self->executable." $instring\n"); print Consense $instring; close(Consense); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->treefile); chdir($curpath); $self->throw("Consense did not create files correctly ($outfile)") unless (-e $outfile); #parse the alignments my @aln; my $tio = Bio::TreeIO->new(-file=>$outfile,-format=>"newick"); my $tree = $tio->next_tree; # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return $tree; } sub _set_names_from_tree { my ($self,$tree) = @_; my $newick; my $ios = IO::String->new($newick); my $tio = Bio::TreeIO->new(-fh=>$ios,-format=>'newick'); $tio->write_tree($tree); my @names = $newick=~/(\w+):\d+/g; my %names; for(my $i=0; $i < $#names; $i++){ $names{$names[$i]} = $i+1; } $self->names(\%names); return; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for Consense program Example : Returns : name of file containing a trees in newick format Args : an array ref of Bio::Tree::Tree object or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} my $tio = Bio::TreeIO->new(-file=>$alnfilename,-format=>'newick'); my $tree = $tio->next_tree; $self->_set_names_from_tree($tree); return $alnfilename; } # $input may be a SimpleAlign Object my @input = ref($input) eq "ARRAY" ? @{$input} : ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); foreach my $tree(@input){ $tree->isa('Bio::Tree::TreeI') || $self->throw('Expected a Bio::TreeI object'); $treeIO->write_tree($tree); } #get the species names in order, using the first one $self->_set_names_from_tree($input[0]); $treeIO->close(); close($tfh); undef $tfh; return $alnfilename; } =head2 names() Title : names Usage : $tree->names(\%names) Function: get/set for a hash ref for storing names in matrix with rank as values. Example : Returns : hash reference Args : hash reference =cut sub names { my ($self,$name) = @_; if($name){ $self->{'_names'} = $name; } return $self->{'_names'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Consense program Example : Returns : parameter string to be passed to Consense Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $rooted = 0; #for case where type is Ml my $Ml = 0; my $frac = 0.5; my %menu = %{$Menu{$self->version}->{'CONSENSE'}}; foreach my $attr ( @CONSENSE_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/ROOTED/i){ $rooted = 1; $param_string .= $menu{'ROOTED'}; } elsif($attr =~/OUTGROUP/i){ if($rooted == 1){ $self->warn("Outgroup option cannot be used with a rooted tree"); next; } if($value !~/^\d+$/){ # is a name my %names = %{$self->names}; $names{$value} || $self->throw("Outgroup $value not found"); $value = $names{$value}; } $param_string .=$menu{'OUTGROUP'}."$value\n"; } elsif($attr=~/TYPE/i){ if($value=~/Ml/i){ ($value,$frac) = split(/\s+/,$value); #default if not given $frac ||= 0.5; if($frac <= 0.5 || $frac > 1){ $self->warn("fraction given is out of range 0.5no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $Consense->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a Consense run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/DrawGram.pm000066400000000000000000000257221342734133000251310ustar00rootroot00000000000000# $Id $ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::DrawGram # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::DrawGram - use Phylip DrawTree program to draw phylograms or phenograms =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::DrawGram; my $drawfact = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(); my $treeimage = $drawfact->run($tree); =head1 DESCRIPTION This is a module for automating drawing of trees through Joe Felsenstein's Phylip suite. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::DrawGram; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME $FONTFILE @DRAW_PARAMS @OTHER_SWITCHES %OK_FIELD %DEFAULT); use strict; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use DrawGram.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/drawgram'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(@params) BEGIN { %DEFAULT = ('PLOTTER' => 'P', 'SCREEN' => 'N'); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1") if $ENV{'PHYLIPDIR'}; $PROGRAMNAME = 'drawgram'; @DRAW_PARAMS = qw(PLOTTER SCREEN TREESTYLE USEBRANCHLENS LABEL_ANGLE HORIZMARGINS VERTICALMARGINS SCALE TREEDEPTH STEMLEN TIPSPACE ANCESTRALNODES FONT); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@DRAW_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::DrawGram object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::DrawGram Args : The available DrawGram parameters =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } $self->plotter($DEFAULT{'PLOTTER'}) unless $self->plotter; $self->screen($DEFAULT{'SCREEN'}) unless $self->screen; $self->fontfile($DEFAULT{'FONTFILE'}) unless $self->fontfile; return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : my $file = $app->run($treefile); Function: Draw a tree Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub run{ my ($self,$input) = @_; # Create input file pointer my ($infilename) = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for drawgram. Probably bad input data in $input !"); } # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run drawgram my $plotfile = $self->_run($infilename,$param_string); return $plotfile; } =head2 draw_tree Title : draw_tree Usage : my $file = $app->draw_tree($treefile); Function: This method is deprecated. Please use run instead. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub draw_tree{ return shift->run(@_); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to drawgram program Example : Returns : Bio::Tree object Args : Name of a file the tree to draw in newick format and a parameter string to be passed to drawgram =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile . "\n"; if( ! defined $self->fontfile ) { $self->throw("You must have defined a fontfile"); } if( -e $self->io->catfile($curpath,'fontfile') ) { $instring .= $self->io->catfile($curpath,'fontfile')."\n"; } elsif( File::Spec->file_name_is_absolute($self->fontfile) ) { $instring .= $self->io->catfile($self->fontfile)."\n"; } else { $instring .= $self->io->catfile($curpath,$self->fontfile)."\n"; } chdir($self->tempdir); $instring .= $param_string; $self->debug( "Program ".$self->executable." $param_string\n"); # open a pipe to run drawgram to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(DRAW,"|".$self->executable.">$null"); } else { open(DRAW,"|".$self->executable); } print DRAW $instring; close(DRAW); chdir($curpath); #get the results my $plotfile = $self->io->catfile($self->tempdir,$self->plotfile); $self->throw("drawgram did not create plotfile correctly ($plotfile)") unless (-e $plotfile); return $plotfile; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for drawing program Example : Returns : filename containing tree in newick format Args : Bio::Tree::TreeI object =cut sub _setinput { my ($self, $input) = @_; my $treefile; unless (ref $input) { # check that file exists or throw $treefile = $input; unless (-e $input) {return 0;} } elsif ($input->isa("Bio::Tree::TreeI")) { # Open temporary file for both reading & writing of BioSeq array my $tfh; ($tfh,$treefile) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); $treeIO->write_tree($input); $treeIO->close(); close($tfh); $tfh = undef; } return $treefile; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for drawgram program Example : Returns : parameter string to be passed to drawgram Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my ($hmargin,$vmargin); my %menu = %{$Menu{$self->version}->{'DRAWGRAM'}}; foreach my $attr ( @DRAW_PARAMS) { $value = $self->$attr(); next unless defined $value; my @vals; if( ref($value) ) { ($value,@vals) = @$value; } $attr = uc($attr); if( ! exists $menu{$attr} ) { $self->warn("unknown parameter $attr, known params are ". join(",",keys %menu). "\n"); } if( ref ($menu{$attr}) !~ /HASH/i ) { unless( @vals ) { $param_string .= $menu{$attr}; } else { $param_string .= sprintf($menu{$attr},$value,@vals); } next; } my $seen = 0; for my $stype ( keys %{$menu{$attr}} ) { if( $value =~ /$stype/i ) { $param_string .= sprintf($menu{$attr}->{$stype},@vals); $seen = 1; last; } } unless( $seen ) { $self->warn("Unknown requested attribute $attr, $value is not known\n"); } } $param_string .="Y\n"; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $dragram->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a DrawGram run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/DrawTree.pm000066400000000000000000000271101342734133000251330ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::DrawTree # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::DrawTree - use Phylip DrawTree program to draw trees =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::DrawTree; my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); my $treeimagefile = $drawfact->run($tree); =head1 DESCRIPTION This is a module for automating drawing of trees through Joe Felsenstein's Phylip suite. To set parameters with option you need to pass in an array reference or a string, depending on the parameter. For example: $treedraw->HORIZMARGINS(['2.00','2.5']); $treedraw->ANCESTRALNODES('C'); $treedraw->TREESTYLE('PHEN'); $treedraw->USEBRANCHLENS('N'); This can be a brittle module as the menus change in PHYLIP. It should support phylip 3.6 but no guarantees. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::DrawTree; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME $FONTFILE @DRAW_PARAMS @OTHER_SWITCHES %OK_FIELD %DEFAULT); use strict; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics use Bio::Tools::Run::Phylo::Phylip::Base; use Cwd; @ISA = qw( Bio::Tools::Run::Phylo::Phylip::Base ); use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use DrawTree.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/drawgram'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params) BEGIN { %DEFAULT = ('PLOTTER' => 'P', 'SCREEN' => 'N'); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1") if $ENV{'PHYLIPDIR'}; $PROGRAMNAME="drawtree"; if (defined $ENV{'PHYLIPDIR'}) { $PROGRAMDIR = $ENV{'PHYLIPDIR'} || ''; $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR, $PROGRAMNAME.($^O =~ /mswin/i ?'.exe':'')); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1"); } else { $PROGRAM = $PROGRAMNAME; } @DRAW_PARAMS = qw(PLOTTER SCREEN LABEL_ANGLE ROTATION TREEARC ITERATE SCALE HORIZMARGINS VERTICALMARGINS FONT ); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@DRAW_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $obj->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $drawfact->program_dir() Function: returns the program directory, obtained from ENV variable. Returns : string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::DrawTree object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::DrawTree Args : The available DrawGram parameters =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } $self->plotter($DEFAULT{'PLOTTER'}) unless $self->plotter; $self->screen($DEFAULT{'SCREEN'}) unless $self->screen; $self->fontfile($DEFAULT{'FONTFILE'}) unless $self->fontfile; return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : my $file = $app->run($treefile); Function: Draw a tree Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub run{ my ($self,$input) = @_; # Create input file pointer my ($infilename) = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for drawgram. Probably bad input data in $input !"); } # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run drawgram my $plotfile = $self->_run($infilename,$param_string); return $plotfile; } =head2 draw_tree Title : draw_tree Usage : my $file = $app->draw_tree($treefile); Function: This method is deprecated. Please use run method. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub draw_tree{ return shift->run(@_); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to drawgram program Example : Returns : Bio::Tree object Args : Name of a file the tree to draw in newick format and a parameter string to be passed to drawgram =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile . "\n"; if( ! defined $self->fontfile ) { $self->throw("You must have defined a fontfile"); } if( -e $self->io->catfile($curpath,'fontfile') ) { $instring .= $self->io->catfile($curpath,'fontfile')."\n"; } elsif( File::Spec->file_name_is_absolute($self->fontfile) ) { #$instring .= $self->io->catfile($self->tempdir,$self->fontfile)."\n"; $instring .= $self->io->catfile($self->fontfile)."\n"; } else { $instring .= $self->io->catfile($curpath,$self->fontfile)."\n"; } chdir($self->tempdir); $instring .= $param_string; $self->debug( "Program ".$self->executable." $param_string\n"); # open a pipe to run drawgram to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(DRAW,"|".$self->executable.">$null"); } else { open(DRAW,"|".$self->executable); } print DRAW $instring; close(DRAW); chdir($curpath); #get the results my $plotfile = $self->io->catfile($self->tempdir,$self->plotfile); $self->throw("drawgram did not create plotfile correctly ($plotfile)") unless (-e $plotfile); return $plotfile; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for drawing program Example : Returns : filename containing tree in newick format Args : Bio::Tree::TreeI object =cut sub _setinput { my ($self, $input) = @_; my $treefile; unless (ref $input) { # check that file exists or throw $treefile = $input; unless (-e $input) {return 0;} } elsif ($input->isa("Bio::Tree::TreeI")) { # Open temporary file for both reading & writing of BioSeq array my $tfh; ($tfh,$treefile) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); $treeIO->write_tree($input); $treeIO->close(); close($tfh); undef $tfh; } return $treefile; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for drawgram program Example : Returns : parameter string to be passed to drawgram Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my ($hmargin,$vmargin); my %menu = %{$Menu{$self->version}->{'DRAWTREE'}}; foreach my $attr ( @DRAW_PARAMS) { $value = $self->$attr(); next unless defined $value; my @vals; if( ref($value) ) { ($value,@vals) = @$value; } $attr = uc($attr); if( ! exists $menu{$attr} ) { $self->warn("unknown parameter $attr, known params are ". join(",",keys %menu). "\n"); } if( ref ($menu{$attr}) !~ /HASH/i ) { unless( @vals ) { $param_string .= $menu{$attr}; } else { $param_string .= sprintf($menu{$attr},$value,@vals); } next; } my $seen = 0; for my $stype ( keys %{$menu{$attr}} ) { if( $value =~ /$stype/i ) { $param_string .= sprintf($menu{$attr}->{$stype},@vals); $seen = 1; last; } } unless( $seen ) { $self->warn("Unknown requested attribute $attr, $value is not known\n"); } } $param_string .="Y\n"; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $dragram->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a DrawTree run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a Bio::Root::IO object Returns : Bio::Root::IO object Args : none See L =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/Neighbor.pm000066400000000000000000000414651342734133000251640ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::Neighbor # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Neighbor - Wrapper for the phylip program neighbor by Joseph Felsenstein for creating a phylogenetic tree(either through Neighbor or UPGMA) based on protein distances based on amino substitution rate. 14 Nov 2002 Shawn Works with Phylip version 3.6 =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. # Create the Distance Matrix # using a default PAM matrix and id name lengths limit of 30 note to # use id name length greater than the standard 10 in neighbor, you # will need to modify the neighbor source code $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my $matrix = $protdist_factory->run($aln); #Create the tree passing in the distance matrix @params = ('type'=>'NJ','outgroup'=>2,'lowtri'=>1, 'upptri'=>1,'subrep'=>1); my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); #you can set your outgroup using either a number specifying #the rank in the matrix or you can just use the name of the #species $neighbor_factory->outgroup('ENSP00001'); #or $neighbor_factory->outgroup(1); my ($tree) = $neighbor_factory->run($matrix); # Alternatively, one can create the tree by passing in a file name # containing a phylip formatted distance matrix(using protdist) my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); my ($tree) = $neighbor_factory->run('/home/shawnh/prot.dist'); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtDist and Neighbor: $matrix = $protdist_factory-> creat_distance_matrix($aln_safe); # Use $aln_safe instead of $aln $tree = $neighbor_factory->run($matrix); # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 PARAMTERS FOR NEIGHBOR COMPUTATION =cut =head2 TYPE Title : TYPE Description : (optional) This sets the type of tree to construct, using neighbor joining or UPGMA. NJ Neighbor Joining UPGMA UPGMA Usage : @params = ('type'=>'X');#where X is one of the values above Defaults to NJ For more information on the usage of the different models, please refer to the documentation found in the phylip package. =head2 OUTGROUP (*ONLY AVAILABLE FOR NEIGHBOR JOINING) Title : OUTGROUP Description : (optional) This option selects the species to be used as the outgroup Acceptable Values: integer Usage : @params = ('outgroup'=>'X'); where X is an positive integer not more than the number of sequences Defaults to 1 =head2 LOWTRI Title : LOWTRI Description : (optional) This indicates that the distance matrix is input in Lower-triangular form (the lower-left half of the distance matrix only, without the zero diagonal elements) Usage : @params = ('lowtri'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 UPPTRI Title : UPPTRI Description : (optional) This indicates that the distance matrix is input in upper-triangular form (the upper-right half of the distance matrix only, without the zero diagonal elements.) Usage : @params = ('upptri'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 SUBREP Title : SUBREP Description : (optional) This is the Subreplication option. It informs the program that after each distance will be provided an integer indicating that the distance is a mean of that many replicates. Usage : @params = ('subrep'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 JUMBLE Title : JUMBLE Description : (optional) This enables you to tell the program to use a random number generator to choose the input order of species. seed: an integer between 1 and 32767 and of the form 4n+1 which means that it must give a remainder of 1 when divided by 4. Each different seed leads to a different sequence of addition of species. By simply changing the random number seed and re-running programs one can look for other, and better trees. iterations: Usage : @params = ('jumble'=>'17); where 17 is the random seed Defaults to no jumble =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email:jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::Neighbor; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @NEIGHBOR_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use Neighbor.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/neighbor'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); # BEGIN { $PROGRAMNAME="neighbor"; if (defined $ENV{PHYLIPDIR}) { $PROGRAMDIR = $ENV{PHYLIPDIR} || ''; $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR, $PROGRAMNAME.($^O =~ /mswin/i ?'.exe':'')); } else { $PROGRAM = $PROGRAMNAME; } @NEIGHBOR_PARAMS = qw(TYPE OUTGROUP LOWTRI UPPTRI SUBREP JUMBLE MULTIPLE); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@NEIGHBOR_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'neighbor'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } if (! defined $self->idlength){ $self->idlength(10); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.dist'; $tree = $neighborfactory->run($inputfilename); or $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); $matrix = $protdist_factory->create_distance_matrix($aln); $tree= $neighborfactory->run($matrix); Function: a Bio:Tree from a protein distance matrix created by protidst Example : Returns : Bio::Tree Args : Name of a file containing a protein distance matrix in Phylip format or a hash ref to a matrix Throws an exception if argument is not either a string (eg a filename) or a Hash. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($temp,$infilename, $seq); my ($attr, $value, $switch); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for neighbor. Probably bad input data in $input !");} # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run neighbor my @tree = $self->_run($infilename,$param_string); return wantarray ? @tree: \@tree; } =head2 create_tree Title : create_tree Usage : my $file = $app->create_tree($treefile); Function: This method is deprecated. Please use run method. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to neighbor program Example : Returns : Bio::Tree object Args : Name of a file containing protein distances in Phylip format and a parameter string to be passed to neighbor =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable."\n"); chdir($self->tempdir); #open a pipe to run neighbor to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(NEIGHBOR,"|".$self->executable.">$null"); } else { open(NEIGHBOR,"|".$self->executable); } print NEIGHBOR $instring; close(NEIGHBOR); chdir($curpath); #get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); my $treefile = $self->io->catfile($self->tempdir,$self->treefile); $self->throw("neighbor did not create tree correctly (expected $treefile) ") unless (-e $treefile); my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my @tree; while (my $tree = $in->next_tree){ push @tree, $tree; } # Clean up the temporary files created along the way... unless ( $self->save_tempfiles ) { unlink $outfile; unlink $treefile; } return @tree; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for neighbor program Example : Returns : name of file containing the protein distance matrix in Phylip format Args : name of file created by protdist or ref to hash created by Bio::Tools:Run::Phylo::Phylip::ProtDist =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); #If $input is not a filename it better be a HASF reference # a phy formatted alignment file created by protdist unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref($input) eq "ARRAY" ? @{$input} : ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $input_count = 0; foreach my $input(@input){ if ($input->isa("Bio::Matrix::PhylipDist")){ # Open temporary file for both reading & writing of distance matrix print $tfh $input->print_matrix; $input_count++; } } $self->_input_nbr($input_count); close($tfh); #get names from the first matrix, to be used in outgroup ordering my %names; $input = shift @input; #set the species names my @names = @{$input->names}; for(my $i=0; $i<= $#names; $i++){ $names{$names[$i]} = $i+1; } $self->names(\%names); return $alnfilename; } sub _input_nbr { my ($self,$val) = @_; if($val){ $self->{'_input_nbr'} = $val; } return $self->{'_input_nbr'}; } =head2 names() Title : names Usage : $tree->names(\%names) Function: get/set for a hash ref for storing names in matrix with rank as values. Example : Returns : hash reference Args : hash reference =cut sub names { my ($self,$name) = @_; if($name){ $self->{'_names'} = $name; } return $self->{'_names'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for neighbor program Example : Returns : parameter string to be passed to neighbor Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $type =""; my $version = $self->version; my %menu = %{$Menu{$version}->{'NEIGHBOR'}}; foreach my $attr ( @NEIGHBOR_PARAMS) { $value = $self->$attr(); next unless (defined $value && $value); if ($attr =~/TYPE/i){ if ($value=~/UPGMA/i){ $type = "UPGMA"; $param_string .= $menu{'TYPE'}{'UPGMA'}; } } elsif($attr =~ /OUTGROUP/i){ if ($type ne "UPGMA"){ if($value !~/^\d+$/){ # is a name so find the rank my %names = %{$self->names}; $names{$value} || $self->throw("Outgroup $value not found"); $value = $names{$value}; } $param_string .= $menu{'OUTGROUP'}."$value\n"; } else { $self->throw("Can't set outgroup using UPGMA. Use Neighbor-Joining instead"); } } elsif ($attr =~ /JUMBLE/i){ $self->throw("Unallowed value for random seed, need odd number") unless ($value =~ /\d+/ && ($value % 2 == 1)); $param_string .=$menu{'JUMBLE'}."$value\n"; } elsif($attr=~/MULTIPLE/i){ $param_string.=$menu{'MULTIPLE'}."$value\n"; #version 3.6 needs a random seed if($version eq "3.6"){ $param_string .= (2 * int(rand(10000)) + 1)."\n"; } } else{ $param_string .= $menu{uc $attr}; } } if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){ $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n"; } $param_string .=$menu{'SUBMIT'}; return $param_string; } =head2 outfile Title : outfile Usage : $obj->outfile($newval) Function: Get/Set default PHYLIP outfile name ('outfile' usually) Returns : value of outfile Args : newvalue (optional) =cut =head2 treefile Title : treefile Usage : $obj->treefile($newval) Function: Get/Set the default PHYLIP treefile name ('treefile' usually) Returns : value of treefile Args : newvalue (optional) =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/PhylipConf.pm000066400000000000000000000172161342734133000254770ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::PhylipConf # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::PhylipConf =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::PhylipConf; my %menu = %{$Bio::Tools::Run::Phylo::Phylip::PhylipConf::Menu->{$version}->{'PROTDIST'}}; =head1 DESCRIPTION A configuration for managing menu configuration differences between version 3.5 and 3.6 =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email:jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::PhylipConf; use strict; use Exporter; use vars qw(@ISA %Menu %FileName $RESOLUTIONX $RESOLUTIONY @EXPORT_OK); use base 'Exporter'; $RESOLUTIONX = 300; $RESOLUTIONY = 300; @EXPORT_OK = qw(%FileName %Menu); %FileName = ( "3.5"=>{'OUTFILE'=>'outfile', 'TREEFILE'=>'treefile', 'PLOTFILE'=>'plotfile', }, "3.6"=>{'OUTFILE'=>'outfile', 'TREEFILE'=>'outtree', 'PLOTFILE'=>'plotfile', }, ); %Menu = ( "3.5" => { 'PROTDIST' => { 'MODEL' => { 'CAT' =>"P\nP\n", 'KIMURA'=>"P\n", }, 'GENCODE'=> { 'ALLOWED'=>"UMVFY", 'OPTION' =>"C\n", }, 'CATEGORY'=>{ 'ALLOWED'=>"CHG", 'OPTION' =>"A\n", }, 'PROBCHANGE'=>"E\n", 'TRANS' =>"T\n", 'FREQ' =>"F\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'NEIGHBOR'=>{ 'TYPE' => { 'UPGMA'=>"N\n", }, 'OUTGROUP'=>"O\n", 'LOWTRI' =>"L\n", 'UPPTRI' =>"R\n", 'SUBREP' =>"S\n", 'JUMBLE' =>"J\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'PROTPARS'=>{ 'THRESHOLD'=>"T\n", 'JUMBLE' =>"J\n", 'OUTGROUP' =>"O\n", 'SUBMIT' =>"Y\n", }, 'SEQBOOT'=>{ 'DATATYPE' =>{ 'SEQUENCE'=>"", 'MORPH' =>"D\n", 'REST' =>"D\nD\n", 'GENEFREQ'=>"D\nD\nD\n", }, 'ALLELES' => "A\n", 'PERMUTE' => { 'BOOTSTRAP'=>"", 'JACKKNIFE'=>"J\n", 'PERMUTE' =>"J\nJ\n", }, 'REPLICATES'=>"R\n", 'SUBMIT' =>"Y\n", }, 'CONSENSE'=>{ 'ROOTED' => "R\n", 'OUTGROUP' => "O\n", 'SUBMIT' =>"Y\n", }, }, "3.6"=>{ 'PROTDIST'=>{ 'MODEL' => { 'PMB' =>"P\n", 'PAM' =>"P\nP\n", 'KIMURA' =>"P\nP\nP\n", 'CAT' =>"P\nP\nP\nP\n", 'JTT' =>"Y\n", }, 'GENCODE'=> { 'ALLOWED'=>"UMVFY", 'OPTION' =>"U\n", }, 'CATEGORY'=> { 'ALLOWED'=>"CHG", 'OPTION' =>"A\n", }, 'PROBCHANGE'=>"E\n", 'TRANS' =>"T\n", 'FREQ' =>"F\n", 'WEIGHTS' =>"W\n", 'SUBMIT' => "Y\n", 'MULTIPLE' =>"M\nD\n", }, 'NEIGHBOR' => { 'TYPE' => { 'UPGMA'=>"N\n", }, 'OUTGROUP'=>"O\n", 'LOWTRI' =>"L\n", 'UPPTRI' =>"R\n", 'SUBREP' =>"S\n", 'JUMBLE' =>"J\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'PROTPARS' => { 'THRESHOLD'=>"T\n", 'JUMBLE' =>"J\n", 'OUTGROUP' =>"O\n", 'SUBMIT' =>"Y\n", }, 'DRAWGRAM' => { 'SCALE' => "R\n", 'HORIZMARGINS' => "M\n%.2f\n%.2f\n", 'VERTICALMARGINS' => "M\n%.2f\n%.2f", 'SCREEN' => { 'Y|YES|1' => "V\nX\n", 'N|NO|0' => "V\nN\n", }, 'FONT' => "F\n%s\n", 'PAGES' => { 'L|PAGES|SIZE' => "#\nL\n%d\n%d\nM\n", 'P|PHYSICAL' => "#\nP\n%.4f\n%.4f\nM\n", 'O|OVERLAP' => "#\nO\n%.4f\n%.4f\nM\n", }, 'PLOTTER' => { 'P|POSTSCRIPT' => "P\nL\n", 'PICT' => "P\nM\n", "HP|PCL|LaserJect" => "P\nJ\n", "BMP" => "P\nW\n$RESOLUTIONX\n$RESOLUTIONY", "FIG" => "P\nF\n", "IDRAW" => "P\nA\n", "VRML" => "P\nZ\n", "PCX" => "P\nP\n3\n", }, 'ANCESTRALNODES' => { 'I|INTER|INTERMEDIETE' => "A\nI\n", 'W|WEIGHTED' => "A\nW\n", 'C|CENT|CENTERED' => "A\nC\n", 'N|INNER|INNERMOST' => "A\nN\n", 'V' => "A\nV\n", }, 'TREESTYLE' => { 'C|CLAD|CLADOGRAM' => "S\nC\n", 'P|PHEN|PHENOGRAM' => "S\nP\n", 'V|CURV|CURVOGRAM' => "S\nV\n", 'E|EURO|EUROGRAM' => "S\nE\n", 'S|SWOOP|SWOOPOGRAM' => "S\nS\n", 'O|CIRC|CIRCULAR' => "S\nO\n", }, 'TIPSPACE' => "C\n%.4f\n", 'STEMLEN' => "T\n%.4f\n", 'TREEDEPTH' => "D\n%.4f\n", 'LABEL_ANGLE' => "L\n%.4f\n", 'USEBRANCHLENS' => { '1|Y|YES' => "", '0|N|NO' => "B\n", }, }, 'DRAWTREE' => { 'SCREEN' => { 'Y|YES|1' => "V\nX\n", 'N|NO|0' => "V\nN\n", }, 'PLOTTER' => { 'L|P|POSTSCRIPT' => "P\nL\n", 'PICT' => "P\nM\n", "HP|PCL|LaserJect" => "P\nJ\n", "BMP" => "P\nW\n$RESOLUTIONX\n$RESOLUTIONY", "FIG" => "P\nF\n", "IDRAW" => "P\nA\n", "VRML" => "P\nZ\n", "PCX" => "P\nP\n3\n", }, 'LABEL_ANGLE' => { 'F|FIXED' => "L\nF\n%d\n", 'R|RADIAL' => "L\nR\n", 'A|ALONG' => "L\nA\n", 'M|MIDDLE' => "L\nM\n", }, 'ROTATION' => "R\n%d\n", 'ITERATE' => { 'E|EQUAL|DAYLIGHT' => "", 'N|NBODY|N-BODY' => "I\n", 'NO|FALSE' => "I\nI\n", }, 'TREEARC' => "I\nI\nA\n%d\n", 'SCALE' => "S\n%.2f\n", 'PAGES' => { 'L|PAGES|SIZE' => "#\nL\n%d\n%d\nM\n", 'P|PHYSICAL' => "#\nP\n%.4f\n%.4f\nM\n", 'O|OVERLAP' => "#\nO\n%.4f\n%.4f\nM\n", }, 'HORIZMARGINS' => "M\n%.2f\n%.2f\n", 'VERTICALMARGINS' => "M\n%.2f\n%.2f", }, 'SEQBOOT'=>{ 'DATATYPE' => { 'SEQUENCE'=> "", 'MORPH' =>"D\n", 'REST' =>"D\nD\n", 'GENEFREQ'=>"D\nD\nD\n", }, 'ALLELES' => "A\n", 'PERMUTE' => { 'BOOTSTRAP'=>"", 'JACKKNIFE'=>"J\n", 'PERMUTE' =>"J\nJ\n", }, 'REPLICATES'=>"R\n", 'SUBMIT' =>"Y\n", }, 'CONSENSE'=>{ 'TYPE' => { 'MRE' =>"", 'STRICT' =>"C\n", 'MR' =>"C\nC\n", 'ML' =>"C\nC\nC\n", }, 'ROOTED' => "R\n", 'OUTGROUP' => "O\n", 'SUBMIT' =>"Y\n", }, }, ); 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/ProtDist.pm000066400000000000000000000443441342734133000251760ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip program protdist =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. # Create the Distance Matrix using a default PAM matrix and id name # lengths limit of 30 note to use id name length greater than the # standard 10 in protdist, you will need to modify the protdist source # code @params = ('MODEL' => 'PAM'); $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my ($matrix) = $protdist_factory->run($aln); # an array of Bio::Matrix::PhylipDist matrix #finding the distance between two sequences my $distance = $matrix->get_entry('protein_name_1','protein_name_2'); my @column = $matrix->get_column('protein_name_1'); my @row = $martrix->get_row('protein_name_1'); my @diag = $matrix->get_diagonal(); print $matrix->print_matrix; #Alternatively, one can create the matrix by passing in a file #name containing a multiple alignment in phylip format $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my ($matrix) = $protdist_factory->run('/home/shawnh/prot.phy'); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtDist and Neighbor: ($matrix) = $protdist_factory-> create_distance_matrix($aln_safe); # Use $aln_safe instead of $aln ($tree) = $neighbor_factory->run($matrix); # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 DESCRIPTION Wrapper for protdist Joseph Felsentein for creating a distance matrix comparing protein sequences from a multiple alignment file or a L object and returns a L object; VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6. =head1 PARAMETERS FOR PROTDIST COMPUTATION =head2 MODEL Title : MODEL Description : (optional) This sets the model of amino acid substitution used in the calculation of the distances. 3 different models are supported: PAM Dayhoff PAM Matrix(default) KIMURA Kimura's Distance CAT Categories Distance Usage: @params = ('model'=>'X');#where X is one of the values above Defaults to PAM For more information on the usage of the different models, please refer to the documentation defaults to Equal (0.25,0.25,0.25,0.25) found in the phylip package. Additional models in PHYLIP 3.6 PMB - Henikoff/Tillier PMB matrix JTT - Jones/Taylor/Thornton =head2 MULTIPLE Title : MULTIPLE Description: (optional) This allows multiple distance matrices to be generated from multiple MSA. Usage: @params = ('MULTIPLE'=>100) where the value specifyies the number of aligments given. =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH THE Categories Distance MODEL* =head2 GENCODE Title : GENCODE Description : (optional) This option allows the user to select among various nuclear and mitochondrial genetic codes. Acceptable Values: U Universal M Mitochondrial V Vertebrate mitochondrial F Fly mitochondrial Y Yeast mitochondrial Usage : @params = ('gencode'=>'X'); where X is one of the letters above Defaults to U =head2 CATEGORY Title : CATEGORY Description : (optional) This option sets the categorization of amino acids all have groups: (Glu Gln Asp Asn), (Lys Arg His), (Phe Tyr Trp) plus: G George/Hunt/Barker: (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr Pro) C Chemical: (Cys Met), (Val Leu Ileu Gly Ala Ser Thr), (Pro) H Hall: (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr), (Pro) Usage : @params = ('category'=>'X'); where X is one of the letters above Defaults to G =head2 PROBCHANGE Title : PROBCHANGE Description : (optional) This option sets the ease of changing category of amino acid. (1.0 if no difficulty of changing,less if less easy. Can't be negative) Usage : @params = ('probchange'=>X) where 0<=X<=1 Defaults to 0.4570 =head2 TRANS Title : TRANS Description : (optional) This option sets transition/transversion ratio can be any positive number Usage : @params = ('trans'=>X) where X >= 0 Defaults to 2 =head2 FREQ Title : FREQ Description : (optional) This option sets the frequency of each base (A,C,G,T) The sum of the frequency must sum to 1. For example A,C,G,T = (0.25,0.5,0.125,0.125) Usage : @params = ('freq'=>('W','X','Y','Z') where W + X + Y + Z = 1 Defaults to Equal (0.25,0.25,0.25,0.25) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::ProtDist; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROTDIST_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Bio::Tools::Phylo::Phylip::ProtDist; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the protdist program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/protdist'); # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); # BEGIN { @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS WEIGHTS FREQ MULTIPLE); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protdist'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $matrix= $prodistfactory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $protdistfactory->align($seq_array_ref); $matrix = $protdistfactory->run($aln); Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file Example : Returns : L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");} # Create parameter string to pass to protdist program my $param_string = $self->_setparams(); # run protdist my @mat = $self->_run($infilename,$param_string); return wantarray ? @mat:\@mat; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to protdist program Example : Returns : Bio::Tree object Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to protdist =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable." $instring\n"); chdir($self->tempdir); #open a pipe to run protdist to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(PROTDIST,"|".$self->executable .">$null"); } else { open(PROTDIST,"|".$self->executable); } print PROTDIST $instring; close(PROTDIST); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); chdir($curpath); $self->throw("protdist did not create matrix correctly ($outfile)") unless (-e $outfile); #Create the distance matrix here my $parser = Bio::Tools::Phylo::Phylip::ProtDist->new(-file=>$outfile); my @matrix; while (my $mat = $parser->next_matrix){ push @matrix, $mat; } # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return @matrix; } =head2 create_distance_matrix Title : create_distance_matrix Usage : my $file = $app->create_distance_matrix($treefile); Function: This method is deprecated. Please use run method. Returns : L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub create_distance_matrix{ return shift->run(@_); } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for protdist program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # suffix is used to distinguish alignment files from an align obkect #If $input is not a reference it better be the name of a file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref $input eq 'ARRAY' ? @{$input} : ($input); # $input may be a SimpleAlign Object ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip', -idlength=>$self->idlength()); my $input_count = 0; foreach my $input(@input){ if ($input->isa("Bio::SimpleAlign")){ # Open temporary file for both reading & writing of BioSeq array $alnIO->write_aln($input); } $input_count++; } $alnIO->close(); close($tfh); $tfh = undef; $self->_input_nbr($input_count); return $alnfilename; } sub _input_nbr { my ($self,$val) = @_; if($val){ $self->{'_input_nbr'} = $val; } return $self->{'_input_nbr'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for protdist program Example : Returns : parameter string to be passed to protdist Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my %menu = %{$Menu{$self->version}->{'PROTDIST'}}; foreach my $attr ( @PROTDIST_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/MODEL/i){ if ($value=~/CAT/i){ $cat = 1; } $param_string .= $menu{'MODEL'}{$value}; } if($attr=~/MULTIPLE/i){ $param_string.=$menu{'MULTIPLE'}."$value\n"; } if ($cat == 1){ if($attr =~ /GENCODE/i){ my $allowed = $menu{'GENCODE'}{'ALLOWED'}; $self->throw("Unallowed value for genetic code") unless ($value =~ /[$allowed]/); $param_string .= $menu{'GENCODE'}{'OPTION'}."$value\n"; } if ($attr =~/CATEGORY/i){ my $allowed = $menu{'CATEGORY'}{'ALLOWED'}; $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[$allowed]/); $param_string .= $menu{'CATEGORY'}{'OPTION'}."$value\n"; } if ($attr =~/PROBCHANGE/i){ if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){ $param_string .= $menu{'PROBCHANGE'}."$value\n"; } else { $self->throw("Unallowed value for probability change category"); } } if ($attr =~/TRANS/i){ if (($value=~/\d+/) && ($value >=0)){ $param_string .=$menu{'TRANS'}."$value\n"; } } if ($attr =~ /FREQ/i){ my @freq = split(",",$value); if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25) $param_string .=$menu{'FREQ'}.$freq[0]."\n"; } elsif ($#freq == 3) {#must have 4 digits for each base $param_string .=$menu{'FREQ'}; foreach my $f (@freq){ $param_string.="$f\n"; } } else { $self->throw("Unallowed value for base frequencies"); } } } } #set multiple option is not set and there are more than one sequence if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){ $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n"; } $param_string .=$menu{'SUBMIT'}; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $protdist->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a ProtDist run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/ProtPars.pm000066400000000000000000000315531342734133000251760ustar00rootroot00000000000000# $Id$ # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtPars # # Created by Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::ProtPars - Object for creating a L object from a multiple alignment file or a SimpleAlign object 14 Nov 2002 Shawn Works with Phylip version 3.6 =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. #Create the Tree #using a threshold value of 30 and id name lengths limit of 30 #note to use id name length greater than the standard 10 in protpars, # you will need to modify the protpars source code $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2); $tree = $tree_factory->run($aln); #Or one can pass in a file name containing a multiple alignment #in phylip format: $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars->new(idlength=>30,threshold=>10); $tree = $tree_factory->run("/usr/users/shawnh/COMPARA/prot.phy"); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtPars: $tree = $protpars_factory->run($aln_safe); # Use $aln_safe instead of $aln # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 PARAMTERS FOR PROTPARS COMPUTATION =head2 THRESHOLD Title : THRESHOLD Description : (optional) This sets a threshold such that if the number of steps counted in a character is higher than the threshold, it will be taken to be the threshold value rather than the actual number of steps. You should use a positive real number greater than 1. Please see the documetation from the phylip package for more information. =head2 OUTGROUP Title : OUTGROUP Description : (optional) This specifies which species is to be used to root the tree by having it become the outgroup. Input values are integers specifying which species to use. Defaults to 1 =head2 JUMBLE Title : JUMBLE Description : (optional) This enables you to tell the program to use a random number generator to choose the input order of species. Input values is of the format: seed,iterations eg 17,10 seed: an integer between 1 and 32767 and of the form 4n+1 which means that it must give a remainder of 1 when divided by 4. Each different seed leads to a different sequence of addition of species. By simply changing the random number seed and re-running programs one can look for other, and better trees. iterations: For a value of 10, this will tell the program to try ten different orders of species in constructing the trees, and the results printed out will reflect this entire search process (that is, the best trees found among all 10 runs will be printed out, not the best trees from each individual run). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phylip::ProtPars; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROTPARS_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Cwd; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the protpars program. This # can be done in (at least) two ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/protdist'); # my $protpars_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars->new(@params); # BEGIN { @PROTPARS_PARAMS = qw(THRESHOLD JUMBLE OUTGROUP); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@PROTPARS_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protpars'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $tree = $factory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $factory->run($seq_array_ref); $tree = $treefactory->run($aln); Function: Create a protpars tree from a SimpleAlign object Example : Returns : L object Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for protpars. Probably bad input data in $input !");} # Create parameter string to pass to protpars program my $param_string = $self->_setparams(); # run protpars my $aln = $self->_run($infilename,$param_string); } =head2 create_tree Title : create_tree Usage : $inputfilename = 't/data/prot.phy'; $tree = $factory->create_tree($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); $tree = $treefactory->create_tree($aln); Function: Create a protpars tree from a SimpleAlign object Example : Returns : L object Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to protpars program Example : Returns : Bio::Tree object Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to protpars =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable."\n"); chdir($self->tempdir); #open a pipe to run protpars to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(PROTPARS,"|".$self->executable.">$null"); } else { open(PROTPARS,"|".$self->executable); } print PROTPARS $instring; close(PROTPARS); chdir($curpath); #get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); my $treefile = $self->io->catfile($self->tempdir,$self->treefile); $self->throw("Protpars did not create treefile correctly") unless (-e $treefile); #create the tree my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my $tree = $in->next_tree(); unless ( $self->save_tempfiles ) { # Clean up the temporary files created along the way... unlink $treefile; unlink $outfile; } return $tree; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for protpars program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); # If $input is not a reference it better be the name of a # file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } # $input may be a SimpleAlign Object if ($input->isa("Bio::Align::AlignI")) { # Open temporary file for both reading & writing of BioSeq array ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip',idlength=>$self->idlength()); $alnIO->write_aln($input); $alnIO->close(); close($tfh); $tfh = undef; return $alnfilename; } return 0; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for protpars program Example : Returns : parameter string to be passed to protpars Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; my %menu = %{$Menu{$self->version}->{'PROTPARS'}}; for $attr ( @PROTPARS_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/JUMBLE/i){ my ($seed,$itr) = split(",",$value); $param_string .=$menu{'JUMBLE'}."$seed\n$itr\n"; } else { $param_string .= $menu{uc $attr}."$value\n"; } } $param_string .="Y\n"; return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phylip/SeqBoot.pm000066400000000000000000000356651342734133000250100ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::SeqBoot # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::SeqBoot - Wrapper for the phylip program SeqBoot =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object. # Use seqboot to generate bootstap alignments my @params = ('datatype'=>'SEQUENCE','replicates'=>100); my $seq = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); my $aln_ref = $seq->run($aln); my $aio = Bio::AlignIO->new(-file=>">alignment.bootstrap",-format=>"phylip"); foreach my $ai(@{$aln_ref}){ $aio->write_aln($ai); } # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds orginal names # Step 2. Run PHYLIP programs: $aln_ref = $seq->run($aln_safe); # Use $aln_safe instead of $aln # Step 3. Retrieve orgininal names $aio = Bio::AlignIO->new( -file=>">alignment.bootstrap", -format=>"fasta"); # FASTA output to view full names foreach my $ai(@{$aln_ref}){ my $new_aln=$ai->restore_displayname($ref_name); # Restore names $aio->write_aln($new_aln); } =head1 DESCRIPTION Wrapper for seqboot from the phylip package by Joseph Felsentein. Taken from phylip doc... "SEQBOOT is a general boostrapping tool. It is intended to allow you to generate multiple data sets that are resampled versions of the input data set. SEQBOOT can handle molecular sequences, binary characters, restriction sites, or gene frequencies." More documentation on using seqboot and setting parameters may be found in the phylip package. VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6 although this is still experimental as v3.6 is still under alpha release and not all functionalities maybe supported. =head1 PARAMETERS FOR SEQBOOT =head2 MODEL Title : DATATYPE Description : (optional) This program supports 3 different datatypes SEQUENCE: Molecular Sequences MORPH : Discrete Morphological Characters REST : Restriction Sites GENEFREQ: Gene Frequencies Defaults to SEQUENCE =head2 PERMUTE Title: PERMUTE Description: (optional) 3 different resampling methods are available: BOOTSTRAP : creating a new data set by sampling N characters randomly with replacement The resulting data set has the same size as the original, but some characters have been left out and others are duplicated JACKKNIFE : Delete-half-jackknifing. It involves sampling a random half of the characters, and including them in the data but dropping the others The resulting data sets are half the size of the original, and no characters are duplicated. PERMUTE : Permuting species within characters. It involves permuting the columns of the data matrix separately. This produces data matrices that have the same number and kinds of characters but no taxonomic structure. Defaults to BOOTSTRAP =head2 REPLICATES Title : REPLICATES Description : (optional) This options allows the user to set the number of replicate data sets. Most statisticians would be happiest with 1000 to 10,000 replicates in a bootstrap, but 100 gives a good rough picture Defaults to 100 =head2 ALLELES Title : ALLELES Description : (optional) This option is to be used with gene frequencies datatype option to specify that all alleles at each locus are in the input file. Defaults to NULL =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::SeqBoot; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SEQBOOT_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Bio::Matrix::PhylipDist; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the SeqBoot program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/seqboot'); # my $SeqBoot_factory = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); # BEGIN { @SEQBOOT_PARAMS = qw(DATATYPE PERMUTE BLOCKSIZE REPLICATES READWEIGHTS READCAT); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@SEQBOOT_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'seqboot'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $matrix= $seqboot_factory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $clustalw_factory->align($seq_array_ref); $aln_ref = $SeqBootfactory->run($aln); Function: Create bootstrap sets of alignments Example : Returns : an array ref of L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for seqboot. Probably bad input data in $input !");} # Create parameter string to pass to SeqBoot program my $param_string = $self->_setparams(); # run SeqBoot my $aln = $self->_run($infilename,$param_string); return $aln; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to SeqBoot program Example : Returns : an array ref of Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to SeqBoot =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } #odd random seed my $rand = (2 * int(rand(10000)) + 1); if ($self->version == 3.5){ $instring = $infile."\n$rand\n$param_string"; } else { $instring = $infile."\n$param_string$rand\n"; } $self->debug( "Program ".$self->executable." $instring\n"); chdir($self->tempdir); #open a pipe to run SeqBoot to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(SeqBoot,"|".$self->executable .">$null"); } else { open(SeqBoot,"|".$self->executable); } print SeqBoot $instring; close(SeqBoot); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); chdir($curpath); $self->throw("SeqBoot did not create files correctly ($outfile)") unless (-e $outfile); #parse the alignments my @aln; my @parse_params; push @parse_params, ('-interleaved' => 1) if $self->version == 3.6; my $aio = Bio::AlignIO->new(-file=>$outfile,-format=>"phylip", @parse_params); while (my $aln = $aio->next_aln){ push @aln, $aln; } # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return \@aln; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for SeqBoot program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref($input) eq 'ARRAY' ? @{$input}: ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip', -idlength=>$self->idlength()); foreach my $input(@input){ # $input should be a Bio::Align::AlignI $input->isa("Bio::Align::AlignI") || $self->throw("Expecting a Bio::Align::AlignI object"); # Open temporary file for both reading & writing of BioSeq array $alnIO->write_aln($input); } $alnIO->close(); close($tfh); return $alnfilename; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for SeqBoot program Example : Returns : parameter string to be passed to SeqBoot Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my $gene_freq = 0; my %menu = %{$Menu{$self->version}->{'SEQBOOT'}}; foreach my $attr ( @SEQBOOT_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/REPLICATES/i){ if( $value !~ /(\d+(\.\d+)?)/ ) { $self->warn("Expected a number in $attr\n"); next; } $param_string .= $menu{'REPLICATES'}."$value\n"; } elsif($attr=~/DATATYPE/i){ $gene_freq = 1 if $value =~/GENEFREQ/i; $param_string .= $menu{'DATATYPE'}{uc $value}; } else { if($attr =~/ALLELES/i){ if(!$gene_freq){ $self->warn("Alleles options only be used with alleles option"); return; } $param_string .=$menu{uc $attr}; } } } $param_string .= $menu{'SUBMIT'}; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $SeqBoot->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a SeqBoot run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Phyml.pm000066400000000000000000000705011342734133000232440ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Phyml # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phyml - Wrapper for rapid reconstruction of phylogenies using Phyml =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phyml; # Make a Phyml factory $factory = Bio::Tools::Run::Phylo::Phyml->new(-verbose => 2); # it defaults to protein alignment # change parameters $factory->model('Dayhoff'); # Pass the factory an alignment and run $inputfilename = 't/data/protpars.phy'; $tree = $factory->run($inputfilename); # $tree is a Bio::Tree::Tree object. # or set parameters at object creation my %args = ( -data_type => 'dna', -model => 'HKY', -kappa => 4, -invar => 'e', -category_number => 4, -alpha => 'e', -tree => 'BIONJ', -opt_topology => '0', -opt_lengths => '1', ); $factory = Bio::Tools::Run::Phylo::Phyml->new(%args); # if you need the output files do $factory->save_tempfiles(1); $factory->tempdir($workdir); # and get a Bio::Align::AlignI (SimpleAlign) object from somewhere $tree = $factory->run($aln); =head1 DESCRIPTION This is a wrapper for running the phyml application by Stephane Guindon and Olivier Gascuel. You can download it from: http://atgc.lirmm.fr/phyml/ =head2 Installing After downloading, you need to rename a the copy of the program that runs under your operating system. I.e. C into C. You will need to help this Phyml wrapper to find the C program. This can be done in (at least) three ways: =over =item 1. Make sure the Phyml executable is in your path. Copy it to, or create a symbolic link from a directory that is in your path. =item 2. Define an environmental variable PHYMLDIR which is a directory which contains the 'phyml' application: In bash: export PHYMLDIR=/home/username/phyml_v2.4.4/exe In csh/tcsh: setenv PHYMLDIR /home/username/phyml_v2.4.4/exe =item 3. Include a definition of an environmental variable PHYMLDIR in every script that will use this Phyml wrapper module, e.g.: BEGIN { $ENV{PHYMLDIR} = '/home/username/phyml_v2.4.4/exe' } use Bio::Tools::Run::Phylo::Phyml; =back =head2 Running This wrapper has been tested with PHYML v2.4.4 and v.3.0. It may work with recent Phyml releases using a date format for the format, but the wrapper hasn't been extensively tested in these cases, so for the moment only the simpler numbered versions are supported. In its current state, the wrapper supports only input of one MSA and output of one tree. It can easily be extended to support more advanced capabilities of C. Two convienience methods have been added on top of the standard BioPerl WrapperBase ones: stats() and tree_string(). You can call them to after running the phyml program to retrieve into a string the statistics and the tree in Newick format. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHOR - Heikki Lehvaslaiho heikki at bioperl dot org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phyml; use strict; use Bio::AlignIO; use File::Copy; use File::Spec; use Bio::TreeIO; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'phyml'; our $PROGRAM_DIR = $ENV{'PHYMLDIR'}; # valid substitution model names our $models; # DNA map { $models->{0}->{$_} = 1 } qw(JC69 K2P F81 HKY F84 TN93 GTR); # protein map { $models->{1}->{$_} = 1 } qw(JTT MtREV Dayhoff WAG); our $models3; # DNA map { $models3->{'nt'}->{$_} = 1 } qw(HKY85 JC69 K80 F81 F84 TN93 GTR ); # protein map { $models3->{'aa'}->{$_} = 1 } qw(LG WAG JTT MtREV Dayhoff DCMut RtREV CpREV VT Blosum62 MtMam MtArt HIVw HIVb ); =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phyml->new(@params) Function: creates a new Phyml factory Returns : Bio::Tools::Run::Phylo::Phyml Args : Optionally, provide any of the following (default in []): -data_type => 'dna' or 'protein', [protein] -dataset_count => integer, [1] -model => 'HKY'... , [HKY|JTT] -kappa => 'e' or float, [e] -invar => 'e' or float, [e] -category_number => integer, [1] -alpha => 'e' or float (int v3),[e] -tree => 'BIONJ' or your own, [BION] -bootstrap => integer [123] -opt_topology => boolean [1] -opt_lengths => boolean [1] -no_memory_check => boolean [1] -program_name => string =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); # for consistency with other run modules, allow params to be dashless my %args = @args; while ( my ( $key, $val ) = each %args ) { if ( $key !~ /^-/ ) { delete $args{$key}; $args{ '-' . $key } = $val; } } my ( $data_type, $data_format, $dataset_count, $model, $freq, $kappa, $invar, $category_number, $alpha, $tree, $opt_topology, $opt_lengths, $opt, $search, $rand_start, $rand_starts, $rand_seed, $no_memory_check, $bootstrap, $program_name ) = $self->_rearrange( [ qw( DATA_TYPE DATA_FORMAT DATASET_COUNT MODEL FREQ KAPPA INVAR CATEGORY_NUMBER ALPHA TREE OPT_TOPOLOGY OPT_LENGTHS OPT SEARCH RAND_START RAND_STARTS RAND_SEED NO_MEMORY_CHECK BOOTSTRAP PROGRAM_NAME ) ], %args ); $self->data_type($data_type) if $data_type; $self->data_format($data_format) if $data_format; $self->dataset_count($dataset_count) if $dataset_count; $self->model($model) if $model; $self->freq($freq) if $freq; $self->kappa($kappa) if $kappa; $self->invar($invar) if $invar; $self->category_number($category_number) if $category_number; $self->alpha($alpha) if $alpha; $self->tree($tree) if $tree; $self->opt_topology($opt_topology) if $opt_topology; $self->opt_lengths($opt_lengths) if $opt_lengths; $self->opt($opt) if $opt; $self->search($search) if $search; $self->rand_start($rand_start) if $rand_start; $self->rand_starts($rand_starts) if $rand_starts; $self->rand_seed($rand_seed) if $rand_seed; $self->no_memory_check($no_memory_check) if $no_memory_check; $self->bootstrap($bootstrap) if $bootstrap; $self->program_name($program_name) if $program_name; return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { my ( $self, $value ) = @_; if ( defined($value) ) { if ( $value =~ /^$PROGRAM_NAME[-a-z]*$/ ) { $PROGRAM_NAME = $value; } else { $self->throw("$value is not a valid program name"); } } $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 version Title : version Usage : exit if $prog->version < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none Phyml before 3.0 did not display the version. Assume 2.44 when can not determine it. Some releases do not state version number, only date, so the version might have to be inferred from this date. =cut sub version { my $self = shift; return $self->{'_version'} if defined $self->{'_version'}; my $exe = $self->executable || return; my $string = substr `$exe -h`, 0, 40; my ($version) = $string =~ /PhyML v([\d+\.]+)/; if ( !$version ) { $string =~ /PhyML\s+(\d{8})/; # 3 was released August 2008 $version = 3 if ( $1 && $1 >= 20080801 ); } $self->{'_version'} = $version; $version ? ( return $version ) : return '2.44'; } =head2 run Title : run Usage : $factory->run($aln_file); $factory->run($align_object); Function: Runs Phyml to generate a tree Returns : Bio::Tree::Tree object Args : file name for your input alignment in a format recognised by AlignIO, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ( $self, $in ) = @_; if ( ref $in && $in->isa("Bio::Align::AlignI") ) { $in = $self->_write_phylip_align_file($in); } elsif ( !-e $in ) { $self->throw( "When not supplying a Bio::Align::AlignI object, " . "you must supply a readable filename" ); } elsif ( -e $in ) { copy( $in, $self->tempdir ); my $name = File::Spec->splitpath($in); # name is the last item in the array $in = File::Spec->catfile( $self->tempdir, $name ); } return $self->_run($in); } =head2 stats Title : stats Usage : $factory->stats; Function: Returns the contents of the phyml '_phyml_stat.txt' output file Returns : string with statistics about the run, undef before run() Args : none =cut sub stats { my $self = shift; return $self->{_stats}; } =head2 tree_string Title : tree_string Usage : $factory->tree_string; $factory->run($align_object); Function: Returns the contents of the phyml '_phyml_tree.txt' output file Returns : string with tree in Newick format, undef before run() Args : none =cut sub tree_string { my $self = shift; return $self->{_tree}; } =head2 Getsetters These methods are used to set and get program parameters before running. =head2 data_type Title : data_type Usage : $phyml->data_type('nt'); Function: Sets sequence alphabet to 'dna' (nt in v3) or 'aa' If leaved unset, will be set automatically Returns : set value, defaults to 'protein' Args : None to get, 'dna' ('nt') or 'aa' to set. =cut sub data_type { my ( $self, $value ) = @_; if ( $self->version && $self->version >= 3 ) { if ( defined $value ) { if ( $value eq 'nt' ) { $self->{_data_type} = 'nt'; } else { $self->{_data_type} = 'aa'; } } return 'aa' unless defined $self->{_data_type}; } else { if ( defined $value ) { if ( $value eq 'dna' ) { $self->{_data_type} = '0'; } else { $self->{_data_type} = '1'; } } return '1' unless defined $self->{_data_type}; } return $self->{_data_type}; } =head2 data_format Title : data_format Usage : $phyml->data_format('s'); Function: Sets PHYLIP format to 'i' interleaved or 's' sequential Returns : set value, defaults to 'i' Args : None to get, 'i' or 's' to set. =cut sub data_format { my ( $self, $value ) = @_; if ( defined $value ) { $self->throw("PHYLIP format must be 'i' or 's'") unless $value eq 'i' or $value eq 's'; $self->{_data_format} = $value; } return $self->{_data_format} || 'i'; } =head2 dataset_count Title : dataset_count Usage : $phyml->dataset_count(3); Function: Sets dataset number to deal with Returns : set value, defaults to 1 Args : None to get, positive integer to set. =cut sub dataset_count { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid positive integer [$value]" unless $value =~ /^[-+]?\d*$/ and $value > 0; $self->{_dataset_count} = $value; } return $self->{_dataset_count} || 1; } =head2 model Title : model Usage : $phyml->model('HKY'); Function: Choose the substitution model to use. One of JC69 | K2P | F81 | HKY | F84 | TN93 | GTR (DNA) JTT | MtREV | Dayhoff | WAG (amino acids) v3.0: HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR (DNA) LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV | CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb (amino acids) Returns : Name of the model, v2.4.4 defaults to {HKY|JTT} Args : None to get, string to set. =cut sub model { my ( $self, $value ) = @_; if ( defined($value) ) { if ( $self->version && $self->version >= 3 ) { unless ( $value =~ /\d{6}/ ) { $self->throw( "Not a valid model name [$value] for current data type (alphabet)" ) unless $models3->{ $self->data_type }->{$value}; } } else { $self->throw( "Not a valid model name [$value] for current data type (alphabet)" ) unless $models->{ $self->data_type }->{$value}; } $self->{_model} = $value; } if ( $self->{_model} ) { return $self->{_model}; } if ( $self->version && $self->version >= 3 ) { if ( $self->data_type eq 'aa' ) { return 'LG'; # protein } else { return 'HKY85'; # DNA } } else { if ( $self->data_type ) { return 'JTT'; # protein } else { return 'HKY'; # DNA } } } =head2 kappa Title : kappa Usage : $phyml->kappa(4); Function: Sets transition/transversion ratio, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub kappa { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.?\d*$/ or $value eq 'e'; $self->{_kappa} = $value; } return 'e' unless defined $self->{_kappa}; return 'e' if $self->{_kappa} eq 'e'; return sprintf( "%.1f", $self->{_kappa} ); } =head2 invar Title : invar Usage : $phyml->invar(.3); Function: Sets proportion of invariable sites, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub invar { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.\d*$/ or $value eq 'e'; $self->{_invar} = $value; } return 'e' unless defined $self->{_invar}; return 'e' if $self->{_invar} eq 'e'; return sprintf( "%.1f", $self->{_invar} ); } =head2 category_number Title : category_number Usage : $phyml->category_number(4); Function: Sets number of relative substitution rate categories Returns : set value, defaults to 1 Args : None to get, integer to set. =cut sub category_number { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid postive integer [$value]" unless $value =~ /^[+]?\d*$/ and $value > 0; $self->{_category_number} = $value; } return $self->{_category_number} || 1; } =head2 alpha Title : alpha Usage : $phyml->alpha(1.0); Function: Sets gamma distribution parameter, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub alpha { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.?\d*$/ or $value eq 'e'; $self->{_alpha} = $value; } return 'e' unless defined $self->{_alpha}; return 'e' if $self->{_alpha} eq 'e'; return sprintf( "%.1f", $self->{_alpha} ) || 'e'; } =head2 tree Title : tree Usage : $phyml->tree('/tmp/tree.nwk'); Function: Sets starting tree, leave unset to estimate a distance tree Returns : set value, defaults to 'BIONJ' Args : None to get, newick tree file name to set. =cut sub tree { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless -e $value or $value eq 'BIONJ'; $self->{_tree} = $value; } return $self->{_tree} || 'BIONJ'; } =head2 v2 options These methods can be used with PhyML v2* only. =head2 opt_topology Title : opt_topology Usage : $factory->opt_topology(1); Function: Choose to optimise the tree topology Returns : 1 or 0. Default is 1. Args : None to get, boolean to set. v2.* only =cut sub opt_topology { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt_topology] for to PhyML v3") if $self->version && $self->version >= 3; if ( defined($value) ) { if ($value) { $self->{_opt_topology} = 1; } else { $self->{_opt_topology} = 0; } } return $self->{_opt_topology} || 1; } =head2 opt_lengths Title : opt_lengths Usage : $factory->opt_lengths(0); Function: Choose to optimise branch lengths and rate parameters Returns : 1 or 0. Default is 1. Args : None to get, boolean to set. v2.* only =cut sub opt_lengths { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt_lengths] for PhyML v3") if $self->version && $self->version >= 3; if ( defined($value) ) { if ($value) { $self->{_opt_lengths} = 1; } else { $self->{_opt_lengths} = 0; } } return $self->{_opt_lengths} || 1; } =head2 v3 options These methods can be used with PhyML v3* only. =head2 freq Title : freq Usage : $phyml->freq(e); $phyml->freq("0.2, 0.6, 0.6, 0.2"); Function: Sets nucleotide frequences or asks residue to be estimated according to two models: e or d Returns : set value, Args : None to get, string to set. v3 only. =cut sub freq { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [freq] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid value [$value]" unless $value =~ /^[\d\. ]$/ or $value eq 'e' or $value eq 'd'; $self->{_freq} = $value; } return $self->{_freq}; } =head2 opt Title : opt Usage : $factory->opt(1); Function: Optimise tree parameters: tlr|tl|tr|l|n Returns : {value|n} (default n) Args : None to get, string to set. v3.* only =cut sub opt { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { $self->{_opt} = $value if $value =~ /tlr|tl|tr|l|n/; } return $self->{_opt} || 'n'; } =head2 search Title : search Usage : $factory->search(SPR); Function: Tree topology search operation algorithm: NNI|SPR|BEST Returns : string (defaults to NNI) Args : None to get, string to set. v3.* only =cut sub search { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [search] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { $self->{_search} = $value if $value =~ /NNI|SPR|BEST/; } return $self->{_search} || 'NNI'; } =head2 rand_start Title : rand_start Usage : $factory->rand_start(1); Function: Sets the initial SPR tree to random. Returns : boolean (defaults to false) Args : None to get, boolean to set. v3.* only; only meaningful if $prog-Esearch is 'SPR' =cut sub rand_start { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_start] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { if ($value) { $self->{_rand_start} = 1; } else { $self->{_rand_start} = 0; } } return $self->{_rand_start}; } =head2 rand_starts Title : rand_starts Usage : $factory->rand_starts(10); Function: Sets the number of initial random SPR trees Returns : integer (defaults to 1) Args : None to get, integer to set. v3.* only; only valid if $prog-Esearch is 'SPR' =cut sub rand_starts { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_starts] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d+$/; $self->{_rand_starts} = $value; } return $self->{_rand_starts} || 1; } =head2 rand_seed Title : rand_seed Usage : $factory->rand_seed(1769876); Function: Seeds the random number generator Returns : random integer Args : None to get, integer to set. v3.* only; only valid if $prog-Esearch is 'SPR' Uses perl rand() to initialize if not explicitely set. =cut sub rand_seed { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_seed] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d+$/; $self->{_rand_seed} = $value; } return $self->{_rand_seed} || int rand 1000000; } =head2 no_memory_check Title : no_memory_check Usage : $factory->no_memory_check(1); Function: Returns : boolean (defaults to false) Args : None to get, integer to set. =cut sub no_memory_check { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [no_memory_check] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { if ($value) { $self->{_no_memory_check} = 1; } else { $self->{_no_memory_check} = 0; } } return $self->{_no_memory_check} || 0; } =head2 bootstrap Title : bootstrap Usage : $factory->bootstrap(100); Function: Set number of bootstraps Returns : Args : None to get, integer to set. =cut sub bootstrap { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [bootstrap] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^\d+$/; $self->{_bootstrap} = $value; } return $self->{_bootstrap}; } =head2 command Title : command Usage : $factory->command(...); Function: Returns : string Args : None to get, integer to set. =cut sub command { my ( $self, $value ) = @_; if ( defined($value) ) { if ($value =~ /$PROGRAM_NAME/ ) { $self->{_command} = $value; } else { $self->throw("$value is not a $PROGRAM_NAME command"); } } return $self->{_command} || ''; } =head2 Internal methods These methods are private and should not be called outside this class. =cut sub _run { my ( $self, $file ) = @_; my $exe = $self->executable || return; my $command; my $output_stat_file; if ( $self->version >= 3 ) { $command = $exe . " -i $file" . $self->_setparams; $output_stat_file = '_phyml_stats.txt'; } else { $command = $exe . " $file " . $self->arguments . $self->_setparams; $output_stat_file = '_phyml_stat.txt'; } $self->command($command); $self->debug("Phyml command = $command\n"); `$command`; # stats { my $stat_file = $file . $output_stat_file; open( my $FH_STAT, "<", $stat_file ) || $self->throw( "Phyml call ($command) did not give an output [$stat_file]: $?"); local $/; $self->{_stats} .= <$FH_STAT>; } #print $self->{stats}; # tree my $tree_file = $file . '_phyml_tree.txt'; { open( my $FH_TREE, "<", $tree_file ) || $self->throw("Phyml call ($command) did not give an output: $?"); local $/; $self->{_tree} .= <$FH_TREE>; } open( my $FH_TREE, "<", $tree_file ) || $self->throw("Phyml call ($command) did not give an output: $?"); my $treeio = Bio::TreeIO->new( -format => 'nhx', -fh => $FH_TREE ); my $tree = $treeio->next_tree; # could be faster to parse the tree only if needed? return $tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string; if ( $self->version >= 3 ) { # version 3 or higher $param_string = ' -d ' . $self->data_type; $param_string .= ' -q ' if $self->data_format eq 's'; $param_string .= ' -n ' . $self->dataset_count if $self->dataset_count > 1; $param_string .= ' -b ' . $self->bootstrap if $self->bootstrap; # $param_string .= ' 0'; # no bootstrap sets $param_string .= ' -m ' . $self->model; $param_string .= ' -f ' . $self->freq if $self->freq; if ( $self->data_type eq 'dna' ) { $param_string .= ' -t ' . $self->kappa; } $param_string .= ' -v ' . $self->invar; $param_string .= ' -c ' . $self->category_number; $param_string .= ' -a ' . $self->alpha; $param_string .= ' -u ' . $self->tree if $self->tree ne 'BIONJ'; $param_string .= ' -o ' . $self->opt if $self->opt; $param_string .= ' -s ' . $self->search; if ( $self->search eq 'SPR' ) { $param_string .= ' --rand_start ' if $self->rand_start; $param_string .= ' --n_rand_starts ' . $self->rand_starts if $self->rand_starts; $param_string .= ' --r_seed ' . $self->rand_seed; } $param_string .= ' --no_memory_check ' if $self->no_memory_check; } else { # version 2 $param_string = ' ' . $self->data_type; $param_string .= ' ' . $self->data_format; $param_string .= ' ' . $self->dataset_count; $param_string .= ' 0'; # no bootstrap sets $param_string .= ' ' . $self->model; unless ( $self->data_type ) { # only for DNA $param_string .= ' ' . $self->kappa; } $param_string .= ' ' . $self->invar; $param_string .= ' ' . $self->category_number; $param_string .= ' ' . $self->alpha; $param_string .= ' ' . $self->tree; $param_string .= ' ' . $self->opt_topology; $param_string .= ' ' . $self->opt_lengths; } return $param_string; } =head2 _write_phylip_align_file Title : _write_phylip_align_file Usage : obj->__write_phylip_align_file($aln) Function: Internal (not to be used directly) Writes the alignment into the tmp directory in PHYLIP interlieved format Returns : filename Args : Bio::Align::AlignI =cut sub _write_phylip_align_file { my ( $self, $align ) = @_; my $tempfile = File::Spec->catfile( $self->tempdir, "aln$$.phylip" ); $self->data_format('i'); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip', -interleaved => 1, -longid => 1 ); $out->write_aln($align); $out->close(); $out = undef; return $tempfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/QuickTree.pm000066400000000000000000000207001342734133000240430ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::QuickTree # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::QuickTree - Wrapper for rapid reconstruction of phylogenies using QuickTree =head1 SYNOPSIS use Bio::Tools::Run::Phylo::QuickTree; # Make a QuickTree factory @params = (); $factory = Bio::Tools::Run::Phylo::QuickTree->new(@params); # Pass the factory an alignment $inputfilename = 't/data/cysprot.stockholm'; $tree = $factory->run($inputfilename); # $tree is a Bio::Tree::Tree object. # or get a Bio::Align::AlignI (SimpleAlign) object from somewhere $tree = $factory->run($aln); =head1 DESCRIPTION This is a wrapper for running the QuickTree application by Kevin Howe. You can download it here: http://www.sanger.ac.uk/Software/analysis/quicktree/ Currently only input with alignments and output of trees is supported. (Ie. no support for distance matrix in/out.) You will need to enable this QuickTree wrapper to find the quicktree program. This can be done in (at least) three ways: 1. Make sure the QuickTree executable is in your path. 2. Define an environmental variable QUICKTREEDIR which is a directory which contains the 'quicktree' application: In bash: export QUICKTREEDIR=/home/username/quicktree_1.1/bin In csh/tcsh: setenv QUICKTREEDIR /home/username/quicktree_1.1/bin 3. Include a definition of an environmental variable QUICKTREEDIR in every script that will use this QuickTree wrapper module, e.g.: BEGIN { $ENV{QUICKTREEDIR} = '/home/username/quicktree_1.1/bin' } use Bio::Tools::Run::Phylo::QuickTree; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::QuickTree; use strict; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'quicktree'; our $PROGRAM_DIR = $ENV{'QUICKTREEDIR'}; =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::QuickTree->new(@params) Function: creates a new QuickTree factory Returns : Bio::Tools::Run::Phylo::QuickTree Args : Optionally, provide any of the following (default in []): -upgma => boolean # Use the UPGMA method to construct the tree [0] -kimura => boolean # Use the kimura translation for pairwise # distances [0] -boot => int # Calculate bootstrap values with n iterations [0] =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); # for consistency with other run modules, allow params to be dashless my %args = @args; while (my ($key, $val) = each %args) { if ($key !~ /^-/) { delete $args{$key}; $args{'-'.$key} = $val; } } my ($upgma, $kimura, $boot) = $self->_rearrange([qw(UPGMA KIMURA BOOT)], %args); $self->upgma(1) if $upgma; $self->kimura(1) if $kimura; $self->boot($boot) if $boot; return $self; } =head2 upgma Title : upgma Usage : $factory->upgma(1); Function: Choose to use the UPGMA method to construct the tree. Returns : boolean (default 0) Args : None to get, boolean to set. =cut sub upgma { my ($self, $bool) = @_; if (defined ($bool)) { $self->{upgma} = $bool; } return $self->{upgma} || 0; } =head2 kimura Title : kimura Usage : $factory->kimura(1); Function: Choose to use the kimura translation for pairwise distances. Returns : boolean (default 0) Args : None to get, boolean to set. =cut sub kimura { my ($self, $bool) = @_; if (defined ($bool)) { $self->{kimura} = $bool; } return $self->{kimura} || 0; } =head2 boot Title : boot Usage : $factory->boot(100); Function: Choose to calculate bootstrap values with the supplied number of iterations. Returns : int (default 0) Args : None to get, int to set. =cut sub boot { my ($self, $int) = @_; if (defined ($int)) { $self->{boot} = $int; } return $self->{boot} || 0; } =head2 run Title : run Usage : $factory->run($stockholm_file); $factory->run($align_object); Function: Runs QuickTree to generate a tree Returns : Bio::Tree::Tree object Args : file name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_writeAlignFile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } return $self->_run($in); } sub _run { my ($self, $file)= @_; my $exe = $self->executable || return; my $param_str = $self->arguments." ".$self->_setparams; my $command = $exe." $param_str ".$file; $self->debug("QuickTree command = $command"); open(my $result, "$command |") || $self->throw("QuickTree call ($command) crashed: $?"); my $treeio = Bio::TreeIO->new(-format => 'nhx', -fh => $result); my $tree = $treeio->next_tree; close($result); # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id if ($self->boot) { my @nodes = $tree->get_nodes; my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); foreach my $node (@nodes) { next if exists $non_internal{$node}; $node->bootstrap && next; # protect ourselves incase the parser improves $node->bootstrap($node->id); $node->id(''); } } return $tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = '-in a -out t'; $param_string .= ' -upgma' if $self->upgma; $param_string .= ' -kimura' if $self->kimura; $param_string .= ' -boot '.$self->boot if $self->boot; return $param_string; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : filename Args : Bio::Align::AlignI =cut sub _writeAlignFile{ my ($self, $align) = @_; my ($tfh, $tempfile) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::AlignIO->new('-fh' => $tfh, '-format' => 'stockholm'); $out->write_aln($align); $out->close(); $out = undef; close($tfh); undef $tfh; return $tempfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Raxml.pm000066400000000000000000000257241342734133000232450ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Raxml # # Please direct questions and support issues to # # Copyright Brian Osborne # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Raxml =head1 SYNOPSIS # Build a Raxml factory $factory = Bio::Tools::Run::Phylo::Raxml->new(-p => 100); # Get an alignment my $alignio = Bio::AlignIO->new( -format => 'fasta', -file => '219877.cdna.fasta'); my $alnobj = $alignio->next_aln; # Analyze the aligment and get a Tree my $tree = $factory->run($alnobj); =head1 DESCRIPTION Get a Bio::Tree object using raxml given a protein or DNA alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I Do not contact the module maintainer directly. Many experienced experts at bioperl-l will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Brian Osborne Email briano@bioteam.net =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Raxml; use strict; use File::Basename; use File::Spec; use Bio::Seq; use Bio::SeqIO; use Bio::TreeIO; use Bio::AlignIO; use Bio::Root::IO; use Cwd; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @Raxml_PARAMS = qw(s n m a A b B c e E f g G i I J o p P q r R S t T w W x z N); our @Raxml_SWITCHES = qw(SSE3 PTHREADS PTHREADS-SSE3 HYBRID HYBRID-SSE3 F h k K M j U v X y C d D); our $PROGRAM_NAME = 'raxml'; # Specify some model if none is specified my $DEFAULTAAMODEL = 'PROTCATDAYHOFF'; my $DEFAULTNTMODEL = 'GTRCAT'; =head2 new Title : new Usage : my $treebuilder = Bio::Tools::Run::Phylo::Raxml->new(); Function: Constructor Returns : Bio::Tools::Run::Phylo::Raxml Args : Same as those used to run raxml. For example: $factory = Bio::Tools::Run::Phylo::Raxml->new(-p => 100, -SSE3 => 1) =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args( \@args, -case_sensitive => 1, -methods => [ @Raxml_PARAMS, @Raxml_SWITCHES ], -create => 1 ); my ($out,$quiet) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME QUIET)], @args ); $self->outfile_name( $out || '' ); $self->quiet( $quiet ) if $quiet; $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory Returns: string Args : =cut sub program_dir { undef; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; $self->{'error_string'} = $value if ( defined $value ); $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe -v 2>&1`; $string =~ /raxml\s+version\s+([\d\.]+)/i; return $1 || undef; } =head2 quiet Title : quiet Usage : Function: get or set value for 'quiet' Example : Returns : Args : the value =cut sub quiet { my ( $self, $value ) = @_; $self->{'_quiet'} = $value if ( defined $value ); $self->{'_quiet'}; } =head2 run Title : run Usage : $factory->run($stockholm_file) OR $factory->run($align_object) Function: Runs Raxml to generate a tree Returns : Bio::Tree::Tree object Args : File name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_write_alignfile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } $self->_run($in); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Runs the application Returns : Tree object Args : Alignment file name =cut sub _run { my ( $self, $file ) = @_; my $exe = $self->executable || return; my $param_str = $self->arguments . " " . $self->_setparams($file); my $command = "$exe $param_str"; $self->debug("Raxml command = $command"); my $status = system($command); # raxml creates tree files with names like "RAxML_bestTree.ABDBxjjdfg3" # if rapid bootstrapping was enabled, also a tree with RAxML_bipartitions.ABDBxjjdfg3 # with support values is created, which then should be returned my $outfile = $self->f() eq 'a' ? 'RAxML_bipartitions.' : 'RAxML_bestTree.'; $outfile .= $self->outfile_name; $outfile = File::Spec->catfile( ($self->w), $outfile ) if $self->w; if ( !-e $outfile || -z $outfile ) { $self->warn("Raxml call had status of $status: $? [command $command] \n"); return undef; } my $treeio = Bio::TreeIO->new( -file => $outfile ); my $tree = $treeio->next_tree; # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id # if ($self->boot) { # my @nodes = $tree->get_nodes; # my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); # foreach my $node (@nodes) { # next if exists $non_internal{$node}; # $node->bootstrap && next; # protect ourselves incase the parser improves # $node->bootstrap($node->id); # $node->id(''); # } # } $tree; } =head2 _write_alignfile Title : _write_alignfile Usage : Internal function, not to be called directly Function: Create an alignment file Returns : filename Args : Bio::Align::AlignI =cut sub _write_alignfile { my ( $self, $align ) = @_; my ( $tfh, $tempfile ) = $self->io->tempfile( -dir => '.' ); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip' ); $out->write_aln($align); $out->close(); undef($out); close($tfh); undef($tfh); die "Alignment file $tempfile was not created" if ( ! -e $tempfile ); $tempfile; } =head2 _alphabet Title : _alphabet Usage : my $alphabet = $self->_alphabet; Function: Get the alphabet of the input alignment, defaults to 'dna' Returns : 'dna' or 'protein' Args : Alignment file =cut sub _alphabet { my ( $self, $file ) = @_; if ($file) { if ( -e $file ) { my $in = Bio::AlignIO->new( -file => $file ); my $aln = $in->next_aln; # arbitrary, the first one my $seq = $aln->get_seq_by_pos(1); my $alphabet = $seq->alphabet; $self->{_alphabet} = $alphabet; } else { die "File $file can not be found"; } } # default is 'dna' return $self->{'_alphabet'} || 'dna'; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Raxml program Example : Returns : parameter string to be passed to Raxml Args : name of calling object =cut sub _setparams { my ( $self, $infile ) = @_; my $param_string = ''; # If 'model' is not set with '-m' check the alphabet of the input, # then specify the default model if ( !$self->m ) { my $model = ( $self->_alphabet($infile) eq 'dna' ) ? $DEFAULTNTMODEL : $DEFAULTAAMODEL; $self->m($model); } # Set default output file if no explicit output file has been given. # Raxml insists that the output file name not contain '/' and its # output directory is set using the '-w' argument. if ( !$self->outfile_name ) { my $dir = getcwd(); $self->w($dir); my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $dir ); close($tfh); undef $tfh; $outfile = basename($outfile); $self->outfile_name($outfile); } for my $attr (@Raxml_PARAMS) { my $value = $self->$attr(); next unless ( defined $value ); $param_string .= ' -' . $attr . ' ' . $value . ' '; } for my $attr (@Raxml_SWITCHES) { my $value = $self->$attr(); next unless ($value); $param_string .= ' -' . $attr . ' '; } $param_string .= "-s $infile -n " . $self->outfile_name; my $null = File::Spec->devnull(); $param_string .= " > $null 2> $null" if ( $self->quiet() || $self->verbose < 0 ); $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $Raxml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $Raxml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/SLR.pm000066400000000000000000000637161342734133000226250ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::SLR # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::SLR - Wrapper around the SLR program =head1 SYNOPSIS use Bio::Tools::Run::Phylo::SLR; use Bio::AlignIO; use Bio::TreeIO; use Bio::SimpleAlign; my $alignio = Bio::AlignIO->new (-format => 'fasta', -file => 't/data/219877.cdna.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new (-format => 'newick', -file => 't/data/219877.tree'); my $tree = $treeio->next_tree; my $slr = Bio::Tools::Run::Phylo::SLR->new(); $slr->alignment($aln); $slr->tree($tree); # $rc = 1 for success, 0 for errors my ($rc,$results) = $slr->run(); my $positive_sites = $results->{'positive'}; print "# Site\tNeutral\tOptimal\tOmega\t", "lower\tupper\tLRT_Stat\tPval\tAdj.Pval\tResult\tNote\n"; foreach my $positive_site (@$positive_sites) { print $positive_site->[0], "\t", $positive_site->[1], "\t", $positive_site->[2], "\t", $positive_site->[3], "\t", $positive_site->[4], "\t", $positive_site->[5], "\t", $positive_site->[6], "\t", $positive_site->[7], "\t", $positive_site->[8], "\t", "positive\n"; } =head1 DESCRIPTION This is a wrapper around the SLR program. See http://www.ebi.ac.uk/goldman/SLR/ for more information. This module is more about generating the proper ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' keep my emacs happy # Let the code begin... package Bio::Tools::Run::Phylo::SLR; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::SimpleAlign; use Bio::Tools::Run::WrapperBase; use Cwd; use File::Spec; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values INCOMPLETE DOCUMENTATION OF ALL METHODS seqfile [incodon] File from which to read alignment of codon sequences. The file should be in PAML format. treefile [intree] File from which tree should be read. The tree should be in Nexus format outfile [slr.res] File to which results are written. If the file already exists, it will be overwritten. reoptimise [1] Should the branch lengths, omega and kappa be reoptimized? 0 - no 1 - yes. kappa [2.0] Value for kappa. If 'reoptimise' is specified, the value given will be used as am initial estimate, omega [0.1] Value for omega (dN/dS). If 'reoptimise' is specified, the value given will be used as an initial estimate. codonf [0] How codon frequencies are estimated: 0: F61/F60 Estimates used are the empirical frequencies from the data. 1: F3x4 The frequencies of nucleotides at each codon position are estimated from the data and then multiplied together to get the frequency of observing a given codon. The frequency of stop codons is set to zero, and all other frequencies scaled appropriately. 2: F1x4 Nucleotide frequencies are estimated from the data (not taking into account at which position in the codon it occurs). The nucleotide frequencies are multiplied together to get the frequency of observing and then corrected for stop codons. freqtype [0] How codon frequencies are incorporated into the substitution matrix. 0: q_{ij} = pi_{j} s_{ij} 1: q_{ij} = \sqrt(pi_j/pi_i) s_{ij} 2: q_{ij} = \pi_{n} s_{ij}, where n is the nucleotide that the subsitution is to. 3: q_{ij} = s_{ij} / pi_i Option 0 is the tradition method of incorporating equilibrium frequencies into subsitution matrices (Felsenstein 1981; Goldman and Yang, 1994) Option 1 is described by Goldman and Whelan (2002), in this case with the additional parameter set to 0.5. Option 2 was suggested by Muse and Gaut (1994). Option 3 is included as an experiment, originally suggested by Bret Larget. it does not appear to describe evolution very successfully and should not be used for analyses. Kosakovsky-Pond has repeatedly stated that he finds incorporating codon frequencies in the manner of option 2 to be superior to option 0. We find that option 1 tends to perform better than either of these options. positive_only [0] If only positively selected sites are of interest, set this to "1". Calculation will be slightly faster, but information about sites under purifying selection is lost. gencode [universal] Which genetic code to use when determining whether a given mutation is synonymous or nonsynonymous. Currently only "universal" and "mammalian" mitochondrial are supported. nucleof [0] Allow for empirical exchangabilities for nucleotide substitution. 0: No adjustment. All nucleotides treated the same, modulo transition / transversion. 1: The rate at which a substitution caused a mutation from nucleotide a to nucleotide b is adjust by a constant N_{ab}. This adjustment is in addition to other adjustments (e.g. transition / transversion or base frequencies). aminof [0] Incorporate amino acid similarity parameters into substitution matrix, adjusting omega for a change between amino acid i and amino acid j. A_{ij} is a symmetric matrix of constants representing amino acid similarities. 0: Constant omega for all amino acid changes 1: omega_{ij} = omega^{A_{ij}} 2: omega_{ij} = a_{ij} log(omega) / [ 1 - exp(-a_{ij} log(omega)) ] Option 1 has the same form as the original codon subsitution model proposed by Goldman and Yang (but with potentially different constants). Option 2 has a more population genetic derivtion, with omega being interpreted as the ratio of fixation probabilities. nucfile [nuc.dat] If nucleof is non-zero, read nucleotide substitution constants from nucfile. If this file does not exist, hard coded constants are used. aminofile [amino.dat] If aminof is non-zero, read amino acid similarity constants from aminofile. If this file does not exist, hard coded constants are used. timemem [0] Print summary of real time and CPU time used. Will eventually print summary of memory use as well. ldiff [3.841459] Twice log-likelihood difference used as a threshold for calculating support (confidence) intervals for sitewise omega estimates. This value should be the quantile from a chi-square distribution with one degree of freedom corresponding to the support required. E.g. qchisq(0.95,1) = 3.841459 0.4549364 = 50% support 1.323304 = 75% support 2.705543 = 90% support 3.841459 = 95% support 6.634897 = 99% support 7.879439 = 99.5% support 10.82757 = 99.9% support paramin [] If not blank, read in parameters from file given by the argument. paramout [] If not blank, write out parameter estimates to file given. skipsitewise [0] Skip sitewise estimation of omega. Depending on other options given, either calculate maximum likelihood or likelihood fixed at parameter values given. seed [0] Seed for random number generator. If seed is 0, then previously produced seed file (~/.rng64) is used. If this does not exist, the random number generator is initialised using the clock. saveseed [1] If non-zero, save finial seed in file (~/.rng64) to be used as initial seed in future runs of program. =head2 Results Format Results file (default: slr.res) ------------ Results are presented in nine columns Site Number of sites in alignment Neutral (minus) Log-probability of observing site given that it was evolving neutrally (omega=1) Optimal (minus) Log-probability of observing site given that it was evolving at the optimal value of omega. Omega The value of omega which maximizes the log-probability of observing LRT_Stat Log-likelihood ratio statistic for non-neutral selection (or positive selection if the positive_only option is set to 1). LRT_Stat = 2 * (Neutral-Optimal) Pval P-value for non-neutral (or positive) selection at a site, unadjusted for multiple comparisons. Adj. Pval P-value for non-neutral (or positive) selection at a site, after adjusting for multiple comparisons using the Hochberg procedure (see the file "MultipleComparisons.txt" in the doc directory). Result A simple visual guide to the result. Sites detected as having been under positive selection are marked with a '+', sites under purifying selection are marked with '-'. The number of symbols Number symbols Threshold 1 95% 2 99% 3 95% after adjustment 4 99% after adjustment Occasionally the result may also contain an exclamation mark. This indicates that the observation at a site is not significantly different from random (equivalent to infinitely strong positive selection). This may indicate that the alignment at that site is bad Note The following events are flagged: Synonymous All codons at a site code for the same amino acid. Single character Only one sequence at the site is ungapped, the result of a recent insertion for example. All gaps All sequences at a site contain a gap character. Sites marked "Single character" or "All gaps" are not counted towards the number of sites for the purposes of correcting for multiple comparisons since it is not possible to detect selection from none or one observation under the assumptions made by the sitewise likelihood ratio test. =cut #' keep my emacs happy BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'Slr_Linux_static'; if ($^O =~ /darwin/i) { $PROGRAMNAME = 'Slr_osx'; } elsif ($^O =~ /mswin/i) { $PROGRAMNAME = 'Slr_windows.exe'; } if( defined $ENV{'SLRDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'SLRDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'_windows.exe':'');; } # valid values for parameters, the default one is always # the first one in the array # example file provided with the package %VALIDVALUES = ( 'outfile' => 'slr.res', 'reoptimise' => [ 1,0], 'kappa' => '2.0', 'omega' => '0.1', 'codonf' => [ 0, 1,2], 'freqtype' => [ 0, 1,2,3], 'positive_only' => [ 0, 1], 'gencode' => [ "universal", "mammalian"], 'nucleof' => [ 0, 1], 'aminof' => [ 0, 1,2], 'nucfile' => '', 'aminofile' => '', 'timemem' => [ 0, 1], 'ldiff' => [ 3.841459, 0.4549364,1.323304,2.705543,6.634897,7.879439,10.82757], 'paramin' => '', 'paramout' => '', 'skipsitewise' => [ 0, 1], 'seed' => [0], 'saveseed' => [ 1, 0] ); } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SLRDIR}) if $ENV{SLRDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::SLR->new(); Function: Builds a new Bio::Tools::Run::Phylo::SLR object Returns : Bio::Tools::Run::Phylo::SLR Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of SLR parameters (all passed to set_parameter) -executable => where the SLR executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 prepare Title : prepare Usage : my $rundir = $slr->prepare($aln); Function: prepare the SLR analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object =cut sub prepare{ my ($self,$aln,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run SLR"); return 0; } if( ! $tree ) { $self->warn("must have supplied a valid tree file in order to run SLR"); return 0; } my ($tempdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile); # Reorder the alignment according to the tree my $ct = 1; my %order; foreach my $node ($tree->get_leaf_nodes) { $order{$node->id_output} = $ct++; } my @seq; my @ids; foreach my $seq ( $aln->each_seq() ) { push @seq, $seq; push @ids, $seq->display_id; } # use the map-sort-map idiom: my @sorted = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq; my $sorted_aln = Bio::SimpleAlign->new(); foreach (@sorted) { $sorted_aln->add_seq($_); } # Rename the leaf nodes in the tree from 1 to n $ct = 1; foreach my $node ($tree->get_leaf_nodes) { $node->id($ct++); } ($tempseqFH,$tempseqfile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlinebreak' => 1, '-idlength' => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); $alnout->write_aln($sorted_aln); $alnout->close(); undef $alnout; close($tempseqFH); my ($temptreeFH,$temptreefile); ($temptreeFH,$temptreefile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); # We need to add a line with the num of leaves ($ct-1) and the # num of trees (1) $treeout->_print(sprintf("%d 1\n",($ct-1))); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); # now let's print the ctl file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. my ($treevolume,$treedirectories,$treefile) = File::Spec->splitpath( $temptreefile ); my ($alnvolume,$alndirectories,$alnfile) = File::Spec->splitpath( $tempseqfile ); my $slr_ctl = "$tempdir/slr.ctl"; open(SLR, ">$slr_ctl") or $self->throw("cannot open $slr_ctl for writing"); print SLR "seqfile\: $alnfile\n"; print SLR "treefile\: $treefile\n"; my $outfile = $self->outfile_name; print SLR "outfile\: $outfile\n"; my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { next if $param eq 'outfile'; print SLR "$param\: $val\n"; } close(SLR); return $tempdir; } =head2 run Title : run Usage : my ($rc,$parser) = $slr->run($aln,$tree); Function: run the SLR analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, L Args : L object, L object =cut sub run { my ($self) = shift;; my $outfile = $self->outfile_name; my $tmpdir = $self->prepare(@_); #my ($rc,$parser) = (1); my ($rc,$results) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $slrexe = $self->executable(); $self->throw("unable to find or run executable for SLR") unless $slrexe && -e $slrexe && -x _; my $run; open($run, "$slrexe |") or $self->throw("Cannot open exe $slrexe"); my @output = <$run>; $exit_status = close($run); $self->error_string(join('',@output)); if( (grep { /\berr(or)?: /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { open RESULTS, "$tmpdir/$outfile" or die "couldnt open results file: $!\n"; my $okay = 0; my $sites; my $type = 'default'; while () { chomp $_; if ( /^\#/ ) {next;} if ( /\!/ ) {$type = 'random';} # random is last elsif ( /\+/ ) {$type = 'positive';} elsif ( /\-\s+/ ) {$type = 'negative';} elsif ( /Constant/ ) {$type = 'constant';} elsif ( /All gaps/ ) {$type = 'all_gaps';} elsif ( /Single character/ ) {$type = 'single_character';} elsif ( /Synonymous/ ) {$type = 'synonymous';} else {$type = 'default'} if ( /^\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ ) { push @{$sites->{$type}}, [$1,$2,$3,$4,$5,$6,$7,$8,$9]; } else { $DB::single=1;1; } } $results = $sites; close RESULTS; # TODO: we could have a proper parser object # $parser = Bio::Tools::Phylo::SLR->new(-file => "$tmpdir/$outfile", # -dir => "$tmpdir"); }; if( $@ ) { $self->warn($self->error_string); } chdir($cwd); } # return ($rc,$parser); return ($rc,$results); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $slr->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $slr->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_slrparams'} }; } =head2 set_parameter Title : set_parameter Usage : $slr->set_parameter($param,$val); Function: Sets a SLR parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $slr->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; unless (defined $self->{'no_param_checks'} && $self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_slrparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $slr->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_slrparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_slrparams'}->{$param} = $val->[0]; } else { $self->{'_slrparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut sub no_param_checks{ my ($self,$value) = @_; if( defined $value) { $self->{'no_param_checks'} = $value; } return $self->{'no_param_checks'}; } =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $slr->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_slrparams'}->{'outfile'} = shift @_; } unless (defined $self->{'_slrparams'}->{'outfile'}) { $self->{'_slrparams'}->{'outfile'} = 'out.res'; } return $self->{'_slrparams'}->{'outfile'}; } =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $slr->cleanup(); Function: Will cleanup the tempdir directory after an SLR run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Phylo/Semphy.pm000077500000000000000000000245601342734133000234270ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Semphy # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Semphy - Wrapper for Semphy =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Semphy; # Make a Semphy factory $factory = Bio::Tools::Run::Phylo::Semphy->new(); # Run Semphy with an alignment my $tree = $factory->run($alignfilename); # or with alignment object $tree = $factory->run($bio_simplalign); # you can supply an initial tree as well, which can be a newick tree file, # Bio::Tree::Tree object... $tree = $factory->run($bio_simplalign, $tree_obj); # ... or Bio::DB::Taxonomy object $tree = $factory->run($bio_simplalign, $bio_db_taxonomy); # (mixtures of all the above are possible) # $tree isa Bio::Tree::Tree =head1 DESCRIPTION This is a wrapper for running the Semphy application by N. Friedman et a.. You can get details here: http://compbio.cs.huji.ac.il/semphy/. Semphy is used for phylogenetic reconstruction (making a tree with branch lengths from an aligned set of input sequences). You can try supplying normal Semphy command-line arguments to new(), eg. new(-hky => 1) or calling arg-named methods (excluding the initial hyphen(s), eg. $factory->hky(1) to set the --hky switch to true). Note that Semphy args are case-sensitive. To distinguish between Bioperl's -verbose and the Semphy's --verbose, you must set Semphy's verbosity with -semphy_verbose or the semphy_verbose() method. You will need to enable this Semphy wrapper to find the Semphy program. This can be done in (at least) three ways: 1. Make sure the Semphy executable is in your path. 2. Define an environmental variable SEMPHYDIR which is a directory which contains the Semphy application: In bash: export SEMPHYDIR=/home/username/semphy/ In csh/tcsh: setenv SEMPHYDIR /home/username/semphy 3. Include a definition of an environmental variable SEMPHYDIR in every script that will use this Semphy wrapper module, e.g.: BEGIN { $ENV{SEMPHYDIR} = '/home/username/semphy/' } use Bio::Tools::Run::Phylo::Semphy; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Semphy; use strict; use File::Spec; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'semphy'; our $PROGRAM_DIR = $ENV{'SEMPHYDIR'}; # methods for the semphy args we support our %PARAMS = (outputfile => 'o', treeoutputfile => 'T', constraint => 'c', gaps => 'g', seed => 'r', Logfile => 'l', alphabet => 'a', ratio => 'z', ACGprob => 'p', BPrepeats => 'BPrepeats', BPconsensus => 'BPconsensus', SEMPHY => 'S', modelfile => 'modelfile', alpha => 'A', categories => 'C', semphy_verbose => 'semphy_verbose'); our %SWITCHES = (homogeneousRatesDTME => 'homogeneousRatesDTME', NJ => 'J', pairwiseGammaDTME => 'pairwiseGammaDTME', commonAlphaDTME => 'commonAlphaDTME', rate4siteDTME => 'rate4siteDTME', posteriorDTME => 'posteriorDTME', BPonUserTree => 'BPonUserTree', nucjc => 'nucjc', aaJC => 'aaJC', k2p => 'k2p', hky => 'hky', day => 'day', jtt => 'jtt', rev => 'rev', wag => 'wag', cprev => 'cprev', homogeneous => 'H', optimizeAlpha => 'O', bbl => 'n', likelihood => 'L', PerPosLike => 'P', PerPosPosterior => 'B', rate => 'R'); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(h help full-help s sequence t tree); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Semphy->new() Function: creates a new Semphy factory Returns : Bio::Tools::Run::Phylo::Semphy Args : Most options understood by Semphy can be supplied as key => value pairs, with a true value for switches. These options can NOT be used with this wrapper (they are handled internally or don't make sense in this context): -h | --help | --fill-help -s | --sequence -t | --tree To distinguish between Bioperl's -verbose and the Semphy's --verbose, you must set Semphy's verbosity with -semphy_verbose =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES), quiet => 'quiet'}, -create => 1, -case_sensitive => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file); -or- $result = $factory->run($align_object); -or- $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs Semphy on an alignment. Returns : Bio::Tree::Tree Args : The first argument represents an alignment, the second (optional) argument a species tree (to set an initial tree: normally the -t option to Semphy). The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases where an initial tree was supplied, the alignment sequence names must correspond to node ids in the species tree. =cut sub run { my ($self, $aln, $tree) = @_; $aln || $self->throw("alignment must be supplied"); $self->_alignment($aln); if ($tree) { $self->_tree($tree); # check node and seq names match $self->_check_names; } return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; my $aln_file = $self->_write_alignment; # generate a semphy-friendly tree file my $tree = $self->_tree; my $tree_file = ''; if ($tree) { $tree = $self->_write_tree; } unless ($self->T) { my ($tfh, $tempfile) = $self->io->tempfile(-dir => $self->tempdir); $self->T($tempfile); close($tfh); } my $command = $exe.$self->_setparams($aln_file, $tree_file); $self->debug("semphy command = $command\n"); open(my $pipe, "$command |") || $self->throw("semphy call ($command) failed to start: $? | $!"); my $error = ''; while (<$pipe>) { print unless $self->quiet; $error .= $_; } close($pipe) || ($error ? $self->warn("semphy call ($command) failed: $error") : $self->throw("semphy call ($command) crashed: $?")); my $result_file = $self->T(); my $tio = Bio::TreeIO->new(-format => 'newick', -file => $result_file); my $result_tree = $tio->next_tree; return $result_tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $tree_file) = @_; my $param_string = ' -s '.$aln_file; $param_string .= ' -t '.$tree_file if $tree_file; my %methods = map { $_ => $_ } keys %PARAMS; $methods{'semphy_verbose'} = 'verbose'; $param_string .= $self->SUPER::_setparams(-params => \%methods, -switches => [keys %SWITCHES], -double_dash => 1); $param_string .= ' 2>&1'; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Primate.pm000066400000000000000000000272361342734133000224700ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Primate # # Please direct questions and support issues to # # Cared for by # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Wrapper for Primate, Guy Slater's near exact match finder for short sequence tags. =head1 SYNOPSIS use Bio::Tools::Run::Primate; use Bio::SeqIO; my $query = "primer.fa"; my $target = "contig.fa"; my @params = ("query" => $query,"target" => $target,"m"=>0); my $fact = Bio::Tools::Run::Primate->new(@params); my @feat = $fact->run; foreach my $feat(@feat) { print $feat->seqname."\t".$feat->primary_tag."\t".$feat->start. "\t".$feat->end."\t".$feat->strand."\t".$feat->seq->seq."\n"; } =head1 DESCRIPTION Primate is available under to ensembl-nci package at http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/ensembl-nci/?cvsroot=Ensembl =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Primate; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR @PRIMATE_PARAMS $PROGRAMNAME @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PRIMATE_PARAMS = qw(V Q T M B QUERY TARGET OUTFILE PROGRAM EXECUTABLE); @OTHER_SWITCHES = qw(QUIET VERBOSE); # Authorize attribute fields foreach my $attr ( @PRIMATE_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'primate'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRIMATEDIR}) if $ENV{PRIMATEDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Primate->new() Function: Builds a new Bio::Tools::Run::Primate objet Returns : Bio::Tools::Run::Primate Args : query => the L object or a file path target => the L object or a file path m => the number of mismatches allowed, default 1(integer) b => [TRUE|FALSE] find best match, default FALSE executable=>where the program sits =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if($attr =~/^q$/i){ $self->query($value); } if($attr =~/^t$/i){ $self->target($value); } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : $primate->version Function: Determine the version number of the program Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe = $self->executable(); return undef unless defined $exe; my $string = `$exe -v ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 search Title : search Usage : DEPRECATED. Use $factory->run() instead Function: Perform a primate search Returns : Array of L Args : =cut sub search { return shift->run(@_); } =head2 run Title : run Usage : @feat = $factory->run(); Function: Perform a primate search Returns : Array of L Args : =cut sub run{ my ($self,$target) = @_; $target = $target ||$self->target; $target || $self->throw("Need a target sequence"); $self->query || $self->throw("Need a query sequence"); # Create input file pointer my ($query_file,$target_file)= $self->_setinput($self->query,$target); if (!($query_file && $target_file)) {$self->throw("Unable to create temp files for query and target !");} # Create parameter string to pass to primate program my $param_string = $self->_setparams(); # run primate my @feats= $self->_run($query_file,$target_file,$param_string); return @feats; } ################################################# #INTERNAL METHODS =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to dba program Returns : array of L Args : path to query and target file and parameter string =cut sub _run { my ($self,$query_file,$target_file,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); # this is to make sure we don't have # open filehandles undef $tfh; my $commandstring = $self->executable. " $param_string -q $query_file -t $target_file > $outfile"; $self->debug( "primate command = $commandstring"); my $status = system($commandstring); $self->throw( "primate call ($commandstring) crashed: $? \n") unless $status==0; #parse pff format and return a Bio::Search::HSP::GenericHSP array my @feats = $self->_parse_results($outfile); return @feats; } =head2 _parse_results Title : _parse_results Usage : Internal function, not to be called directly Function: Passes primate output Returns : array of L Args : the name of the output file =cut sub _parse_results { my ($self,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my @feats; my %query = $self->_query_seq(); open(OUT,$outfile); while(my $entry = ){ chomp($entry); if($entry =~ /primate/ ) { my ($dummy,$tagname, $seqname, $strand,$seq_end,$mismatch) = split(" " , $entry ); #map primate coordinates to Seq coordinates my $seq_start = $seq_end- length($query{$tagname})+2; $seq_end++; my $feature = Bio::SeqFeature::Generic->new( -seq_id => $seqname, -strand => $strand, -score => $mismatch, -start => $seq_start, -end => $seq_end, -frame => 1, -source => 'primate', -primary => $tagname); $feature->attach_seq($self->_target_seq); push @feats,$feature; } } return @feats; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input files for primate Returns : name of file containing query and target Args : query and target (either a filename or a L =cut sub _setinput { my ($self, $query,$target) = @_; my ($query_file,$target_file,$tfh1,$tfh2); my @query = ref ($query) eq "ARRAY" ? @{$query} : ($query); foreach my $query(@query){ if(ref($query)&& $query->isa("Bio::PrimarySeqI")){ ($tfh1,$query_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my %query; $query{$query->primary_id} = $query->seq; $self->_query_seq(\%query); $out1->write_seq($query) || return 0; close ($tfh1); undef $tfh1; } elsif (-e $query){ my $in = Bio::SeqIO->new(-file => $query , '-format' => 'fasta'); ($tfh1,$query_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my %query; while(my $seq1 = $in->next_seq()){ $out1->write_seq($seq1) || return 0; $query{$seq1->primary_id} = $seq1->seq; } close($tfh1); undef $tfh1; $self->_query_seq(\%query); } else { return 0; } } if(ref($target) && $target->isa("Bio::PrimarySeqI")){ ($tfh2,$target_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta'); $out1->write_seq($target)|| return 0; $self->_target_seq($target); close($tfh2); undef $tfh2; } elsif (-e $target){ my $in = Bio::SeqIO->new(-file => $target , '-format' => 'fasta'); ($tfh2,$target_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); my $seq1 = $in->next_seq() || return 0; $out->write_seq($seq1); close($tfh2); undef $tfh2; $self->_target_seq($seq1); } else { return 0; } return $query_file,$target_file; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for primate program Returns : parameter string to be passed to primate Args : the param array =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @PRIMATE_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba $attr_key = ' -'.$attr_key; if(($attr_key !~/QUERY/i) && ($attr_key !~/TARGET/i)){ $param_string .= $attr_key.' '.$value; } } if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " >$null "; } return $param_string; } =head2 _query_seq() Title : _query_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Returns : a hash of seq with key the query tag Args : optional =cut sub _query_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_seq'} = $seq; } return %{$self->{'_query_seq'}}; } =head2 _target_seq() Title : _target_seq Usage : Internal function, not to be called directly Function: get/set for the target sequence Returns : L Args : optional =cut sub _target_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_target_seq'} = $seq; } return $self->{'_target_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Primer3.pm000066400000000000000000000701161342734133000224030ustar00rootroot00000000000000# # This is the original copyright statement. I have relied on Chad's module # extensively for this module. # # Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved. # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # Copyright Chad Matsalla # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code # # But I have modified lots of it, so I guess I should add: # # Copyright (c) 2003 bioperl, Rob Edwards. All Rights Reserved. # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # Copyright Rob Edwards # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Primer3 - Create input for and work with the output from the program primer3 =head1 SYNOPSIS Bio::Tools::Primer3 creates the input files needed to design primers using primer3 and provides mechanisms to access data in the primer3 output files. This module provides a bioperl interface to the program primer3. See http://frodo.wi.mit.edu/primer3/primer3_code.html for details and to download the software. This module only works for primer3 release 1 but is not guaranteed to work with earlier versions. # design some primers. # the output will be put into temp.out use Bio::Tools::Run::Primer3; use Bio::SeqIO; my $seqio = Bio::SeqIO->new(-file=>'data/dna1.fa'); my $seq = $seqio->next_seq; my $primer3 = Bio::Tools::Run::Primer3->new(-seq => $seq, -outfile => "temp.out", -path => "/usr/bin/primer3_core"); # or after the fact you can change the program_name $primer3->program_name('my_suprefast_primer3'); unless ($primer3->executable) { print STDERR "primer3 can not be found. Is it installed?\n"; exit(-1) } # what are the arguments, and what do they mean? my $args = $primer3->arguments; print "ARGUMENT\tMEANING\n"; foreach my $key (keys %{$args}) {print "$key\t", $$args{$key}, "\n"} # set the maximum and minimum Tm of the primer $primer3->add_targets('PRIMER_MIN_TM'=>56, 'PRIMER_MAX_TM'=>90); # design the primers. This runs primer3 and returns a # Bio::Tools::Run::Primer3 object with the results $results = $primer3->run; # see the Bio::Tools::Run::Primer3 pod for # things that you can get from this. For example: print "There were ", $results->number_of_results, " primers\n"; Bio::Tools::Run::Primer3 creates the input files needed to design primers using primer3 and provides mechanisms to access data in the primer3 output files. This module provides a bioperl interface to the program primer3. See http://www-genome.wi.mit.edu/genome_software/other/primer3.html for details and to download the software. This module is based on one written by Chad Matsalla (bioinformatics1@dieselwurks.com). I have ripped some of his code, and added a lot of my own. I hope he is not mad at me! =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://www.bioperl.org/MailList.html - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Rob Edwards redwards@utmem.edu Based heavily on work of Chad Matsalla bioinformatics1@dieselwurks.com =head1 CONTRIBUTORS Shawn Hoon shawnh-at-stanford.edu Jason Stajich jason-at-bioperl.org Brian Osborne osborne1-at-optonline.net =head1 SEE ALSO L =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Primer3; use vars qw(@ISA); use strict; use Bio::Root::Root; use Bio::Tools::Primer3; use Bio::Tools::Run::WrapperBase; use File::Spec; use vars qw($AUTOLOAD @ISA @PRIMER3_PARAMS $PROGRAMNAME %OK_FIELD); @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'primer3'; @PRIMER3_PARAMS=qw( PROGRAM EXCLUDED_REGION INCLUDED_REGION PRIMER_COMMENT PRIMER_DNA_CONC PRIMER_EXPLAIN_FLAG PRIMER_FILE_FLAG PRIMER_FIRST_BASE_INDEX PRIMER_GC_CLAMP PRIMER_INTERNAL_OLIGO_DNA_CONC PRIMER_INTERNAL_OLIGO_EXCLUDED_REGION PRIMER_INTERNAL_OLIGO_INPUT PRIMER_INTERNAL_OLIGO_MAX_GC PRIMER_INTERNAL_OLIGO_MAX_MISHYB PRIMER_INTERNAL_OLIGO_MAX_POLY_X PRIMER_INTERNAL_OLIGO_MAX_SIZE PRIMER_INTERNAL_OLIGO_MAX_TM PRIMER_INTERNAL_OLIGO_MIN_GC PRIMER_INTERNAL_OLIGO_MIN_QUALITY PRIMER_INTERNAL_OLIGO_MIN_SIZE PRIMER_INTERNAL_OLIGO_MIN_TM PRIMER_INTERNAL_OLIGO_MISHYB_LIBRARY PRIMER_INTERNAL_OLIGO_OPT_GC_PERCENT PRIMER_INTERNAL_OLIGO_OPT_SIZE PRIMER_INTERNAL_OLIGO_OPT_TM PRIMER_INTERNAL_OLIGO_SALT_CONC PRIMER_INTERNAL_OLIGO_SELF_ANY PRIMER_INTERNAL_OLIGO_SELF_END PRIMER_IO_WT_COMPL_ANY PRIMER_IO_WT_COMPL_END PRIMER_IO_WT_END_QUAL PRIMER_IO_WT_GC_PERCENT_GT PRIMER_IO_WT_GC_PERCENT_LT PRIMER_IO_WT_NUM_NS PRIMER_IO_WT_REP_SIM PRIMER_IO_WT_SEQ_QUAL PRIMER_IO_WT_SIZE_GT PRIMER_IO_WT_SIZE_LT PRIMER_IO_WT_TM_GT PRIMER_IO_WT_TM_LT PRIMER_LEFT_INPUT PRIMER_LIBERAL_BASE PRIMER_MAX_DIFF_TM PRIMER_MAX_END_STABILITY PRIMER_MAX_GC PRIMER_MAX_MISPRIMING PRIMER_MAX_POLY_X PRIMER_MAX_SIZE PRIMER_MAX_TM PRIMER_MIN_END_QUALITY PRIMER_MIN_GC PRIMER_MIN_QUALITY PRIMER_MIN_SIZE PRIMER_MIN_TM PRIMER_MISPRIMING_LIBRARY PRIMER_NUM_NS_ACCEPTED PRIMER_NUM_RETURN PRIMER_OPT_GC_PERCENT PRIMER_OPT_SIZE PRIMER_OPT_TM PRIMER_PAIR_MAX_MISPRIMING PRIMER_PAIR_WT_COMPL_ANY PRIMER_PAIR_WT_COMPL_END PRIMER_PAIR_WT_DIFF_TM PRIMER_PAIR_WT_IO_PENALTY PRIMER_PAIR_WT_PRODUCT_SIZE_GT PRIMER_PAIR_WT_PRODUCT_SIZE_LT PRIMER_PAIR_WT_PRODUCT_TM_GT PRIMER_PAIR_WT_PRODUCT_TM_LT PRIMER_PAIR_WT_PR_PENALTY PRIMER_PAIR_WT_REP_SIM PRIMER_PICK_ANYWAY PRIMER_PICK_INTERNAL_OLIGO PRIMER_PRODUCT_MAX_TM PRIMER_PRODUCT_MIN_TM PRIMER_PRODUCT_OPT_SIZE PRIMER_PRODUCT_OPT_TM PRIMER_PRODUCT_SIZE_RANGE PRIMER_QUALITY_RANGE_MAX PRIMER_QUALITY_RANGE_MIN PRIMER_RIGHT_INPUT PRIMER_SALT_CONC PRIMER_SELF_ANY PRIMER_SELF_END PRIMER_SEQUENCE_ID PRIMER_SEQUENCE_QUALITY PRIMER_START_CODON_POSITION PRIMER_TASK PRIMER_WT_COMPL_ANY PRIMER_WT_COMPL_END PRIMER_WT_END_QUAL PRIMER_WT_END_STABILITY PRIMER_WT_GC_PERCENT_GT PRIMER_WT_GC_PERCENT_LT PRIMER_WT_NUM_NS PRIMER_WT_POS_PENALTY PRIMER_WT_REP_SIM PRIMER_WT_SEQ_QUAL PRIMER_WT_SIZE_GT PRIMER_WT_SIZE_LT PRIMER_WT_TM_GT PRIMER_WT_TM_LT SEQUENCE TARGET PRIMER_DEFAULT_PRODUCT PRIMER_DEFAULT_SIZE PRIMER_INSIDE_PENALTY PRIMER_INTERNAL_OLIGO_MAX_TEMPLATE_MISHYB PRIMER_OUTSIDE_PENALTY PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS PRIMER_MAX_TEMPLATE_MISPRIMING PRIMER_PAIR_MAX_TEMPLATE_MISPRIMING PRIMER_PAIR_WT_TEMPLATE_MISPRIMING PRIMER_WT_TEMPLATE_MISPRIMING ); foreach my $attr (@PRIMER3_PARAMS) {$OK_FIELD{$attr}++} } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new() Title : new() Usage : my $primer3 = Bio::Tools::Run::Primer3->new(-file=>$file) to read a primer3 output file. my $primer3 = Bio::Tools::Run::Primer3->new(-seq=>sequence object) design primers against sequence Function: Start primer3 working and adds a sequence. At the moment it will not clear out the old sequence, but I suppose it should. Returns : Does not return anything. If called with a filename will allow you to retrieve the results Args : -seq (optional) Bio::Seq object of sequence. This is required to run primer3 but can be added later with add_targets() -outfile file name to output results to (can also be added with $primer3->outfile_name -path path to primer3 executable, including program name, e.g. "/usr/bin/primer3_core". This can also be set with program_name and program_dir -verbose (optional) set verbose output Notes : =cut sub new { my($class,%args) = @_; my $self = $class->SUPER::new(%args); $self->io->_initialize_io(); $self->program_name($args{-program}) if defined $args{'-program'}; if ($args{'-verbose'}) {$self->{'verbose'}=1} if ($args{'-seq'}) { $self->{'seqobject'}=$args{'-seq'}; my @input; push (@input, ("PRIMER_SEQUENCE_ID=".$self->{'seqobject'}->id), ("SEQUENCE=".$self->{'seqobject'}->seq)); $self->{'primer3_input'}=\@input; } if ($args{'-outfile'}) {$self->{_outfilename}=$args{'-outfile'}} if ($args{'-path'}) { my (undef,$path,$prog) = File::Spec->splitpath($args{'-path'}); # For Windows system, $path better (Letter disk not truncated) if ( $^O =~ m{mswin}i ) { require File::Basename; $path = File::Basename::dirname( $args{'-path'} ); $prog = File::Basename::basename( $args{'-path'} ); } $self->program_dir($path); $self->program_name($prog); } return $self; } =head2 program_name Title : program_name Usage : $primer3->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my $self = shift; return $self->{'program_name'} = shift @_ if @_; return $self->{'program_name'} if $self->{'program_name'}; for (qw(primer3 primer3_core)) { if ($self->io->exists_exe($_)) { $PROGRAMNAME = $_; last; } } # don't set permanently, use global return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $primer3->program_dir($dir) Function: returns the program directory, which may also be obtained from ENV variable. Returns : string Args : =cut sub program_dir { my ($self, $dir) = @_; if ($dir) { $self->{'program_dir'}=$dir; } # we need to stop here if we know what the answer is, otherwise we can # never set it and then call it later return $self->{'program_dir'} if $self->{'program_dir'}; if ($ENV{PRIMER3}) { $self->{'program_dir'} = Bio::Root::IO->catfile($ENV{PRIMER3}); } else { $self->{'program_dir'} = Bio::Root::IO->catfile('usr','local','bin'); } return $self->{'program_dir'} } =head2 add_targets() Title : add_targets() Usage : $primer3->add_targets(key=>value) Function: Add any legal value to the input command line. Returns : Returns the number of arguments added. Args : Use $primer3->arguments to find a list of all the values that are allowed, or see the primer3 docs. Notes : This will only do limited error checking at the moment, but it should work. =cut sub add_targets { my ($self, %args)=@_; my $added_args; # a count of what we have added. my $inputarray = $self->{'primer3_input'}; foreach my $key (keys %args) { # we will allow them to add a sequence before checking for arguments if ((uc($key) eq "-SEQ") || (uc($key) eq "-SEQUENCE")) { # adding a new sequence. We need to separate them with an = $self->{'seqobject'}=$args{$key}; if (defined $$inputarray[0]) {push (@$inputarray, "=")} push (@$inputarray, ("PRIMER_SEQUENCE_ID=". $self->{'seqobject'}->id),("SEQUENCE=".$self->{'seqobject'}->seq)); next; } unless ($self->{'no_param_checks'}) { unless ($OK_FIELD{$key}) { $self->warn("Parameter $key is not a valid Primer3 parameter"); next} } if (uc($key) eq "INCLUDED_REGION") { # this must be a comma separated start, length. my $sequencelength; # we don't have a length, hence we need to add the length of the # sequence less the start. foreach my $input (@$inputarray) { if ($input =~ /SEQUENCE=(.*)/) {$sequencelength=length($1)} } if (!$args{$key}) {$args{$key}="0," . $sequencelength} elsif ($args{$key} !~ /\,/) { my $length_of_included = $sequencelength-$args{$key}; $args{$key} .= ",".$length_of_included; } } elsif (uc($key) eq "PRIMER_MIN_SIZE") { # minimum size must be less than MAX size and greater than zero if (exists $args{"PRIMER_MAX_SIZE"}) { unless ($args{"PRIMER_MAX_SIZE"} > $args{"PRIMER_MIN_SIZE"}) { $self->warn('Maximum primer size (PRIMER_MAX_SIZE) must be greater than minimum primer size (PRIMER_MIN_SIZE)'); } } if ($args{$key} < 0) { $self->warn('Minimum primer size (PRIMER_MIN_SIZE) must be greater than 0'); } } elsif ($key eq "PRIMER_MAX_SIZE") { if ($args{$key}>35) {$self->warn('Maximum primer size (PRIMER_MAX_SIZE) must be less than 35')} } elsif (uc($key) eq "SEQUENCE") { # Add seqobject if not present, since it is checked for by Bio::Tools::Primer3->next_primer() $self->{'seqobject'}=Bio::Seq->new(-seq=>$args{$key}) if not defined($self->{'seqobject'}); } # need a check to see whether this is already in the array # and finally add the argument to the list. my $toadd=uc($key)."=".$args{$key}; my $replaced; # don't add it if it is replacing something! my @new_array; foreach my $input (@$inputarray) { my ($array_key, $array_value) = split '=', $input; if (uc($array_key) eq uc($key)) {push @new_array, $toadd; $replaced=1} else {push @new_array, $input} } unless ($replaced) {push @new_array, $toadd} @$inputarray=@new_array; if ($self->{'verbose'}) {print STDERR "Updated ", uc($key), " to $args{$key}\n"} $added_args++; } $self->{'primer3_input'}=$inputarray; return $added_args; } =head2 run() Title : run() Usage : $primer3->run(); Function: Run the primer3 program with the arguments that you have supplied. Returns : A Bio::Tools::Primer3 object containing the results. Args : None. Note : See the Bio::Tools::Primer3 documentation for those functions. =cut sub run { my($self) = @_; my $executable = $self->executable; my $input = $self->{'primer3_input'}; unless ($executable && -e $executable) { $self->throw("Executable was not found. Do not know where primer3 is!") if !$executable; $self->throw("$executable was not found. Do not know where primer3 is!"); exit(-1); } # note that I write this to a temp file because we need both read # and write access to primer3, therefore, # we can't use a simple pipe. if ($self->{'verbose'}) {print STDERR "TRYING\n", join "\n", @{$self->{'primer3_input'}}, "=\n"} # make a temporary file and print the instructions to it. my ($temphandle, $tempfile) = $self->io->tempfile; print $temphandle join "\n", @{$self->{'primer3_input'}}, "=\n"; close($temphandle); my $executable_command = $executable; if ( $executable =~ m{^[^\'\"]+(.+)[^\'\"]+$} ) { $executable_command = "\"$executable\" < \"$tempfile\"|"; } open (RESULTS, $executable_command) || $self->throw("Can't open RESULTS"); if ($self->{'_outfilename'}) { # I can't figure out how to use either of these to write the results out. # neither work, what am I doing wrong or missing in the docs? # $self->{output}=$self->_initialize_io(-file=>$self->{'outfile'}); # $self->{output}=$self->io; # OK, for now, I will just do it myself, because I need this to # check the parser :) open (OUT, ">".$self->{'_outfilename'}) || $self->throw("Can't open ".$self->{'_outfilename'}." for writing"); } my @results; while () { if ($self->{'_outfilename'}) { # this should work, but isn't #$self->{output}->_print($_); print OUT $_; } chomp; next if( $_ eq '='); # skip over bolderio record terminator my ($return, $value) = split('=',$_); $self->{'results'}->{$return} = $value; } close RESULTS; # close the output file if ($self->{'_outfilename'}) { close OUT; } $self->cleanup; # convert the results to individual results $self->{results_obj} = Bio::Tools::Primer3->new; $self->{results_obj}->_set_variable('results', $self->{results}); $self->{results_obj}->_set_variable('seqobject', $self->{seqobject}); # Bio::Tools::Primer3::_separate needs a hash of the primer3 arguments, # with the arg as the key and the value as the value (surprise!). my %input_hash = map {split '='} @{$self->{'primer3_input'}}; $self->{results_obj}->_set_variable('input_options', \%input_hash); $self->{results_separated}= $self->{results_obj}->_separate(); return $self->{results_obj}; } =head2 arguments() Title : arguments() Usage : $hashref = $primer3->arguments(); Function: Describes the options that you can set through Bio::Tools::Run::Primer3, with a brief (one line) description of what they are and their default values Returns : A string (if an argument is supplied) or a reference to a hash. Args : If supplied with an argument will return a string of its description. If no arguments are supplied, will return all the arguments as a reference to a hash Notes : Much of this is taken from the primer3 README file, and you should read that file for a more detailed description. =cut sub arguments { my ($self, $required) = @_; unless ($self->{'input_options'}) {$self->_input_args} if ($required) {return ${$self->{'input_options'}}{'$required'}} else {return $self->{'input_options'}} } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return unless my $exe = $self->executable; if (!defined $self->{'_progversion'}) { my $string = `$exe -about 2>&1`; my $v; if ($string =~ m{primer3\s+release\s+([\d\.]+)}) { $self->{'_progversion'} = $1; } } return $self->{'_progversion'} || undef; } =head2 _input_args() Title : _input_args() Usage : an internal method to set the input arguments for Primer3 Function: Define a hash with keys for each of the input arguments and values as a short one line description Returns : A reference to a hash. Args : None. Notes : Much of this is taken from the primer3 README file, and you should read that file for a more detailed description. =cut sub _input_args { my($self) = shift; # just return functions that we can set and what they are my %hash=( 'PRIMER_SEQUENCE_ID'=>'(string, optional) an id. Optional. Note must be present if PRIMER_FILE_FLAG is set', 'SEQUENCE'=>'(nucleotide sequence, REQUIRED) The sequence itself. Cannot contain newlines', 'INCLUDED_REGION'=>'(interval, optional) Where to pick primers from. In form ,. Based on zero indexing!', 'TARGET'=>'(interval list, default empty) Regions that must be included in the product. The value should be a space-separated list of ,', 'EXCLUDED_REGION'=>'(interval list, default empty) Regions that must NOT be included in the product. The value should be a space-separated list of ,', 'PRIMER_COMMENT'=>'(string) This is ignored, so we will just save, and return it', 'PRIMER_SEQUENCE_QUALITY'=>'(quality list, default empty) A list of space separated integers with one per base. Could adapt a Phred object to this.', 'PRIMER_LEFT_INPUT'=>'(nucleotide sequence, default empty) If you know the left primer sequence, put it here', 'PRIMER_RIGHT_INPUT'=>'(nucleotide sequence, default empty) If you know the right primer sequence, put it here', 'PRIMER_START_CODON_POSITION'=>'(int, default -1000000) Location of known start codons for designing in frame primers.', 'PRIMER_PICK_ANYWAY'=>'boolean, default 0) Pick a primer, even if we have violated some constraints.', 'PRIMER_MISPRIMING_LIBRARY'=>'(string, optional) A file containing sequences to avoid amplifying. Should be fasta format, but see primer3 docs for constraints.', 'PRIMER_MAX_MISPRIMING'=>'(decimal,9999.99, default 12.00) Weighting for PRIMER_MISPRIMING_LIBRARY', 'PRIMER_PAIR_MAX_MISPRIMING'=>'(decimal,9999.99, default 24.00 Weighting for PRIMER_MISPRIMING_LIBRARY', 'PRIMER_PRODUCT_MAX_TM'=>'(float, default 1000000.0) The maximum allowed Tm of the product.', 'PRIMER_PRODUCT_MIN_TM'=>'(float, default -1000000.0) The minimum allowed Tm of the product', 'PRIMER_EXPLAIN_FLAG'=>'(boolean, default 0) If set it will print a bunch of information out.', 'PRIMER_PRODUCT_SIZE_RANGE'=>'(size range list, default 100-300) space separated list of product sizes eg - -', 'PRIMER_DEFAULT_PRODUCT' => '(size range list, default 100-300)', 'PRIMER_PICK_INTERNAL_OLIGO'=>'(boolean, default 0) if set, a hybridization probe will be selected', 'PRIMER_GC_CLAMP'=>'(int, default 0) Number of Gs and Cs at the 3 prime end.', 'PRIMER_OPT_SIZE'=>'(int, default 20) Optimal primer size. Primers will be close to this value in length', 'PRIMER_DEFAULT_SIZE' => '(int, default 20)', 'PRIMER_MIN_SIZE'=>'(int, default 18) Minimum size. Must be 0 < PRIMER_MIN_SIZE < PRIMER_MAX_SIZE ', 'PRIMER_MAX_SIZE'=>'(int, default 27) Maximum size. Must be < 35.', 'PRIMER_OPT_TM'=>'(float, default 60.0C) Optimum Tm of a primer.', 'PRIMER_MIN_TM'=>'(float, default 57.0C) Minimum Tm of a primer', 'PRIMER_MAX_TM'=>'(float, default 63.0C) Maximum Tm of a primer', 'PRIMER_MAX_DIFF_TM'=>'(float, default 100.0C) acceptable difference in Tms', 'PRIMER_MIN_GC'=>'(float, default 20.0%) Minimum allowable GCs', 'PRIMER_OPT_GC_PERCENT'=>'(float, default 50.0%) Optimal GCs', 'PRIMER_MAX_GC'=>'(float, default 80.0%) Maximum allowable GCs', 'PRIMER_SALT_CONC'=>'(float, default 50.0 mM) Salt concentration required for Tm calcs.', 'PRIMER_DNA_CONC'=>'(float, default 50.0 nM) DNA concentration required for Tm calcs. ', 'PRIMER_NUM_NS_ACCEPTED'=>'(int, default 0) Maximum number of unknown bases (N) allowable in any primer.', 'PRIMER_SELF_ANY'=>'(decimal,9999.99, default 8.00) Maximum aligment score for within and between primers when checking for hairpin loops', 'PRIMER_SELF_END'=>'(decimal 9999.99, default 3.00) Maximum aligment score for within and between primers when checking for primer dimers', 'PRIMER_FILE_FLAG'=>'(boolean, default 0) Output .for and .rev with all acceptable forward and reverse primers', 'PRIMER_MAX_POLY_X'=>'(int, default 5) The maximum allowable length of a mononucleotide repeat.', 'PRIMER_LIBERAL_BASE'=>'(boolean, default 0) Use IUPAC codes (well, just change them to N). Note must also set PRIMER_NUM_NS_ACCEPTED', 'PRIMER_NUM_RETURN'=>'(int, default 5) Number of primers to return', 'PRIMER_FIRST_BASE_INDEX'=>'(int, default 0) Index of the first base. Do not change this or allow it to be changed, as we will have to mess with subseqs and whatnot.', 'PRIMER_MIN_QUALITY'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_MIN_END_QUALITY'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY at 5 prime 5 bases', 'PRIMER_QUALITY_RANGE_MIN'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_QUALITY_RANGE_MAX'=>'(int, default 100) Maximum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_MAX_END_STABILITY'=>'(float 999.9999, default 100.0) Maximum stability for the five 3 prime bases of a primer. Bigger numbers mean more stable 3 prime ends.', 'PRIMER_PRODUCT_OPT_TM'=>'(float, default 0.0) Optimum melting temperature for the PCR product. 0 means no optimum.', 'PRIMER_PRODUCT_OPT_SIZE'=>'(int, default 0) Optimum size for the PCR product. 0 means no optimum.', 'PRIMER_TASK'=>'(string, default pick_pcr_primers) Choose from pick_pcr_primers, pick_pcr_primers_and_hyb_probe, pick_left_only, pick_right_only, pick_hyb_probe_only', 'PRIMER_WT_TM_GT'=>'(float, default 1.0) Penalty weight for primers with Tm over PRIMER_OPT_TM.', 'PRIMER_WT_TM_LT'=>'(float, default 1.0) Penalty weight for primers with Tm under PRIMER_OPT_TM.', 'PRIMER_WT_SIZE_LT'=>'(float, default 1.0) Penalty weight for primers shorter than PRIMER_OPT_SIZE.', 'PRIMER_WT_SIZE_GT'=>'(float, default 1.0) Penalty weight for primers longer than PRIMER_OPT_SIZE.', 'PRIMER_WT_GC_PERCENT_LT'=>'(float, default 1.0) Penalty weight for primers with GC percent greater than PRIMER_OPT_GC_PERCENT.', 'PRIMER_WT_GC_PERCENT_GT'=>'(float, default 1.0) Penalty weight for primers with GC percent greater than PRIMER_OPT_GC_PERCENT.', 'PRIMER_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_WT_NUM_NS'=>'(float, default 0.0)', 'PRIMER_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_WT_SEQ_QUAL'=>'(float, default 0.0)', 'PRIMER_WT_END_QUAL'=>'(float, default 0.0)', 'PRIMER_WT_POS_PENALTY'=>'(float, default 0.0)', 'PRIMER_WT_END_STABILITY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PR_PENALTY'=>'(float, default 1.0)', 'PRIMER_PAIR_WT_IO_PENALTY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_DIFF_TM'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_TM_LT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_TM_GT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_SIZE_GT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_SIZE_LT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_INTERNAL_OLIGO_EXCLUDED_REGION'=>'(interval list, default empty) Internal oligos must ignore these regions', 'PRIMER_INTERNAL_OLIGO_INPUT'=>'(nucleotide sequence, default empty) Known sequence of an internal oligo', 'PRIMER_INTERNAL_OLIGO_OPT_SIZE'=>'(int, default 20)', 'PRIMER_INTERNAL_OLIGO_MIN_SIZE'=>'(int, default 18)', 'PRIMER_INTERNAL_OLIGO_MAX_SIZE'=>'(int, default 27)', 'PRIMER_INTERNAL_OLIGO_OPT_TM'=>'(float, default 60.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_OPT_GC_PERCENT'=>'(float, default 50.0%)', 'PRIMER_INTERNAL_OLIGO_MIN_TM'=>'(float, default 57.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_MAX_TM'=>'(float, default 63.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_MIN_GC'=>'(float, default 20.0%)', 'PRIMER_INTERNAL_OLIGO_MAX_GC'=>'(float, default 80.0%)', 'PRIMER_INTERNAL_OLIGO_SALT_CONC'=>'(float, default 50.0 mM)', 'PRIMER_INTERNAL_OLIGO_DNA_CONC'=>'(float, default 50.0 nM)', 'PRIMER_INTERNAL_OLIGO_SELF_ANY'=>'(decimal 9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MAX_POLY_X'=>'(int, default 5)', 'PRIMER_INTERNAL_OLIGO_SELF_END'=>'(decimal 9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MISHYB_LIBRARY'=>'(string, optional)', 'PRIMER_INTERNAL_OLIGO_MAX_MISHYB'=>'(decimal,9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MIN_QUALITY'=>'(int, default 0)', 'PRIMER_IO_WT_TM_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_TM_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_GC_PERCENT_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_GC_PERCENT_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_SIZE_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_SIZE_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_IO_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_IO_WT_NUM_NS'=>'(float, default 0.0)', 'PRIMER_IO_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_IO_WT_SEQ_QUAL'=>'(float, default 0.0)', 'PRIMER_IO_WT_END_QUAL'=>'(float, default 0.0)', 'PRIMER_INSIDE_PENALTY' => '(float, default -1.0)', 'PRIMER_INTERNAL_OLIGO_MAX_TEMPLATE_MISHYB' => '(decimal 9999.99, default 12.00)', 'PRIMER_OUTSIDE_PENALTY' => '(float, default 0.0)', 'PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS' => '(boolean, default 1)', 'PRIMER_MAX_TEMPLATE_MISPRIMING' => '(decimal,9999.99, default -1.00)', 'PRIMER_PAIR_MAX_TEMPLATE_MISPRIMING' => '(decimal,9999.99, default -1.00)', 'PRIMER_PAIR_WT_TEMPLATE_MISPRIMING' => '(float, default 0.0)', 'PRIMER_WT_TEMPLATE_MISPRIMING' => '(float, default 0.0)' ); $self->{'input_options'}=\%hash; return \%hash; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Prints.pm000077500000000000000000000152021342734133000223370ustar00rootroot00000000000000# Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Prints =head1 SYNOPSIS Build a Prints factory my @params = ('DB',$dbfile); my $factory = Bio::Tools::Run::Prints->new($params); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION FingerPRINTScan II is a PRINTS fingerprint identification algorithm. Copyright (C) 1998,1999 Phil Scordis =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Prints; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PRINTS_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::IO; use Bio::Root::Root; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Prints; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PRINTS_PARAMS=qw(DB PROGRAM VERBOSE); foreach my $attr ( @PRINTS_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'FingerPRINTScan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRINTSDIR}) if $ENV{PRINTSDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $prints->new(@params) Function: creates a new Prints factory Returns: Bio::Tools::Run::Prints Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $obj->run($seqFile) instead. Function: Runs Prints and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Prints Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI, or a Fasta file name =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ){# it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The clone object is not a seq object but a file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; my $str =$self->executable." ".$self->DB." ".$self->_input." -fjR >".$outfile; my $status = system($str); $self->throw( "Prints call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (PRINTS, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*PRINTS; } else { $filehandle = $outfile; } my $prints_parser = Bio::Tools::Prints->new(-fh=>$filehandle); my @prints_feat; while(my $prints_feat = $prints_parser->next_result){ push @prints_feat, $prints_feat; } $self->cleanup(); unlink $outfile; return @prints_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); $in->close(); undef $in; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Profile.pm000077500000000000000000000154101342734133000224610ustar00rootroot00000000000000# BioPerl module for Profile # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Profile =head1 SYNOPSIS Build a Profile factory # $paramfile is the full path to the seg binary file my @params = ('DB',$dbfile,'PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Profile->new($param); # Pass the factory a Bio::PrimarySeqI object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Wrapper module for the pfscan program =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Balamurugan Kumarasamy Email: fugui@worf.fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Profile; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROFILE_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Profile; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PROFILE_PARAMS=qw(DB PROGRAM VERBOSE); foreach my $attr ( @PROFILE_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'pfscan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROFILEDIR}) if $ENV{PROFILEDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Profile->new($param); Function: creates a new Profile factory Returns: Bio::Tools::Run::Profile Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features Usage : DEPRECATED. Use $factory->run($seq) instead. Function: Runs Profile and creates an array of featrues Returns : An array of L objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : my @feats = $factory->run($seq) Function: Runs Profile Returns : An array of L objects Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $display_id = $seq->display_id; my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run($display_id); unlink $infile1; } else { #The clone object is not a seq object but a file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs pfscan Returns : An array of L objects Args : =cut sub _run { my ($self,$display_id)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; my $str =$self->executable.' -fz '.$self->_input." ".$self->DB." > ".$outfile; my $status = system($str); $self->throw( "Profile call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (PROFILE, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*PROFILE; } else { $filehandle = $outfile; } my $profile_parser = Bio::Tools::Profile->new(-fh=>$filehandle); my @profile_feat; while(my $profile_feat = $profile_parser->next_result){ $profile_feat->seq_id($display_id); push @profile_feat, $profile_feat; } $self->cleanup(); unlink $outfile; return @profile_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : $factory->_writeSeqFile($seq) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); undef $in; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Promoterwise.pm000066400000000000000000000245721342734133000235660ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Promoterwise - Wrapper for aligning two sequences using promoterwise =head1 SYNOPSIS # Build a Promoterwise alignment factory my @params = ('-s'=>1,'-query_start'=>10,'-dymem'=>1); my $factory = Bio::Tools::Run::Promoterwise->new(@params); my (@fp)= $factory->run($seq1,$seq2); # each feature pair is a group of hsps foreach my $fp(@fp){ print "Hit Length: ".$fp->feature1->length."\n"; print "Hit Start: ".$fp->feature1->start."\n"; print "Hit End: ".$fp->feature1->end."\n"; print "Hsps: \n"; my @first_hsp = $fp->feature1->sub_SeqFeature; my @second_hsp = $fp->feature2->sub_SeqFeature; for ($i..$#first_hsp){ print $first_hsp[$i]->seq." ".$second_hsp[$i]->seq."\n"; } } print "end: ". $fp->feature2->start."\t".$fp->feature2->end."\n"; #Available parameters include: #( S T U V QUERY_START QUERY_END TARGET_START #TARGET_END LHWINDOW LHSEED LHALN LHSCORE LHREJECT #LHREJECT LHMAX DYMEM KBYTE DYCACHE) #For an explanation of these parameters, please see documentation #from the Wise package. =head1 DESCRIPTION Promoterwise is an alignment algorithm that relaxes the constraint that local alignments have to be co-linear. Otherwise it provides a similar model to DBA, which is designed for promoter sequence alignments by Ewan Birney. It is part of the wise2 package available at: http://www.sanger.ac.uk/software/wise2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email: shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Promoterwise; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROMOTERWISE_SWITCHES @PROMOTERWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Promoterwise; use strict; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); # Two ways to run the program ..... # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { @PROMOTERWISE_PARAMS = qw( S T U V QUERY_START QUERY_END TARGET_START TARGET_END LHWINDOW LHSEED LHALN LHSCORE LHREJECT LHREJECT LHMAX DYMEM KBYTE DYCACHE); @OTHER_SWITCHES = qw(SILENT QUIET ERROROFFSTD); # Authorize attribute fields foreach my $attr ( @PROMOTERWISE_PARAMS, @PROMOTERWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'promoterwise'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin/") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $prog = $self->executable; my $string = `$prog -version`; $string =~ /(Version *)/i; return $1 || undef; } =head2 run Title : run Usage : 2 sequence objects @fp = $factory->run($seq1, $seq2); Function: run Returns : An array of Args : Name of a file containing a set of 2 fasta sequences or else 2 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 2 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self, $seq1, $seq2)=@_; my ($attr, $value, $switch); $self->io->_io_cleanup(); # Create input file pointer my ($infile1,$infile2)= $self->_setinput($seq1, $seq2); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) ");} # run promoterwise my @fp = $self->_run($infile1,$infile2); return @fp; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to a promoterwise program Example : Returns : L Args : Name of a files containing 2 sequences in the order of peptide and genomic =cut sub _run { my ($self,$infile1,$infile2) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); unless ( $self->executable ) { $self->throw("Cannot run Promoterwise unless the executable is found.". " Check your environment variables or make sure ". "promoterwise is in your path."); } my $paramstring = $self->_setparams; my $commandstring = $self->executable." $infile1 $infile2 $paramstring"; # this is to capture STDERR messages which leak out when you run programs # with open(FH, "... |"); if( ( $self->silent && $self->quiet) && ($^O !~ /os2|dos|amigaos/) ) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " -quiet -silent -erroroffstd 2> $null"; } $self->debug( "promoterwise command = $commandstring"); open(PW, "$commandstring |") || $self->throw( "Promoterwise call ($commandstring) crashed: $? \n"); my $pw_parser = Bio::Tools::Promoterwise->new(-fh=>\*PW, -query1_seq=>$self->_query1_seq, -query2_seq=>$self->_query2_seq); my @fp; while (my $fp = $pw_parser->next_result){ push @fp,$fp; } return @fp; } sub _setinput { my ($self, $seq1, $seq2) = @_; my ($tfh1,$tfh2,$outfile1,$outfile2); $self->throw("calling with not enough arguments") unless $seq1 && $seq2; # Not going to set _query_pep/_subject_dna_seq # if you pass in a filename unless( ref($seq1) ) { unless( -e $seq1 ) { $self->throw("Sequence1 is not a Bio::PrimarySeqI object nor file\n"); } $outfile1 = $seq1; } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'fasta'); $out1->write_seq($seq1); $self->_query1_seq($seq1); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh1); undef $tfh1; } unless( ref($seq2) ) { unless( -e $seq2 ) { $self->throw("Sequence2 is not a Bio::PrimarySeqI object nor file\n"); } $outfile2 = $seq2; } else { ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out2 = Bio::SeqIO->new('-fh' => $tfh2, '-format' => 'fasta'); $out2->write_seq($seq2); $self->_query2_seq($seq2); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh2); undef $tfh2; } return ($outfile1,$outfile2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@PROMOTERWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } foreach my $attr(@PROMOTERWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } $param_string = $param_string." -hitoutput tab"; #specify the output option return $param_string; } =head2 _query_pep_seq Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query1_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query1_seq'} = $seq; } return $self->{'_query1_seq'}; } =head2 _subject_dna_seq Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _query2_seq{ my ($self,$seq) = @_; if(defined $seq){ $self->{'_query2_seq'} = $seq; } return $self->{'_query2_seq'}; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Pseudowise.pm000066400000000000000000000262521342734133000232130ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Pseudowise # # Please direct questions and support issues to # # Cared for by # # Copyright Kiran # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Pseudowise - Object for prediting pseudogenes in a given sequence given a protein and a cdna sequence =head1 SYNOPSIS # Build a pseudowise alignment factory my $factory = Bio::Tools::Run::Pseudowise->new(); # Pass the factory 3 Bio:SeqI objects (in the order of query # peptide and cdna and target_genomic) # @genes is an array of GenericSeqFeature objects my @genes = $factory->run($seq1, $seq2, $seq3); =head1 DESCRIPTION Pseudowise is a pseudogene predition program developed by Ewan Birney http://www.sanger.ac.uk/software/wise2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Kiran Email kiran@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Pseudowise; use vars qw($AUTOLOAD @ISA $PROGRAM_NAME $PROGRAM_DIR @PSEUDOWISE_SWITCHES @PSEUDOWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Pseudowise; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable pseudowise to find the pseudowise program. This # can be done in (at least) two ways: # # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { $PROGRAM_NAME = 'pseudowise'; $PROGRAM_DIR = Bio::Root::IO->catfile($ENV{WISEDIR},"src","bin") if $ENV{WISEDIR}; @PSEUDOWISE_PARAMS = qw(SPLICE_MAX_COLLAR SPLICE_MIN_COLLAR SPLICE_SCORE_OFFSET GENESTATS NOMATCHN PARAMS KBYTE DYMEM DYDEBUG PALDEBUG ERRORLOG); @PSEUDOWISE_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD); # Authorize attribute fields foreach my $attr ( @PSEUDOWISE_PARAMS, @PSEUDOWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $string = `pseudowise -- ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 predict_genes Title : predict_genes Usage : DEPRECATED. Use $factory->run instead Function: Predict pseudogenes Returns : An array of Bio::Seqfeature::Generic objects Args : Name of a file containing a set of 3 fasta sequences in the order of peptide, cdna and genomic sequences or else 3 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 3 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub predict_genes { return shift->run(@_); } =head2 run Title : run Usage : my @feats = $factory->run($seq1, $seq2, $seq3); Function: Executes pseudogene binary Returns : An array of Bio::Seqfeature::Generic objects Args : Name of a file containing a set of 3 fasta sequences in the order of peptide, cdna and genomic sequences or else 3 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 3 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub run { my ($self,@args)=@_; my ($attr, $value, $switch); # Create input file pointer my @files = $self->_setinput(@args); if( @files !=3 || grep { !defined } @files ) { $self->throw("Bad input data (sequences need an id ) "); } my $prot_name = $args[0]->display_id; return $self->_run($prot_name, @files); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to a pseudowise program Example : Returns : nothing; pseudowise output is written to a temporary file $TMPOUTFILE Args : Name of a files containing 3 sequences in the order of peptide, cdna and genomic =cut sub _run { my ($self,$prot_name, @files) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my $paramstring = $self->_setparams; my $commandstring = sprintf("%s %s %s > %s", $self->executable, $paramstring, join(" ", @files), $outfile); if($self->silent || $self->quiet || ($self->verbose < 1)){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2> $null"; } $self->debug( "pseudowise command = $commandstring\n"); # my $status = system($commandstring); `$commandstring`; # $self->throw( "Pseudowise call ($commandstring) crashed: $? \n") # unless $status == 0; #parse the outpur and return a Bio::Seqfeature array my $genes = $self->_parse_results($prot_name,$outfile); close($tfh1); undef $tfh1; if( $self->verbose > 0 ) { open($tfh1,$outfile) || die $!; while(<$tfh1>) { $self->debug ($_); } } return @{$genes}; } =head2 _parse_results Title : __parse_results Usage : Internal function, not to be called directly Function: Parses pseudowise output Example : Returns : an reference to an array of Seqfeatures Args : the name of the output file =cut sub _parse_results { my ($self,$prot_name,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my $filehandle; if (ref ($outfile) !~ /GLOB/i ) { open ($filehandle, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); } else { $filehandle = $outfile; } my @genes; #The big parsing loop - parses exons and predicted peptides my $parser = Bio::Tools::Pseudowise->new(-verbose => $self->verbose, -fh => $filehandle); while( my $f = $parser->next_feature ) { push @genes, $f; } return \@genes; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input files for pseudowise program Example : Returns : name of file containing dba data input Args : Seq objects in the order of query protein and cdna and target genomic sequence =cut sub _setinput { my ($self, $seq1, $seq2, $seq3) = @_; my ($tfh1,$tfh2,$tfh3,$outfile1,$outfile2,$outfile3); if(!($seq1->isa("Bio::PrimarySeqI") && $seq2->isa("Bio::PrimarySeqI") && $seq2->isa("Bio::PrimarySeqI"))) { $self->throw("One or more of the sequences are nor Bio::PrimarySeqI objects\n"); } my $tempdir = $self->tempdir(); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$tempdir); ($tfh3,$outfile3) = $self->io->tempfile(-dir=>$tempdir); my $out1 = Bio::SeqIO->new(-fh => $tfh1 ,'-format' => 'Fasta'); my $out2 = Bio::SeqIO->new(-fh => $tfh2, '-format' => 'Fasta'); my $out3 = Bio::SeqIO->new(-fh => $tfh3, '-format' => 'Fasta'); $out1->write_seq($seq1); $out2->write_seq($seq2); $out3->write_seq($seq3); $self->_query_pep_seq($seq1); $self->_query_cdna_seq($seq2); $self->_subject_dna_seq($seq3); close($tfh1); close($tfh2); close($tfh3); undef ($tfh1); undef ($tfh2); undef ($tfh3); return ($outfile1,$outfile2,$outfile3); } sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@PSEUDOWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key.' '.$value; } foreach my $attr(@PSEUDOWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } return $param_string; } =head2 _query_pep_seq() Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_pep_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_pep_seq'} = $seq; } return $self->{'_query_pep_seq'}; } =head2 _query_cdna_seq() Title : _query_cdna_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_cdna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_cdna_seq'} = $seq; } return $self->{'_query_cdna_seq'}; } =head2 _subject_dna_seq() Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_dna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_dna_seq'} = $seq; } return $self->{'_subject_dna_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/RNAMotif.pm000066400000000000000000000335241342734133000225030ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::RNAMotif # # Please direct questions and support issues to # # Cared for by Chris Fields # # Copyright Chris Fields # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::RNAMotif - Wrapper for local execution of rnamotif, rm2ct, rmfmt, rmprune =head1 SYNOPSIS #run rnamotif|rmfmt|rm2ct my @params = ( descr => 'pyrR.descr', fmt => 'gb', setvar => 'ctx_maxlen=20', context => 1, sh => 1, ); my $factory = Bio::Tools::Run::RNAMotif->new(-program =>'rnamotif', -prune => 1, @params); # Pass the factory a Bio::Seq object or a file name # Returns a Bio::SearchIO object #my $searchio = $factory->run("B_sub.gb"); my $searchio = $factory->run($seq); while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } # Pass a finished report through rmfmt (-a format only) # Returns Bio::AlignIO object my $aio = Bio::AlignIO->new(-file=>"rna.msf",-format=>'msf'); my $factory = Bio::Tools::Run::RNAMotif->new('program'=>'rmfmt', 'a' => 1); my $alnin = $factory->run('trna.rnamotif'); my $aln = $alnin->next_aln; $aio->write_aln($aln); =head1 DESCRIPTION Wrapper module for Tom Macke and David Cases's RNAMotif suite of programs. This allows running of rnamotif, rmprune, rm2ct, and rmfmt. Binaries are available at http://www.scripps.edu/mb/case/casegr-sh-3.5.html. This wrapper allows for one to save output to an optional named file or tempfile using the '-outfile_name' or '-tempfile' parameters; this is primarily for saving output from the rm2ct program, which currently does not have a parser available. If both a named output file and tempfile flag are set, the output file name is used. The default setting is piping output into a filehandle for parsing (or output to STDERR, for rm2ct which requires '-verbose' set to 1). WARNING: At this time, there is very little checking of parameter settings, so one could have an error if setting the worng parameter for a program. Future versions will likely add some error checking. =head1 NOTES ON PROGRAM PARAMETERS All program parameters are currently supported. Of note, the 'D' parameter, used for setting the value of a variable to a value, is changed to 'set_var' to avoid name collisions with 'd' (used for dumping internal data structures). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::RNAMotif; use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::SearchIO; use Bio::AlignIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # will move parameters to each program, use this for _set_params my %RNAMOTIF_PROGS =( rnamotif => [qw(c d h N O p s v context sh setvar I xdfname pre post descr xdescr fmt fmap )], rm2ct => [qw(t)], rmfmt => [qw(a l la smax td)], rmprune => [] # no params ); my %RNAMOTIF_SWITCHES = map {$_ => 1} qw(c d h p s v l a la context sh); # order is important here my @RNAMOTIF_PARAMS=qw(program prune c sh N d h p s v context setvar O I xdfname pre post descr xdescr fmt fmap l a la t); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::RNAMotif->new(@params) Function: creates a new RNAMotif factory Returns: Bio::Tools::Run::RNAMotif Args : list of parameters -tempfile => set tempfile flag (default 0) -outfile_name => set file to send output to (default none) -prune => set rmprune postprocess flag (default 0) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($out, $tf) = $self->_rearrange([qw(OUTFILE_NAME TEMPFILE)], @args); $self->io->_initialize_io(); if ($tf && !$out) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } else { $out ||= ''; $self->outfile_name($out); } $tf && $self->tempfile($tf); $self->_set_from_args(\@args, -methods => [@RNAMOTIF_PARAMS], -create => 1 ); return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{RNAMOTIFDIR}) if $ENV{RNAMOTIFDIR}; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; return $self->{'_progversion'} if $self->{'_progversion'}; my $string = `rnamotif -v 2>&1`; my $v; if ($string =~ m{([\d.]+)}) { $v = $1; } return $self->{'_progversion'} = $v || $string; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs RNAMotif programs, returns Bio::SearchIO/Bio::AlignIO Returns : Depends on program: 'rnamotif' - returns Bio::SearchIO 'rmfmt -a' - returns Bio::AlignIO all others - sends output to outfile, tempfile, STDERR Use search() (for Bio::SearchIO stream) or get_AlignIO() (for Bio::AlignIO stream) for a uniform Bioperl object interface. Args : A Bio::PrimarySeqI or file name Note : This runs any RNAMotif program set via program() =cut sub run { my ($self,@seq) = @_; $self->throw ("Must pass a file name or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 search Title : search Usage : $searchio = $obj->search($seqFile) Function: Runs 'rnamotif' on seqs, returns Bio::SearchIO Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI or file name Note : Runs 'rnamotif' only, regardless of program setting; all other parameters loaded =cut sub search { my ($self,@seq) = @_; $self->throw ("Must pass a file name or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 get_AlignIO Title : get_AlignIO Usage : $aln = $obj->get_AlignIO($seqFile) Function: Runs 'rmfmt -a' on file, returns Bio::AlignIO Returns : A Bio::AlignIO Args : File name Note : Runs 'rmfmt -a' only, regardless of program setting; only file name and outfile (if any) are set =cut sub get_AlignIO { my ($self,@seq) = @_; $self->throw ("Must pass a file name") if (!@seq && ref($seq[0])); return $self->_run(@seq); } =head2 tempfile Title : tempfile Usage : $obj->tempfile(1) Function: Set tempfile flag. When set, writes output to a tempfile; this is overridden by outfile_name() if set Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub tempfile { my $self = shift; return $self->{'_tempfile'} = shift if @_; return $self->{'_tempfile'}; } =head2 prune Title : prune Usage : $obj->prune(1) Function: Set rmprune flag. When set, follows any searches with a call to rmprune (this deletes some redundant sequence hits) Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub prune { my $self = shift; return $self->{'_prune'} = shift if @_; return $self->{'_prune'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Args : =cut sub _run { my ($self,$file,$prog)= @_; return unless $self->executable; $self->io->_io_cleanup(); my ($str, $progname, $outfile) = ($prog || $self->executable, $self->program_name, $self->outfile_name); my $param_str = $self->_setparams($file); my $descr = ($self->can('descr')) ? $self->descr : ($self->can('xdescr')) ? $self->xdescr : $self->throw("Must have a descriptor present!"); $str .= " $param_str"; $self->debug("RNAMotif command: $str\n"); # rnamotif => SearchIO object # rmfmt -a => AlignIO object # all others sent to outfile, tempfile, or STDERR (upon verbose = 1) my $obj = ($progname eq 'rnamotif' || $progname eq 'rmprune' ) ? Bio::SearchIO->new(-verbose => $self->verbose, -format => "rnamotif", -version => $self->version, -database => $file, -model => $descr) : ($progname eq 'rmfmt' && $self->can('a') && $self->a) ? Bio::AlignIO->new(-verbose => $self->verbose, -format =>'fasta') : undef; my @args; # file-based if ($outfile) { local $SIG{CHLD} = 'DEFAULT'; my $status = system($str); if($status || !-e $outfile ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "RNAMotif call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($outfile); @args = (-file => $outfile); } # fh-based } else { open(my $fh,"$str |") || $self->throw("RNAMotif call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io); return 1; } } # initialize SearchIO/AlignIO...um...IO # (since file/fh set post obj construction) $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self, $file) = @_; my $progname = $self->program_name; # small sanity check $self->throw("Unknown program: $progname") if (!exists $RNAMOTIF_PROGS{$progname} ); my $param_string; my $outfile = ($self->outfile_name) ? ' > '.$self->outfile_name : ''; my @params; foreach my $attr (@RNAMOTIF_PARAMS) { next if ($attr =~/PROGRAM|DB|PRUNE/i); my $value = $self->$attr(); next unless (defined $value); my $attr_key = '-'.$attr; if (exists $RNAMOTIF_SWITCHES{$attr}) { push @params, $attr_key; } else { if ($attr eq 'setvar') { push @params, '-D'.$value; } else { push @params, $attr_key.' '.$value; } } } $param_string = join ' ', @params; $param_string .= ' '.$file; if ($self->prune && $self->program_name eq 'rnamotif') { $param_string .= ' | rmprune'; } $param_string .= $outfile; return $param_string; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : writes passed Seq objects to tempfile, to be used as input for program Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/RepeatMasker.pm000066400000000000000000000267161342734133000234540ustar00rootroot00000000000000# BioPerl module for RepeatMasker # # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::RepeatMasker - Wrapper for RepeatMasker Program =head1 SYNOPSIS use Bio::Tools::Run::RepeatMasker; my @params=("mam" => 1,"noint"=>1); my $factory = Bio::Tools::Run::RepeatMasker->new(@params); $in = Bio::SeqIO->new(-file => "contig1.fa", -format => 'fasta'); my $seq = $in->next_seq(); #return an array of Bio::SeqFeature::FeaturePair objects my @feats = $factory->run($seq); # or $factory->run($seq); my @feats = $factory->repeat_features; #return the masked sequence, a Bio::SeqI object my $masked_seq = $factory->run; =head1 DESCRIPTION To use this module, the RepeatMasker program (and probably database) must be installed. RepeatMasker is a program that screens DNA sequences for interspersed repeats known to exist in mammalian genomes as well as for low complexity DNA sequences. For more information, on the program and its usage, please refer to http://www.repeatmasker.org/. Having installed RepeatMasker, you must let Bioperl know where it is. This can be done in (at least) three ways: 1. Make sure the RepeatMasker executable is in your path. 2. Define an environmental variable REPEATMASKERDIR which is a directory which contains the RepeatMasker executable: In bash: export REPEATMASKERDIR=/home/username/RepeatMasker/ In csh/tcsh: setenv REPEATMASKERDIR /home/username/RepeatMasker/ 3. Include a definition of an environmental variable REPEATMASKERDIR in every script that will use this RepeatMasker wrapper module, e.g.: BEGIN { $ENV{REPEATMASKERDIR} = '/home/username/RepeatMasker/' } use Bio::Tools::Run::RepeatMasker; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut package Bio::Tools::Run::RepeatMasker; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @RM_SWITCHES @RM_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqFeature::Generic; use Bio::SeqFeature::FeaturePair; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::RepeatMasker; # Let the code begin... @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); BEGIN { @RM_PARAMS = qw(DIR DIV LIB CUTOFF PARALLEL GC FRAG SPECIES MAXSIZE ); @RM_SWITCHES = qw(NOLOW LOW L NOINT INT NORNA ALU M MUS ROD RODENT MAM MAMMAL COW AR ARABIDOPSIS DR DROSOPHILA EL ELEGANS IS_ONLY IS_CLIP NO_IS RODSPEC E EXCLN NO_ID FIXED XM U GFF ACE POLY X XSMALL SMALL INV A ALIGNMENTS PRIMSPEC W WUBLAST S Q QQ GCCALC NOCUT); @OTHER_SWITCHES = qw(NOISY QUIET SILENT); # Authorize attribute fields foreach my $attr ( @RM_PARAMS, @RM_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'RepeatMasker'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{REPEATMASKERDIR}) if $ENV{REPEATMASKERDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new($seq) Function: creates a new wrapper Returns: Bio::Tools::Run::RepeatMasker Args : self =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); # Need to check that filehandle is not left open here... while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } # unless ($self->executable()) { # if( $self->verbose >= 0 ) { # warn "RepeatMasker program not found as ".($self->executable||''). # " or not executable. \n"; # } # } return $self; } =head2 version Title : version Usage : Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return $self->{'_version'} if( defined $self->{'_version'} ); my $exe = $self->executable; return undef unless $exe; my $string = `$exe -- ` ; if( $string =~ /\(([\d.]+)\)/ || $string =~ /RepeatMasker\s+version\s+(\S+)/ ) { return $self->{'_version'} = $1; } else { return $self->{'_version'} = undef; } } =head2 run Title : run Usage : $rm->run($seq); Function: Run Repeatmasker on the sequence set as the argument Returns : an array of repeat features that are Bio::SeqFeature::FeaturePairs Args : Bio::PrimarySeqI compliant object =cut sub run { my ($self,$seq) = @_; my ($infile); $infile = $self->_setinput($seq); my $param_string = $self->_setparams(); my @repeat_feats = $self->_run($infile,$param_string); return @repeat_feats; } =head2 mask Title : mask Usage : $rm->mask($seq) Function: This method is deprecated. Call run() instead Example : Returns : an array of repeat features that are Bio::SeqFeature::FeaturePairs Args : Bio::PrimarySeqI compliant object =cut sub mask{ return shift->run(@_); } =head2 _run Title : _run Usage : $rm->_run ($filename,$param_string) Function: internal function that runs the repeat masker Example : Returns : an array of repeat features Args : the filename to the input sequence and the parameter string =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my $outfile = $infile.".out"; my $cmd_str = $self->executable." $param_string ". $infile; $self->debug("repeat masker command = $cmd_str"); if ($self->quiet || $self->verbose <=0){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd_str.=" 2> $null 1>$null"; } my $status = system($cmd_str); $self->throw("Repeat Masker Call($cmd_str) crashed: $?\n") unless $status == 0; unless (open (RM, $outfile)) { $self->throw("Cannot open RepeatMasker outfile for parsing"); } my $rpt_parser = Bio::Tools::RepeatMasker->new(-fh=>\*RM); my @rpt_feat; while(my $rpt_feat = $rpt_parser->next_result){ push @rpt_feat, $rpt_feat; } $self->repeat_features(\@rpt_feat); #get masked sequence my $masked = $infile.".masked"; my $seqio = Bio::SeqIO->new(-file=>$masked,-format=>'FASTA'); $self->masked_seq($seqio->next_seq); return @rpt_feat; } =head2 masked_seq Title : masked_seq Usage : $rm->masked_seq($seq) Function: get/set for masked sequence Example : Returns : the masked sequence Args : Bio::Seq object =cut sub masked_seq { my ($self,$seq) = @_; if($seq){ $self->{'_masked_seq'} = $seq; } return $self->{'_masked_seq'}; } =head2 repeat_features Title : repeat_features Usage : $rm->repeat_features(\@rf) Function: get/set for repeat features array Example : Returns : the array of repeat features Args : =cut sub repeat_features { my ($self,$rf) = @_; if($rf) { $self->{'_rf'} = $rf; } return @{$self->{'_rf'}}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for repeatmasker program Example : Returns : parameter string to be passed to repeatmasker Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @RM_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @RM_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : a Bio::PrimarySeqI compliant object =cut sub _setinput { my ($self,$seq) = @_; $seq->isa("Bio::PrimarySeqI") || $self->throw("Need a Bio::PrimarySeq compliant object for RepeatMasker"); # my $in = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta'); my ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); $out1->write_seq($seq); close($tfh1); undef $tfh1; return ($outfile1); } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Samtools.pm000077500000000000000000000123421342734133000226630ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Samtools # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Samtools - a run wrapper for the samtools suite *BETA* =head1 SYNOPSIS # convert a sam to a bam $samt = Bio::Tools::Run::Samtools( -command => 'view', -sam_input => 1, -bam_output => 1 ); $samt->run( -bam => "mysam.sam", -out => "mysam.bam" ); # sort it $samt = Bio::Tools::Run::Samtools( -command => 'sort' ); $samt->run( -bam => "mysam.bam", -pfx => "mysam.srt" ); # now create an assembly $assy = Bio::IO::Assembly->new( -file => "mysam.srt.bam", -refdb => "myref.fas" ); =head1 DESCRIPTION This is a wrapper for running samtools, a suite of large-alignment reading and manipulation programs available at L. =head1 RUNNING COMMANDS To run a C command, construct a run factory, specifying the desired command using the C<-command> argument in the factory constructor, along with options specific to that command (see L): $samt = Bio::Tools::Run::Samtools->new( -command => 'view', -sam_input => 1, -bam_output => 1); To execute, use the C method. Input and output files are specified in the arguments of C (see L): $samt->run( -bam => "mysam.sam", -out => "mysam.bam" ); =head1 OPTIONS C is complex, with many subprograms (commands) and command-line options and file specs for each. This module attempts to provide commands and options comprehensively. You can browse the choices like so: $samt = Bio::Tools::Run::Samtools->new( -command => 'pileup' ); # all samtools commands @all_commands = $samt->available_parameters('commands'); @all_commands = $samt->available_commands; # alias # just for pileup @pup_params = $samt->available_parameters('params'); @pup_switches = $samt->available_parameters('switches'); @pup_all_options = $samt->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. See L for the gory details. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $samt = Bio::Tools::Run::Samtools->new( -command => 'view' ); @filespec = $samt->filespec; This example returns the following array: bam >out This indicates that the bam/sam file (bam) and the output file (out) MUST be specified in the C argument list: $samt->run( -bam => 'mysam.sam', -out => 'mysam.cvt' ); If files are not specified per the filespec, text sent to STDOUT and STDERR is saved and is accessible with C<$bwafac->stdout()> and C<$bwafac->stderr()>. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Samtools; use strict; use warnings; use Bio::Root::Root; use Bio::Tools::Run::Samtools::Config; # currently an AssemblerBase object, but the methods we need from # there should really go in an updated WrapperBase.../maj use base qw(Bio::Tools::Run::WrapperBase Bio::Root::Root); use Bio::Tools::Run::WrapperBase::CommandExts; our $program_name = 'samtools'; our $use_dash = 1; our $join = ' '; =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::Samtools(); Function: Builds a new Bio::Tools::Run::Samtools object Returns : an instance of Bio::Tools::Run::Samtools Args : =cut sub new { my ($class, @args) = @_; $program_dir ||= $ENV{SAMTOOLSDIR}; my $self = $class->SUPER::new(@args); return $self; } sub run { shift->_run(@_) } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Samtools/000077500000000000000000000000001342734133000223205ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Samtools/Config.pm000077500000000000000000000116501342734133000240710ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Samtools::Config # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Samtools::Config - configurator for Bio::Tools::Run::Samtools =head1 SYNOPSIS Not used directly. =head1 DESCRIPTION Exports global configuration variables (as required by L) to Samtools.pm. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Samtools::Config; use strict; use warnings; no warnings qw(qw); use Exporter; our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( $program_dir @program_commands %command_prefixes @program_params @program_switches %param_translation %command_files ); @EXPORT_OK = qw(); our $program_dir; our @program_commands = qw( view sort index merge faidx pileup fixmate rmdup fillmd ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # # prefixes only for commands that take params/switches... our %command_prefixes = ( 'view' => 'view', 'sort' => 'srt', 'index' => 'idx', 'merge' => 'mrg', 'faidx' => 'fai', 'pileup' => 'pup', 'fillmd' => 'fmd' ); our @program_params = qw( command view|tab_delim view|out_file view|pass_flags view|filt_flags view|refseq view|qual_threshold view|library view|read_group srt|mem_hint mrg|headers_in pup|refseq pup|map_qcap pup|ref_list pup|site_list pup|theta pup|n_haplos pup|exp_hap_diff pup|indel_prob ); our @program_switches = qw( view|bam_output view|uncompressed view|add_header view|only_header view|sam_input srt|sort_by_names mrg|sort_by_names pup|qual_last_col pup|sam_input pup|indels_only pup|call_cons pup|genot_L fmd|match_with_eq ); our %param_translation = ( 'view|tab_delim' => 't', 'view|out_file' => 'o', 'view|pass_flags' => 'f', 'view|refseq' => 'T', 'view|filt_flags' => 'F', 'view|qual_threshold' => 'q', 'view|library' => 'l', 'view|read_group' => 'r', 'view|bam_output' => 'b', 'view|uncompressed' => 'u', 'view|add_header' => 'h', 'view|only_header' => 'H', 'view|sam_input' => 'S', 'srt|mem_hint' => 'm', 'srt|sort_by_names' => 'n', 'mrg|headers_in' => 'h', 'mrg|sort_by_names' => 'n', 'pup|refseq' => 'f', 'pup|map_qcap' => 'M', 'pup|ref_list' => 't', 'pup|site_list' => 'l', 'pup|theta' => 'T', 'pup|n_haplos' => 'N', 'pup|exp_hap_diff' => 'f', 'pup|indel_prob' => 'I', 'pup|qual_last_col' => 's', 'pup|sam_input' => 'S', 'pup|indels_only' => 'i', 'pup|call_cons' => 'c', 'pup|genot_L' => 'g', 'fmd|match_with_eq' => 'e' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_maq # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'view' => [qw( bam #*rgn >out )], 'sort' => [qw( bam >pfx )], 'index' => [qw( bam )], 'merge' => [qw( obm *ibm )], 'faidx' => [qw( fas #*rgn )], 'pileup' => [qw( bam >out )], 'fixmate' => [qw( ibm obm )], 'rmdup' => [qw( ibm obm )], 'fillmd' => [qw( bam fas )] ); 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Seg.pm000077500000000000000000000145411342734133000216030ustar00rootroot00000000000000# Copyright Balamurugan Kumarasamy # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Seg - Object for identifying low complexity regions in a given protein seequence. =head1 SYNOPSIS # Build a Seg factory # $paramfile is the full path to the seg binary file my @params = ('PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Seg->new($param); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Seg is a program which identifies low complexity regions in proteins. It was developed by Wootton and Federhen at NCBI. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Seg; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SEG_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Seg; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @SEG_PARAMS=qw(PROGRAM VERBOSE); foreach my $attr ( @SEG_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'seg'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string, or undef if $SEGDIR not in ENV Args : None =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SEGDIR}) if $ENV{SEGDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new(@params) Function: creates a new Seg factory Returns: Bio::Tools::Run::Seg Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED Use $obj->run($seq) instead Function: Runs Seg and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Seg and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The seq object is not a seq object but a file. #Here the file does not need to be created. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal (not to be used directly) Returns : Args : =cut sub _input { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal (not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : None =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." ".$self->_input." -l > ".$outfile; my $status = system($str); $self->throw( "Seg call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (SEG, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*SEG; } else { $filehandle = $outfile; } my $seg_parser = Bio::Tools::Seg->new(-fh=>$filehandle); my @seg_feat; while(my $seg_feat = $seg_parser->next_result){ push @seg_feat, $seg_feat; } # free resources $self->cleanup(); unlink $outfile; close($tfh1); undef $tfh1; return @seg_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal (not to be used directly) Returns : string - Fasta filename to which $seq was written Args : Bio::Seq object =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Signalp.pm000077500000000000000000000160261342734133000224620ustar00rootroot00000000000000# Wrapper module for SignalP Bio::Tools::Run::Signalp # # Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp # originally written by Marc Sohrmann (ms2@sanger.ac.uk) # Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 NAME Bio::Tools::Run::Signalp =head1 SYNOPSIS Build a Signalp factory my $factory = Bio::Tools::Run::Signalp->new(); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION wrapper module for Signalp program =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp originally written by Marc Sohrmann (ms2@sanger.ac.uk) Written in BioPipe by Balamurugan Kumarasamy Contributions by David Vilanova (david.vilanova@urbanet.ch) Shawn Hoon (shawnh@fugu-sg.org) # Please direct questions and support issues to # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Signalp; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SIGNALP_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Signalp; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @SIGNALP_PARAMS=qw(PROGRAM VERBOSE); foreach my $attr ( @SIGNALP_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'signalp'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIGNALPDIR}) if $ENV{SIGNALPDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; return $self->$attr if $self->$attr; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Signalp->new(); Function: creates a new Signalp factory Returns: Bio::Tools::Run::Signalp Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $factory->run($seq) instead Function: Runs Signalp and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run() Usage : my $feats = $factory->run($seq) Function: Runs Signalp Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my @feats; if (ref($seq) ) { if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { my $in = Bio::SeqIO->new(-file => $seq, '-format' =>'fasta'); my $infile1; while ( my $tmpseq = $in->next_seq() ) { $infile1 = $self->_writeSeqFile($tmpseq); } $self->_input($infile1); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; $self->{'input'} = $infile1 if(defined $infile1); return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs signalp Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." -t euk -trunc 50 ".$self->{'input'}." > ".$outfile; my $status = system($str); $self->throw( "Signalp call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (SIGNALP, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*SIGNALP; } else { $filehandle = $outfile; } my $signalp_parser = Bio::Tools::Signalp->new(-fh=>$filehandle); my @signalp_feat; while(my $signalp_feat = $signalp_parser->next_result){ push @signalp_feat, $signalp_feat; } $self->cleanup(); close($tfh1); undef $tfh1; unlink $outfile; return @signalp_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : $factory->_writeSeqFile($seq) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Simprot.pm000066400000000000000000000351651342734133000225240ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Simprot # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Simprot - Wrapper around the Simprot program. Wrapper for the calculation of a multiple sequence alignment from a phylogenetic tree =head1 SYNOPSIS use Bio::Tools::Run::Simprot; use Bio::TreeIO; my $treeio = Bio::TreeIO->new( -format => 'nh', -file => 't/data/tree.nh'); my $tree = $treeio->next_tree; my $simprot = Bio::Tools::Run::Simprot->new(); $simprot->tree($tree); my ($rc,$aln,$seq) = $simprot->run(); =head1 DESCRIPTION This is a wrapper around the Simprot program by Andy Pang, Andrew D Smith, Paulo AS Nuin and Elisabeth RM Tillier. Simprot allows for several models of amino acid substitution (PAM, JTT and PMB), allows for gamma distributed sites rates according to Yang's model, and implements a parameterised Qian and Goldstein distribution model for insertion and deletion. See http://www.uhnres.utoronto.ca/labs/tillier/software.htm for more information. =head2 Helping the module find your executable You will need to enable SIMPROTDIR to find the simprot program. This can be done in (at least) three ways: 1. Make sure the simprot executable is in your path (i.e. 'which simprot' returns a valid program 2. define an environmental variable SIMPROTDIR which points to a directory containing the 'simprot' app: In bash export SIMPROTDIR=/home/progs/simprot or In csh/tcsh setenv SIMPROTDIR /home/progs/simprot 3. include a definition of an environmental variable SIMPROTDIR in every script that will BEGIN {$ENV{SIMPROTDIR} = '/home/progs/simprot'; } use Bio::Tools::Run::Simprot; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Simprot; use vars qw(@ISA %VALIDVALUES $PROGRAMNAME $PROGRAM); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::SeqIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # valid values for parameters, the default one is always # the first one in the array BEGIN { %VALIDVALUES = ( 'branch' => '1', 'eFactor' => '3', 'indelFrequncy' => '0.03', 'maxIndel' => '2048', 'subModel' => [ 2,0,1], # 0:PAM, 1:JTT, 2:PMB 'rootLength' => '50', 'alpha' => '1', 'Benner' => '0', 'interleaved' => '1', 'variablegamma' => '0', 'bennerk' => '-2', ); } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'simprot'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIMPROTDIR}) if $ENV{SIMPROTDIR}; } =head2 new Title : new Usage : my $simprot = Bio::Tools::Run::Simprot->new(); Function: Builds a new Bio::Tools::Run::Simprot Returns : Bio::Tools::Run::Simprot Args : -alignment => the Bio::Align::AlignI object -tree => the Bio::Tree::TreeI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -executable => where the simprot executable resides -params => A reference to a hash where keys are parameter names and hash values are the associated parameter values See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 set_parameters Title : set_parameters Usage : $codeml->set_parameters($parameter, $value); Function: (Re)set the SimProt parameters Returns : none Args : First argument is the parameter name Second argument is the parameter value =cut sub set_parameter{ my ($self,$param,$value) = @_; unless (defined $self->{'no_param_checks'} && $self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_simprotparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $codeml->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_simprotparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_simprotparams'}->{$param} = $val->[0]; } else { $self->{'_simprotparams'}->{$param} = $val; } } } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_simprotparams'} }; } =head2 prepare Title : prepare Usage : my $rundir = $simprot->prepare(); Function: prepare the simprot analysis using the default or updated parameters the alignment parameter and species tree must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; if( ! $tree ) { $self->warn("must have supplied a valid species tree file in order to run simprot"); return 0; } my ($tempdir) = $self->tempdir(); my ($temptreeFH); if( ! ref($tree) && -e $tree ) { $self->{_temptreefile} = $tree; } else { ($temptreeFH,$self->{_temptreefile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{_prepared} = 1; my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { $self->{_simprot_params} .=" \-\-$param\=$val"; } return $tempdir; } =head2 run Title : run Usage : my $nhx_tree = $simprot->run(); Function: run the simprot analysis using the default or updated parameters the alignment parameter must have been set Returns : L object [optional] Args : L object L object =cut sub run { my ($self,$tree) = @_; $self->prepare($tree) unless (defined($self->{_prepared})); my ($rc,$aln,$seq) = (1); my ($tmpdir) = $self->tempdir(); my $outfile; { my $commandstring; my $exit_status; my $simprot_executable = $self->executable; $commandstring .= $simprot_executable; $commandstring .= $self->{_simprot_params}; $commandstring .= " --tree=". $self->{_temptreefile} . " "; my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); my $seqfile; ($tfh, $seqfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $commandstring .= "--alignment=". $self->outfile_name . " "; $commandstring .= "--sequence=". $seqfile . " "; $self->throw("unable to find or run executable for 'simprot'") unless $simprot_executable && -e $simprot_executable && -x _; open(RUN, "$commandstring |") or $self->throw("Cannot run $commandstring"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if( (grep { /^\[ /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $aln = Bio::AlignIO->new(-file => "$outfile",-format => 'fasta'); $seq = Bio::SeqIO->new(-file => "$seqfile", -format => 'fasta'); }; if( $@ ) { $self->warn($self->error_string); } } unless ( $self->save_tempfiles ) { $self->cleanup(); } return ($rc,$aln,$seq); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Version\:\s+(\d+.\d+.\d+)/m; return $1 || undef; } =head2 alignment Title : alignment Usage : $simprot->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $simprot->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $simprot->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $simprot->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; # Needed to keep compiler happy bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneBlast.pm000066400000000000000000000541051342734133000241000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlast - Object for the local execution of the NCBI BLAST program suite (blastall, blastpgp, bl2seq). There is experimental support for WU-Blast and NCBI rpsblast. =head1 SYNOPSIS # Local-blast "factory object" creation and blast-parameter # initialization: @params = (-database => 'swissprot', -outfile => 'blast1.out'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); # Blast a sequence against a database: $str = Bio::SeqIO->new(-file=>'t/amino.fa', -format => 'Fasta'); $input = $str->next_seq(); $input2 = $str->next_seq(); $blast_report = $factory->blastall($input); # Run an iterated Blast (psiblast) of a sequence against a database: $factory->j(3); # 'j' is blast parameter for # of iterations $factory->outfile('psiblast1.out'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); $blast_report = $factory->blastpgp($input); # Use blast to align 2 sequences against each other: $factory = Bio::Tools::Run::StandAloneBlast->new(-outfile => 'bl2seq.out'); $factory->bl2seq($input, $input2); # Experimental support for WU-Blast 2.0 my $factory = Bio::Tools::Run::StandAloneBlast->new(-program =>"wublastp", -database =>"swissprot", -e => 1e-20); my $blast_report = $factory->wublast($seq); # Experimental support for NCBI rpsblast my $factory = Bio::Tools::Run::StandAloneBlast->new(-db => 'CDD/Cog', -expect => 0.001); $factory->F('T'); # turn on SEG filtering of query sequence my $blast_report = $factory->rpsblast($seq); # Use the experimental fast Blast parser, 'blast_pull' my $factory = Bio::Tools::Run::StandAloneBlast->new(-_READMETHOD =>'blast_pull', @other_params); # Various additional options and input formats are available, # see the DESCRIPTION section for details. =head1 DESCRIPTION This DESCRIPTION only documents Bio::Tools::Run::StandAloneBlast, a Bioperl object for running the NCBI standAlone BLAST package. Blast itself is a large & complex program - for more information regarding BLAST, please see the BLAST documentation which accompanies the BLAST distribution. BLAST is available from ftp://ncbi.nlm.nih.gov/blast/. A source of confusion in documenting a BLAST interface is that the term "program" is used in - at least - three different ways in the BLAST documentation. In this DESCRIPTION, "program" will refer to the BLAST routine set by the BLAST C<-p> parameter that can be set to blastn, blastp, tblastx etc. We will use the term Blast "executable" to refer to the various different executable files that may be called - ie. blastall, blastpgp or bl2seq. In addition, there are several BLAST capabilities, which are also referred to as "programs", and are implemented by using specific combinations of BLAST executables, programs and parameters. They will be referred by their specific names - eg PSIBLAST and PHIBLAST. Before running StandAloneBlast it is necessary: to install BLAST on your system, to edit set the environmental variable $BLASTDIR or your $PATH variable to point to the BLAST directory, and to ensure that users have execute privileges for the BLAST program. If the databases which will be searched by BLAST are located in the data subdirectory of the blast program directory (the default installation location), StandAloneBlast will find them; however, if the database files are located in any other location, environmental variable $BLASTDATADIR will need to be set to point to that directory. The use of the StandAloneBlast module is as follows: Initially, a local blast "factory object" is created. The constructor may be passed an optional array of (non-default) parameters to be used by the factory, eg: @params = (-program => 'blastn', -database => 'ecoli.nt'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); Any parameters not explicitly set will remain as the defaults of the BLAST executable. Note each BLAST executable has somewhat different parameters and options. See the BLAST Documentation for a description or run the BLAST executable from the command line followed solely with a "-" to see a list of options and default values for that executable; eg Eblastall -. BLAST parameters can be changed and/or examined at any time after the factory has been created. The program checks that any parameter/switch being set/read is valid. Except where specifically noted, StandAloneBlast uses the same single-letter, case-sensitive parameter names as the actual blast program. Currently no checks are included to verify that parameters are of the proper type (e.g. string or numeric) or that their values are within the proper range. As an example, to change the value of the Blast parameter 'e' ('e' is the parameter for expectation-value cutoff) $expectvalue = 0.01; $factory->e($expectvalue); Note that for improved script readibility one can modify the name of the (ncbi) BLAST parameters as desired as long as the initial letter (and case) of the parameter are preserved, e.g.: $factory->expectvalue($expectvalue); Unfortunately, some of the BLAST parameters are not the single letter one might expect (eg "iteration round" in blastpgp is 'j'). Again one can check by using, for example: > blastpgp - Wublast parameters need to be complete (ie. don't truncate them to their first letter), but are case-insensitive. Once the factory has been created and the appropriate parameters set, one can call one of the supported blast executables. The input sequence(s) to these executables may be fasta file(s) as described in the BLAST documentation. $inputfilename = 't/testquery.fa'; $blast_report = $factory->blastall($inputfilename); In addition, sequence input may be in the form of either a Bio::Seq object or (a reference to) an array of Bio::Seq objects, e.g.: $input = Bio::Seq->new(-id => "test query", -seq => "ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->blastall($input); NOTE: Use of the BPlite method has been deprecated and is no longer supported. For blastall and non-psiblast blastpgp runs, report object is a L object, selected by the user with the parameter _READMETHOD. The leading underscore is needed to distinguish this option from options which are passed to the BLAST executable. The default parser is Bio::SearchIO::blast. In any case, the "raw" blast report is also available. The filename is set by the 'outfile' parameter and has the default value of "blastreport.out". For psiblast execution in the BLAST "jumpstart" mode, the program must be passed (in addition to the query sequence itself) an alignment containing the query sequence (in the form of a SimpleAlign object) as well as a "mask" specifying at what residues position-specific scoring matrices (PSSMs) are to used and at what residues default scoring matrices (eg BLOSUM) are to be used. See psiblast documentation for more details. The mask itself is a string of 0's and 1's which is the same length as each sequence in the alignment and has a "1" at locations where (PSSMs) are to be used and a "0" at all other locations. So for example: $str = Bio::AlignIO->new(-file => "cysprot.msf", -format => 'msf'); $aln = $str->next_aln(); $len = $aln->length_aln(); $mask = '1' x $len; # simple case where PSSM's to be used at all residues $report = $factory->blastpgp("cysprot1.fa", $aln, $mask); For bl2seq execution, StandAloneBlast.pm can be combined with AlignIO.pm to directly produce a SimpleAlign object from the alignment of the two sequences produced by bl2seq as in: # Get 2 sequences $str = Bio::SeqIO->new(-file=>'t/amino.fa' , -format => 'Fasta'); my $seq3 = $str->next_seq(); my $seq4 = $str->next_seq(); # Run bl2seq on them $factory = Bio::Tools::Run::StandAloneBlast->new(-program => 'blastp', -outfile => 'bl2seq.out'); my $bl2seq_report = $factory->bl2seq($seq3, $seq4); # Use AlignIO.pm to create a SimpleAlign object from the bl2seq report $str = Bio::AlignIO->new(-file=> 'bl2seq.out',-format => 'bl2seq'); $aln = $str->next_aln(); For more examples of syntax and use of StandAloneBlast.pm, the user is encouraged to run the scripts standaloneblast.pl in the bioperl examples/tools directory and StandAloneBlast.t in the bioperl t/ directory. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneBlast; use strict; use warnings; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::SearchIO; use File::Spec; use base qw(Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); our $AUTOLOAD; our $DEFAULTBLASTTYPE = 'NCBI'; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $ENV{'BLASTDATADIR'} || $ENV{'BLASTDB'}; if (! defined $DATADIR && defined $ENV{'BLASTDIR'}) { my $dir = Bio::Root::IO->catfile($ENV{'BLASTDIR'}, 'data'); if (-d $dir) { $DATADIR = $dir; } elsif ($ENV{'BLASTDIR'} =~ /bin/) { $dir = $ENV{'BLASTDIR'}; $dir =~ s/bin/data/; $DATADIR = $dir if -d $dir; } } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneNCBIBlast or StandAloneWUBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p in those programs that have it. For a complete listing of settable parameters, run the relevant executable BLAST program with the option "-" as in blastall - Note that the input parameters (-i, -j, -input) should not be set directly by you: this module sets them when you call one of the executable methods. Blastall -p Program Name [String] Input should be one of "blastp", "blastn", "blastx", "tblastn", or "tblastx". -d Database [String] default = nr The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "nr est" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm -S Query strands to search against database (for blast[nx], and tblastx). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 Blastpgp (including Psiblast) -j is the maximum number of rounds (default 1; i.e., regular BLAST) -h is the e-value threshold for including sequences in the score matrix model (default 0.001) -c is the "constant" used in the pseudocount formula specified in the paper (default 10) -B Multiple alignment file for PSI-BLAST "jump start mode" Optional -Q Output File for PSI-BLAST Matrix in ASCII [File Out] Optional rpsblast -d Database [String] default = (none - you must specify a database) The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "Cog Smart" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm Bl2seq -p Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String] default = blastp -o alignment output file [File Out] default = stdout -e Expectation value (E) [Real] default = 10.0 -S Query strands to search against database (blastn only). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 WU-Blast -p Program Name [String] Input should be one of "wublastp", "wublastn", "wublastx", "wutblastn", or "wutblastx". -d Database [String] default = nr The database specified must first be formatted with xdformat. -E Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm =cut sub new { my ($caller, @args) = @_; my $class = ref($caller) || $caller; # Because of case-sensitivity issues, ncbi and wublast methods are # mutually exclusive. We can't load ncbi methods if we start with wublast # (and vice versa) since wublast e() and E() should be the same thing, # whilst they must be different things in ncbi blast. # # Solution: split StandAloneBlast out into two more modules for NCBI and WU if ($class =~ /NCBI|WU/) { return $class->SUPER::new(@args); } my %args = @args; my $blasttype = $DEFAULTBLASTTYPE; while (my ($attr, $value) = each %args) { if ($attr =~/^-?\s*program\s*$|^-?p$/) { if ($value =~ /^wu*/) { $blasttype = 'WU'; } } } my $module = "Bio::Tools::Run::StandAlone${blasttype}Blast"; Bio::Root::Root->_load_module($module); return $module->new(@args); } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe, $warn) = @_; $exename = 'blastall' unless (defined $exename || $self =~ /WUBlast/); $self->program_name($exename); if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_dir Title : program_dir Usage : my $dir = $factory->program_dir(); Function: Abstract get method for dir of program. Returns : string representing program directory Args : none =cut sub program_dir { my $self = shift; $self =~ /NCBIBlast/? $ENV{'BLASTDIR'}: $ENV{'WUBLASTDIR'}; } sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift } return $self->{program_name} || ''; } sub program { my $self = shift; if( wantarray ) { return ($self->executable, $self->p()); } else { return $self->executable(@_); } } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Blast executable Example : Returns : name of file containing Blast data input Args : Seq object reference or input file name =cut sub _setinput { my ($self, $executable, $input1, $input2) = @_; my ($seq, $temp, $infilename1, $infilename2,$fh ) ; # If $input1 is not a reference it better be the name of a file with # the sequence/ alignment data... $self->io->_io_cleanup(); SWITCH: { unless (ref $input1) { $infilename1 = (-e $input1) ? $input1 : 0 ; last SWITCH; } # $input may be an array of BioSeq objects... if (ref($input1) =~ /ARRAY/i ) { ($fh,$infilename1) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); $temp = Bio::SeqIO->new(-fh=> $fh, -format => 'fasta'); foreach $seq (@$input1) { unless ($seq->isa("Bio::PrimarySeqI")) {return 0;} $seq->display_id($seq->display_id); $temp->write_seq($seq); } close $fh; $fh = undef; last SWITCH; } # $input may be a single BioSeq object... elsif ($input1->isa("Bio::PrimarySeqI")) { ($fh,$infilename1) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); # just in case $input1 is taken from an alignment and has spaces (ie # deletions) indicated within it, we have to remove them - otherwise # the BLAST programs will be unhappy my $seq_string = $input1->seq(); $seq_string =~ s/\W+//g; # get rid of spaces in sequence $input1->seq($seq_string); $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'fasta'); $temp->write_seq($input1); close $fh; undef $fh; last SWITCH; } $infilename1 = 0; # Set error flag if you get here } unless ($input2) { return $infilename1; } SWITCH2: { unless (ref $input2) { $infilename2 = (-e $input2) ? $input2 : 0 ; last SWITCH2; } if ($input2->isa("Bio::PrimarySeqI") && $executable eq 'bl2seq' ) { ($fh,$infilename2) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta'); $temp->write_seq($input2); close $fh; undef $fh; last SWITCH2; } # Option for using psiblast's pre-alignment "jumpstart" feature elsif ($input2->isa("Bio::SimpleAlign") && $executable eq 'blastpgp' ) { # a bit of a lie since it won't be a fasta file ($fh,$infilename2) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); # first we retrieve the "mask" that determines which residues should # by scored according to their position and which should be scored # using the non-position-specific matrices my @mask = split("", shift ); # get mask # then we have to convert all the residues in every sequence to upper # case at the positions that we want psiblast to use position specific # scoring foreach $seq ( $input2->each_seq() ) { my @seqstringlist = split("",$seq->seq()); for (my $i = 0; $i < scalar(@mask); $i++) { unless ( $seqstringlist[$i] =~ /[a-zA-Z]/ ) {next} $seqstringlist[$i] = $mask[$i] ? uc $seqstringlist[$i]: lc $seqstringlist[$i] ; } my $newseqstring = join("", @seqstringlist); $seq->seq($newseqstring); } # Now we need to write out the alignment to a file # in the "psi format" which psiblast is expecting $input2->map_chars('\.','-'); $temp = Bio::AlignIO->new(-fh=> $fh, '-format' => 'psi'); $temp->write_aln($input2); close $fh; undef $fh; last SWITCH2; } $infilename2 = 0; # Set error flag if you get here } return ($infilename1, $infilename2); } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $tcoffee->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $tcoffee->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a Bio::Root::IO object Returns : Bio::Root::IO Args : none =cut 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneBlastPlus.pm000077500000000000000000001154721342734133000247540ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlastPlus # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlastPlus - Compute with NCBI's blast+ suite *ALPHA* =head1 SYNOPSIS B: This module is related to the L system in name (and inspiration) only. You must use this module directly. # existing blastdb: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb' ); # create blastdb from fasta file and attach $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => 'myseqs.fas', -create => 1 ); # create blastdb from BioPerl sequence collection objects $alnio = Bio::AlignIO->new( -file => 'alignment.msf' ); $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => $alnio, -create => 1 ); @seqs = $alnio->next_aln->each_seq; $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => \@seqs, -create => 1 ); # create database with masks $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -masker => 'dustmasker', -mask_data => 'maskseqs.fas', -create => 1 ); # create a mask datafile separately $mask_file = $fac->make_mask( -data => 'maskseqs.fas', -masker => 'dustmasker' ); # query database for metadata $info_hash = $fac->db_info; $num_seq = $fac->db_num_sequences; @mask_metadata = @{ $fac->db_filter_algorithms }; # perform blast methods $result = $fac->tblastn( -query => $seqio ); # see Bio::Tools::Run::StandAloneBlastPlus::BlastMethods # for many more details =head1 DESCRIPTION B This module requires BLAST+ v. 2.2.24+ and higher. Until the API stabilizes for BLAST+, consider this module highly experimental. This module along with L allows the user to perform BLAST functions using the external program suite C (available at L), using BioPerl objects and L facilities. This wrapper can prepare BLAST databases as well as run BLAST searches. It can also be used to run C programs independently. This module encapsulates object construction and production of databases and masks. Blast analysis methods (C, etc>) are contained in L. =head1 USAGE The basic mantra is to (1) create a BlastPlus factory using the C constructor, and (2) perform BLAST analyses by calling the desired BLAST program by name off the factory object. The blast database itself and any masking data are attached to the factory object (step 1). Query sequences and any parameters associated with particular programs are provided to the blast method call (step 2), and are run against the attached database. =head2 Factory construction/initialization The factory needs to be told where the blast+ programs live. The C environment variable will be checked for the default executable directory. The program directory can be set for individual factory instances with the C parameter. All the blast+ programs must be accessible from that directory (i.e., as executable files or symlinks). Either the database or BLAST subject data must be specified at object construction. Databases can be pre-existing formatted BLAST dbs, or can be built directly from fasta sequence files or BioPerl sequence object collections of several kinds. The key constructor parameters are C, C, C. To specify a pre-existing BLAST database, use C alone: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb' ); The directory can be specified along with the basename, or separately with C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => '~/home/blast/mydb' ); #same as $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb', -DB_DIR => '~/home/blast' ); To create a BLAST database de novo, see L. If you wish to apply pre-existing mask data (i.e., the final ASN1 output from one of the blast+ masker programs), to the database before querying, specify it with C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb', -MASK_FILE => 'mymaskdata.asn' ); =head2 Creating a BLAST database There are several options for creating the database de novo using attached data, both before and after factory construction. If a temporary database (one that can be deleted by the C method) is desired, leave out the C<-db_name> parameter. If C<-db_name> is specified, the database will be preserved with the basename specified. Use C<-create => 1> to create a new database (otherwise the factory will look for an existing database). Use C<-overwrite => 1> to create and overwrite an existing database. Note that the database is not created immediately on factory construction. It will be created if necessary on the first use of a factory BLAST method, or you can force database creation by executing $fac->make_db(); =over =item * Specify data during construction With a FASTA file: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => 'myseqs.fas', -create => 1 ); With another BioPerl object collection: $alnio = Bio::AlignIO->new( -file => 'alignment.msf' ); $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => $alnio, -create => 1 ); @seqs = $alnio->next_aln->each_seq; $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => \@seqs, -create => 1 ); Other collections (e.g., L) are valid. If a certain type does not work, please submit an enhancement request. To create temporary databases, leave out the C<-db_name>, e.g. $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_data => 'myseqs.fas', -create => 1 ); To get the tempfile basename, do: $dbname = $fac->db; =item * Specify data post-construction Use the explicit attribute setters: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -create => 1 ); $fac->set_db_data('myseqs.fas'); $fac->make_db; =back =head2 Creating and using mask data The blast+ mask utilities C, C, and C are available. Masking can be rolled into database creation, or can be executed later. If your mask data is already created and in ASN1 format, set the C<-mask_file> attribute on construction (see L). To create a mask from raw data or an existing database and apply the mask upon database creation, construct the factory like so: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -masker => 'dustmasker', -mask_data => 'maskseqs.fas', -create => 1 ); The masked database will be created during C. The C<-mask_data> parameter can be a FASTA filename or any BioPerl sequence object collection. If the datatype ('nucl' or 'prot') of the mask data is not compatible with the selected masker, an exception will be thrown with a message to that effect. To create a mask ASN1 file that can be used in the C<-mask_file> parameter separately from the attached database, use the C method directly: $mask_file = $fac->make_mask(-data => 'maskseqs.fas', -masker => 'dustmasker'); # segmasker can use a blastdb as input $mask_file = $fac->make_mask(-mask_db => 'mydb', -masker => 'segmasker') $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -mask_file => $mask_file -create => 1 ); =head2 Getting database information To get a hash containing useful metadata on an existing database (obtained by running C, use C: # get info on the attached database.. $info = $fac->db_info; # get info on another database $info = $fac->db_info('~/home/blastdbs/another'); To get a particular info element for the attached database, just call the element name off the factory: $num_seqs = $fac->db_num_sequences; # info on all the masks applied to the db, if any: @masking_info = @{ $fac->db_filter_algorithms }; =head2 Accessing the L factory The blast+ programs are actually executed by a L wrapper instance. This instance is available for peeking and poking in the L C attribute. For convenience, C methods can be run from the C object, and are delegated to the C attribute. For example, to get the blast+ program to be executed, examine either $fac->factory->command or $fac->command Similarly, the current parameters for the C factory are @parameters = $fac->get_parameters =head2 Cleaning up temp files Temporary analysis files produced under a single factory instances can be unlinked by running $fac->cleanup; Tempfiles are generally not removed unless this method is explicitly called. C only unlinks "registered" files and databases. All temporary files are automatically registered; in particular, "anonymous" databases (such as $fac->Bio::Tools::Run::StandAloneBlastPlus->new( -db_data => 'myseqs.fas', -create => 1 ); without a C<-db_name> specification) are registered for cleanup. Any file or database can be registered with an internal method: $fac->_register_temp_for_cleanup('testdb'); =head2 Other Goodies =over =item You can check whether a given basename points to a properly formatted BLAST database by doing $is_good = $fac->check_db('putative_db'); =item User parameters can be passed to the underlying blast+ programs (if you know what you're doing) with C and C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'customdb', -db_data => 'myseqs.fas', -db_make_args => [ '-taxid_map' => 'seq_to_taxa.txt' ], -masker => 'windowmasker', -mask_data => 'myseqs.fas', -mask_make_args => [ '-dust' => 'T' ], -create => 1 ); =item You can prevent exceptions from being thrown by failed blast+ program executions by setting C. Examine the error with C: $fac->no_throw_on_crash(1); $fac->make_db; if ($fac->stderr =~ /Error:/) { #handle error ... } =back =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 CONTRIBUTORS =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::StandAloneBlastPlus; use strict; our $AUTOLOAD; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Root::Root; use Bio::SeqIO; use Bio::Tools::GuessSeqFormat; use Bio::Tools::Run::StandAloneBlastPlus::BlastMethods; use File::Temp 0.22; use IO::String; use base qw(Bio::Root::Root); unless ( eval "require Bio::Tools::Run::BlastPlus" ) { Bio::Root::Root->throw("This module requires 'Bio::Tools::Run::BlastPlus'"); } my %AVAILABLE_MASKERS = ( 'windowmasker' => 'nucl', 'dustmasker' => 'nucl', 'segmasker' => 'prot' ); # NOTE: After testing all possible output formats, only 'maskinfo_asn1_text' # is currently working correctly as input for makeblastdb '-mask_data' argument, # the others return an 'Unknown encoding for mask data' error my %MASKER_ENCODING = ( 'windowmasker' => 'maskinfo_asn1_text', 'dustmasker' => 'maskinfo_asn1_text', 'segmasker' => 'maskinfo_asn1_text' ); my $bp_class = 'Bio::Tools::Run::BlastPlus'; # what's the desire here? # # * factory object (created by new()) # - points to some blast db entity, so all functions run off the # the factory (except bl2seq?) use the associated db # # * create a blast formatted database: # - specify a file, or an AlignI object # - store for later, or store in a tempfile to throw away # - object should store its own database pointer # - provide masking options based on the maskers provided # # * perform database actions via db-oriented blast+ commands # via the object # # * perform blast searches against the database # - blastx, blastp, blastn, tblastx, tblastn # - specify Bio::Seq objects or files as queries # - output the results as a file or as a Bio::Search::Result::BlastResult # * perform 'special' (i.e., ones I don't know) searches # - psiblast, megablast, rpsblast, rpstblastn # some of these are "tasks" under particular programs # check out psiblast, why special (special 'iteration' handling in # ...::BlastResult) # check out rpsblast, megablast # # * perform bl2seq # - return the alignment directly as a convenience, using Bio::Search # functions # lazy db formatting: makeblastdb only on first blast request... # ParameterBaseI delegation : use AUTOLOAD # # =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::StandAloneBlastPlus(); Function: Builds a new Bio::Tools::Run::StandAloneBlastPlus object Returns : an instance of Bio::Tools::Run::StandAloneBlastPlus Args : named argument (key => value) pairs: -db : blastdb name =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($db_name, $db_data, $db_dir, $db_make_args, $mask_file, $mask_data, $mask_make_args, $masker, $create, $overwrite, $is_remote, $prog_dir, $program_dir) = $self->_rearrange([qw( DB_NAME DB_DATA DB_DIR DB_MAKE_ARGS MASK_FILE MASK_DATA MASK_MAKE_ARGS MASKER CREATE OVERWRITE REMOTE PROG_DIR PROGRAM_DIR )], @args); # parm taint checks if ($db_name) { $self->throw("DB name contains invalid characters") unless $db_name =~ m{^[a-z0-9_/:.+-]+$}i; } if ( $db_dir ) { $self->throw("DB directory (DB_DIR) not found") unless (-d $db_dir); $self->{'_db_dir'} = $db_dir; } else { $self->{'_db_dir'} = '.'; } $program_dir ||= $prog_dir; # alias # now handle these systematically (bug #3003) # allow db_name to include path info # let db_dir act as root if present and db_name is a relative path # db property contains the pathless name only if ($db_name) { my ($v,$d,$f) = File::Spec->splitpath($db_name); $self->throw("No DB name at the end of path '$db_name'") unless $f; $f =~ s/\..*$//; # tolerant of extensions, but ignore them $self->{_db} = $f; # now establish db_path property as the internal authority on # db location... if ( File::Spec->file_name_is_absolute($db_name) ) { $self->throw("Path specified in DB name ('$d') does not exist") unless !$d || (-d $d); $self->{_db_path} = File::Spec->catfile($d,$f); $self->{_db_dir} = $d; # ignore $db_dir, give heads-up $self->warn("DB name is an absolute path; setting db_dir to '".$self->db_dir."'") if $db_dir; } else { $d = File::Spec->catdir($self->db_dir, $d); $self->throw("Path specified by DB_DIR+DB_NAME ('$d') does not exist") unless !$d || (-d $d); $self->{_db_path} = File::Spec->catfile($d,$f); } } if ($masker) { $self->throw("Masker '$masker' not available") unless grep /^$masker$/, keys %AVAILABLE_MASKERS; $self->{_masker} = $masker; } if ($program_dir) { $self->throw("Can't find program directory '$program_dir'") unless -d $program_dir; $self->{_program_dir} = $program_dir; } elsif ($ENV{BLASTPLUSDIR}) { $self->{_program_dir} = $ENV{BLASTPLUSDIR}; } $Bio::Tools::Run::BlastPlus::program_dir = $self->{_program_dir} || $Bio::Tools::Run::BlastPlus::program_dir; $self->set_db_make_args( $db_make_args) if ( $db_make_args ); $self->set_mask_make_args( $mask_make_args) if ($mask_make_args); $self->{'_create'} = $create; $self->{'_overwrite'} = $overwrite; $self->{'_is_remote'} = $is_remote; $self->{'_db_data'} = $db_data; $self->{'_mask_file'} = $mask_file; $self->{'_mask_data'} = $mask_data; # check db if (defined $self->check_db and $self->check_db == 0 and !$self->is_remote) { $self->throw("DB '".$self->db."' can't be found. To create, set -create => 1.") unless ($create || $overwrite); } if (!$self->db) { # allow this to pass; catch lazily at make_db... if (!$self->db_data) { $self->debug('No database or db data specified. '. 'To create a new database, provide '. '-db_data => [fasta|\@seqs|$seqio_object]') } # no db specified; create temp db $self->{_create} = 1; if ($self->db_dir) { my $fh = File::Temp->new(TEMPLATE => 'DBXXXXX', DIR => $self->db_dir, UNLINK => 1); my ($v,$d,$f) = File::Spec->splitpath($fh->filename); $self->{_db} = $f; $self->{_db_path} = $fh->filename; $self->_register_temp_for_cleanup($self->db_path); $fh->close; } else { $self->{_db_dir} = File::Temp->newdir('DBDXXXXX'); $self->{_db} = 'DBTEMP'; $self->{_db_path} = File::Spec->catfile($self->db_dir, $self->db); } } return $self; } =head2 db() Title : db Usage : $obj->db($newval) Function: contains the basename of the local blast database Example : Returns : value of db (a scalar string) Args : readonly =cut sub db { shift->{_db} } sub db_name { shift->{_db} } sub set_db_name { shift->{_db} = shift } sub db_dir { shift->{_db_dir} } sub set_db_dir { shift->{_db_dir} = shift } sub db_path { shift->{_db_path} } sub db_data { shift->{_db_data} } sub set_db_data { shift->{_db_data} = shift } sub db_type { shift->{_db_type} } sub masker { shift->{_masker} } sub set_masker { shift->{_masker} = shift } sub mask_file { shift->{_mask_file} } sub set_mask_file { shift->{_mask_file} = shift } sub mask_data { shift->{_mask_data} } sub set_mask_data { shift->{_mask_data} = shift } =head2 factory() Title : factory Usage : $obj->factory($newval) Function: attribute containing the Bio::Tools::Run::BlastPlus factory Example : Returns : value of factory (Bio::Tools::Run::BlastPlus object) Args : readonly =cut sub factory { shift->{_factory} } sub create { shift->{_create} } sub overwrite { shift->{_overwrite} } sub is_remote { shift->{_is_remote} } =head2 program_version() Title : program_version Usage : $version = $bedtools_fac->program_version() Function: Returns the program version (if available) Returns : string representing location and version of the program Note : this works around the WrapperBase::version() method conflicting with the -version parameter for SABlast (good argument for not having getter/setters for these) =cut =head2 package_version() Title : package_version Usage : $version = $bedtools_fac->package_version() Function: Returns the BLAST+ package version (if available) Returns : string representing BLAST+ package version (may differ from version()) =cut sub program_version { my $self = shift; my $fac = $self->factory; $fac->program_version(@_) if $fac; } sub package_version { my $self = shift; my $fac = $self->factory; $fac->package_version(@_) if $fac; } =head1 DB methods =head2 make_db() Title : make_db Usage : Function: create the blast database (if necessary), imposing masking if specified Returns : true on success Args : =cut # should also provide facility for creating subdatabases from # existing databases (i.e., another format for $data: the name of an # existing blastdb...) sub make_db { my $self = shift; my @args = @_; return 1 if ( $self->check_db && !$self->overwrite ); # already there or force make $self->throw('No database or db data specified. '. 'To create a new database, provide '. '-db_data => [fasta|\@seqs|$seqio_object]') unless $self->db_data; # db_data can be: fasta file, array of seqs, Bio::SeqIO object my $data = $self->db_data; $data = $self->_fastize($data); my $testio = Bio::SeqIO->new(-file=>$data, -format=>'fasta'); $self->{_db_type} = ($testio->next_seq->alphabet =~ /.na/) ? 'nucl' : 'prot'; $testio->close; my ($v,$d,$name) = File::Spec->splitpath($data); $name =~ s/\.fas$//; $self->{_db} ||= $name; $self->{_db_path} = File::Spec->catfile($self->db_dir,$self->db); # <#######[ # deal with creating masks here, # and provide correct parameters to the # makeblastdb ... # accomodate $self->db_make_args here -- allow them # to override defaults, or allow only those args # that are not specified here? my $usr_db_args ||= $self->db_make_args; my %usr_args = @$usr_db_args if $usr_db_args; my %db_args = ( -in => $data, -dbtype => $self->db_type, -out => $self->db_path, -title => $self->db, -parse_seqids => 1 # necessary for masking ); # usr arg override if (%usr_args) { $db_args{$_} = $usr_args{$_} for keys %usr_args; } # do masking if requested # if the (masker and mask_data) OR mask_file attributes of this # object are set, assume that masking is desired # if ($self->mask_file) { # the actual masking data is provided $db_args{'-mask_data'} = $self->mask_file; } elsif ($self->masker && $self->mask_data) { # build the mask $db_args{'-mask_data'} = $self->make_mask(-data => $self->mask_data); $self->throw("Masker error: message is '".$self->stderr."'") unless $db_args{'-mask_data'}; $self->{_mask_data} = $db_args{'-mask_data'}; } $self->{_factory} = $bp_class->new( -command => 'makeblastdb', %db_args ); $self->factory->no_throw_on_crash($self->no_throw_on_crash); return $self->factory->_run; } =head2 make_mask() Title : make_mask Usage : Function: create masking data based on specified parameters Returns : mask data filename (scalar string) Args : =cut # mask program usage (based on blast+ manual) # # program dbtype opn # windowmasker nucl mask overrep data, low-complexity (optional) # dustmasker nucl mask low-complexity # segmasker prot sub make_mask { my $self = shift; my @args = @_; my ($data, $mask_db, $make_args, $masker) = $self->_rearrange([qw( DATA MASK_DB MAKE_ARGS MASKER)], @args); my (%mask_args,%usr_args,$db_type); my $infmt = 'fasta'; $self->throw("make_mask requires -data argument") unless $data; $masker ||= $self->masker; $self->throw("no masker specified and no masker default set in object") unless $masker; my $usr_make_args ||= $self->mask_make_args; %usr_args = @$usr_make_args if $usr_make_args; unless (grep /^$masker$/, keys %AVAILABLE_MASKERS) { $self->throw("Masker '$masker' not available"); } if ($self->check_db($data)) { unless ($masker eq 'segmasker') { $self->throw("Masker '$masker' can't use a blastdb as primary input"); } unless ($self->db_info($data)->{_db_type} eq $AVAILABLE_MASKERS{$masker}) { $self->throw("Masker '$masker' is incompatible with input db sequence type"); } $infmt = 'blastdb'; } else { $data = $self->_fastize($data); my $sio = Bio::SeqIO->new(-file=>$data); my $s = $sio->next_seq; my $type; if ($s->alphabet =~ /.na/) { $type = 'nucl'; } elsif ($s->alphabet =~ /protein/) { $type = 'prot'; } else { $type = 'UNK'; } unless ($type eq $AVAILABLE_MASKERS{$masker}) { $self->throw("Masker '$masker' is incompatible with sequence type '$type'"); } } # check that sequence type and masker program match: # now, need to provide reasonable default masker arg settings, # and override these with $usr_make_args as necessary and appropriate my $mh = File::Temp->new(TEMPLATE=>'MSKXXXXX', UNLINK => 0, DIR => $self->db_dir); my $mask_outfile = $mh->filename; $mh->close; $self->_register_temp_for_cleanup(File::Spec->catfile($self->db_dir,$mask_outfile)); # NOTE: '-outfmt' argument must not be included in the default args because # it conflicts with windowmasker '-mk_counts' argument %mask_args = ( -in => $data, -parse_seqids => 1, ); # usr arg override if (%usr_args) { $mask_args{$_} = $usr_args{$_} for keys %usr_args; } # masker-specific pipelines my $status; for ($masker) { m/dustmasker/ && do { $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; m/windowmasker/ && do { # check mask_db if present if ($mask_db) { unless ($self->check_db($mask_db)) { $self->throw("Mask database '$mask_db' is not present or valid"); } } my $cth = File::Temp->new(TEMPLATE=>'MCTXXXXX', DIR => $self->db_dir); my $ct_file = $cth->filename; $cth->close; $mask_args{'-out'} = $ct_file; $mask_args{'-mk_counts'} = 'true'; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last unless $status; delete $mask_args{'-mk_counts'}; $mask_args{'-ustat'} = $ct_file; $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; if ($mask_db) { $mask_args{'-in'} = $mask_db; $mask_args{'-infmt'} = 'blastdb'; } $self->factory->reset_parameters(%mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; m/segmasker/ && do { $mask_args{'-infmt'} = $infmt; $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; do { $self->throw("Masker program '$masker' not recognized"); }; } return $status ? $mask_outfile : $status; } =head2 db_info() Title : db_info Usage : Function: get info for database (via blastdbcmd -info); add factory attributes Returns : hash of database attributes Args : [optional] db name (scalar string) (default: currently attached db) =cut sub db_info { my $self = shift; my $db = shift; $db ||= $self->db_path; unless ($db) { $self->warn("db_info: db not specified and no db attached"); return; } if ($self->is_remote) { $self->warn("db_info: sorry, can't get info for remote database (complain to NCBI)"); return; } if ($db eq $self->db and $self->{_db_info}) { return $self->{_db_info}; # memoized } my $db_info_text; $self->{_factory} = $bp_class->new( -command => 'blastdbcmd', -info => 1, -db => $db ); $self->factory->no_throw_on_crash(1); $self->factory->_run(); $self->factory->no_throw_on_crash(0); if ($self->factory->stderr =~ /No alias or index file found/) { $self->warn("db_info: Couldn't find database ".$self->db."; make with make_db()"); return; } $db_info_text = $self->factory->stdout; # parse info into attributes my $infh = IO::String->new($db_info_text); my %attr; while (<$infh>) { /Database: (.*)/ && do { $attr{db_info_name} = $1; next; }; /([0-9,]+) sequences; ([0-9,]+) total/ && do { $attr{db_num_sequences} = $1; $attr{db_total_bases} = $2; $attr{db_num_sequences} =~ s/,//g; $attr{db_total_bases} =~ s/,//g; next; }; /Date: (.*?)\s+Longest sequence: ([0-9,]+)/ && do { $attr{db_date} = $1; # convert to more usable date object $attr{db_longest_sequence} = $2; $attr{db_longest_sequence} =~ s/,//g; next; }; /Algorithm ID/ && do { my $alg = $attr{db_filter_algorithms} = []; while (<$infh>) { if (/\s*([0-9]+)\s+([a-z0-9_]+)\s+(.*)/i) { my ($alg_id, $alg_name, $alg_opts) = ($1, $2, $3); $alg_opts =~ s/\s+$//; push @$alg, { algorithm_id => $alg_id, algorithm_name => $alg_name, algorithm_opts => $alg_opts }; } else { last; } } next; }; } # get db type if ( -e $db.'.psq' ) { $attr{_db_type} = 'prot'; } elsif (-e $db.'.nsq') { $attr{_db_type} = 'nucl'; } else { $attr{_db_type} = 'UNK'; # bork } if ($db eq $self->db) { $self->{_db_type} = $attr{_db_type}; $self->{_db_info_text} = $db_info_text; $self->{_db_info} = \%attr; } return \%attr; } =head2 set_db_make_args() Title : set_db_make_args Usage : Function: set the DB make arguments attribute with checking Returns : true on success Args : arrayref or hashref of named arguments =cut sub set_db_make_args { my $self = shift; my $args = shift; $self->throw("Arrayref or hashref required at DB_MAKE_ARGS") unless ref($args) =~ /^ARRAY|HASH$/; if (ref($args) eq 'HASH') { my @a = %$args; $args = \@a; } $self->throw("Named args required for DB_MAKE_ARGS") unless !(@$args % 2); $self->{'_db_make_args'} = $args; return 1; } sub db_make_args { shift->{_db_make_args} } =head2 set_mask_make_args() Title : set_mask_make_args Usage : Function: set the masker make arguments attribute with checking Returns : true on success Args : arrayref or hasref of named arguments =cut sub set_mask_make_args { my $self = shift; my $args = shift; $self->throw("Arrayref or hashref required at MASK_MAKE_ARGS") unless ref($args) =~ /^ARRAY|HASH$/; if (ref($args) eq 'HASH') { my @a = %$args; $args = \@a; } $self->throw("Named args required at MASK_MAKE_ARGS") unless !(@$args % 2); $self->{'_mask_make_args'} = $args; return 1; } sub mask_make_args { shift->{_mask_make_args} } =head2 check_db() Title : check_db Usage : Function: determine if database with registered name and dir exists Returns : 1 if db present, 0 if not present, undef if name/dir not set Args : [optional] db name (default is 'registered' name in $self->db) [optional] db directory (default is 'registered' dir in $self->db_dir) =cut sub check_db { my $self = shift; my ($db) = @_; my $db_path; if ($db) { my ($v,$d,$f) = File::Spec->splitpath($db); $f =~ s/\..*$//; # ignore extensions $db_path = File::Spec->catfile($d||'.',$f); } else { $db_path = $self->db_path; } if ( $db_path ) { $self->{_factory} = $bp_class->new( -command => 'blastdbcmd', -info => 1, -db => $db_path ); # $DB::single=1; $self->factory->no_throw_on_crash(1); $self->factory->_run(); $self->factory->no_throw_on_crash(0); return 0 if ($self->factory->stderr =~ /No alias or index file found/); return 1; } return; } =head2 no_throw_on_crash() Title : no_throw_on_crash Usage : $fac->no_throw_on_crash($newval) Function: set to prevent an exeception throw on a failed blast program execution Example : Returns : value of no_throw_on_crash (boolean) Args : on set, new value (boolean) =cut sub no_throw_on_crash { my $self = shift; return $self->{'no_throw_on_crash'} = shift if @_; return $self->{'no_throw_on_crash'}; } =head1 Internals =head2 _fastize() Title : _fastize Usage : Function: convert a sequence collection to a temporary fasta file (sans gaps) Returns : fasta filename (scalar string) Args : sequence collection =cut sub _fastize { my $self = shift; my $data = shift; for ($data) { !ref && do { # suppose a fasta file name $self->throw('Sequence file not found') unless -e $data; my $guesser = Bio::Tools::GuessSeqFormat->new(-file => $data); $self->throw('Sequence file not in FASTA format') unless $guesser->guess eq 'fasta'; last; }; (ref eq 'ARRAY') && (ref $$data[0]) && ($$data[0]->isa('Bio::Seq') || $$data[0]->isa('Bio::PrimarySeq')) && do { my $fh = File::Temp->new(TEMPLATE => 'DBDXXXXX', UNLINK => 0, DIR => $self->db_dir, SUFFIX => '.fas'); my $fname = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($fname); my $fasio = Bio::SeqIO->new(-file=>">$fname", -format=>"fasta") or $self->throw("Can't create temp fasta file"); for (@$data) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_); } $fasio->close; $data = $fname; last; }; ref && do { # some kind of object my ($fmt) = ref($data) =~ /.*::(.*)/; if ($fmt eq 'fasta') { $data = $data->file; # use the fasta file directly } else { # convert my $fh = File::Temp->new(TEMPLATE => 'DBDXXXXX', UNLINK => 0, DIR => $self->db_dir, SUFFIX => '.fas'); my $fname = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($fname); my $fasio = Bio::SeqIO->new(-file=>">$fname", -format=>"fasta") or $self->throw("Can't create temp fasta file"); require Bio::PrimarySeq; if ($data->isa('Bio::AlignIO')) { my $aln = $data->next_aln; for ($aln->each_seq) { # must de-gap my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_) } } elsif ($data->isa('Bio::SeqIO')) { while (local $_ = $data->next_seq) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_); } } elsif ($data->isa('Bio::Align::AlignI')) { for( $data->each_seq) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_) } } elsif ($data->isa('Bio::Seq') || $data->isa('Bio::PrimarySeq')) { my $s = $data->seq; my $a = $data->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $data->seq($s); $data->alphabet($a); $fasio->write_seq($data); } else { $self->throw("Can't handle sequence container object ". "of type '".ref($data)."'"); } $fasio->close; $data = $fname; } last; }; } return $data; } =head2 _register_temp_for_cleanup() Title : _register_temp_for_cleanup Usage : Function: register a file for cleanup with cleanup() method Returns : true on success Args : a file name or a blastdb basename (scalar string) =cut sub _register_temp_for_cleanup { my $self = shift; my @files = @_; for (@files) { my ($v, $d, $n) = File::Spec->splitpath($_); $_ = File::Spec->catfile($self->db_dir, $n) unless length($d); push @{$self->{_cleanup_list}}, File::Spec->rel2abs($_); } return 1; } =head2 cleanup() Title : cleanup Usage : Function: unlink files registered for cleanup Returns : true on success Args : =cut sub cleanup { my $self = shift; return unless $self->{_cleanup_list}; my $self_file = ''; if (exists $self->{_results}->{_file}) { $self_file = $self->{_results}->{_file}; } for (@{$self->{_cleanup_list}}) { # Close $self_file filehandle if it appears on the cleanup list, # to avoid 'permission denied' errors when unlinking if ($self_file ne '' and $_ =~ m/$self_file$/) { close $self->{_results}->_fh; } m/(\.[a-z0-9_]+)+$/i && do { unlink $_; next; }; do { # catch all index files if ( -e $_.".psq" ) { unlink glob($_.".p*"); unlink glob($_.".??.p*"); } elsif ( -e $_.".nsq" ) { unlink glob($_.".n*"); unlink glob($_.".??.n*"); } else { unlink $_; } next; }; } return 1; } =head2 AUTOLOAD In this module, C delegates L and L methods (including those of L) to the C attribute: $fac->stderr gives you $fac->factory->stderr If $AUTOLOAD isn't pointing to a WrapperBase method, then AUTOLOAD attempts to return a C attribute: e.g. $fac->db_num_sequences works by looking in the $fac->db_info() hash. Finally, if $AUTOLOAD is pointing to a blast query method, AUTOLOAD runs C with the C<-method> parameter appropriately set. =cut sub AUTOLOAD { my $self = shift; my @args = @_; my $method = $AUTOLOAD; $method =~ s/.*:://; my @ret; if (grep /^$method$/, @Bio::Tools::Run::StandAloneBlastPlus::BlastMethods) { push @args, ('-method_args' => ['-remote' => 1] ) if ($self->is_remote); return $self->run( -method => $method, @args ); } if ($self->factory and $self->factory->can($method)) { # factory method return $self->factory->$method(@args); } if ($self->db_info and grep /^$method$/, keys %{$self->db_info}) { return $self->db_info->{$method}; } # else, fail $self->throw("Can't locate method '$method' in class ".ref($self)); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneBlastPlus/000077500000000000000000000000001342734133000244015ustar00rootroot00000000000000bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneBlastPlus/BlastMethods.pm000077500000000000000000000276751342734133000273540ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlastPlus::BlastMethods # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlastPlus::BlastMethods - Provides BLAST methods to StandAloneBlastPlus =head1 SYNOPSIS # create a factory: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'testdb' ); # get your results $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -method_args => [ '-num_alignments' => 10 ] ); $result = $fac->tblastx( -query => $an_alignment_object, -outfile => 'query.bls', -outformat => 7 ); # do a bl2seq $fac->bl2seq( -method => 'blastp', -query => $seq_object_1, -subject => $seq_object_2 ); =head1 DESCRIPTION This module provides the BLAST methods (blastn, blastp, psiblast, etc.) to the L object. =head1 USAGE This POD describes the use of BLAST methods against a L factory object. The object itself has extensive facilities for creating, formatting, and masking BLAST databases; please refer to L POD for these details. Given a C factory, such as $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'testdb' ); you can run the desired BLAST method directly from the factory object, against the database currently attached to the factory (in the example, C). C<-query> is a required argument: $result = $fac->blastn( -query => 'query_seqs.fas' ); Here, C<$result> is a L object. Other details: =over =item * The blast output file can be named explicitly: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls' ); =item * The output format can be specified: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -outformat => 7 ); #tabular =item * Additional arguments to the method can be specified: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -method_args => [ '-num_alignments' => 10 , '-evalue' => 100 ]); =item * HTML output can be created using this workaround: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -method_args => [ -html => ' ' ); =item * To get the name of the blast output file, do $file = $fac->blast_out; =item * To clean up the temp files (you must do this explicitly): $fac->cleanup; =back =head2 bl2seq() Running C is similar, but both C<-query> and C<-subject> are required, and the attached database is ignored. The blast method must be specified explicitly with the C<-method> parameter: $fac->bl2seq( -method => 'blastp', -query => $seq_object_1, -subject => $seq_object_2 ); Other parameters ( C<-method_args>, C<-outfile>, and C<-outformat> ) are valid. =head2 Return values The return value is always a L object on success, undef on failure. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... # note: providing methods directly to the namespace... package Bio::Tools::Run::StandAloneBlastPlus; use strict; use warnings; use Bio::SearchIO; use lib '../../../..'; use Bio::Tools::Run::BlastPlus; use File::Temp; use File::Copy; use File::Spec; our @BlastMethods = qw( blastp blastn blastx tblastn tblastx psiblast rpsblast rpstblastn ); =head2 run() Title : run Usage : Function: Query the attached database using a specified blast method Returns : Bio::Search::Result::BlastResult object Args : key => value: -method => $method [blastp|blastn|blastx|tblastx|tblastn| rpsblast|psiblast|rpstblastn] -query => $query_sequences (a fasta file name or BioPerl sequence object or sequence collection object) -outfile => $blast_report_file (optional: default creates a tempfile) -outformat => $format_code (integer in [0..10], see blast+ docs) -method_args => [ -key1 => $value1, ... ] (additional arguments for the given method) =cut sub run { my $self = shift; my @args = @_; my ( $method, $query, $outfile, $outformat, $method_args ) = $self->_rearrange( [ qw( METHOD QUERY OUTFILE OUTFORMAT METHOD_ARGS ) ], @args ); my $ret; my ( %blast_args, %usr_args ); unless ($method) { $self->throw("Blast run: method not specified, use -method"); } unless ($query) { $self->throw("Blast run: query data required, use -query"); } unless ($outfile) { # create a tempfile name my $fh = File::Temp->new( TEMPLATE => 'BLOXXXXX', DIR => $self->db_dir, UNLINK => 0 ); $outfile = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($outfile); } if ($outformat) { unless ( $outformat =~ /^"?[0-9]{1,2}/ ) { $self->throw( "Blast run: output format code should be integer 0-10"); } $blast_args{'-outfmt'} = $outformat; } if ($method_args) { $self->throw( "Blast run: method arguments must be name => value pairs") unless ! ( @$method_args % 2 ); %usr_args = @$method_args; } # make db if necessary $self->make_db unless $self->check_db or $self->is_remote or $usr_args{'-subject'} or $usr_args{'-SUBJECT'}; # no db nec if this is bl2seq... $self->{_factory} = Bio::Tools::Run::BlastPlus->new( -command => $method ); if (%usr_args) { my @avail_parms = $self->factory->available_parameters('all'); while ( my ( $key, $value ) = each %usr_args ) { $key =~ s/^-//; unless ( grep /^$key$/, @avail_parms ) { $self->throw( "Blast run: parameter '$key' is not available for method '$method'" ); } } } # remove a leading ./ on remote databases. Something adds that in the # factory, easier to remove here. my $db = $self->db_path; if ( $self->is_remote ) { $db =~ s#^\./##; } $blast_args{-db} = $db; $blast_args{-query} = $self->_fastize($query); $blast_args{-out} = $outfile; # user arg override if (%usr_args) { $blast_args{$_} = $usr_args{$_} for keys %usr_args; } # override for bl2seq; if ( $blast_args{'-db'} && $blast_args{'-subject'} ) { delete $blast_args{'-db'}; } $self->factory->set_parameters(%blast_args); $self->factory->no_throw_on_crash( $self->no_throw_on_crash ); my $status = $self->_run; return $status unless $status; # kludge to demodernize the bl2seq output if ( $blast_args{'-subject'} ) { unless ( _demodernize($outfile) ) { $self->throw("Demodernization failed!"); } } # if here, success for ($method) { m/^(t|psi|rps|rpst)?blast[npx]?/ && do { $ret = Bio::SearchIO->new( -file => $outfile ); $self->{_blastout} = $outfile; $self->{_results} = $ret; $ret = $ret->next_result; last; }; do { 1; # huh? }; } return $ret; } =head2 bl2seq() Title : bl2seq Usage : Function: emulate bl2seq using blast+ programs Returns : Bio::Search::Result::BlastResult object Args : key => value -method => $blast_method [blastn|blastp|blastx| tblastn|tblastx] -query => $query (fasta file or BioPerl sequence object -subject => $subject (fasta file or BioPerl sequence object) -outfile => $blast_report_file -method_args => [ $key1 => $value1, ... ] (additional method parameters) =cut sub bl2seq { my $self = shift; my @args = @_; my ( $method, $query, $subject, $outfile, $outformat, $method_args ) = $self->_rearrange( [ qw( METHOD QUERY SUBJECT OUTFILE OUTFORMAT METHOD_ARGS ) ], @args ); unless ($method) { $self->throw("bl2seq: blast method not specified, use -method"); } unless ($query) { $self->throw("bl2seq: query data required, use -query"); } unless ($subject) { $self->throw("bl2seq: subject data required, use -subject"); } $subject = $self->_fastize($subject); my @run_args; if ($method_args) { @run_args = @$method_args; } return $self->run( -method => $method, -query => $query, -outfile => $outfile, -outformat => $outformat, -method_args => [ @run_args, '-subject' => $subject ] ); } =head2 next_result() Title : next_result Usage : $result = $fac->next_result; Function: get the next BLAST result Returns : Bio::Search::Result::BlastResult object Args : none =cut sub next_result() { my $self = shift; return unless $self->{_results}; return $self->{_results}->next_result; } =head2 rewind_results() Title : rewind_results Usage : $fac->rewind_results; Function: rewind BLAST results Returns : true on success Args : =cut sub rewind_results { my $self = shift; return unless $self->blast_out; $self->{_results} = Bio::SearchIO->new( -file => $self->blast_out ); return 1; } =head2 blast_out() Title : blast_out Usage : $file = $fac->blast_out Function: get the filename of the blast report file Returns : scalar string Args : none =cut sub blast_out { shift->{_blastout} } # =head2 _demodernize() # Title : _demodernize # Usage : # Function: # Returns : # Args : # =cut sub _demodernize { my $file = shift; my $tf = File::Temp->new(); open( my $f, $file ); while (<$f>) { s/^Subject=\s+/>/; print $tf $_; } $tf->close; copy( $tf->filename, $file ); } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneNCBIBlast.pm000066400000000000000000000467111342734133000245400ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneNCBIBlast - Object for the local execution of the NCBI BLAST program suite (blastall, blastpgp, bl2seq). With experimental support for NCBI rpsblast. =head1 SYNOPSIS # Do not use directly; see Bio::Tools::Run::StandAloneBlast =head1 DESCRIPTION See Bio::Tools::Run::StandAloneBlast =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneNCBIBlast; use strict; use warnings; use base qw(Bio::Tools::Run::StandAloneBlast); our $AUTOLOAD; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $Bio::Tools::Run::StandAloneBlast::DATADIR; our %GENERAL_PARAMS = (i => 'input', o => 'outfile', p => 'program', d => 'database'); our @BLASTALL_PARAMS = qw(A B C D E F G K L M O P Q R S W X Y Z a b e f l m q r t v w y z n); our @BLASTALL_SWITCH = qw(I g J T U n V s); our @BLASTPGP_PARAMS = qw(A B C E F G H I J K L M N O P Q R S T U W X Y Z a b c e f h j k l m q s t u v y z); our @RPSBLAST_PARAMS = qw(F I J L N O P T U V X Y Z a b e l m v y z); our @BL2SEQ_PARAMS = qw(A D E F G I J M S T U V W X Y a e g j m q r t); our @OTHER_PARAMS = qw(_READMETHOD); =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p in those programs that have it. For a complete listing of settable parameters, run the relevant executable BLAST program with the option "-" as in blastall - Note that the input parameters (-i, -j, -input) should not be set directly by you: this module sets them when you call one of the executable methods. Blastall -p Program Name [String] Input should be one of "blastp", "blastn", "blastx", "tblastn", or "tblastx". -d Database [String] default = nr The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "nr est" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm -S Query strands to search against database (for blast[nx], and tblastx). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 Blastpgp (including Psiblast) -j is the maximum number of rounds (default 1; i.e., regular BLAST) -h is the e-value threshold for including sequences in the score matrix model (default 0.001) -c is the "constant" used in the pseudocount formula specified in the paper (default 10) -B Multiple alignment file for PSI-BLAST "jump start mode" Optional -Q Output File for PSI-BLAST Matrix in ASCII [File Out] Optional rpsblast -d Database [String] default = (none - you must specify a database) The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "Cog Smart" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm Bl2seq -p Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String] default = blastp -o alignment output file [File Out] default = stdout -e Expectation value (E) [Real] default = 10.0 -S Query strands to search against database (blastn only). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 =cut sub new { my ($caller, @args) = @_; my $self = $caller->SUPER::new(@args); # StandAloneBlast is special in that "one can modify the name of # the (ncbi) BLAST parameters as desired as long as the initial letter (and # case) of the parameter are preserved". We handle this by truncating input # args to their first char my %args = @args; @args = (); while (my ($attr, $value) = each %args) { $attr =~ s/^-//; $attr = substr($attr, 0, 1) unless $attr =~ /^_/; push(@args, $attr, $value); } $self->_set_from_args(\@args, -methods => {(map { $_ => $GENERAL_PARAMS{$_} } keys %GENERAL_PARAMS), (map { $_ => $_ } (@OTHER_PARAMS, @BLASTALL_PARAMS, @BLASTALL_SWITCH, @BLASTPGP_PARAMS, @RPSBLAST_PARAMS, @BL2SEQ_PARAMS))}, -code => { map { $_ => 'my $self = shift; if (@_) { my $value = shift; if ($value && $value ne \'F\') { $value = \'T\'; } else { $value = \'F\'; } $self->{\'_\'.$method} = $value; } return $self->{\'_\'.$method} || return;' } @BLASTALL_SWITCH }, # these methods can take boolean or 'T' and 'F' -create => 1, -force => 1, -case_sensitive => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->o || $self->outfile || $tempfile; $self->o($outfile); close($tfh); $self->_READMETHOD($DEFAULTREADMETHOD) unless $self->_READMETHOD; return $self; } # StandAloneBlast is special in that "one can modify the name of # the (ncbi) BLAST parameters as desired as long as the initial letter (and # case) of the parameter are preserved". We handle this with AUTOLOAD # redirecting to the automatically created methods from _set_from_args() ! sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; my $orig = $attr; $attr = substr($attr, 0, 1); $self->can($attr) || $self->throw("Unallowed parameter: $orig !"); return $self->$attr(@_); } =head2 blastall Title : blastall Usage : $blast_report = $factory->blastall('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->blastall($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->blastall($seq_array_ref); Returns : Reference to a Blast object containing the blast report. Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. =cut sub blastall { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'blastall'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_blast($executable); } =head2 blastpgp Title : blastpgp Usage : $blast_report = $factory-> blastpgp('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTADDEEQQPPTCADEEQQQVVGG"); $blast_report = $factory->blastpgp ($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory-> blastpgp(\@seq_array); Returns : Reference to a Bio::SearchIO object containing the blast report Args : Name of a file or Bio::Seq object. In psiblast jumpstart mode two additional arguments are required: a SimpleAlign object one of whose elements is the query and a "mask" to determine how BLAST should select scoring matrices see DESCRIPTION above for more details. Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. Returns : Reference to Bio::SearchIO object containing the blast report. =cut sub blastpgp { my $self = shift; my $executable = 'blastpgp'; my $input1 = shift; my $input2 = shift; # used by blastpgp's -B option to specify which # residues are position aligned my $mask = shift; my ($infilename1, $infilename2 ) = $self->_setinput($executable, $input1, $input2, $mask); if (!$infilename1) {$self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!");} $self->i($infilename1); # set file name of sequence to be blasted to inputfilename1 (-i param of blastpgp) if ($input2) { unless ($infilename2) {$self->throw("$input2 not SimpleAlign Object in pre-aligned psiblast\n");} $self->B($infilename2); # set file name of partial alignment to inputfilename2 (-B param of blastpgp) } my $blast_report = $self->_generic_local_blast($executable); } =head2 rpsblast Title : rpsblast Usage : $blast_report = $factory->rpsblast('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"MVVLCRADDEEQQPPTCADEEQQQVVGG"); $blast_report = $factory->rpsblast($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->rpsblast(\@seq_array); Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. Returns : Reference to a Bio::SearchIO object containing the blast report =cut sub rpsblast { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'rpsblast'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_blast($executable); } =head2 bl2seq Title : bl2seq Usage : $factory-> bl2seq('t/seq1.fa', 't/seq2.fa'); or $input1 = Bio::Seq->new(-id=>"test query1", -seq=>"ACTADDEEQQPPTCADEEQQQVVGG"); $input2 = Bio::Seq->new(-id=>"test query2", -seq=>"ACTADDEMMMMMMMDEEQQQVVGG"); $blast_report = $factory->bl2seq ($input1, $input2); Returns : Reference to a BPbl2seq object containing the blast report. Args : Names of 2 files or 2 Bio::Seq objects containing the sequences to be aligned by bl2seq. Throws an exception if argument is not either a pair of strings (eg filenames) or references to Bio::Seq objects. If arguments are strings, throws exception if files corresponding to string names can not be found. =cut sub bl2seq { my $self = shift; my $executable = 'bl2seq'; my $input1 = shift; my $input2 = shift; # Create input file pointer my ($infilename1, $infilename2 ) = $self->_setinput($executable, $input1, $input2); if (!$infilename1){$self->throw(" $input1 not Seq Object or file name!");} if (!$infilename2){$self->throw("$input2 not Seq Object or file name!");} $self->i($infilename1); # set file name of first sequence to # be aligned to inputfilename1 # (-i param of bl2seq) $self->j($infilename2); # set file name of first sequence to # be aligned to inputfilename2 # (-j param of bl2seq) my $blast_report = $self->_generic_local_blast($executable); } =head2 _generic_local_blast Title : _generic_local_blast Usage : internal function not called directly Returns : Bio::SearchIO Args : Reference to calling object and name of BLAST executable =cut sub _generic_local_blast { my $self = shift; my $executable = shift; # Create parameter string to pass to Blast program my $param_string = $self->_setparams($executable); # run Blast my $blast_report = $self->_runblast($executable, $param_string); } =head2 _runblast Title : _runblast Usage : Internal function, not to be called directly Function: makes actual system call to Blast program Example : Returns : Report Bio::SearchIO object in the appropriate format Args : Reference to calling object, name of BLAST executable, and parameter string for executable =cut sub _runblast { my ($self, $executable, $param_string) = @_; my ($blast_obj, $exe); if (! ($exe = $self->executable($executable)) ) { $self->warn("cannot find path to $executable"); return; } # Use double quotes if executable path have empty spaces if ($exe =~ m/ /) { $exe = "\"$exe\""; } my $commandstring = $exe.$param_string; $self->debug("$commandstring\n"); system($commandstring) && $self->throw("$executable call crashed: $? | $! | $commandstring\n"); # set significance cutoff to set expectation value or default value # (may want to make this value vary for different executables) my $signif = $self->e() || 1e-5; # get outputfilename my $outfile = $self->o(); # this should allow any blast SearchIO parser (not just 'blast_pull' or 'blast', # but 'blastxml' and 'blasttable'). Fall back to 'blast' if not stipulated. my $method = $self->_READMETHOD; if ($method =~ /^(?:blast|SearchIO)/i ) { $method = 'blast' if $method =~ m{SearchIO}i; $blast_obj = Bio::SearchIO->new(-file => $outfile, -format => $method); } # should these be here? They have been deprecated... elsif ($method =~ /BPlite/i ) { if ($executable =~ /bl2seq/i) { # Added program info so BPbl2seq can compute strand info $self->throw("Use of Bio::Tools::BPbl2seq is deprecated; use Bio::SearchIO modules instead"); } elsif ($executable =~ /blastpgp/i && defined $self->j() && $self->j() > 1) { $self->throw("Use of Bio::Tools::BPpsilite is deprecated; use Bio::SearchIO modules instead"); } elsif ($executable =~ /blastall|rpsblast/i) { $self->throw("Use of Bio::Tools::BPlite is deprecated; use Bio::SearchIO modules instead"); } else { $self->warn("Unrecognized executable $executable"); } } else { $self->warn("Unrecognized readmethod $method"); } return $blast_obj; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Blast program Example : Returns : parameter string to be passed to Blast Args : Reference to calling object and name of BLAST executable =cut sub _setparams { my ($self, $executable) = @_; my ($attr, $value, @execparams); if ($executable eq 'blastall') { @execparams = (@BLASTALL_PARAMS, @BLASTALL_SWITCH); } elsif ($executable eq 'blastpgp') { @execparams = @BLASTPGP_PARAMS; } elsif ($executable eq 'rpsblast') { @execparams = @RPSBLAST_PARAMS; } elsif ($executable eq 'bl2seq' ) { @execparams = @BL2SEQ_PARAMS; } # we also have all the general params push(@execparams, keys %GENERAL_PARAMS); my $database = $self->d; if ($database && $executable ne 'bl2seq') { # Need to prepend datadirectory to database name my @dbs = split(/ /, $database); for my $i (0..$#dbs) { # (works with multiple databases) if (! (-e $dbs[$i].".nin" || -e $dbs[$i].".pin") && ! (-e $dbs[$i].".nal" || -e $dbs[$i].".pal") ) { $dbs[$i] = File::Spec->catdir($DATADIR, $dbs[$i]); } } $self->d('"'.join(" ", @dbs).'"'); } # workaround for problems with shell metacharacters [bug 2707] # simply quoting does not always work! my $tmp = $self->o; $self->o(quotemeta($tmp)) if ($tmp && $^O !~ /^MSWin/); my $param_string = $self->SUPER::_setparams(-params => [@execparams], -dash => 1); $self->o($tmp) if ($tmp && $^O !~ /^MSWin/); $self->d($database) if $database; if ($self->quiet()) { $param_string .= ' 2> '.File::Spec->devnull; } return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/StandAloneWUBlast.pm000066400000000000000000000221661342734133000243560ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneWUBlast - Object for the local execution of WU-Blast. =head1 SYNOPSIS # Do not use directly; use Bio::Tools::Run::StandAloneBlast =head1 DESCRIPTION See Bio::Tools::Run::StandAloneBlast =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneWUBlast; use strict; use base qw(Bio::Tools::Run::StandAloneBlast); our $AUTOLOAD; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $Bio::Tools::Run::StandAloneBlast::DATADIR; our %GENERAL_PARAMS = (i => 'input', o => 'outfile', p => 'program', d => 'database'); our @WUBLAST_PARAMS = qw(e s e2 s2 w t x m y z l k h v b q r matrix filter wordmask filter maskextra hitdist wink ctxfactor gape gaps gape2 gaps2 gapw gapx olf golf olmax golmax gapdecayrate topcombon topcomboe sumstatsmethod hspsepqmax hspsepsmax gapsepqmax gapsepsmax altscore hspmax gspmax qoffset nwstart nwlen qrecmin qrecmax dbrecmin dbrecmax vdbdescmax dbchunks sort_by_pvalue cpus putenv getenv progress); our @WUBLAST_SWITCH = qw(kap sump poissonp lcfilter lcmask echofilter stats nogap gapall pingpong nosegs postsw span2 span1 span prune consistency links ucdb gi noseqs qtype qres sort_by_pvalue sort_by_count sort_by_highscore sort_by_totalscore sort_by_subjectlength mmio nonnegok novalidctxok shortqueryok notes warnings errors endputenv getenv endgetenv abortonerror abortonfatal); our @OTHER_PARAMS = qw(_READMETHOD); =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p. -p Program Name [String] Input should be one of "wublastp", "wublastn", "wublastx", "wutblastn", or "wutblastx". -d Database [String] default = nr The database specified must first be formatted with xdformat. -E Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm =cut sub new { my ($caller, @args) = @_; my $self = $caller->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $GENERAL_PARAMS{$_} } keys %GENERAL_PARAMS), (map { $_ => $_ } (@OTHER_PARAMS, @WUBLAST_PARAMS, @WUBLAST_SWITCH))}, -create => 1, -force => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->o || $self->outfile || $tempfile; $self->o($outfile); close($tfh); $self->_READMETHOD($DEFAULTREADMETHOD) unless $self->_READMETHOD; return $self; } # We let get/setter method names be case-insensitve sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; my $orig = $attr; $attr = lc($attr); $self->can($attr) || $self->throw("Unallowed parameter: $orig !"); return $self->$attr(@_); } =head2 wublast Title : wublast Usage : $blast_report = $factory->wublast('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->wublast($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->wublast(\@seq_array); Returns : Reference to a Blast object Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. =cut sub wublast { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'wublast'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_wublast($executable); } =head2 _generic_local_wublast Title : _generic_local_wublast Usage : internal function not called directly Returns : Blast object Args : Reference to calling object and name of BLAST executable =cut sub _generic_local_wublast { my $self = shift; my $executable = shift; # Create parameter string to pass to Blast program my $param_string = $self->_setparams($executable); $param_string = " ".$self->database." ".$self->input." ".$param_string; # run Blast my $blast_report = $self->_runwublast($executable, $param_string); } =head2 _runwublast Title : _runwublast Usage : Internal function, not to be called directly Function: makes actual system call to WU-Blast program Example : Returns : Report Blast object Args : Reference to calling object, name of BLAST executable, and parameter string for executable =cut sub _runwublast { my ($self, $executable, $param_string) = @_; my ($blast_obj, $exe); if (! ($exe = $self->executable($self->p))){ $self->warn("cannot find path to $executable"); return; } # Use double quotes if executable path have empty spaces if ($exe =~ m/ /) { $exe = "\"$exe\""; } my $commandstring = $exe.$param_string; $self->debug("$commandstring\n"); system($commandstring) && $self->throw("$executable call crashed: $? | $! | $commandstring\n"); # get outputfilename my $outfile = $self->o(); $blast_obj = Bio::SearchIO->new(-file => $outfile, -format => 'blast'); return $blast_obj; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Blast program Example : Returns : parameter string to be passed to Blast Args : Reference to calling object and name of BLAST executable =cut sub _setparams { my ($self, $executable) = @_; my ($attr, $value, @execparams); @execparams = @WUBLAST_PARAMS; # of the general params, wublast only takes outfile at # this stage (we add in program, input and database manually elsewhere) push(@execparams, 'o'); # workaround for problems with shell metacharacters [bug 2707] # simply quoting does not always work! # Fixed so Windows files are not quotemeta'd my $tmp = $self->o; $self->o(quotemeta($tmp)) if ($tmp && $^O !~ /^MSWin/); my $param_string = $self->SUPER::_setparams(-params => [@execparams], -switches => \@WUBLAST_SWITCH, -dash => 1); $self->o($tmp) if ($tmp && $^O !~ /^MSWin/); if ($self->quiet()) { $param_string .= ' 2> '.File::Spec->devnull; } return $param_string; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Tmhmm.pm000077500000000000000000000173611342734133000221520ustar00rootroot00000000000000# # # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code # =head1 NAME Bio::Tools::Run::Tmhmm - Object for identifying transmembrane helixes in a given protein seequence. =head1 SYNOPSIS # Build a Tmhmm factory # $paramfile is the full path to the seg binary file my @params = ('PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Tmhmm->new($param); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Tmhmm is a program for identifying transmembrane helices in proteins. You must have the environmental variable TMHMMDIR set to the base directory where I and it's associated data/option files reside (NOT the bin directory where the actual executable resides) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Tmhmm; use vars qw($AUTOLOAD @ISA $PROGRAMNAME @TMHMM_PARAMS %OK_FIELD); use strict; use Cwd; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Tmhmm; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'tmhmm' . ($^O =~ /mswin/i ?'.exe':''); @TMHMM_PARAMS=qw(PROGRAM VERBOSE NOPLOT); foreach my $attr ( @TMHMM_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable, in this case it is the tmhmm installation directory, not the location of the executable. Returns: string Args : =cut sub program_dir { return $ENV{TMHMMDIR} || ''; } =head2 program_path Title : program_path Usage : my $path = $factory->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Args : none =cut sub program_path { my ($self) = @_; my @path; if ($self->program_dir) { my $program_dir = $self->program_dir; $program_dir =~ s/\/bin//; push @path, $program_dir; } push @path, 'bin'; push @path, $self->program_name.($^O =~ /mswin/i ?'.exe':''); return File::Spec->catfile(@path); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new(@params) Function: creates a new Tmhmm factory Returns: Bio::Tools::Run::Tmhmm Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED Use $obj->run($seq) instead Function: Runs Tmhmm and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 executable Title : executable Usage : my $exe = $tmhmm->executable('tmhmm'); Function: Finds the full path to the 'tmhmm' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my $self = shift; my $exe = $self->SUPER::executable(@_) || return; # even if its executable, we still need the environment variable to have # been set if (! $ENV{TMHMMDIR}) { $self->warn("Environment variable TMHMMDIR must be set, even if the tmhmm executable is in your path"); return undef; } return $exe; } =head2 run Title : run() Usage : $obj->run($seq) Function: Runs Tmhmm and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { # The clone object is not a seq object but a file. Perhaps # should check here or before if this file is fasta format...if # not die Here the file does not need to be created or # deleted. Its already written and may be used by other # runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if (defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str = $self->executable || return; if( $self->NOPLOT ) { $str .= " --noplot"; } $str .= " -basedir=".$self->program_dir." -workdir=".$self->tempdir()." ".$self->_input." > ".$outfile; my $status = system($str); $self->throw( "Tmhmm call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (TMHMM, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*TMHMM; } else { $filehandle = $outfile; } my $tmhmm_parser = Bio::Tools::Tmhmm->new(-fh=>$filehandle); my @tmhmm_feat; while(my $tmhmm_feat = $tmhmm_parser->next_result){ push @tmhmm_feat, $tmhmm_feat; } # free resources $self->cleanup(); unlink $outfile; close($tfh1); undef $tfh1; return @tmhmm_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/TribeMCL.pm000066400000000000000000000701061342734133000224620ustar00rootroot00000000000000# BioPerl module for TribeMCL # # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::TribeMCL =head1 SYNOPSIS use Bio::Tools::Run::TribeMCL; use Bio::SearchIO; # 3 methods to input the blast results # straight forward raw blast output (NCBI or WU-BLAST) my @params = ('inputtype'=>'blastfile'); # OR # markov program format # protein_id1 protein_id2 evalue_magnitude evalue_factor # for example: # proteins ENSP00000257547 and ENSP00000261659 # with a blast score evalue of 1e-50 # and proteins O42187 and ENSP00000257547 # with a blast score evalue of 1e-119 # entry would be my @array = [[qw(ENSP00000257547 ENSP00000261659 1 50)], [qw(O42187 ENSP00000257547 1 119)]]; my @params = ('pairs'=>\@array,I=>'2.0'); # OR # pass in a searchio object # slowest of the 3 methods as it does more rigourous parsing # than required for us here my $sio = Bio::SearchIO->new(-format=>'blast', -file=>'blast.out'); my @params=('inputtype'=>'searchio',I=>'2.0'); # you can specify the path to the executable manually in the following way my @params=('inputtype'=>'blastfile',I=>'2.0', 'mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl', 'matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); my $fact = Bio::Tools::Run::TribeMCL->new(@params); # OR $fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); $fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); # to run my $fact = Bio::Tools::Run::TribeMCL->new(@params); # Run the program # returns an array reference to clusters where members are the ids # for example :2 clusters with 3 members per cluster: # $fam = [ [mem1 mem2 mem3],[mem1 mem2 mem3]] # pass in either the blastfile path/searchio obj/the array ref to scores my $fam = $fact->run($sio); # print out your clusters for (my $i = 0; $i [$i]})." members\n"; foreach my $member (@{$fam->[$i]}){ print "\t$member\n"; } } =head1 DESCRIPTION TribeMCL is a method for clustering proteins into related groups, which are termed 'protein families'. This clustering is achieved by analysing similarity patterns between proteins in a given dataset, and using these patterns to assign proteins into related groups. In many cases, proteins in the same protein family will have similar functional properties. TribeMCL uses a novel clustering method (Markov Clustering or MCL) which solves problems which normally hinder protein sequence clustering. Reference: Enright A.J., Van Dongen S., Ouzounis C.A; Nucleic Acids Res. 30(7):1575-1584 (2002) You will need tribe-matrix (the program used to generate the matrix for input into mcl) and mcl (the clustering software) available at: http://www.ebi.ac.uk/research/cgg/tribe/ or http://micans.org/mcl/. Future Work in this module: Port the tribe-matrix program into perl so that we can enable have a SearchIO kinda module for reading and writing mcl data format =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut # Let the code begin... package Bio::Tools::Run::TribeMCL; use vars qw($AUTOLOAD @ISA $PROGRAMDIR @TRIBEMCL_PARAMS @MATRIXPROGRAM_PARAMS @MCLPROGRAM_PARAMS @OTHER_SWITCHES %OK_FIELD $MATRIXPROGRAM_NAME $MCLPROGRAM_NAME $MCLPROGRAM $MATRIXPROGRAM ); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Cluster::SequenceFamily; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; use Bio::Annotation::DBLink; use Bio::Seq; use Bio::Species; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable mcl and tribe-matrix to use this wrapper. This # can be done in (at least) two ways: # # 1. define an environmental variable TRIBEDIR # export TRIBEDIR =/usr/local/share/mclbin/ # where the tribe-matrix and mcl programs are located. #you probably need to copy them individually to the same directory #if the installation puts them in different directories. or simply put them in #your standard /usr/local/bin # # 2. include a definition of an environmental variable TRIBEDIR in # every script that will use TRIBEMCL.pm # $ENV{TRIBEDIR} = '/usr/local/share/mclbin/; # #3. Manually set the path to the executabes in your code: # #my @params=('inputtype'=>'blastfile',I=>'2.0',' # mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl',' # matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); #my $fact = Bio::Tools::Run::TribeMCL->new(@params); #or #$fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); #$fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); BEGIN { $MATRIXPROGRAM_NAME = 'tribe-matrix'; $MCLPROGRAM_NAME = 'mcl'; if (defined $ENV{TRIBEDIR}) { $PROGRAMDIR = $ENV{TRIBEDIR} || ''; $MCLPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MCLPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); $MATRIXPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MATRIXPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); } @TRIBEMCL_PARAMS = qw(inputtype hsp hit scorefile blastfile description_file searchio pairs mcl matrix weight description family_tag use_db); @MATRIXPROGRAM_PARAMS = qw(ind out chunk); @MCLPROGRAM_PARAMS = qw(I t P R pct o); @OTHER_SWITCHES = qw(verbose quiet); # Authorize attribute fields foreach my $attr (@TRIBEMCL_PARAMS, @MATRIXPROGRAM_PARAMS, @MCLPROGRAM_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/MCL/i) { $self->mcl_executable($value); next; } if ($attr =~ /MATRIX/i){ $self->matrix_executable($value); next; } $self->$attr($value); } defined($self->weight) || $self->weight(200); return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 mcl_executable Title : mcl_executable Usage : $self->mcl_executable() Function: get set for path to mcl executable Returns : String or undef if not installed Args : [optional] string of path to executable [optional] boolean to warn on missing executable status =cut sub mcl_executable{ my ($self,$exe,$warn) = @_; if( defined $exe ) { $self->{'_mcl_exe'} = $exe; } unless( defined $self->{'_mcl_exe'} ) { # this would be the name set in the BEGIN block if( $MCLPROGRAM && -e $MCLPROGRAM && -x $MCLPROGRAM ) { $self->{'_mcl_exe'} = $MCLPROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($MCLPROGRAM_NAME) ) && -x $exe ) { $self->{'_mcl_exe'} = $exe; } else { $self->warn("Cannot find executable for $MCLPROGRAM_NAME") if $warn; $self->{'_mcl_exe'} = undef; } } } $self->{'_mcl_exe'}; } =head2 matrix_executable Title : matrix_executable Usage : $self->matrix_executable() Function: get set for path to tribe-matrix executable Returns : String or undef if not installed Args : [optional] string of path to executable [optional] boolean to warn on missing executable status =cut sub matrix_executable{ my ($self,$exe,$warn) = @_; if( defined $exe ) { $self->{'_matrix_exe'} = $exe; } unless( defined $self->{'_matrix_exe'} ) { # this would be the name set in the BEGIN block if( $MATRIXPROGRAM && -e $MATRIXPROGRAM && -x $MATRIXPROGRAM ) { $self->{'_matrix_exe'} = $MATRIXPROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($MATRIXPROGRAM_NAME) ) && -x $exe ) { $self->{'_matrix_exe'} = $exe; } else { $self->warn("Cannot find executable for $MATRIXPROGRAM_NAME") if $warn; $self->{'_matrix_exe'} = undef; } } } $self->{'_matrix_exe'}; } =head2 run Title : run Usage : $self->run() Function: runs the clustering Returns : Array Ref of clustered Ids Args : =cut sub run { my ($self,$input) = @_; if($self->description_file){ $self->_setup_description($self->description_file); } my $file = $self->_setup_input($input); defined($file) || $self->throw("Error setting up input "); #run tribe-matrix to generate matrix for mcl my ($index_file, $mcl_infile) = $self->_run_matrix($file); $self->throw("tribe-matrix not run properly as index file is missing") unless (-e $index_file); $self->throw("tribe-matrix not run properly as matrix file is missing") unless (-e $mcl_infile); #run mcl my $clusters = $self->_run_mcl($index_file,$mcl_infile); my $families; if($self->description){ my %consensus = $self->_consensifier($clusters); $families = $self->_generate_families($clusters,\%consensus); } else { $families = $self->_generate_families($clusters); } return @{$families}; } sub _generate_families { my ($self,$clusters,$consensus) = @_; my $family_tag = $self->family_tag || "TribeFamily"; my @fam; if($consensus){ my %description = %{$self->description}; my %consensus = %{$consensus}; for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @mem; foreach my $member (@{$clusters->[$i]}){ my $mem = Bio::Seq->new(-id=>$member, -alphabet=>"protein", -desc=>$description{$member}->[1]); my $annot = Bio::Annotation::DBLink->new(-database=>$description{$member}->[0], -primary_id=>$member); $mem->annotation->add_Annotation('dblink',$annot); #store species information my $taxon_str = $description{$member}->[2]; #parse taxon info into hash my %taxon; $taxon_str=~s/=;/=undef;/g if $taxon_str; %taxon = map{split '=',$_}split';',$taxon_str if $taxon_str; my $name = $taxon{'taxon_common_name'}; my @classification = $taxon{'taxon_classification'} ? split(':',$taxon{'taxon_classification'}) : (); my $tax_id = $taxon{'taxon_id'}; my $sub_species = $taxon{'taxon_sub_species'}; my $species = Bio::Species->new(); $species->common_name($name) if $name; #*** should this actually be scientific_name() ? $species->sub_species($sub_species) if $sub_species; $species->ncbi_taxid($tax_id) if $tax_id; $species->classification(@classification) if @classification; $mem->species($species); push @mem, $mem; } my $id = $family_tag."_".$i; my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, -description=>$consensus{$i}{desc}, -annotation_score=>$consensus{$i}{conf}, -members=>\@mem); push @fam, $fam; } return \@fam; } else { for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @mem; foreach my $member (@{$clusters->[$i]}){ my $mem = Bio::Seq->new(-id=>$member, -alphabet=>"protein"); push @mem, $mem; } my $id = $family_tag."_".$i; my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, -members=>\@mem); push @fam, $fam; } return \@fam; } } sub _consensifier { my ($self,$clusters) = @_; eval { require "Algorithm/Diff.pm"; }; if($@){ $self->warn("Algorith::Diff is needed to run TribeMCL"); return undef; } my %description = %{$self->description}; my %consensus; my $best_annotation; my %use_db; if($self->use_db){ foreach my $key(split(',',$self->use_db)){ $use_db{$key}++; } } CLUSTER: for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @desc; my @orig_desc; my $total_members = scalar(@{$clusters->[$i]}); foreach my $member(@{$clusters->[$i]}){ #if specify which dbs to use for consensifying if($self->use_db){ if($use_db{$description{$member}->[0]}){ push @desc, $description{$member}->[1] if $description{$member}->[1]; push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; } } else { push @desc, $description{$member}->[1] if $description{$member}->[1]; push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; } } if($#desc < 0){ #truly unknown $consensus{$i}{desc} = "UNKNOWN"; $consensus{$i}{conf} = 0; next CLUSTER; } if($#desc == 0){#only a single description $consensus{$i}{desc} = grep(/S+/,@desc); $consensus{$i}{desc} = $consensus{$i}{desc} || "UNKNOWN"; if ($consensus{$i}{desc} eq "UNKNOWN") { $consensus{$i}{conf} = 0; } else { $consensus{$i}{conf} = 100 * int(1/$total_members); } next CLUSTER; } #all the same desc my %desc = (); foreach my $desc (@desc) { $desc{$desc}++; } if ( (keys %desc) == 1 ) { my ($best_annotation,) = keys %desc; my $n = grep($_ eq $best_annotation, @desc); my $perc= int( 100*($n/$total_members) ); $consensus{$i}{desc} = $best_annotation; $consensus{$i}{conf} = $perc; next CLUSTER; } my %lcshash = (); my %lcnext = (); while (@desc) { # do an all-against-all LCS (longest commong substring) of the # descriptions of all members; take the resulting strings, and # again do an all-against-all LCS on them, until we have nothing # left. The LCS's found along the way are in lcshash. # # Incidentally, longest common substring is a misnomer, since it # is not guaranteed to occur in either of the original strings. It # is more like the common parts of a Unix diff ... for (my $i=0;$i<@desc;$i++) { for (my $j=$i+1;$j<@desc;$j++){ my @list1=split(" ",$desc[$i]); my @list2=split(" ",$desc[$j]); my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); my $lcs=join(" ",@lcs); $lcshash{$lcs}=1; $lcnext{$lcs}=1; } } @desc=keys(%lcnext); undef %lcnext; } my ($best_score, $best_perc)=(0, 0); my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ; foreach my $candidate_consensus (@all_cands) { my @temp=split(" ",$candidate_consensus); my $length=@temp; # num of words in annotation # see how many members of cluster contain this LCS: my ($lcs_count)=0; foreach my $orig_desc (@orig_desc) { my @list1=split(" ",$candidate_consensus); my @list2=split(" ",$orig_desc); my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); my $lcs=join(" ",@lcs); if ($lcs eq $candidate_consensus || index($orig_desc,$candidate_consensus) != -1 # addition; # many good (single word) annotations fall out otherwise ) { $lcs_count++; # Following is occurs frequently, as LCS is _not_ the longest # common substring ... so we can't use the shortcut either # if ( index($orig_desc,$candidate_consensus) == -1 ) { # warn "lcs:'$lcs' eq cons:'$candidate_consensus' and # orig:'$orig_desc', but index == -1\n" # } } } my $perc_with_desc=(($lcs_count/$total_members))*100; my $perc=($lcs_count/$total_members)*100; my $score=$perc + ($length*14); # take length into account as well $score = 0 if $length==0; if (($perc_with_desc >= 40) && ($length >= 1)) { if ($score > $best_score) { $best_score=$score; $best_perc=$perc; $best_annotation=$candidate_consensus; } } } if ($best_perc==0 || $best_perc >= 100 ) { $best_perc='NA'; } if ($best_annotation eq '') { $best_annotation = 'AMBIGUOUS'; } $consensus{$i}{desc} = $best_annotation; $consensus{$i}{conf} = $best_perc; } return %consensus; } sub _setup_description { my ($self,$file) = @_; my $goners='().-'; my $spaces= ' ' x length($goners); my $filter = "tr '$goners' '$spaces' < $file"; open (FILE,"$filter | ") || die "$filter: $!"; my %description; while(){ chomp; my ($db,$acc,$description,$taxon_str) = split("\t",$_); $description || $self->throw("Wrongly formated description file"); $description = $self->_apply_edits($description); if($description{$acc}){ $self->warn("Duplicated entry $acc in description file, overwriting.."); } $description{$acc} = [$db,$description,$taxon_str]; } $self->description(\%description); } sub as_words { #add ^ and $ to regexp my (@words); my @newwords=(); foreach my $word (@words) { push @newwords, "^$word\$" }; } sub _apply_edits { my ($self,$desc) = @_; my @deletes = ( 'FOR\$', 'SIMILAR TO\$', 'SIMILAR TO PROTEIN\$', 'RIKEN.*FULL.*LENGTH.*ENRICHED.*LIBRARY', '\w*\d{4,}','HYPOTHETICAL PROTEIN' ); my @newwords = &as_words(qw(NOVEL PUTATIVE PREDICTED UNNAMED UNNMAED ORF CLONE MRNA CDNA EST RIKEN FIS KIAA\d+ \S+RIK IMAGE HSPC\d+ FOR HYPOTETICAL HYPOTHETICAL)); push @deletes, @newwords; foreach my $re ( @deletes ) { $desc=~s/$re//g; } #Apply some fixes to the annotation: $desc=~s/EC (\d+) (\d+) (\d+) (\d+)/EC $1.$2.$3.$4/; $desc=~s/EC (\d+) (\d+) (\d+)/EC $1.$2.$3.-/; $desc=~s/EC (\d+) (\d+)/EC $1\.$2.-.-/; $desc=~s/(\d+) (\d+) KDA/$1.$2 KDA/; return $desc; } =head2 _run_mcl Title : _run_mcl Usage : $self->_run_mcl() Function: internal function for running the mcl program Returns : Array Ref of clustered Ids Args : Index_file name, matrix input file name =cut sub _run_mcl { my ($self,$ind_file,$infile) = @_; my $exe = $self->mcl_executable || $self->throw("mcl not found."); my $cmd = $exe . " $infile"; unless (defined $self->o) { my ($fh,$o) = $self->io->tempfile(-dir=>$self->tempdir); $self->o($o); # file handle not use later so deleted close($fh); undef $fh; } unless (defined $self->I) { $self->I(3.0); } foreach my $param (@MCLPROGRAM_PARAMS) { if (defined $self->$param) { $cmd .= " -$param ".$self->$param; } } if($self->quiet || ($self->verbose < 0)){ $cmd .= " -V all"; if( $^O !~ /Mac/) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd .= " 2> $null"; } } my $status = system($cmd); $self->throw( "mcl call ($cmd) crashed: $? \n") unless $status==0; my $families = $self->_parse_mcl($ind_file,$self->o); return $families; } =head2 _run_matrix Title : _run_matrix Usage : $self->_run_matrix() Function: internal function for running the tribe-matrix program Returns : index filepath and matrix file path Args : filepath of parsed ids and scores =cut sub _run_matrix { my ($self,$parse_file) = @_; my $exe = $self->matrix_executable || $self->throw("tribe-matrix not found."); my $cmd = $exe . " $parse_file"; unless (defined $self->ind) { my ($fh,$indexfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->ind($indexfile); # file handle not use later so deleted close($fh); undef $fh; } unless (defined $self->out) { my ($fh,$matrixfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->out($matrixfile); # file handle not use later so deleted close($fh); undef $fh; } foreach my $param (@MATRIXPROGRAM_PARAMS) { if (defined $self->$param) { $cmd .= " -$param ".$self->$param; } } my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd .= " > $null"; my $status = system($cmd); $self->throw( "tribe-matrix call ($cmd) crashed: $? \n") unless $status==0; return ($self->ind,$self->out); } =head2 _setup_input Title : _setup_input Usage : $self->_setup_input() Function: internal function for running setting up the inputs needed for running mcl Returns : filepath of parsed ids and scores Args : =cut sub _setup_input { my ($self,$input) = @_; my ($type,$rc); my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); $type = $self->inputtype(); if($type=~/scorefile/i){ -e $self->scorefile || $self->throw("Scorefile doesn't seem to exist or accessible"); return $self->scorefile; } if($type =~/blastfile/i){ $self->blastfile($input); $rc = $self->_parse_blastfile($self->blastfile,$tfh); } elsif($type=~/searchio/i){ $self->searchio($input); $rc = $self->_get_from_searchio($self->searchio,$tfh); } elsif($type=~/pairs/i) { $self->pairs($input); for my $line (@{ $self->pairs }){ print $tfh join("\t",@{$line}), "\n"; $rc++; } } elsif($type =~/hsp/i) { $self->hsp($input); $rc = $self->_get_from_hsp($self->hsp,$tfh); } elsif($type=~/hit/i){ $self->hit($input); $rc = $self->_get_from_hit($self->hit,$tfh); } else { $self->throw("Must set inputtype to either blastfile,searchio or ". "paris using \$fact->blastfile |\$fact->searchio| \$fact->pairs"); } unless ( $rc ) { $self->throw("Need inputs for running tribe mcl, nothing provided"); } close($tfh); $tfh= undef; return $outfile; } =head2 _get_from_hsp Title : _get_from_hsp Usage : $self->_get_from_hsp() Function: internal function for getting blast scores from hsp Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : L =cut sub _get_from_hsp { my ($self,$hsp,$tfh) = @_; my @array; my $count; foreach my $pair (@{$hsp}){ my $sig = $pair->score; $sig =~ s/^e-/1e-/g; my $expect=sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t", $pair->feature1->seq_id, $pair->feature2->seq_id,int($first), int($second) ), "\n"; $count++; } return $count; } sub _get_from_hit { my ($self,$hit,$tfh) = @_; my $count; foreach my $pair(@{$hit}){ my $sig = $pair->raw_score; $sig =~s/^e-/1e-/g; my $expect = sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t",$pair->name,$pair->description,int($first),int($second)),"\n"; $count++; } return $count; } =head2 _get_from_searchio Title : _get_from_searchio Usage : $self->_get_from_searchio() Function: internal function for parsing blast scores from searchio object Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : L =cut sub _get_from_searchio { my ($self,$sio,$tfh) = @_; my @array; my $count; while( my $result = $sio->next_result ) { while( my $hit = $result->next_hit ) { while( my $hsp = $hit->next_hsp ) { my $sig = $hsp->evalue; $sig =~ s/^e-/1e-/g; my $expect=sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t", $hsp->feature1->seq_id, $hsp->feature2->seq_id, int($first), int($second) ), "\n"; $count++; } } } return $count; } =head2 _parse_blastfile Title : _parse_blastfile Usage : $self->_parse_blastfile() Function: internal function for quickly parsing blast evalue scores from raw blast output file Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : file path =cut sub _parse_blastfile { my ($self, $file,$tfh) = @_; open(FILE,$file) || $self->throw("Cannot open Blast Output File"); my ($query,$reference,$first,$second); my @array; my $count; my $weight = $self->weight; while(){ if(/Query=\s+(\S+)/){ $query = $1; } if(/^>(\S+)/){ $reference = $1; } if (/Expect = (\S+)/){ my $raw=$1; $raw=~s/^e-/1e-/g; my $expect=sprintf("%e",$raw); if ($expect==0){ $expect=sprintf("%e","1e-$weight"); } $first=(split("e-",$expect))[0]; $second=(split("e-",$expect))[1]; print $tfh join("\t", $query, $reference, int($first), int($second)), "\n"; $count++; } } return $count; } =head2 _parse_mcl Title : _parse_mcl Usage : $self->_parse_mcl() Function: internal function for quickly parsing mcl output and generating the array of clusters Returns : Array Ref of clustered Ids Args : index file path, mcl output file path =cut sub _parse_mcl { my ($self,$ind,$mcl) = @_; open (MCL,$mcl) || $self->throw("Error, cannot open $mcl for parsing"); my $i =-1; my $start; my (@cluster,@out); while() { if ($start) { chomp($_); $cluster[$i] = join(" ",$cluster[$i],"$_"); } if(/^\d+/){ $start = 1; $i++; $cluster[$i] = join(" ",$cluster[$i] || '',"$_"); } if (/\$$/){ $start = 0; } last if /^\(mclruninfo/; } open (IND,$ind) || $self->throw("Cannot open $ind for parsing"); my %hash; while(){ /^(\S+)\s+(\S+)/; $hash{$1}=$2; } for (my $j=0;$j<$i+1;$j++) { my @array=split(" ",$cluster[$j]); for (my $p=1;$p<$#array;$p++){ push @{$out[$array[0]]}, $hash{$array[$p]}; } } return \@out; } 1; bioperl-run-release-1-7-3/lib/Bio/Tools/Run/Vista.pm000066400000000000000000000517631342734133000221570ustar00rootroot00000000000000# Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Vista Wrapper for Vista =head1 SYNOPSIS use Bio::Tools::Run::Vista; use Bio::Tools::Run::Alignment::Lagan; use Bio::AlignIO; my $sio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>'genbank'); my @seq; my $reference = $sio->next_seq; push @seq, $reference; while(my $seq = $sio->next_seq){ push @seq,$seq; } my @features = grep{$_->primary_tag eq 'CDS'} $reference->get_SeqFeatures; my $lagan = Bio::Tools::Run::Alignment::Lagan->new; my $aln = $lagan->mlagan(\@seq,'(fugu (mouse human))'); my $vis = Bio::Tools::Run::Vista->new('outfile'=>"outfile.pdf", 'title' => "My Vista Plot", 'annotation'=>\@features, 'annotation_format'=>'GFF', 'min_perc_id'=>75, 'min_length'=>100, 'plotmin' => 50, 'tickdist' => 2000, 'window'=>40, 'numwindows'=>4, 'start'=>50, 'end'=>1500, 'tickdist'=>100, 'bases'=>1000, 'java_param'=>"-Xmx128m", 'num_pages'=>1, 'color'=> {'EXON'=>'100 0 0', 'CNS'=>'0 0 100'}, 'quiet'=>1); my $referenceid= 'human'; $vis->run($aln,$referenceid); #alternative one can choose pairwise alignments to plot #where the second id in each pair is the reference sequence $vis->run($aln,([mouse,human],[fugu,human],[mouse,fugu])); =head1 DESCRIPTION Pls see Vista documentation for plotfile options Wrapper for Vista : C. Mayor, M. Brudno, J. R. Schwartz, A. Poliakov, E. M. Rubin, K. A. Frazer, L. S. Pachter, I. Dubchak. VISTA: Visualizing global DNA sequence alignments of arbitrary length. Bioinformatics, 2000 Nov;16(11):1046-1047. Get it here: http://www-gsd.lbl.gov/vista/VISTAdownload2.html On the command line, it is assumed that this can be executed: java Vista plotfile Some of the code was adapted from MLAGAN toolkit M. Brudno, C.B. Do, G. Cooper, M.F. Kim, E. Davydov, NISC Sequencing Consortium, E.D. Green, A. Sidow and S. Batzoglou LAGAN and Multi-LAGAN: Efficient Tools for Large-Scale Multiple Alignment of Genomic DNA, Genome Research, in press get lagan here: http://lagan.stanford.edu/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Vista; use vars qw($AUTOLOAD @ISA %DEFAULT_VALUES %EPONINE_PARAMS @VISTA_PARAMS $EPOJAR $JAVA $PROGRAMDIR $PROGRAMNAME $PROGRAM %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Seq; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use File::Copy; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'java'; if( ! defined $PROGRAMDIR ) { $PROGRAMDIR = $ENV{'JAVA_HOME'} || $ENV{'JAVA_DIR'}; } if (defined $PROGRAMDIR) { foreach my $progname ( [qw(java)],[qw(bin java)] ) { my $f = Bio::Root::IO->catfile($PROGRAMDIR, @$progname); if( -e $f && -x $f ) { $PROGRAM = $f; last; } } } %DEFAULT_VALUES= ('java' => 'java', 'min_perc_id' => 75, 'min_length' => 100, 'plotmin' => 50, 'bases' => 10000, 'tickdist' => 2000, 'resolution'=> 25, 'window' => 40, 'title' => 'VISTA PLOT', 'numwindows'=>4); @VISTA_PARAMS=qw(JAVA JAVA_PARAM OUTFILE MIN_PERC_ID QUIET VERBOSE ANNOTATION_FORMAT REGION_FILE REGION_FILE_DIR SCORE_FILE SCORE_FILE_DIR ALIGNMENT_FILE_DIR ALIGNMENT_FILE CONTIGS_FILE DIFFS PLOTFILE MIN_LENGTH PLOTMIN ANNOTATION BASES TICKDIST RESOLUTION TITLE PAPER WINDOW NUMWINDOWS START END NUM_PLOT_LINES LEGEND FILENAME NUM_PAGES AXIS_LABEL TICKS_FILE COLOR USE_ORDER GAPS SNPS_FILE REPEATS_FILE FILTER_REPEATS); foreach my $attr ( @VISTA_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $self->debug( "************ attr: $attr\n"); $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $vis = Bio::Tools::Run::Vista->new('outfile'=>$out, 'title' => "My Vista Plot", 'annotation'=>\@features, 'annotation_format'=>'GFF', 'min_perc_id'=>75, 'min_length'=>100, 'plotmin' => 50, 'tickdist' => 2000, 'window'=>40, 'numwindows'=>4, 'start'=>50, 'end'=>1500, 'tickdist'=>100, 'bases'=>1000, 'color'=> {'EXON'=>'100 0 0', 'CNS'=>'0 0 100'}, 'quiet'=>1); Function: Construtor for Vista wrapper Args : outfile - location of the pdf generated annotation - either a file or and array ref of Bio::SeqFeatureI indicating the exons regmin -region min =cut sub new { my ($caller, @args) = @_; # chained new my $self = $caller->SUPER::new(@args); # so that tempfiles are cleaned up foreach my $key(keys %DEFAULT_VALUES){ $self->$key($DEFAULT_VALUES{$key}); } while (@args) { my $attr = shift @args; my $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 java Title : java Usage : $obj->java('/usr/opt/java130/bin/java'); Function: Get/set method for the location of java VM Args : File path (optional) =cut sub executable { shift->java(@_); } sub java { my ($self, $exe,$warn) = @_; if( defined $exe ) { $self->{'_pathtojava'} = $exe; } unless( defined $self->{'_pathtojava'} ) { if( $PROGRAM && -e $PROGRAM && -x $PROGRAM ) { $self->{'_pathtojava'} = $PROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) && -x $exe ) { $self->{'_pathtojava'} = $exe; } else { $self->warn("Cannot find executable for $PROGRAMNAME") if $warn; $self->{'_pathtojava'} = undef; } } } $self->{'_pathtojava'}; } =head2 run Title : run Usage : my @genes = $self->run($seq) Function: runs Vista Returns : A boolean 1 if no errors Args : Argument 1: Bio::Align::Align required Argument 2: a string or number, which is the sequence id of the reference sequence or the rank of the sequence in the alignment =cut sub run{ my ($self,$align,$ref) = @_; $ref ||=1; my $infile = $self->_setinput($align,$ref); return $self->_run_Vista($infile); } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : =cut sub _setinput { my ($self,$sim_aln,$ref) = @_; my($pairs,$files) = $self->_mf2bin($sim_aln,$ref); my $plotfile = $self->_make_plotfile($sim_aln,$pairs,$files); return $plotfile; } sub _parse_multi_fasta { my ($self,$file) = @_; my %seq; open(FASTA, $file) || $self->throw("Couldn't open $file"); my $last; my $count = 0; while (my $line = ) { chomp $line; next if $line=~/^$/; if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; $seq{$&} = ""; $last = $&; } else { $seq{$last}.=$line; } print STDERR $count."\n"; $count++; } my @seq; foreach my $key(keys %seq){ my $seq = Bio::Seq->new(-id=>$key,-seq=>$seq{$key}); push @seq,$seq; } return @seq; } #adapted from mlagan utils mf2bin.pl sub _mf2bin { my ($self,$sim,$ref)= @_; my @seq; if(!ref $sim){ @seq = $self->_parse_multi_fasta($sim); } else { ($sim && $sim->isa("Bio::Align::AlignI")) || $self->throw("Expecting a Bio::Align::AlignI"); @seq = $sim->each_seq; } my $reference; my @files; my @pairs; if(ref($ref) eq 'ARRAY'){ my @ref; foreach my $set(@$ref){ my ($reference) = grep{$_->id eq $set->[1]}@seq; my ($other) = grep{$_->id eq $set->[0]}@seq; my ($pair,$file) = $self->_pack_bin($reference,$other); push @pairs, @$pair; push @files, @$file; push @ref,$set->[1]; } $self->_coordinate(\@ref); return \@pairs,\@files; } #figure out the reference sequence elsif($ref =~/^\d+$/){ #its a rank index $reference = $seq[$ref-1]; my $tmp = $ref; $ref = $reference->id; splice @seq,($tmp-1),1; } else { #its an id foreach my $i(0..$#seq){ if($seq[$i]->id =~/$ref/){ $reference = $seq[$i]; splice @seq,($i),1; last; } } } $self->_coordinate([$ref]); # pack bin # format from Alex Poliakov's glass2bin.pl script my %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); my @ref= (split ('',$reference->seq)); foreach my $seq2(@seq){ my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my @seq2= (split('', $seq2->seq)); foreach my $index(0..$#ref){ unless($ref[$index] eq '-' && $seq2[$index] eq '-'){ print $tfh1 pack("H2",$base_code{$ref[$index]}.$base_code{$seq2[$index]}); } } close ($tfh1); undef ($tfh1); push @files, $outfile; push @pairs,[$reference->id,$seq2->id]; } return \@pairs,\@files; } sub _pack_bin { my ($self,$first,$sec) = @_; my @first = (split('',$first->seq)); my @sec = (split('',$sec->seq)); # pack bin # format from Alex Poliakov's glass2bin.pl script my %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); my @files; my @pairs; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); foreach my $index(0..$#first){ unless($first[$index] eq '-' && $sec[$index] eq '-'){ print $tfh1 pack("H2",$base_code{$first[$index]}.$base_code{$sec[$index]}); } } close ($tfh1); undef ($tfh1); push @files, $outfile; push @pairs,[$first->id,$sec->id]; return \@pairs,\@files; } sub _make_plotfile { my ($self,$sim_aln,$pairs,$files) = @_; my ($tfh1,$plotfile) = $self->io->tempfile(-dir=>$self->tempdir); my @ids = map{$_->id}$sim_aln->each_seq; print $tfh1 "TITLE ".$self->title."\n\n"; print $tfh1 "OUTPUT ".$self->outfile."\n\n" ; print $tfh1 "SEQUENCES "; print $tfh1 join(" ",@ids)."\n\n"; foreach my $index(0..$#$pairs){ print $tfh1 "ALIGN ".$files->[$index]." BINARY\n"; print $tfh1 " SEQUENCES ".$pairs->[$index]->[0]." ".$pairs->[$index]->[1]."\n"; print $tfh1 " REGIONS ".$self->min_perc_id." ".$self->min_length."\n"; print $tfh1 " MIN ".$self->plotmin."\n"; print $tfh1 " DIFFS ". $self->diffs ."\n\n" if $self->diffs; if($self->region_file||$self->region_file_dir){ my $file = " REGION_FILE "; $file.=$self->region_file_dir."/" if $self->region_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".region\n\n"; print $tfh1 $file; } if($self->score_file || $self->score_file_dir){ my $file = " SCORE_FILE "; $file.=$self->score_file_dir."/" if $self->score_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".score\n\n"; print $tfh1 $file; } if($self->alignment_file || $self->alignment_file_dir){ my $file = " ALIGNMENT_FILE "; $file.=$self->alignment_file_dir."/" if $self->alignment_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".alignment\n\n"; print $tfh1 $file; } print $tfh1 " CONTIGS_FILE ". $self->contigs_file ."\n\n" if $self->contigs_file; print $tfh1 " USE_ORDER ". $self->use_order."\n\n" if $self->use_order; print $tfh1 "END \n\n"; } my $annotation_file; if((ref $self->annotation eq 'ARRAY')&& $self->annotation->[0]->isa("Bio::SeqFeatureI")){ $annotation_file = $self->_dump2gff($self->annotation); $self->annotation_format('GFF'); } elsif($self->annotation){ $annotation_file = $self->annotation; } $annotation_file .= " GFF" if $self->annotation_format=~/GFF/i; print $tfh1 "GENES ".$annotation_file." \n\n" if $annotation_file; print $tfh1 "LEGEND on\n\n"; print $tfh1 "COORDINATE ".join(" ",@{$self->_coordinate})."\n\n"; print $tfh1 "PAPER letter\n\n"; print $tfh1 "BASES ".$self->bases."\n\n"; print $tfh1 "TICK_DIST ".$self->tickdist."\n\n"; print $tfh1 "RESOLUTION ".$self->resolution."\n\n"; print $tfh1 "WINDOW ".$self->window."\n\n"; print $tfh1 "NUM_WINDOWS ".$self->numwindows."\n\n"; print $tfh1 "AXIS_LABEL ".$self->axis_label ."\n\n" if $self->axis_label; print $tfh1 "TICKS_FILE ".$self->ticks_file ."\n\n" if $self->ticks_file; print $tfh1 "SNPS_FILE"." ".$self->snps_file."\n\n" if $self->snps_file; print $tfh1 "GAPS ".$self->gaps ."\n\n"if $self->gaps; print $tfh1 "REPEATS_FILE ".$self->repeats_file ."\n\n" if $self->repeats_file; print $tfh1 "FILTER_REPEATS ".$self->filter_repeats ."\n\n" if $self->filter_repeats; print $tfh1 "NUM_PAGES ".$self->num_pages ."\n\n" if $self->num_pages; print $tfh1 "START ".$self->start ."\n\n" if $self->start; print $tfh1 "END ".$self->end ."\n\n" if $self->end; my $color = $self->color; if(ref $color eq 'HASH'){ foreach my $region_type (keys %$color){ print $tfh1 "COLOR ".$region_type." ".$color->{$region_type}."\n\n"; } } close ($tfh1); undef $tfh1; if($self->plotfile) {#saving plotfile copy($plotfile,$self->plotfile); } else { $self->plotfile($plotfile); } return $self->plotfile; } sub _dump2gff { my ($self,$feat) = @_; my ($tfh1,$file) = $self->io->tempfile(-dir=>$self->tempdir); foreach my $f(@$feat){ print $tfh1 $f->gff_string."\n"; } close ($tfh1); undef $tfh1; return $file; } sub _run_Vista { my ($self,$infile) = @_; #run Vista $self->debug( "Running Vista\n"); my $java = $self->java; my $param = $self->java_param || ''; my $cmd = $java." ".$param.' Vista '; $cmd .= " -q " if $self->quiet || $self->verbose < 0; $cmd .= " -d " if $self->debug; $cmd .= $infile; $self->debug($cmd); my $status = system ($cmd); $self->throw("Problem running Vista: $? \n") if $status != 0; return 1; } sub _coordinate { my ($self,$val) = @_; if($val){ $self->{'_coordinate'} = $val; } return $self->{'_coordinate'}; } =head2 outfile Title : outfile Usage : $obj->outfile Function : Get/Set method outfile Args : =cut =head2 min_perc_id Title : min_perc_id Usage : $obj->min_perc_id Function : Get/Set method min_perc_id Args : =cut =head2 quiet Title : quiet Usage : $obj->quiet Function : Get/Set method quiet Args : =cut =head2 verbose Title : verbose Usage : $obj->verbose Function : Get/Set method verbose Args : =cut =head2 annotation_format Title : annotation_format Usage : $obj->annotation_format Function : Get/Set method annotation_format Args : =cut =head2 region_file Title : region_file Usage : $obj->region_file Function : Get/Set method region_file Args : =cut =head2 score_file Title : score_file Usage : $obj->score_file Function : Get/Set method score_file Args : =cut =head2 alignment_file Title : alignment_file Usage : $obj->alignment_file Function : Get/Set method alignment_file Args : =cut =head2 contigs_file Title : contigs_file Usage : $obj->contigs_file Function : Get/Set method contigs_file Args : =cut =head2 diffs Title : diffs Usage : $obj->diffs Function : Get/Set method diffs Args : =cut =head2 plotfile Title : plotfile Usage : $obj->plotfile Function : Get/Set method plotfile Args : =cut =head2 min_length Title : min_length Usage : $obj->min_length Function : Get/Set method min_length Args : =cut =head2 plotmin Title : plotmin Usage : $obj->plotmin Function : Get/Set method plotmin Args : =cut =head2 annotation Title : annotation Usage : $obj->annotation Function : Get/Set method annotation Args : =cut =head2 bases Title : bases Usage : $obj->bases Function : Get/Set method bases Args : =cut =head2 tickdist Title : tickdist Usage : $obj->tickdist Function : Get/Set method tickdist Args : =cut =head2 resolution Title : resolution Usage : $obj->resolution Function : Get/Set method resolution Args : =cut =head2 title Title : title Usage : $obj->title Function : Get/Set method title Args : =cut =head2 window Title : window Usage : $obj->window Function : Get/Set method window Args : =cut =head2 numwindows Title : numwindows Usage : $obj->numwindows Function : Get/Set method numwindows Args : =cut =head2 start Title : start Usage : $obj->start Function : Get/Set method start Args : =cut =head2 end Title : end Usage : $obj->end Function : Get/Set method end Args : =cut =head2 num_plot_lines Title : num_plot_lines Usage : $obj->num_plot_lines Function : Get/Set method num_plot_lines Args : =cut =head2 legend Title : legend Usage : $obj->legend Function : Get/Set method legend Args : =cut =head2 filename Title : filename Usage : $obj->filename Function : Get/Set method filename Args : =cut =head2 axis_label Title : axis_label Usage : $obj->axis_label Function : Get/Set method axis_label Args : =cut =head2 ticks_file Title : ticks_file Usage : $obj->ticks_file Function : Get/Set method ticks_file Args : =cut =head2 color Title : color Usage : $obj->color Function : Get/Set method color Args : =cut =head2 use_order Title : use_order Usage : $obj->use_order Function : Get/Set method use_order Args : =cut =head2 gaps Title : gaps Usage : $obj->gaps Function : Get/Set method gaps Args : =cut =head2 snps_file Title : snps_file Usage : $obj->snps_file Function : Get/Set method snps_file Args : =cut =head2 repeats_file Title : repeats_file Usage : $obj->repeats_file Function : Get/Set method repeats_file Args : =cut =head2 filter_repeats Title : filter_repeats Usage : $obj->filter_repeats Function : Get/Set method filter_repeats Args : =cut 1; __END__ bioperl-run-release-1-7-3/lib/Bio/Tools/Run/tRNAscanSE.pm000066400000000000000000000154351342734133000227660ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::tRNAscanSE # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::tRNAscanSE - Wrapper for local execution of tRNAscan-SE =head1 SYNOPSIS my $factory = Bio::Tools::Run::tRNAscanSE->new(-program => 'tRNAscan-SE'); # Pass the factory Bio::Seq objects, # returns a Bio::Tools::tRNAscanSE object my $factory = $factory->run($seq); or my $factory = $factory->run(@seq); =head1 DESCRIPTION Wrapper module for tRNAscan-SE. tRNAscan-SE is open source and available at L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::tRNAscanSE; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::tRNAscanSE; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program)); our @tRNAscanSE_switches = (qw(A B C G O P)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 new Title : new Usage : $tRNAscanSE->new(@params) Function: creates a new tRNAscanSE factory Returns: Bio::Tools::Run::tRNAscanSE Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @tRNAscanSE_switches, ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs tRNAscan-SE Returns : A Bio::Tools::tRNAscanSE object Args : An array of Bio::PrimarySeqI objects =cut sub run{ my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); return $self->_run($file_name); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::tRNAscanSE Args : file name =cut sub _run { my ($self, $seq_file_name) = @_; my @cmd = ( $self->executable(), split(/\s+/, $self->_setparams()), $seq_file_name, ); my $cmd = join(' ', @cmd); $self->debug("tRNAscan-SE Command = $cmd"); my $program_name = $self->program_name(); my ($program_stderr); my ($output_fh, $output_file_name) = $self->io->tempfile(-dir=> $self->tempdir()); my @ipc_args = (\@cmd, \undef, '>', $output_file_name, '2>', \$program_stderr); # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. eval { IPC::Run::run(@ipc_args) || die $CHILD_ERROR;; }; if ($EVAL_ERROR) { $self->throw("tRNAscan-SE call crashed: $EVAL_ERROR"); } $self->debug(join("\n", 'tRNAscanSE STDERR:', $program_stderr)) if $program_stderr; return Bio::Tools::tRNAscanSE->new(-file => $output_file_name); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [ ], -switches => [ @tRNAscanSE_switches, ], -dash => 1 ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-3/packages/000077500000000000000000000000001342734133000171525ustar00rootroot00000000000000bioperl-run-release-1-7-3/packages/install-samtools.sh000077500000000000000000000003261342734133000230170ustar00rootroot00000000000000#!/bin/bash wget https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 tar -xjvf samtools-1.3.1.tar.bz2 cd samtools-1.3.1 && make prefix=$HOME/local && make prefix=$HOME/local install bioperl-run-release-1-7-3/scripts/000077500000000000000000000000001342734133000170635ustar00rootroot00000000000000bioperl-run-release-1-7-3/scripts/bp_blast2tree.pl000066400000000000000000000110631342734133000221510ustar00rootroot00000000000000#!/usr/bin/perl # Author: Jason Stajich # Purpose: Blast Report -> MSA -> Tree # This needs lots more error checking, cmdline input of parameters # and support for other treebuilding -- only Phylip Neighbor for # protein data is supported # Also proper pulling in of the correct sequence data from the # alignment - multiple hits on different parts of a protein aren't # going to work properly right now. So this is mostly and example # starting point which needs a lot more work to be made robust. use strict; use warnings; use Bio::AlignIO; use Bio::Tools::Run::Alignment::Clustalw; use Bio::Tools::Run::Phylo::Phylip::ProtDist; use Bio::Tools::Run::Phylo::Phylip::Neighbor; use Bio::Tools::Run::Phylo::Molphy::ProtML; use Bio::Tools::Run::Phylo::Phylip::ProtPars; use Bio::SearchIO; use Bio::SimpleAlign; use Bio::PrimarySeq; use Bio::TreeIO; use Getopt::Long; my $IDLENGTH = 12; # we could in fact pull the tree out of the guide tree calculated # by Clustalw in the alignment, but I believe that is UPGMA # which would *NOT* be a good thing to be giving people. my $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new ('ktuple' => 2, "quiet" => 1, 'matrix' => 'BLOSUM'); my ($report,$format,$tree_method,$cutoff,$keepall); $format = 'blast'; $tree_method = 'neighbor'; $cutoff = '0.01'; GetOptions( 'h|help' => sub { exec('perldoc', $0); exit(0); }, 'i|input:s' => \$report, 'f|format:s' => \$format, 'm|method:s' => \$tree_method, 'e|evalue:s' => \$cutoff, 'k|keepall:s' => \$keepall, # keep all HSPs not just best ); unless( $format =~ /blast|fasta|hmmer/i ) { die("Must request a valid search report format (fasta,blast,hmmer)"); } unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) { die("Must request a valid tree building method (neighbor,protml)"); } my (@alns,@seqs); my $in = new Bio::SearchIO(-file => $report, -format => $format); while( my $r = $in->next_result ) { # Let's build the simplest system first die("Cannot process report that does not contain protein sequence") unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i ); my @seqs; while( my $hit = $r->next_hit ) { next if $hit->significance > $cutoff; while( my $hsp = $hit->next_hsp ) { next if $hsp->evalue > $cutoff; my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq(); push @seqs, new Bio::PrimarySeq(-seq => $seq, -id => $hsp->hit->seq_id, -desc => $r->algorithm . " best align to ". $hsp->query->seq_id ); last unless $keepall; } } push @alns, $aln_factory->align(\@seqs); } my $out = new Bio::AlignIO(-format => 'phylip', -interleaved => 1, -idlength => $IDLENGTH, -file => ">alignfile.phy"); $out->write_aln(@alns); $out = undef; # these need to be parameterized for cmdline arguments my @params = ('idlength'=>$IDLENGTH, 'model'=>'cat', 'gencode'=>'U', 'category'=>'H', 'probchange'=>'0.4', 'trans'=>'5', 'freq'=>'0.25,0.5,0.125,0.125'); my $dist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); $dist_factory->quiet(1); @params = ('type'=>'NJ', 'outgroup'=>1, 'upptri'=>1, 'jumble'=>17); my $tree_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); $tree_factory->quiet(1); my $count = 1; my $outtrees = new Bio::TreeIO(-file => ">trees.tre", -format => 'newick'); foreach my $aln ( @alns ) { # NOTE NOTE NOTE # This interface is probably going to change from create_tree to # next_tree per some discussions I'm having with Shawn - we may need # to tweak any scripts before you publish # also may move the create_distance_matrix method around some # and need to write in the switched support for Molphy's ProtML my $matrix = $dist_factory->create_distance_matrix($aln); my @seqnames = keys %$matrix; open my $MATRIX, '>', "Group$count.dist" or die "Could not write file 'Group$count.dist': $!\n"; printf $MATRIX "%4d\n",scalar @seqnames; for(my $i =0; $i< (scalar @seqnames - 1); $i++ ) { printf $MATRIX "%-12s ", $seqnames[$i]; for( my $j = $i+1; $j < scalar @seqnames; $j++ ) { print $MATRIX $matrix->{$seqnames[$i]}->{$seqnames[$j]}," "; } print $MATRIX "\n"; } close $MATRIX; my $tree = $tree_factory->create_tree("Group$count.dist"); $outtrees->write_tree($tree); $count++; } =head1 NAME tree_from_seqsearch - builds a phylogenetic tree based on a sequence search (FastA,BLAST,HMMER) =head1 DESCRIPTION This script requires that the bioperl-run pkg be also installed. =cut bioperl-run-release-1-7-3/scripts/multi_hmmsearch.PLS000066400000000000000000000067401342734133000226330ustar00rootroot00000000000000#!/usr/bin/perl # $Id: multi_hmmsearch.PLS,v 1.3 2006-07-04 22:23:36 mauricio Exp $ use strict; use warnings; =head1 NAME multi_hmmsearch - perform a hmmsearch into multiple FASTA files using an INDEX file =head1 SYNOPSIS multi_hmmsearch -p hmm_file [-i] -f index_file =head1 DESCRIPTION Not technically a Bio::Tools::Run script as this doesn't use any Bioperl or Bioperl-run components but it's useful. =head2 Mandatory Options: -p HMM profile to use for the search. -f INDEX file that contains a list of FASTA files for the multiple search. =head2 Special Options: -i Create a new index file with the resulting hmms files. This is useful if you want to pass this list as input arguments into another programs. -h Show this documentation. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Mauricio Herrera Cuadra =cut # Modules, pragmas and variables to use use Getopt::Long; use vars qw($opt_p $opt_i $opt_f $opt_h $index_file); # Gets options from the command line GetOptions qw(-p=s -i -f=s -h); # Print documentation if help switch was given exec('perldoc', $0) and exit() if $opt_h; # If no mandatory options are given prints an error and exits if (!$opt_p) { print "ERROR: No HMM profile has been specified.\n Use '-h' switch for documentation.\n" and exit(); } elsif (!$opt_f) { print "ERROR: No INDEX file has been specified.\n Use '-h' switch for documentation.\n" and exit(); } # Locates hmmsearch in the filesystem my $hmmsearch = `which hmmsearch`; chomp $hmmsearch; # Creates a directory for writing the resulting files mkdir("multi", 0755) unless -e "multi" and -d "multi"; # Creates a new INDEX file if the option was given if ($opt_i) { my $prefix = $opt_f; $prefix =~ s/\.INDEX$//; $index_file = "$prefix.hmms.INDEX"; open(HMMSINDEX, ">", $index_file) or die("Unable to create file: $index_file ($!)"); } # Opens the INDEX file sent as input open(FH, "<", $opt_f) or die("Unable to open INDEX file: $opt_f ($!)"); print "==> Opening INDEX file:\t\t\t\t$opt_f\n"; print "==> HMM profile file is:\t\t\t$opt_p\n"; # Cycle that extracts one line for every loop until finding the end of # file while (my $line = ) { # Deletes the new line characters from the line chomp $line; # Gets the name for the result file my $out = $line; $out =~ s/^split\///; $out =~ s/\.faa$//; # Performs the hmmsearch for the FASTA file in turn print "--> Performing hmmsearch in file:\t\t$line\n"; system("$hmmsearch $opt_p $line > multi/$out.hmms"); print "==> hmmsearch results stored in file:\t\tmulti/$out.hmms\n"; # Prints the result file name into the new INDEX file if the # option was given print HMMSINDEX "multi/$out.hmms\n" if $opt_i; } # Closes INDEX files close(FH); if ($opt_i) { print "==> New INDEX stored in file:\t\t\t$index_file\n"; close(HMMSINDEX); } # Exits the program exit(); bioperl-run-release-1-7-3/scripts/panalysis.PLS000066400000000000000000000562601342734133000214570ustar00rootroot00000000000000#!/usr/bin/perl # # A client showing how to use Bio::Tools::Run::Analysis module, # a module for executing and controlling local or remote analysis tools. # It also calls methods from Bio::Tools::Run::AnalysisFactory module. # # It has many options in order to cover as many methods as # possible. Because of that, it can be also used as a fully # functional command-line client for accessing various analysis # tools. # # Usage: ./panalysis.PLS -h # or: perldoc panalysis.PLS # # martin.senger@gmail.com # July 2002 # # $Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $ #----------------------------------------------------------------------------- use strict; use warnings; sub get_usage { return <<"END_OF_USAGE"; Usage: panalysis.PLS [options] [input-data] where 'options' are: -A access method (default 'soap') -l where are the analyses -n name of an analysis -j ID of a previously created job -L list all available analyses -c list all available categories -C show all analyses in given category -i, -I show specification of data inputs -o, -O show specification of results -a show specification of the analysis -d show analysis metadata (XML) -b create job from [input-data] (default: create a job also without -b option if there is no -j option and if there are some 'input-data' on the command-line) -x create job from [input-data] and run it -w create job from [input-data], run it and wait for it -x -j run a previously created job -w -j run a previously created job and wait for it -k -j kill a previously created job -s show job status -t show all job times -T show some job times (all, created, begun, finished, elapsed) -e show job last event (XML) -r retrieve all results -R retrieve named results; comma-separated list, each item: = =@[filename-template] =?[filename-template] where 'filename-template' can contain: * ... will be replaced by a unique number \$ANALYSIS ... will be replaced by an analysis name \$RESULT ... will be replaced by a result name any other characters (suitable for filenames) -z at the end remove job and all its results -h this help -v, -V show version(s) -q be less verbose where 'input-data' are: =... =@... Environment variables: HTTPPROXY HTTP proxy server HTTPTIMEOUT HTTP timeout (0 means no timeout at all) RESULT_FILENAME_TEMPLATE template for inventing filenames for results For more details type: perldoc panalysis.PLS END_OF_USAGE } BEGIN { # add path to the directory with this script my $mylib; ($mylib = $0) =~ s|/[^/]+$||; unshift @INC, $mylib; # be prepare for command-line options/arguments use Getopt::Std; # general options use vars qw/ $opt_h $opt_v $opt_V $opt_q /; # specialized options use vars qw/ $opt_A $opt_l $opt_n $opt_j /; # service use vars qw/ $opt_L $opt_c $opt_C /; # factory use vars qw/ $opt_d $opt_i $opt_I $opt_o $opt_O $opt_a /; # metadata use vars qw/ $opt_x $opt_w $opt_k $opt_s $opt_e $opt_t $opt_T $opt_b /; # job use vars qw/ $opt_r $opt_R /; # results use vars qw/ $opt_z /; # cleaning my $switches = 'ACjlnRT'; # switches taking an argument (a value) getopt ($switches); use vars qw($VERSION $Revision); # set the version for version checking $VERSION = do { my @r = (q[$Revision: 1.10 $] =~ /\d+/g); sprintf "%d.%-02d", @r }; $Revision = q[$Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $]; # help wanted? if ($opt_h) { print get_usage; exit 0; } # print version of this script and exit if ($opt_v) { print "$0 $VERSION\n"; exit 0; } } use Bio::Tools::Run::Analysis; # to access analysis tools directly use Bio::Tools::Run::AnalysisFactory; # to access list/factory of analysis tools # --- create a factory object; # the new() method understands the following parameters: # -location (taken from '-l' option if given) # -access (taken from '-A' option, default is 'soap') # # Additionally, it uses env. variable HTTPPROXY to create parameter # '-httpproxy', and env. variable HTTPTIMEOUT to set max HTTP timeout. # my @access = ('-access', $opt_A) if defined $opt_A; my @location = ('-location', $opt_l) if defined $opt_l; my @httpproxy = ('-httpproxy', $ENV{'HTTPPROXY'}) if defined $ENV{'HTTPPROXY'}; my @timeout = ('-timeout', $ENV{'HTTPTIMEOUT'}) if defined $ENV{'HTTPTIMEOUT'}; my $factory = new Bio::Tools::Run::AnalysisFactory (@location, @httpproxy, @timeout); # --- create an analysis (service) object; # the new() method understands the following parameters: # -location (taken from '-l' option if given) # -access (taken from '-A' option, default is 'soap') # -name (taken from '-n' option; mandatory!, no default value) # -destroy_on_exit (set to true if '-z' option given) # -httpproxy (taken from an env.variable) # -timeout (taken from an env.variable) # my @name = ('-name', $opt_n) if defined $opt_n; my @destroy = ('-destroy_on_exit', 0) unless $opt_z; my $service = new Bio::Tools::Run::Analysis (@name, @location, @httpproxy, @timeout, @destroy); die "Stopped. No success in accessing analysis factory.\n" unless $factory; die "Stopped. No success in accessing analysis tools.\n" unless $service; # --- print class and version of "real-workers" and exit if ($opt_V) { print ref $factory, " ", $factory->VERSION . "\n"; print ref $service, " ", $service->VERSION . "\n"; exit 0; } # # --- here are methods of the "directory service" (factory) # # what categories are available? if ($opt_c) { my $msg = "Available categories"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_categories }), "\n"; } # what analyses are available? if ($opt_L) { my $msg = "Available analyses"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_analyses }), "\n"; } # what analyses are available in a particular category? if ($opt_C) { my $msg = "Available analyses in category '$opt_C':"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_analyses ($opt_C) }), "\n"; } # # --- here are methods describing one analysis # # print full analysis metadata in XML # ('$service->describe' returns an XML string) print $service->describe . "\n" if $opt_d; # print major characteristics of an analysis # ('$service->analysis_spec' returns a hash reference) if ($opt_a) { my $rh_spec = $service->analysis_spec; my $msg = "Specification of analysis"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); print "Analysis '$opt_n':\n"; while (($key, $value) = each %{ $rh_spec }) { print "\t$key => $value\n"; } } # print input specification (either full, or just input data names) # ('$service->input_spec' returns a reference to an array of hashes) if ($opt_i or $opt_I) { my $ra_spec = $service->input_spec; my $msg = "Specification of inputs"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) { print $$_{'name'},"\n"; if ($opt_I) { while (($key, $value) = each %{ $_ }) { unless ($key eq 'name') { if (ref $value eq 'ARRAY') { # for 'allowed values' print "\t$key => " . join (", ", @$value) . "\n"; } else { print "\t$key => $value\n"; } } } } } } # print result specification (either full, or just names of results) # ('$service->result_spec' returns a reference to an array of hashes) if ($opt_o or $opt_O) { my $ra_spec = $service->result_spec; my $msg = "Specification of results"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) { print $$_{'name'},"\n"; if ($opt_O) { while (($key, $value) = each %{ $_ }) { print "\t$key => $value\n" unless ($key eq 'name'); } } } } # # --- let's create a job # my $job; if ($opt_j) { # ... either by re-creating a previous job $job = $service->create_job ($opt_j); if ($opt_x) { $job->run; } elsif ($opt_w) { $job->wait_for; } elsif ($opt_k) { $job->terminate; } } else { # ... or creating a new job using given input data if ($opt_x) { $job = $service->run (\@ARGV); } elsif ($opt_w) { $job = $service->wait_for (\@ARGV); } elsif ($opt_b or @ARGV > 0) { $job = $service->create_job (\@ARGV); } # often you need to know the JOB's ID to be able to come back # later and ask for results, status, events etc. - so I print it # here even in quiet mode (option -q) - but to STDERR in order not # to intervene with redirected real results print STDERR "JOB ID: " , $job->id . "\n" if $job; } # # --- having a job, ask it for something # if ($job) { print "JOB STATUS: " . $job->status . "\n" if $opt_s; print "LAST EVENT: " . $job->last_event . "\n" if $opt_e; # ...get job times (all of them in one go, formatted) if ($opt_t) { my $rh_times = $job->times (1); # '1' means 'formatted' print "TIMES:\n"; print "\tCreated: " . $$rh_times{'created'} . "\n" if $$rh_times{'created'}; print "\tStarted: " . $$rh_times{'started'} . "\n" if $$rh_times{'started'}; print "\tEnded: " . $$rh_times{'ended'} . "\n" if $$rh_times{'ended'}; print "\tElapsed: " . $$rh_times{'elapsed'} . "\n" if defined $$rh_times{'elapsed'}; } # ...get individual job characteristics (both formatted and raw) if ($opt_T) { print "CREATED: " . $job->created (1) . " (" . $job->created . ")\n" if $opt_T =~ /a|c/; print "STARTED: " . $job->started (1) . " (" . $job->started . ")\n" if $opt_T =~ /a|b/; print "ENDED: " . $job->ended (1) . " (" . $job->ended . ")\n" if $opt_T =~ /a|f/; print "ELAPSED: " . $job->elapsed . "\n" if $opt_T =~ /a|e/; } # retrieve results my $rh_results; if ($opt_R) { $rh_results = $job->results (split /\s*,\s*/, $opt_R); } elsif ($opt_r) { $rh_results = $job->results ('?'); } if ($rh_results) { foreach my $name (sort keys %$rh_results) { my $msg = "RESULT: $name"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); if (ref $$rh_results{$name}) { # ... this is probably what you do not want (binary on terminal); # unless you wisely used: -R result_name=filename print join ("\n", @{ $$rh_results{$name} }) . "\n"; } else { print $$rh_results{$name} . "\n"; } } } } sub msg { print shift unless $opt_q; } __END__ =head1 NAME panalysis.PLS - An example/tutorial script how to access analysis tools =head1 SYNOPSIS # run an analysis with your sequence in a local file ./panalysis.PLS -n 'edit.seqret'-w -r \ sequence_direct_data=@/home/testdata/my.seq See more examples in the text below. =head1 DESCRIPTION A client showing how to use C module, a module for executing and controlling local or remote analysis tools. It also calls methods from the C module, a module providing lists of available analyses. Primarily, this client is meant as an example how to use analysis modules, and also to test them. However, because it has a lot of options in order to cover as many methods as possible, it can be also used as a fully functional command-line client for accessing various analysis tools. =head2 Defining location and access method C is independent on the access method to the remote analyses (the analyses running on a different machines). The method used to communicate with the analyses is defined by the C<-A> option, with the default value I. The other possible values (not yet supported, but coming soon) are I and I. Each access method may have different meaning for parameter C<-l> defining a location of services giving access to the analysis tools. For example, the I access expects a URL of a Web Service in the C<-l> option, while the I access may find here a stringified Interoperable Object Reference (IOR). A default location for the I access is C which represents services running at European Bioinformatics Institute on top of over hundred EMBOSS analyses (and on top of few others). =head2 Available analyses C can show a list of available analyses (from the given location using given access method). The C<-L> option shows all analyses, the C<-c> option lists all available categories (a category is a group of analyses with similar functionality or processing similar type of data), and finally the C<-C> option shows only analyses available within the given category. Note, that all these functions are provided by module C (respectively, by one of its access-dependent sub-classes). The module has also a I method C which is not used by this script. =head2 Service A C is a higher level of abstraction of an analysis tool. It understands a well defined interface (module C, a fact which allows this script to be independent on the access protocol to various services. The service name must be given by the C<-n> option. This option can be omitted only if you invoked just the C methods (described above). Each service (representing an analysis tool, a program, or an application) has its description, available by using options C<-a> (analysis name, type, etc.), C<-i>, C<-I> (specification of analysis input data, most important are their names), and C<-o>, C<-O> (result names and their types). The option C<-d> gives the most detailed description in the XML format. The service description is nice but the most important is to use the service for invoking an underlying analysis tool. For each invocation, the service creates a C and feeds it with input data. There are three stages: (a) create a job, (b) run the job, and (c) wait for its completion. Correspondingly. there are three options: the C<-b> which just creates (builds) a job, the C<-x> which creates a job and executes it, and finally C<-w> which creates a job, runs it and blocks the client until the job is finished. Always only one of these options is used (so it does not make sense to use more of them, the C priorities them in the order C<-x>, C<-w>, and C<-b>). All of these options take input data from the command-line (see next section about it) and all of them return (internally) an object representing a job. There are many methods (options) dealing with the job objects (see one after next section about them). Last note in this section: the C<-b> option is actually optional - a job is created even without this option when there are some input data found on the command-line. You I to use it, however, if you do not pass any data to an analysis tool (an example would be the famous C service). =head2 Input data Input data are given as name/value pairs, put on the command-line with equal sign between name and value. If the I part starts with an un-escaped character C<@>, it is used as a local file name and the C reads the file and uses its contents instead. Examples: panalysis.PLS -n edit.seqret -w -r sequence_direct_data='tatatctcccc' osformat=embl panalysis.PLS ... sequence_direct_data=@/my/data/my.seq The names of input data come from the C that can be shown by the C<-i> or C<-I> options. The input specification (when using option C<-I>) shows also - for some inputs - a list of allowed values. The specification, however, does not tell what input data are mutually exclusive, or what other constrains apply. If there is a conflict, an error message is produced later (before the job starts). Input data are used when any of the options C<-b>, C<-x>, or C<-w> is present, but option C<-j> is not present (see next section about this job option). =head2 Job Each service (defined by a name given in the C<-n> option) can be executed one or more times, with the same, but usually with different input data. Each execution creates a I. Actually, the job is created even before execution (remember that option C<-b> builds a job but does not execute it yet). Any job, executed or not, is persistent and can be used again later from another invocation of the C script. Unless you explicitly destroy the job using option C<-z>. A job created by options C<-b>, C<-x> and C<-w> (and by input data) can be accessed in the same C invocation using various job-related options, the most important are C<-r> and C<-R> for retrieving results from the finished job. However, you can also re-create a job created by a previous invocation. Assuming that you know the job ID (the C prints it always on the standard error when a new job is created), use option C<-j> to re-create the job. Example: ./panalysis.PLS -n 'edit.seqret' sequence_direct_data=@/home/testdata/my.seq It prints: JOB ID: edit.seqret/bb494b:ef55e47c99:-8000 Next invocation (asking to run the job, to wait for its completion and to show job status) can be: ./panalysis.PLS -n 'edit.seqret' -j edit.seqret/bb494b:ef55e47c99:-800 -w -s And again later another invocation can ask for results: ./panalysis.PLS -n 'edit.seqret' -j edit.seqret/bb494b:ef55e47c99:-800 -r Here is a list of all job options (except for results, they are in the next section): =over 4 =item Job execution and termination There are the same options C<-x> and C<-w> for executing a job and for executing it and waiting for its completion, as they were described above. But now, the options act on a job given by the C<-j> option, now they do not use any input data from the command-line (the input data had to be used when the job was created). Additionally, there is a C<-k> option to kill a running job. =item Job characteristics Other options tell about the job status (C<-s>, about the job execution times (C<-t> and C<-T>, and about the last available event what happened with the job (C<-e>). Note that the event notification is not yet fully implemented, so this option will change in the future to reflect more notification capabilities. =back =head2 Results Of course, the most important on the analysis tools are their results. The results are named (in the similar way as the input data) and they can be retrieved all in one go using option C<-r> (so you do not need to know their names actually), or by specifying (all or some) result names using the C<-R> option. If a result does not exist (either not yet, or the name is wrong) an undef value is returned (no error message produced). Some results are better to save directly into files instead to show them in the terminal window (this applies to the I results, mostly containing images). The C helps to deal with binary results by saving them automatically to local files (actually it is the module C and its submodules who do help with the binary data). So why not to use a traditional shell re-direction to a file? There are two reasons. First, a job can produce more than one result, so they would be mixed together. But mainly, because each result can consist of several parts whose number is not known in advance and which cannot be mixed together in one file. Again, this is typical for the binary data returning images - an invocation can produce many images. The C<-r> option retrieves all available results and treat them as described by the C<'?'> format below. The C<-R> option has a comma-separated list of result names, each of the names can be either a simple name (as specified by the C obtainable using the C<-o> or C<-O> options), or a equal-sign-separated name/format construct suggesting what to do with the result. The possibilities are: =over 4 =item result-name It prints given result on the standard output. =item result-name=filename It saves the given result into given file. =item result-name=@ It saves the given result into a file whose name is automatically invented, and it guarantees that the same name will not be used in the next invocation. =item result=name=@template It saves the given result into a file whose name is given by the C