Genome-Model-Tools-Music-0.04000755000765000024 012013522176 16043 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/Build.PL000444000765000024 416312013522176 17500 0ustar00nnutterstaff000000000000#!/usr/bin/perl # Use local perl, not some perl on an application server! use Config; use Module::Build; BEGIN { unshift @INC, "$ENV{PWD}/blib/lib"; } my $class = Module::Build->subclass ( class => 'Pod::Builder', code => <<'EOS', sub ACTION_clean { # FIXME: is this safe? use File::Path qw/rmtree/; rmtree "./_build"; rmtree "./blib"; rmtree "./cmd-bindoc"; unlink "./Build"; unlink "./MYMETA.yml"; } sub ACTION_cmd_docs { use File::Copy qw/copy/; $ENV{ANSI_COLORS_DISABLED} = 1; eval { local @INC = @INC; unshift @INC, 'blib/lib'; die $@ if $@; eval "use Genome::Model::Tools::Music"; die $@ if $@; foreach my $exec ('genome','gmt') { UR::Namespace::Command::Update::Doc->execute( class_name => 'Genome::Model::Tools', targets => [ 'Genome::Model::Tools::Music' ], executable_name => $exec, output_path => 'cmd-bindoc', output_format => 'pod', ); } }; die "failed to extract pod: $!: $@" if ($@); } sub ACTION_docs { my $self = shift; $self->depends_on('code'); $self->depends_on('cmd_docs'); $self->depends_on('manpages', 'html'); } sub man1page_name { my ($self, $file) = @_; $file =~ s/.pod$//; return $self->SUPER::man1page_name($file); } EOS ); my $build = $class->new( module_name => 'Genome::Model::Tools::Music', license => 'lgpl', dist_abstract => 'find mutations of significance in cancer', install_path => { 't' => $Config{vendorlib} . "/Genome/Model/Tools" }, build_requires => { 'Genome' => '0.06', 'Text::CSV_XS' => '', 'Regexp::Common' => '', 'Statistics::Distributions' => '', }, requires => { 'Genome' => '0.06', 'Text::CSV_XS' => '', 'Regexp::Common' => '', 'Statistics::Distributions' => '', }, bindoc_dirs => ['cmd-bindoc'], ); $build->add_build_element('R'); $build->create_build_script; Genome-Model-Tools-Music-0.04/Changes000444000765000024 374312013522176 17502 0ustar00nnutterstaff000000000000Revision history for Genome::Model::Tools::Music 0.04 2012-08-15 - Implemented generalized linear models for regression analyses against clinical data - Added tools to generate typical visualizations like Kaplan-Meier survival estimates, and mutation status matrices - Support for TCGA Mutation Annotation Format (MAF) version 2.3 - Performance improvements in mutation rate calculations, and more efficient memory usage - Added support for wiggle track format files describing coverage, if BAMs are unavailable 0.03 2012-04-29 [FEATURES] - All: Added support for updated MAF format v2.2 (Mar 28th, 2012) - All: Output header descriptions are friendlier, and comma-delimited outputs are now tab-delimited - All: Added optional skipping of noncoding and silent muts for most modules - ClinicalCorrelation: Added generalized linear model for phenotype-covariate relationships - ClinicalCorrelation: Optional use of a sample-vs-gene matrix as input, instead of a MAF - SMG: Uses the R multicore package to parallelize its tests - SMG: Provide per-gene BMR modifiers to adjust for varying mutation accumulation rates across the genome - CalcBmr: Use mutation-recurrence rates instead of mutation rates for SMG testing - CalcBmr: Implemented clustered-sample BMRs for subgroup-wise or sample-wise SMG tests - CalcBmr: Added an optional mutation category for truncation mutations - Pfam: Now supports NCBI Build 37 variant loci [BUG FIXES] - All: Modules no longer remove chr-prefixes from user-provided chromosome names - CalcBmr: Print a warning if the MAF contains a variant from a sample not in the bam-list - calcRoiCovg: Fixed a potential memory leak - CosmicOmim: Now treats chromosomes 23 and 24 in the COSMIC DB file as chromosomes X and Y, respectively - CosmicOmim: Nucleotide changes corrected for splice site variantsGenome-Model-Tools-Music-0.04/INSTALL000444000765000024 6712013522176 17174 0ustar00nnutterstaff000000000000perl Build.PL ./Build ./Build test sudo Build install Genome-Model-Tools-Music-0.04/LICENSE000444000765000024 1674312013522176 17240 0ustar00nnutterstaff000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. Genome-Model-Tools-Music-0.04/MANIFEST000444000765000024 252012013522176 17330 0ustar00nnutterstaff000000000000Build.PL Changes INSTALL lib/Genome/Model/Tools/Music.pm lib/Genome/Model/Tools/Music/Base.pm lib/Genome/Model/Tools/Music/Bmr.pm lib/Genome/Model/Tools/Music/Bmr/Base.pm lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm.R lib/Genome/Model/Tools/Music/CosmicOmim.pm lib/Genome/Model/Tools/Music/Galaxy.pm lib/Genome/Model/Tools/Music/MutationRelation.pm lib/Genome/Model/Tools/Music/MutationRelation.pm.R lib/Genome/Model/Tools/Music/PathScan.pm lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm lib/Genome/Model/Tools/Music/PathScan/PathScan.pm lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm lib/Genome/Model/Tools/Music/Pfam.pm lib/Genome/Model/Tools/Music/Play.pm lib/Genome/Model/Tools/Music/Plot.pm lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm.R lib/Genome/Model/Tools/Music/Proximity.pm lib/Genome/Model/Tools/Music/Smg.pm lib/Genome/Model/Tools/Music/Smg.pm.R lib/Genome/Model/Tools/Music/Survival.pm lib/Genome/Model/Tools/Music/Survival.pm.R LICENSE MANIFEST This list of files MANIFEST.SKIP META.yml README t/Play.t META.json Genome-Model-Tools-Music-0.04/MANIFEST.SKIP000444000765000024 44412013522176 20060 0ustar00nnutterstaff000000000000^MYMETA.yml$ ^.git ^debian/ ^ubuntu-lucid/ ^alt/ ^dist-maint/ ^MANIFEST.bak$ ^_build/ ^Build$ \.tar\.gz$ ^blib ^genome-bindoc ^gmt-bindoc ^cmd-bindoc ^i ^.modulebuildrc ^gmt-web rebuild-debian-install galaxy.xml ^MYMETA\.json$ t/Music.t # floating point causes this to fail on other machines Genome-Model-Tools-Music-0.04/META.json000444000765000024 1103312013522176 17637 0ustar00nnutterstaff000000000000{ "abstract" : "find mutations of significance in cancer", "author" : [ "unknown" ], "dynamic_config" : 1, "generated_by" : "Module::Build version 0.3901, CPAN::Meta::Converter version 2.120630", "license" : [ "open_source" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "Genome-Model-Tools-Music", "prereqs" : { "build" : { "requires" : { "Genome" : "0.06", "Regexp::Common" : "0", "Statistics::Distributions" : "0", "Text::CSV_XS" : "0" } }, "configure" : { "requires" : { "Module::Build" : "0.39" } }, "runtime" : { "requires" : { "Genome" : "0.06", "Regexp::Common" : "0", "Statistics::Distributions" : "0", "Text::CSV_XS" : "0" } } }, "provides" : { "Genome::Model::Tools::Music" : { "file" : "lib/Genome/Model/Tools/Music.pm", "version" : "0.04" }, "Genome::Model::Tools::Music::Base" : { "file" : "lib/Genome/Model/Tools/Music/Base.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr" : { "file" : "lib/Genome/Model/Tools/Music/Bmr.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr::Base" : { "file" : "lib/Genome/Model/Tools/Music/Bmr/Base.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr::CalcBmr" : { "file" : "lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr::CalcCovg" : { "file" : "lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr::CalcCovgHelper" : { "file" : "lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm", "version" : "0" }, "Genome::Model::Tools::Music::Bmr::CalcWigCovg" : { "file" : "lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm", "version" : "0" }, "Genome::Model::Tools::Music::ClinicalCorrelation" : { "file" : "lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm", "version" : "0" }, "Genome::Model::Tools::Music::CosmicOmim" : { "file" : "lib/Genome/Model/Tools/Music/CosmicOmim.pm", "version" : 0 }, "Genome::Model::Tools::Music::Galaxy" : { "file" : "lib/Genome/Model/Tools/Music/Galaxy.pm", "version" : "0" }, "Genome::Model::Tools::Music::MutationRelation" : { "file" : "lib/Genome/Model/Tools/Music/MutationRelation.pm", "version" : "0" }, "Genome::Model::Tools::Music::PathScan" : { "file" : "lib/Genome/Model/Tools/Music/PathScan.pm", "version" : "0" }, "Genome::Model::Tools::Music::PathScan::CombinePvals" : { "file" : "lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm", "version" : 0 }, "Genome::Model::Tools::Music::PathScan::PathScan" : { "file" : "lib/Genome/Model/Tools/Music/PathScan/PathScan.pm", "version" : 0 }, "Genome::Model::Tools::Music::PathScan::PopulationPathScan" : { "file" : "lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm", "version" : 0 }, "Genome::Model::Tools::Music::Pfam" : { "file" : "lib/Genome/Model/Tools/Music/Pfam.pm", "version" : "0" }, "Genome::Model::Tools::Music::Play" : { "file" : "lib/Genome/Model/Tools/Music/Play.pm", "version" : "0" }, "Genome::Model::Tools::Music::Plot" : { "file" : "lib/Genome/Model/Tools/Music/Plot.pm", "version" : "0" }, "Genome::Model::Tools::Music::Plot::MutationRelation" : { "file" : "lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm", "version" : "0" }, "Genome::Model::Tools::Music::Proximity" : { "file" : "lib/Genome/Model/Tools/Music/Proximity.pm", "version" : "0" }, "Genome::Model::Tools::Music::Smg" : { "file" : "lib/Genome/Model/Tools/Music/Smg.pm", "version" : "0" }, "Genome::Model::Tools::Music::Survival" : { "file" : "lib/Genome/Model/Tools/Music/Survival.pm", "version" : "0" } }, "release_status" : "stable", "resources" : { "license" : [ "http://opensource.org/licenses/lgpl-license.php" ] }, "version" : "0.04" } Genome-Model-Tools-Music-0.04/META.yml000444000765000024 626112013522176 17456 0ustar00nnutterstaff000000000000--- abstract: 'find mutations of significance in cancer' author: - unknown build_requires: Genome: 0.06 Regexp::Common: 0 Statistics::Distributions: 0 Text::CSV_XS: 0 configure_requires: Module::Build: 0.39 dynamic_config: 1 generated_by: 'Module::Build version 0.3901, CPAN::Meta::Converter version 2.120630' license: open_source meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 name: Genome-Model-Tools-Music provides: Genome::Model::Tools::Music: file: lib/Genome/Model/Tools/Music.pm version: 0.04 Genome::Model::Tools::Music::Base: file: lib/Genome/Model/Tools/Music/Base.pm version: 0 Genome::Model::Tools::Music::Bmr: file: lib/Genome/Model/Tools/Music/Bmr.pm version: 0 Genome::Model::Tools::Music::Bmr::Base: file: lib/Genome/Model/Tools/Music/Bmr/Base.pm version: 0 Genome::Model::Tools::Music::Bmr::CalcBmr: file: lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm version: 0 Genome::Model::Tools::Music::Bmr::CalcCovg: file: lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm version: 0 Genome::Model::Tools::Music::Bmr::CalcCovgHelper: file: lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm version: 0 Genome::Model::Tools::Music::Bmr::CalcWigCovg: file: lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm version: 0 Genome::Model::Tools::Music::ClinicalCorrelation: file: lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm version: 0 Genome::Model::Tools::Music::CosmicOmim: file: lib/Genome/Model/Tools/Music/CosmicOmim.pm version: 0 Genome::Model::Tools::Music::Galaxy: file: lib/Genome/Model/Tools/Music/Galaxy.pm version: 0 Genome::Model::Tools::Music::MutationRelation: file: lib/Genome/Model/Tools/Music/MutationRelation.pm version: 0 Genome::Model::Tools::Music::PathScan: file: lib/Genome/Model/Tools/Music/PathScan.pm version: 0 Genome::Model::Tools::Music::PathScan::CombinePvals: file: lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm version: 0 Genome::Model::Tools::Music::PathScan::PathScan: file: lib/Genome/Model/Tools/Music/PathScan/PathScan.pm version: 0 Genome::Model::Tools::Music::PathScan::PopulationPathScan: file: lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm version: 0 Genome::Model::Tools::Music::Pfam: file: lib/Genome/Model/Tools/Music/Pfam.pm version: 0 Genome::Model::Tools::Music::Play: file: lib/Genome/Model/Tools/Music/Play.pm version: 0 Genome::Model::Tools::Music::Plot: file: lib/Genome/Model/Tools/Music/Plot.pm version: 0 Genome::Model::Tools::Music::Plot::MutationRelation: file: lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm version: 0 Genome::Model::Tools::Music::Proximity: file: lib/Genome/Model/Tools/Music/Proximity.pm version: 0 Genome::Model::Tools::Music::Smg: file: lib/Genome/Model/Tools/Music/Smg.pm version: 0 Genome::Model::Tools::Music::Survival: file: lib/Genome/Model/Tools/Music/Survival.pm version: 0 requires: Genome: 0.06 Regexp::Common: 0 Statistics::Distributions: 0 Text::CSV_XS: 0 resources: license: http://opensource.org/licenses/lgpl-license.php version: 0.04 Genome-Model-Tools-Music-0.04/README000444000765000024 263212013522176 17063 0ustar00nnutterstaff000000000000Genome::Music The README is used to introduce the module and provide instructions on how to install the module, any machine dependencies it may have (for example C compilers and installed libraries) and any other information that should be provided before the module is installed. A README file is required for CPAN modules since CPAN extracts the README file from a module distribution so that people browsing the archive can use it to get an idea of the module's uses. It is usually a good idea to provide version information here so that people can decide whether fixes for the module are worth downloading. INSTALLATION To install this module, run the following commands: perl Makefile.PL make make test make install SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the perldoc command. perldoc Genome::Music You can also look for information at: RT, CPAN's request tracker http://rt.cpan.org/NoAuth/Bugs.html?Dist=Genome::Music AnnoCPAN, Annotated CPAN documentation http://annocpan.org/dist/Genome::Music CPAN Ratings http://cpanratings.perl.org/d/Genome::Music Search CPAN http://search.cpan.org/dist/Genome::Music/ COPYRIGHT AND LICENCE Copyright (C) 2010 The Genome Center at Washington University This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Genome-Model-Tools-Music-0.04/lib000755000765000024 012013522176 16611 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome000755000765000024 012013522176 20023 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model000755000765000024 012013522176 21063 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools000755000765000024 012013522176 22163 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music.pm000444000765000024 650612013522176 23745 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music; use strict; use warnings; use Genome; #bugfix version needs to be encoded as extra precision after the decimal point ie: .0401 our $VERSION = '0.04'; class Genome::Model::Tools::Music { is => ['Command::Tree'], doc => 'Mutational Significance in Cancer (Cancer Mutation Analysis)' }; sub _doc_manual_body { return < command runs all of the sub-commands serially on a selected input set. EOS } sub _doc_copyright_years { (2007,2011); } sub _doc_license { my $self = shift; my (@y) = $self->_doc_copyright_years; return <(1)', } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music000755000765000024 012013522176 23243 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Base.pm000444000765000024 254512013522176 24616 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Base; use strict; use warnings; use Genome; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Base { is => ['Command::V2'], is_abstract => 1, attributes_have => [ file_format => { is => 'Text', is_optional => 1, } ], doc => "Mutational Significance In Cancer (Cancer Mutation Analysis)" }; sub _doc_copyright_years { (2010,2011); } sub _doc_license { my $self = shift; my (@y) = $self->_doc_copyright_years; return <(1), B(1) EOS } sub _doc_manual_body { my $help = shift->help_detail; $help =~ s/\n+$/\n/g; return $help; } sub help_detail { return "This tool is part of the MuSiC suite. See:\n" . join("\n",shift->_doc_see_also) . "\n"; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr.pm000444000765000024 222312013522176 24455 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr; use warnings; use strict; use Genome; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Bmr { #is => ['Command::Tree','Genome::Model::Tools::Music::Base'], is => ['Command::Tree'], doc => "Calculate gene coverages and background mutation rates." }; sub _doc_copyright_years { (2010,2011); } sub _doc_license { my $self = shift; my (@y) = $self->_doc_copyright_years; return <(1), B(1), B(1), B(1) EOS } sub _doc_manual_body { return shift->help_detail; } sub help_detail { return "These tools are part of the MuSiC suite.\n"; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm000444000765000024 4525512013522176 27711 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::ClinicalCorrelation; use warnings; use strict; use Carp; use Genome; use IO::File; use POSIX qw( WIFEXITED ); our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::ClinicalCorrelation { is => 'Genome::Model::Tools::Music::Base', has_input => [ bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)", }, maf_file => { is => 'Text', is_optional => 1, doc => "List of mutations using TCGA MAF specification v2.3", }, output_file => { is_output => 1, is => 'Text', doc => "Results of clinical-correlation tool. Will have suffix added for data type", }, clinical_correlation_matrix_file => { is => 'Text', is_optional => 1, doc => "Specify a file to store the sample-vs-gene matrix created during calculations", }, input_clinical_correlation_matrix_file => { is => 'Text', is_optional => 1, doc => "Instead of creating this from the MAF, input the sample-vs-gene matrix for calculations", }, genetic_data_type => { is => 'Text', is_optional => 1, default => "gene", doc => "Correlate clinical data to \"gene\" or \"variant\" level data", }, numeric_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Table of samples (y) vs. numeric clinical data category (x)", }, numerical_data_test_method => { is => 'Text', is_optional => 1, default => 'cor', doc => "Either 'cor' for Pearson Correlation or 'wilcox' for the Wilcoxon Rank-Sum Test for numerical clinical data", }, categorical_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Table of samples (y) vs. categorical clinical data category (x)", }, glm_model_file => { is => 'Text', is_optional => 1, doc => "File outlining the type of model, response variable, covariants, etc. for the GLM analysis. (See DESCRIPTION)", }, glm_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION)", }, use_maf_in_glm => { is => 'Boolean', is_optional => 1, default => 0, doc => "Create a variant matrix from the MAF file as variant input to GLM analysis.", }, skip_non_coding => { is => 'Boolean', is_optional => 1, default => 1, doc => "Skip non-coding mutations from the provided MAF file", }, skip_silent => { is => 'Boolean', is_optional => 1, default => 1, doc => "Skip silent mutations from the provided MAF file", }, ], doc => "Correlate phenotypic traits against mutated genes, or against individual variants", }; sub help_synopsis { return <bam_list; my $output_file = $self->output_file; my $genetic_data_type = $self->genetic_data_type; # check genetic data type unless( $genetic_data_type =~ /^gene|variant$/i ) { $self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter."); return; } # load clinical data and analysis types my %clinical_data; if( $self->numeric_clinical_data_file ) { $clinical_data{'numeric'} = $self->numeric_clinical_data_file; } if( $self->categorical_clinical_data_file ) { $clinical_data{'categ'} = $self->categorical_clinical_data_file; } if( $self->glm_clinical_data_file ) { $clinical_data{'glm'} = $self->glm_clinical_data_file; } my $glm_model = $self->glm_model_file; # declarations my @all_sample_names; # names of all the samples, no matter if it's mutated or not # parse out the sample names from the bam-list which should match the names in the MAF file my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); } $sampleFh->close; # loop through clinical data files for my $datatype ( keys %clinical_data ) { my $test_method; my $full_output_filename; if( $datatype =~ /numeric/i ) { $full_output_filename = $output_file . ".numeric.csv"; $test_method = $self->numerical_data_test_method; } if( $datatype =~ /categ/i ) { $full_output_filename = $output_file . ".categorical.csv"; $test_method = "fisher"; } if( $datatype =~ /glm/i ) { $full_output_filename = $output_file . ".glm.csv"; $test_method = "glm"; } #read through clinical data file to see which samples are represented and create input matrix for R my %samples; my $matrix_file; my $samples = \%samples; my $clin_fh = new IO::File $clinical_data{$datatype},"r"; unless( $clin_fh ) { die "failed to open $clinical_data{$datatype} for reading: $!"; } my $header = $clin_fh->getline; while( my $line = $clin_fh->getline ) { chomp $line; my ( $sample ) = split( /\t/, $line ); $samples{$sample}++; } #create correlation matrix unless it's glm analysis without using a maf file unless(( $datatype =~ /glm/i && !$self->use_maf_in_glm ) || $self->input_clinical_correlation_matrix_file ) { if( $genetic_data_type =~ /^gene$/i ) { $matrix_file = $self->create_sample_gene_matrix_gene( $samples, $clinical_data{$datatype}, @all_sample_names ); } elsif( $genetic_data_type =~ /^variant$/i ) { $matrix_file = $self->create_sample_gene_matrix_variant( $samples, $clinical_data{$datatype}, @all_sample_names ); } else { $self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." ); return; } } if( $self->input_clinical_correlation_matrix_file ) { $matrix_file = $self->input_clinical_correlation_matrix_file; } unless( defined $matrix_file ) { $matrix_file = "'*'"; } #set up R command my $R_cmd = "R --slave --args < " . __FILE__ . ".R $test_method "; if( $datatype =~ /glm/i ) { $R_cmd .= "$glm_model $clinical_data{$datatype} $matrix_file $full_output_filename"; } else { $R_cmd .= "$clinical_data{$datatype} $matrix_file $full_output_filename"; } #run R command print "R_cmd:\n$R_cmd\n"; WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)"; } return( 1 ); } sub create_sample_gene_matrix_gene { my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_; my $output_matrix = $self->clinical_correlation_matrix_file; #create a hash of mutations from the MAF file my ( %mutations, %all_genes, @all_genes ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $mutation_class, $sample ) = @cols[0,8,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } #check if sample exists in clinical data unless( defined $samples->{$sample} ) { warn "Sample Name: $sample from MAF file does not exist in Clinical Data File"; next; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } $all_genes{$gene}++; $mutations{$sample}{$gene}++; } $maf_fh->close; #sort @all_genes for consistency @all_genes = sort keys %all_genes; #write the input matrix for R code to a file my $matrix_fh; unless (defined $output_matrix) { $output_matrix = Genome::Sys->create_temp_file_path(); } $matrix_fh = new IO::File $output_matrix,"w"; unless ($matrix_fh) { die "Failed to create matrix file $output_matrix!: $!"; } #print input matrix file header my $header = join("\t","Sample",@all_genes); $matrix_fh->print("$header\n"); #print mutation relation input matrix for my $sample (sort @all_sample_names) { $matrix_fh->print($sample); for my $gene (@all_genes) { if (exists $mutations{$sample}{$gene}) { $matrix_fh->print("\t$mutations{$sample}{$gene}"); } else { $matrix_fh->print("\t0"); } } $matrix_fh->print("\n"); } return $output_matrix; } sub create_sample_gene_matrix_variant { my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_; my $output_matrix = $self->clinical_correlation_matrix_file; #create hash of mutations from the MAF file my ( %variants_hash, %all_variants ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } unless( exists $samples->{$sample} ) { warn "Sample Name: $sample from MAF file does not exist in Clinical Data File"; next; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } my $var; my $variant_name; if( $ref eq $var1 ) { $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref eq $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref ne $var1 && $ref ne $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } } $maf_fh->close; #sort variants for consistency my @variant_names = sort keys %all_variants; #write the input matrix for R code to a file my $matrix_fh; unless( defined $output_matrix ) { $output_matrix = Genome::Sys->create_temp_file_path(); } $matrix_fh = new IO::File $output_matrix,"w"; unless( $matrix_fh ) { die "Failed to create matrix file $output_matrix!: $!"; } #print input matrix file header my $header = join( "\t", "Sample", @variant_names ); $matrix_fh->print("$header\n"); #print mutation relation input matrix for my $sample ( sort @all_sample_names ) { $matrix_fh->print( $sample ); for my $variant ( @variant_names ) { if( exists $variants_hash{$sample}{$variant} ) { $matrix_fh->print("\t$variants_hash{$sample}{$variant}"); } else { $matrix_fh->print("\t0"); } } $matrix_fh->print("\n"); } return $output_matrix; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm.R000444000765000024 2340212013522176 30077 0ustar00nnutterstaff000000000000#determine which test method to use method = as.character(commandArgs()[4]); if (method == "glm") { ### This program is for Generalized Linear Model (GLM) analysis ### Y = X + covar1 + covar2 ...... ### Usually (not limited), ### Y is clinical trait (quantitative or binary) ### X is variant/gene/mutation etc. ### covar means covariate ### by Qunyuan Zhang (qunyuan@wustl.edu), 02/16/2012 updated ### input options model.file = as.character(commandArgs()[5]); y.file = as.character(commandArgs()[6]); x.file = as.character(commandArgs()[7]); out.file = as.character(commandArgs()[8]); x.names="*"; ### to run it on command line, type # R --no-save < glm.R model.file y.file x.file * out.csv # or (if you want to define x variables) # R --no-save < glm.R model.csv y.file x.file x1|x2|x3 out.csv # or (if x.file has been merged into y.file) # R --no-save < glm.R model.csv y.file * x1|x2|x3 out.csv # or (if you have defined x variable in model.file) # R --no-save < glm.R model.csv y.file * * out.csv ### about model.file # column "type": Q=quantitative trait, B=binary trait # column "y": trait name # colum "x": variant/gene name; if x=NA or blank, it will determined by x.file and x.names # column "cvar": covariate(s) # tab delimited ### about y.file # trait data file, column 1 must be sample id # tab delimited ### about x.file # usually mutation/variant/geene data file # the first column must be sample id (the same as in y.file, ordered or not) # tab delimited # x.file="*" if x.file already merged into y.file ### about x.names # x.names="*" will use all column names in x.file as x variable names # or you can define it in the format x.names="gene1|gene2|gene3" # self-defined x.names have to be found in column names of y.file and/or x.file #################### myglm fuction ############## myglm=function(z,trait,variant,covar=NA,ytype) { if (nchar(covar)==0 | is.na(covar) | is.null(covar)) { model=formula(paste(trait,"~",variant)) } else { model=formula(paste(trait,"~",variant,"+",covar)) } if (ytype=="B") fit=glm(formula=model,data=z,family=binomial(link = "logit")) if (ytype=="Q") fit=glm(formula=model,data=z,family=gaussian(link = "identity")) fit } ################################################# ### data input ##### read.table(model.file,colClasses="character",na.strings = c("","NA"),sep="\t",header=T)->md read.table(y.file,na.strings = c("","NA"),sep="\t",header=T)->y if (x.names!="*") x.names=strsplit(x.names,split="[|]")[[1]] if (x.file!="*") { read.table(x.file,na.strings = c("","NA"),sep="\t",header=T)->x xid=colnames(x)[1] xs=colnames(x)[-1] if (x.names!="*") {x=x[,c(xid,x.names)];xs=colnames(x)[-1]} x.names=xs yid=colnames(y)[1] ysid = ! (colnames(y) %in% xs) y=y[,ysid] if (sum(ysid)==1) {y=data.frame(id=y);colnames(y)[1]=yid} #y=merge(y,x,by.x = xid, by.y = yid) y=merge(x,y,by.x = xid, by.y = yid) } ######### analysis ########## tt=NULL for (i in c(1:nrow(md))) { ytype=md[i,1];yi=md[i,2];xs=md[i,3];covi=md[i,4];memo=md[i,5] if (!is.na(xs) & nchar(xs)>0) xs=strsplit(xs,split="[|]")[[1]] if (is.na(xs)[1]|nchar(xs)[1]==0) xs=x.names if (length(covi)==0) covi=NA for (xi in xs) { print(yi); print(xi); print(covi); print("******") if (ytype=="Q") try(anova(myglm(y,yi,xi,covi,ytype),test="F"))->fit if (ytype=="B") try(anova(myglm(y,yi,xi,covi,ytype),test="Chisq"))->fit if (class(fit)[1]!="try-error") { fit=as.matrix(fit) if (xi %in% rownames(fit)) tt=rbind(tt, cbind(yi,ytype,xi,as.data.frame(t(fit[xi,])),covi,memo)) } } } #"yi","ytype","xi","Df","Deviance","Resid. Df","Resid. Dev","F","Pr(>F)","covi","memo" if (ytype=="Q") colnames(tt) = c("y","y_type","x","degrees_freedom","deviance","residual_degrees_freedom","residual_deviance","F_statistic","p-value","covariants","memo"); if (ytype=="B") colnames(tt) = c("y","y_type","x","degrees_freedom","deviance","residual_degrees_freedom","residual_deviance","p-value","covariants","memo"); write.table(tt,file=out.file,quote=F,sep="\t",row.names=F); } else { # else, we process numerical or categorical clinical data correlation clinical_data = as.character(commandArgs()[5]); mutation_matrix = as.character(commandArgs()[6]); output_file = as.character(commandArgs()[7]); # FUNCTION finds the correlation between two variables cor2=function(ty,tx,method) { id=intersect(!is.na(ty),!is.na(tx)); ty=ty[id]; tx=tx[id]; if(method=="cor") { tst=cor.test(tx,ty); s=tst$est; p=tst$p.value; } if(method=="wilcox") #x must be (0,1) mutation data { tst=wilcox.test(x=ty[tx==0],y=ty[tx>=1]) s=tst$stat p=tst$p.value } if(method=="chisq") { tst=chisq.test(tx,ty); s=tst$stat; p=tst$p.value; } if(method=="fisher") { tst=fisher.test(tx,ty) s=tst$p.value p=tst$p.value } if(method=="anova") { tst=summary(aov(ty~tx,as.data.frame(cbind(tx,ty)))) s=tst[[1]]$F[1] p=tst[[1]]$Pr[1] } tt=c(p,s); tt; } # END cor2 # FUNCTION runs correlation test on matrixes of data cor2test =function(y,x=NULL,method="cor",cutoff=1,sep="\t",outf=NULL) { if (!is.null(x)) { if (length(x)==1) {read.table(x,header=T,sep=sep)->x;} if (length(y)==1) {read.table(y,header=T,sep=sep)->y;} colnames(y)[1]="id"; colnames(x)[1]="id"; tt=character(0); for (vi in colnames(x)[-1]) { for (vj in colnames(y)[-1]) { tx=x[,c("id",vi)]; tx=tx[!is.na(tx[,vi]),]; tx=tx[!duplicated(tx[,"id"]),]; ty=y[,c("id",vj)]; ty=ty[!is.na(ty[,vj]),]; ty=ty[!duplicated(ty[,"id"]),]; xy=merge(tx,ty,by.x="id",by.y="id"); tx=xy[,2]; ty=xy[,3]; n=length(xy[,"id"]); rst=try(cor2(ty,tx,method)); if (class(rst)=="try-error") {p=NA;s=NA;} else {p=rst[1];s=rst[2];} t=c(vi,vj,method,n,s,p) tt=rbind(tt,t); } #end vj } #end vi rownames(tt)=NULL; colnames(tt)=c("x","y","method","n","s","p"); tt=as.data.frame(tt); tt[,"s"]=as.character(tt[,"s"]); tt[,"s"]=as.numeric(tt[,"s"]); tt[,"p"]=as.character(tt[,"p"]); tt[,"p"]=as.numeric(tt[,"p"]); fdr=p.adjust(tt[,"p"],method="fdr"); bon=p.adjust(tt[,"p"],method="bon"); tt=cbind(tt,fdr,bon); tt=tt[order(tt[,"p"]),]; } if (is.null(x)) { if (length(y)==1) {read.table(y,header=T,sep=sep)->y;} x=y; nxy=ncol(y)-1; colnames(y)[1]="id"; colnames(x)[1]="id" tt=character(0); for (i in c(1:(nxy-1))) { for (j in c((i+1):nxy)) { vi=colnames(x)[-1][i]; vj=colnames(y)[-1][j]; tx=x[,c("id",vi)]; tx=tx[!is.na(tx[,vi]),]; tx=tx[!duplicated(tx[,"id"]),] ty=y[,c("id",vj)]; ty=ty[!is.na(ty[,vj]),]; ty=ty[!duplicated(ty[,"id"]),]; xy=merge(tx,ty,by.x="id",by.y="id"); tx=xy[,2]; ty=xy[,3]; n=length(xy[,"id"]); rst=try(cor2(ty,tx,method)); if (class(rst)=="try-error") {p=NA;s=NA;} else {p=rst[1];s=rst[2];} t=c(vi,vj,method,n,s,p); tt=rbind(tt,t); } #end vj } #end vi rownames(tt)=NULL; colnames(tt)=c("x","y","method","n","s","p"); tt=as.data.frame(tt); tt[,"s"]=as.character(tt[,"s"]); tt[,"s"]=as.numeric(tt[,"s"]); tt[,"p"]=as.character(tt[,"p"]); tt[,"p"]=as.numeric(tt[,"p"]); fdr=p.adjust(tt[,"p"],method="fdr"); bon=p.adjust(tt[,"p"],method="bon"); tt=cbind(tt,fdr,bon); tt=tt[order(tt[,"p"]),]; } if (!is.null(outf)) { colnames(tt)=c("x","y","method","n","s","p","fdr","bon"); #The amount of precision that R prints with is somehow machine dependent (or the R version?) tt[,"s"] = sapply(tt[,"s"], sprintf, fmt="%.4E"); tt[,"p"] = sapply(tt[,"p"], sprintf, fmt="%.4E"); tt[,"fdr"] = sapply(tt[,"fdr"], sprintf, fmt="%.2E"); tt[,"bon"] = sapply(tt[,"bon"], sprintf, fmt="%.2E"); #The ordering should be done after reformatting the precision (duh) tt=tt[order(tt[,"x"]),]; tt=tt[order(tt[,"p"]),]; write.table(tt,file=outf,quote=FALSE,row.names=FALSE,sep=","); } invisible(tt); } #END cor2test #run correlation test using function cor2test(y = clinical_data, x = mutation_matrix, method = method, outf = output_file); } Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/CosmicOmim.pm000444000765000024 14431212013522176 26042 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::CosmicOmim; #__STANDARD PERL PACKAGES use warnings; use strict; use Genome; use FileHandle; use Text::CSV_XS; class Genome::Model::Tools::Music::CosmicOmim { is => 'Genome::Model::Tools::Music::Base', has_input => [ maf_file => { is => 'Path', file_format => 'maf', doc => 'list of annotated mutations in MAF format (or any file with MAF+annotation headers)', }, output_file => { is => 'Path', is_output => 1, doc => 'Output file contains the input file with two columns appended to the end, corresponding to cosmic and omim mutation comparisons, respectively', }, reference_build => { is => 'Text', default => 'Build37', doc => 'Put either "Build36" or "Build37"', } ], has_optional_input => [ omimaa_dir => { is => 'Path', default => Genome::Sys->dbpath( 'omim', 'latest' ), doc => 'omim amino acid mutation database folder', }, cosmic_dir => { is => 'Path', default => Genome::Sys->dbpath( 'cosmic', 'latest' ), doc => 'cosmic amino acid mutation database folder', }, verbose => { is => 'Boolean', default => 0, doc => 'Use this to display the larger working output', }, wu_annotation_headers => { is => 'Boolean', default => 0, doc => "Use this if input MAF contains WUSTL annotation format headers", }, aa_range => { is => 'Integer', default => 2, doc => "Set how close a 'near' match is when searching for amino acid near hits", }, nuc_range => { is => 'Integer', default => 5, doc => "Set how close a 'near' match is when searching for nucleotide position near hits", }, show_known_hits => { is => 'Boolean', default => 1, doc => "When a finding is novel, show known AA in that gene", }, ], doc => 'Compare the amino acid changes of supplied mutations to COSMIC and OMIM databases.' }; sub help_synopsis { return <maf_file; my $cosmic_dir = $self->cosmic_dir; my $basename = $self->output_file; my $ref_build = $self->reference_build; unless($ref_build =~ m/build36/i xor $ref_build =~ m/build37/i){ $self->error_message("You must either specify reference_build as either \"Build36\" or \"Build37\""); die $self->error_message; } my $omimaa_dir = $self->omimaa_dir; my $omimaa; if (-d $omimaa_dir){ $omimaa = "$omimaa_dir/omim_aa.csv"; unless (-e $omimaa) { $omimaa = "$omimaa_dir/OMIM_aa_will.csv"; } } my $cosmic_database_file = 'Cosmic_Database.tsv'; my $cosmic_database; if (-d $cosmic_dir){ $cosmic_database = "$cosmic_dir/$cosmic_database_file"; } my $verbose = $self->verbose; my $wuheaders = $self->wu_annotation_headers; $self->status_message("Using $omimaa as omima db file"); $self->status_message("Using $cosmic_dir as cosmic db folder"); my $aa_range = $self->aa_range; my $nuc_range = $self->nuc_range; ##################### # MAIN PROCESSING # ##################### #Set Stats hash that counts everything my %stats; $stats{'OMIMDB'} = 0; $stats{'OMIM'}{'doublematch'} = 0; $stats{'OMIM'}{'ntmatch'} = 0; $stats{'OMIM'}{'aamatch'} = 0; $stats{'OMIM'}{'posmatch'} = 0; $stats{'OMIM'}{'ntposmatch'} = 0; $stats{'OMIM'}{'aaposmatch'} = 0; $stats{'OMIM'}{'nearmatch'} = 0; $stats{'OMIM'}{'ntnearmatch'} = 0; $stats{'OMIM'}{'aanearmatch'} = 0; $stats{'OMIM'}{'novel'} = 0; $stats{'OMIM'}{'ntnovel'} = 0; $stats{'OMIM'}{'aanovel'} = 0; $stats{'OMIM'}{'silent'} = 0; $stats{'OMIM'}{'nomatch'} = 0; $stats{'COSMICDB'} = 0; $stats{'COSMIC'}{'doublematch'} = 0; $stats{'COSMIC'}{'ntmatch'} = 0; $stats{'COSMIC'}{'aamatch'} = 0; $stats{'COSMIC'}{'posmatch'} = 0; $stats{'COSMIC'}{'ntposmatch'} = 0; $stats{'COSMIC'}{'aaposmatch'} = 0; $stats{'COSMIC'}{'nearmatch'} = 0; $stats{'COSMIC'}{'ntnearmatch'} = 0; $stats{'COSMIC'}{'aanearmatch'} = 0; $stats{'COSMIC'}{'novel'} = 0; $stats{'COSMIC'}{'ntnovel'} = 0; $stats{'COSMIC'}{'aanovel'} = 0; $stats{'COSMIC'}{'silent'} = 0; $stats{'COSMIC'}{'nomatch'} = 0; #__PARSE MUTATION DATA my $fh = new FileHandle; unless ($fh->open (qq{$mut_file})) { die "Could not open mutation project file '$mut_file' for reading"; } if ($verbose) {print "Parsing mutation file...\n";} my $mutation = ParseMutationFile($fh, $mut_file, $wuheaders); $fh->close; if ($verbose) {print "Done Parsing Mutation File! Yippee!\n";} #__READ IN OMIM FILE my %omimaa; if (defined($omimaa) && -e $omimaa) { open(OMIMAA,$omimaa) || die "Could not open omim file '$omimaa'"; if ($verbose) {print "Loading OMIM Database\n";} my $omimaa_header = ; while() { $stats{'OMIMDB'}++; chomp; my ($gene, $omim_entry, $position, $aa_ori, $aa_mut, $description, $diseases) = split("\t"); $omimaa{$gene}{$omim_entry}{$position}{residue1} = $aa_ori; $omimaa{$gene}{$omim_entry}{$position}{residue2} = $aa_mut; $omimaa{$gene}{$omim_entry}{$position}{description} = $description; } if ($verbose) {print "Finished Loading OMIM Database\n";} close(OMIMAA); } #__READ IN COSMIC FILE my %cosmic_gene; my %cosmic_position; my %cosmic_position_only; my %cosmic_tissue; my %cosmic_histology; my %aa_count; my %residue_match; if (defined($cosmic_database) && -e $cosmic_database) { my ($gene_col, $chr_col, $start_col, $stop_col, $chr_col_37, $start_col_37, $stop_col_37, $amino_col, $nucleo_col, $somatic_col, $primary_tissue_col, $tissue_sub_1_col, $tissue_sub_2_col, $histology_col, $histology_sub_1_col, $histology_sub_2_col, $gene, $chr, $start, $stop, $chr_37, $start_37, $stop_37, $amino, $nucleo, $somatic, $primary_tissue, $tissue_sub_1, $tissue_sub_2, $histology, $histology_sub_1, $histology_sub_2); open(COSMIC,$cosmic_database) || die "Could not open omim file '$cosmic_database'"; if ($verbose) {print "Loading COSMIC Database\n";} my $cosmic_header = ; chomp($cosmic_header); my @parser = split(/\t/, $cosmic_header); my $parsecount = 0; my %parsehash; foreach my $item (@parser) { $parsehash{$item} = $parsecount; $parsecount++; } $gene_col = $parsehash{'Gene'}; $chr_col = $parsehash{'Chromosome'}; $start_col = $parsehash{'Genome Start'}; $stop_col = $parsehash{'Genome Stop'}; $chr_col_37 = $parsehash{'Chromosome Build37'}; $start_col_37 = $parsehash{'Genome Start Build37'}; $stop_col_37 = $parsehash{'Genome Stop Build37'}; $amino_col = $parsehash{'Amino Acid'}; $nucleo_col = $parsehash{'Nucleotide'}; $somatic_col = $parsehash{'Somatic Status'}; $primary_tissue_col = $parsehash{'Primary_Tissue'}; $tissue_sub_1_col = $parsehash{'Tissue_subtype_1'}; $tissue_sub_2_col = $parsehash{'Tissue_subtype_2'}; $histology_col = $parsehash{'Histology'}; $histology_sub_1_col = $parsehash{'Histology_subtype_1'}; $histology_sub_2_col = $parsehash{'Histology_subtype_2'}; while(my $line = ) { $stats{'COSMICDB'}++; chomp($line); my @parser = split(/\t/, $line); $gene = $parser[$gene_col]; if ($ref_build =~ m/build36/i) { $chr = $parser[$chr_col]; $start = $parser[$start_col]; $stop = $parser[$stop_col]; } else { $chr = $parser[$chr_col_37]; $start = $parser[$start_col_37]; $stop = $parser[$stop_col_37]; } $chr =~ s/NotListed/ /; $start =~ s/NotListed/ /; $stop =~ s/NotListed/ /; $chr =~ s/23/X/; $chr =~ s/24/Y/; $amino = $parser[$amino_col]; $nucleo = $parser[$nucleo_col]; $somatic = $parser[$somatic_col]; $primary_tissue = $parser[$primary_tissue_col]; $tissue_sub_1 = $parser[$tissue_sub_1_col]; $tissue_sub_2 = $parser[$tissue_sub_2_col]; $histology = $parser[$histology_col]; $histology_sub_1 = $parser[$histology_sub_1_col]; $histology_sub_2 = $parser[$histology_sub_2_col]; my ($residue1, $res_start, $residue2, $res_stop, $new_residue); if ($amino eq 'E597A') { $amino = 'p.E597A'; } if ($amino =~ m/^p/) { unless ($amino =~ m/^p\./) { $amino =~ s/^p/p\./; } } if ($amino =~ m/^P\./) { $amino =~ s/^P\./p\./; } $amino =~ s/\s+$//; $amino =~ s/^\s+//; ($residue1, $res_start, $residue2, $res_stop, $new_residue) = AA_Check($amino); if (defined $res_start){ if ($res_start == $res_stop){ $residue_match{$gene}{$res_start}{$res_stop}{$residue1}{$residue2}++; my $addition = $residue1.$res_start.$residue2; $aa_count{$gene}{$addition}++; } else { my $addition = $residue1.$res_start."-".$res_stop.$residue2; $aa_count{$gene}{$addition}++; } } else { if ($amino =~ m/^p\.\?/ || $amino =~ m/^p\.\>/ || $amino eq 'p.INS' || $amino eq 'p.DEL' || $amino eq 'p.fs*?' || $amino eq 'p.fs') { my $addition = $amino; $addition =~ s/p\.//; $aa_count{$gene}{$addition}++; } else { print "$residue1, $res_start, $residue2, $res_stop, $new_residue\n"; die "$amino not found\n"; } } if (defined($nucleo)) { $cosmic_gene{$gene}{$amino}{$nucleo}++; } else { $cosmic_gene{$gene}{$amino}++; } if (defined($chr) && defined($start) && defined($stop) && $chr ne ' ' && $start ne ' ' && $stop ne ' ') { $cosmic_position{$chr}{$start}{$stop}{$gene}++; if (defined($nucleo)) { $cosmic_position_only{$chr}{$start}{$stop}{$nucleo}++; } else { $cosmic_position_only{$chr}{$start}{$stop}++; } } $cosmic_tissue{$gene}{$amino} = "$primary_tissue\t$tissue_sub_1\t$tissue_sub_2"; $cosmic_histology{$gene}{$amino} = "$histology\t$histology_sub_1\t$histology_sub_2"; } close(COSMIC); if ($verbose) {print "Finished Loading COSMIC Database! Hooray!\n";} } my %cosmic_results; my %omim_results; my $summary_file = $basename; unless (open(SUMMARY,">$summary_file")) { die "Could not open output file '$summary_file' for writing"; } unless ($fh->open (qq{$mut_file})) { die "Could not open mutation project file '$mut_file' for reading"; } my %fileline; my $i = 1; while (my $filehandleline = <$fh>) { chomp $filehandleline; while ($filehandleline =~ /^#/) { print SUMMARY "$filehandleline\n"; $filehandleline = <$fh>; chomp $filehandleline; } $fileline{$i} = $filehandleline; $i++; } $fh->close; print SUMMARY "$fileline{'1'}\tInput_MAF_Line_Number\tCosmic_Results\tOMIM_Results\n"; if ($verbose) {print "Starting COSMIC/OMIM to Mutation File Comparisons\n";} foreach my $hugo (sort keys %{$mutation}) { foreach my $sample (keys %{$mutation->{$hugo}}) { foreach my $line_num (keys %{$mutation->{$hugo}->{$sample}}) { if ($verbose) {print ".";} #report that we are starting a sample #read in the alleles. The keys may change with future file formats. If so, a new version should be added to # my ($entrez_gene_id, $line, $aa_change,$transcript,$mstatus,$Variant_Type,$Chromosome,$Start_position,$End_position,$Reference_Allele,$Tumor_Seq_Allele1,$gene) = my ($entrez_gene_id, $line, $aa_change,$transcript,$Variant_Type,$Chromosome,$Start_position,$End_position,$Reference_Allele,$Tumor_Seq_Allele1,$Tumor_Seq_Allele2,$gene) = ( $mutation->{$hugo}->{$sample}->{$line_num}->{ENTREZ_GENE_ID}, $mutation->{$hugo}->{$sample}->{$line_num}->{file_line}, $mutation->{$hugo}->{$sample}->{$line_num}->{AA_CHANGE}, $mutation->{$hugo}->{$sample}->{$line_num}->{TRANSCRIPT}, # $mutation->{$hugo}->{$sample}->{$line_num}->{MUTATION_STATUS}, $mutation->{$hugo}->{$sample}->{$line_num}->{VARIANT_TYPE}, $mutation->{$hugo}->{$sample}->{$line_num}->{CHROMOSOME}, $mutation->{$hugo}->{$sample}->{$line_num}->{START_POSITION}, $mutation->{$hugo}->{$sample}->{$line_num}->{END_POSITION}, $mutation->{$hugo}->{$sample}->{$line_num}->{REFERENCE_ALLELE}, $mutation->{$hugo}->{$sample}->{$line_num}->{TUMOR_SEQ_ALLELE1}, $mutation->{$hugo}->{$sample}->{$line_num}->{TUMOR_SEQ_ALLELE2}, $mutation->{$hugo}->{$sample}->{$line_num}->{HUGO_SYMBOL}, ); # if ($mstatus){ #Annotate the allele's effect on all known (ie transcript without the 'unknown' status) transcripts my $proper_allele; if ($Reference_Allele ne $Tumor_Seq_Allele1) { $proper_allele = $Tumor_Seq_Allele1; } elsif($Reference_Allele ne $Tumor_Seq_Allele2) { $proper_allele = $Tumor_Seq_Allele2; } else { warn "line $line_num has both Tumor_Seq_Allele1 ($Tumor_Seq_Allele1) and Tumor_Seq_Allele2 ($Tumor_Seq_Allele2) as the Reference ($Reference_Allele)\n"; $proper_allele = $Tumor_Seq_Allele2; } #LOOK FOR ONLY SINGLE CHARACTER PROPER ALLELE TYPES - A, C, T, G, 0, or - unless($Reference_Allele ne $proper_allele) { die "Ref allele: $Reference_Allele same as mutation allele: $proper_allele ('line num' $line_num)"; } unless($Reference_Allele =~ /[ACTG0\-]/ && $proper_allele =~ /[ACTG0\-]/) { die "Read in improper alleles from mutation file ref: $Reference_Allele var: $proper_allele ('line num' $line_num)"; } ## SHOULD NEVER BE '--' BECAUSE THE CODE WILL THINK THIS IS A DINUCLEOTIDE POLYMORPHISM INSTEAD OF AN INDEL, SO CHANGE THESE $Reference_Allele = '-' if $Reference_Allele eq '--'; $proper_allele = '-' if $proper_allele eq '--'; chomp($line_num); my %results_hash; #parse the amino acid string my ($residue1, $res_start, $residue2, $res_stop, $new_residue) = AA_Check($aa_change); if(defined($Start_position) && defined($End_position) && (!$residue2 || $residue2 eq ' ')){ if ($verbose) {print "Skipping Non-coding (Silent) Mutation";} my $find_type = $self->CheckPositionMatch($Start_position, $End_position, $Reference_Allele, $proper_allele, \%cosmic_position_only, \%cosmic_position); my $createspreadsheet; if ($find_type =~ m/no_match/) { $createspreadsheet = "$fileline{$line_num}\t$line_num\tSkipped - Silent Mutation\tSkipped - Silent Mutation"; $stats{'COSMIC'}{'silent'}++; $stats{'OMIM'}{'silent'}++; } elsif ($find_type =~ m/position/) { my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop) = split(":",$find_type); $results_hash{NT}{POSITION}{COSMIC}{$transcript}=": Nucleotide -> Cosmic Gene(s):$cosmic_genes Position:$chr:$gen_start-$gen_stop"; my $matchtype_cosmic; ($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC"); $stats{'COSMIC'}{$matchtype_cosmic}++; $stats{'OMIM'}{'silent'}++; $createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation"; } elsif ($find_type =~ m/position_nucleotide/) { my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop,$reference,$mutant) = split(":",$find_type); $results_hash{NT}{MATCH}{COSMIC}{$transcript} = ": Nucleotide -> Cosmic Gene(s): $cosmic_genes Position:$chr:$gen_start-$gen_stop,ref:$reference,mut:$mutant"; my $matchtype_cosmic; ($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC"); $stats{'COSMIC'}{$matchtype_cosmic}++; $stats{'OMIM'}{'silent'}++; $createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation"; } elsif ($find_type =~ m/near_match/) { my ($findtype,$diff_position) = split(":",$find_type); $results_hash{NT}{ALMOST}{COSMIC}{$transcript}=": Nucleotide $diff_position bases away"; my $matchtype_cosmic; ($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC"); $stats{'COSMIC'}{$matchtype_cosmic}++; $stats{'OMIM'}{'silent'}++; $createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation"; } print SUMMARY "$createspreadsheet\n"; next; #skip silent mutations } #look for mutation file gene ($hugo) to match cosmic file gene (%cosmic_gene{gene}), note matched name in $cosmic_hugo my $cosmic_hugo; my $uc_hugo = uc($hugo); if (exists($cosmic_gene{$hugo})) { $cosmic_hugo = $hugo; } elsif (exists($cosmic_gene{$uc_hugo})) { # check for UPPERCASE hugo match $cosmic_hugo = $uc_hugo; } else { #if cosmic key needs to be uppercase to match mutation file (for example, maf default is all uppercase) foreach my $key (keys %cosmic_gene) { # check for UPPERCASE keys match if ($uc_hugo eq uc($key)) { $cosmic_hugo = $key; } } } #genes that didn't find a match go here, will check for position matches later unless (defined($cosmic_hugo)) { if (-e "$cosmic_dir/$hugo\.csv") { #database flatfile has only genes with AA changes, check source files for gene existance $results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Gene $hugo in Cosmic but No Amino Acid Results for Gene"; } else { $results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Gene $hugo not in Cosmic Database"; } } #retrieve COSMIC match if (!defined($aa_change) || $aa_change eq 'NULL') { warn "We skipped silent mutations, how do we have undel or NULL amino acids at non-silent sites? Gene:$hugo AA:$aa_change\n"; } #Start checks,First check position (then check amino acid) my $cosmic_find_type; my @aa_holder; if(defined($Start_position) && defined($End_position) && $Start_position ne ' ' && $End_position ne ' ') { my $find_type = $self->CheckPositionMatch($Start_position, $End_position, $Reference_Allele, $proper_allele, \%cosmic_position_only, \%cosmic_position); if ($find_type =~ m/no_match/) { $results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Nucleotide"; } elsif ($find_type =~ m/position/) { my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop) = split(":",$find_type); $results_hash{NT}{POSITION}{COSMIC}{$transcript}=": Nucleotide -> Cosmic Gene(s):$cosmic_genes Position:$chr:$gen_start-$gen_stop"; } elsif ($find_type =~ m/position_nucleotide/) { my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop,$reference,$mutant) = split(":",$find_type); $results_hash{NT}{MATCH}{COSMIC}{$transcript} = ": Nucleotide -> Cosmic Gene(s): $cosmic_genes Position:$chr:$gen_start-$gen_stop,ref:$reference,mut:$mutant"; } elsif ($find_type =~ m/near_match/) { my ($findtype,$diff_position) = split(":",$find_type); $results_hash{NT}{ALMOST}{COSMIC}{$transcript}=": Nucleotide $diff_position bases away"; } } #check amino acid here if ($cosmic_hugo && exists($cosmic_gene{$cosmic_hugo})) { $cosmic_find_type = 'no_match'; foreach my $key (keys %{$aa_count{$cosmic_hugo}}) { if ($key =~ m/\S+/) { push(@aa_holder,"$key ($aa_count{$cosmic_hugo}{$key})"); } } unless (@aa_holder) { @aa_holder = "AA?Unknown?"; } if ($res_start && $res_stop && exists($residue_match{$cosmic_hugo}) && exists($residue_match{$cosmic_hugo}{$res_start}) && exists($residue_match{$cosmic_hugo}{$res_start}{$res_stop})) { #match amino acid $cosmic_find_type = 'position'; $results_hash{AA}{POSITION}{COSMIC}{$transcript}=": Amino Acid -> Matched $cosmic_hugo, $res_start, $res_stop"; if ($residue1 && $residue2 && ($residue_match{$cosmic_hugo}{$res_start}{$res_stop}{$residue1}{$residue2} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{uc($residue1)}{$residue2} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{$residue1}{uc($residue2)} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{uc($residue1)}{uc($residue2)})) { # matches both amino acid and amino acid position $cosmic_find_type = 'position_aminoacid'; my $addition; if ($res_start == $res_stop){ $addition = $residue1.$res_start.$residue2; } else { $addition = $residue1.$res_start."-".$res_stop.$residue2; } $results_hash{AA}{MATCH}{COSMIC}{$transcript} = ": Amino Acid -> Matched $cosmic_hugo, $addition"; } } if($cosmic_find_type && $cosmic_find_type eq 'no_match') { if ($self->show_known_hits) { $results_hash{AA}{NOVEL}{COSMIC}{$transcript}=": Amino Acid -> Known AA = @aa_holder"; } else { $results_hash{AA}{NOVEL}{COSMIC}{$transcript}=": Amino Acid -> Known AA Not Shown"; } my $iter_start = $res_start - $aa_range; my $iter_stop = $res_stop + $aa_range; my $iter; for($iter = $iter_start; $iter <= $iter_stop; $iter++) { if ($res_start && $res_stop && exists($residue_match{$cosmic_hugo}) && exists($residue_match{$cosmic_hugo}{$iter})) { if ($self->show_known_hits) { $results_hash{AA}{ALMOST}{COSMIC}{$transcript} = ": Amino Acid -> Known AA for Gene = @aa_holder"; } else { $results_hash{AA}{ALMOST}{COSMIC}{$transcript} = ": Amino Acid -> Known AA Not Shown"; } } } } } #retrieve OMIM match my $omim_find_type; my $omim = \%omimaa; if (exists($omim->{$hugo})) { $omim_find_type = FindOMIM(\%omimaa,$hugo,$res_start,$res_stop,$residue1,$residue2,$aa_range); } #Add OMIM result to the results hash if(defined($omim_find_type)) { if ($omim_find_type eq 'position_aminoacid') { $results_hash{AA}{MATCH}{OMIM}{$transcript} =": Amino Acid"; } elsif ($omim_find_type eq 'position') { $results_hash{AA}{POSITION}{OMIM}->{$transcript}=": Amino Acid"; } elsif ($omim_find_type eq 'almost') { $results_hash{AA}{ALMOST}{OMIM}->{$transcript}=": Amino Acid"; } else { $results_hash{AA}{NOVEL}{OMIM}->{$transcript}=": Amino Acid"; } } else { $results_hash{AA}{NOVEL}{OMIM}->{$transcript}=": Amino Acid - OMIM Gene Name Not Found"; } #now check to see what the 'best' cosmic score was my $matchtype_cosmic; ($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC"); $stats{'COSMIC'}{$matchtype_cosmic}++; #now check to see what the 'best' omim score was my $matchtype_omim; ($omim_results{$line_num}, $matchtype_omim) = score_results(\%results_hash, "OMIM"); $stats{'OMIM'}{$matchtype_omim}++; my $createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\t$omim_results{$line_num}"; print SUMMARY "$createspreadsheet\n"; # } } } } close(SUMMARY); if ($verbose) {print "Finished COSMIC/OMIM to Mutation File Comparisons! HAPPY HAPPY JOY JOY!\n";} print "\n"; print "Number of Genes in OMIM: $stats{'OMIMDB'}\n"; print "Number of AA and NT Matches: $stats{'OMIM'}{'doublematch'}\n"; print "Number of NT only Matches: $stats{'OMIM'}{'ntmatch'}\n"; print "Number of AA only Matches: $stats{'OMIM'}{'aamatch'}\n"; print "Number of AA and NT Position Matches: $stats{'OMIM'}{'posmatch'}\n"; print "Number of NT only Position Matches: $stats{'OMIM'}{'ntposmatch'}\n"; print "Number of AA only Position Matches: $stats{'OMIM'}{'aaposmatch'}\n"; print "Number of NT and AA Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'nearmatch'}\n"; print "Number of NT only Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'ntnearmatch'}\n"; print "Number of AA only Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'aanearmatch'}\n"; print "Number of NT and AA Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'novel'}\n"; print "Number of NT Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'ntnovel'}\n"; print "Number of AA Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'aanovel'}\n"; print "Number of Silent Mutations Skipped: $stats{'OMIM'}{'silent'}\n"; print "Number of Lines that Exited with No Hit: $stats{'OMIM'}{'nomatch'}\n"; print "\n"; print "Number of Genes in COSMIC: $stats{'COSMICDB'}\n"; print "Number of AA and NT Matches: $stats{'COSMIC'}{'doublematch'}\n"; print "Number of NT only Matches: $stats{'COSMIC'}{'ntmatch'}\n"; print "Number of AA only Matches: $stats{'COSMIC'}{'aamatch'}\n"; print "Number of AA and NT Position Matches: $stats{'COSMIC'}{'posmatch'}\n"; print "Number of NT only Position Matches: $stats{'COSMIC'}{'ntposmatch'}\n"; print "Number of AA only Position Matches: $stats{'COSMIC'}{'aaposmatch'}\n"; print "Number of NT and AA Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'nearmatch'}\n"; print "Number of NT only Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'ntnearmatch'}\n"; print "Number of AA only Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'aanearmatch'}\n"; print "Number of NT and AA Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'novel'}\n"; print "Number of NT Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'ntnovel'}\n"; print "Number of AA Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'aanovel'}\n"; print "Number of Silent Mutations Skipped: $stats{'COSMIC'}{'silent'}\n"; print "Number of Lines that Exited with No Hit: $stats{'COSMIC'}{'nomatch'}\n"; return 1; } ################################################################################ # # # S U B R O U T I N E S # # # ################################################################################ sub CheckPositionMatch { my $self = shift; my $genomic_start = shift; my $genomic_stop = shift; my $nt1 = shift; my $nt2 = shift; my $cp_only = shift; my %cosmic_position_only = %{$cp_only}; my $cp_all = shift; my %cosmic_position = %{$cp_all}; my $verbose = $self->verbose; my $nuc_range = $self->nuc_range; my $find_type = 'no_match'; foreach my $chr (sort keys %cosmic_position_only) { # Test that it at least matches position foreach my $gen_start (keys %{$cosmic_position_only{$chr}}) { my $diff_start = $gen_start - $genomic_start; if ($gen_start == $genomic_start) { foreach my $gen_stop (keys %{$cosmic_position_only{$chr}{$gen_start}}) { my $diff_stop = $gen_stop - $genomic_stop; if ($gen_stop == $genomic_stop) { my $cosmic_genes; if (keys %{$cosmic_position{$chr}{$gen_start}{$gen_stop}}) { my @cosmic_genes = keys %{$cosmic_position{$chr}{$gen_start}{$gen_stop}}; $cosmic_genes = join(",",@cosmic_genes); } if ($find_type eq 'no_match' || $find_type eq 'near_match') { $find_type = "position:$cosmic_genes:$chr:$gen_start:$gen_stop"; } # Test that it matches both foreach my $nucleo (keys %{$cosmic_position_only{$chr}{$gen_start}{$gen_stop}}) { my ($start,$stop,$type_length,$type,$reference,$mutant) = parse_nucleotide($nucleo,$verbose); if($reference && $mutant && $reference eq $nt1 && $mutant eq $nt2) { $find_type = "position_nucleotide:$cosmic_genes:$chr:$gen_start:$gen_stop:$reference:$mutant"; return $find_type; } } } elsif ($diff_stop <= $nuc_range && $diff_stop >= -$nuc_range) { if ($find_type eq 'no_match') { $find_type = "near_match:$diff_stop"; } } } } elsif ($diff_start <= $nuc_range && $diff_start >= -$nuc_range) { if ($find_type eq 'no_match') { $find_type = "near_match:$diff_start"; } } } } return $find_type; } sub FindOMIM { my ($omim, $hugo,$res_start, $res_stop, $residue1, $residue2, $aa_range) = @_; my $return_value = 'no_match'; unless (exists($omim->{$hugo})) { warn "No omim entry for: $hugo"; return $return_value; } foreach my $sample (keys %{$omim->{$hugo}}) { # Test that it at least matches position if (exists($omim->{$hugo}{$sample}{$res_start})) { $return_value = 'position'; # Test that it matches both if (exists($omim->{$hugo}{$sample}{$res_start}{residue1}) && exists($omim->{$hugo}{$sample}{$res_start}{residue2}) && defined($omim->{$hugo}{$sample}{$res_start}{residue1}) && defined($omim->{$hugo}{$sample}{$res_start}{residue2}) && uc($omim->{$hugo}{$sample}{$res_start}{residue1}) eq uc($residue1) && uc($omim->{$hugo}{$sample}{$res_start}{residue2}) eq uc($residue2)) { return 'position_aminoacid'; } } elsif ($return_value eq 'no_match') { my $iter_start = $res_start - $aa_range; my $iter_stop = $res_stop + $aa_range; my $iter; for($iter = $iter_start; $iter <= $iter_stop; $iter++) { if (exists($omim->{$hugo}{$sample}{$iter})) { $return_value = 'almost'; } } } } return $return_value; } sub score_results { my ($results, $database) = @_; my $matchtype; if(exists($results->{NT}{MATCH}->{$database}) && exists($results->{AA}{MATCH}->{$database})) { #best hit was a DOUBLE MATCH. Huzzah! $matchtype = 'doublematch'; my ($transcript) = keys %{$results->{NT}{MATCH}{$database}}; my ($transcript2) = keys %{$results->{AA}{MATCH}{$database}}; my $ret_value = "NT and AA Match".$results->{AA}{MATCH}->{$database}{$transcript2}." and ".$results->{NT}{MATCH}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{MATCH}->{$database})) { #best hit was a MATCH. Huzzah! $matchtype = 'ntmatch'; my ($transcript) = keys %{$results->{NT}{MATCH}{$database}}; my $ret_value = "Match".$results->{NT}{MATCH}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{AA}{MATCH}->{$database})) { #best hit was a MATCH. Huzzah! $matchtype = 'aamatch'; my ($transcript) = keys %{$results->{AA}{MATCH}{$database}}; my $ret_value = "Match".$results->{AA}{MATCH}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{POSITION}->{$database}) && exists($results->{AA}{POSITION}->{$database})) { #best hit was a position match $matchtype = 'posmatch'; my ($transcript) = keys %{$results->{NT}{POSITION}{$database}}; my ($transcript2) = keys %{$results->{AA}{POSITION}{$database}}; my $ret_value = "NT and AA Position Match".$results->{AA}{POSITION}->{$database}{$transcript2}." and ".$results->{NT}{POSITION}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{POSITION}->{$database})) { #best hit was a position match $matchtype = 'ntposmatch'; my ($transcript) = keys %{$results->{NT}{POSITION}{$database}}; my $ret_value = "Position Match".$results->{NT}{POSITION}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{AA}{POSITION}->{$database})) { #best hit was a position match $matchtype = 'aaposmatch'; my ($transcript) = keys %{$results->{AA}{POSITION}{$database}}; my $ret_value = "Position Match".$results->{AA}{POSITION}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{ALMOST}->{$database}) && exists($results->{AA}{ALMOST}->{$database})) { #best hit was near a position match $matchtype = 'nearmatch'; my ($transcript) = keys %{$results->{NT}{ALMOST}{$database}}; my ($transcript2) = keys %{$results->{AA}{ALMOST}{$database}}; my $ret_value = "NT and AA Novel, but near match".$results->{AA}{ALMOST}->{$database}{$transcript2}." and ".$results->{NT}{ALMOST}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{ALMOST}->{$database})) { #best hit was near a position match $matchtype = 'ntnearmatch'; my ($transcript) = keys %{$results->{NT}{ALMOST}{$database}}; my $ret_value = "Novel, but near match".$results->{NT}{ALMOST}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{AA}{ALMOST}->{$database})) { #best hit was near a position match $matchtype = 'aanearmatch'; my ($transcript) = keys %{$results->{AA}{ALMOST}{$database}}; my $ret_value = "Novel, but near match".$results->{AA}{ALMOST}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{AA}{NOVEL}->{$database}) && exists($results->{NT}{NOVEL}->{$database})) { #no hits, novel $matchtype = 'novel'; my ($transcript) = keys %{$results->{AA}{NOVEL}{$database}}; my ($transcript2) = keys %{$results->{NT}{NOVEL}{$database}}; my $ret_value = "Novel".$results->{AA}{NOVEL}->{$database}{$transcript}. " and ".$results->{NT}{NOVEL}->{$database}{$transcript2}; return ($ret_value, $matchtype); } elsif(exists($results->{AA}{NOVEL}->{$database})) { #no hits, novel $matchtype = 'aanovel'; my ($transcript) = keys %{$results->{AA}{NOVEL}{$database}}; my $ret_value = "Novel".$results->{AA}{NOVEL}->{$database}{$transcript}; return ($ret_value, $matchtype); } elsif(exists($results->{NT}{NOVEL}->{$database})) { #no hits, novel $matchtype = 'ntnovel'; my ($transcript) = keys %{$results->{NT}{NOVEL}{$database}}; my $ret_value = "Novel".$results->{NT}{NOVEL}->{$database}{$transcript}; return ($ret_value, $matchtype); } else { #it was a nomatch! this shouldn't happen. $matchtype = 'nomatch'; my ($transcript) = keys %{$results->{NOMATCH}}; my $ret_value = (defined($transcript) && $results->{NOMATCH}{$transcript}) ? $results->{NOMATCH}{$transcript} : "Unknown/NULL"; return ($ret_value, $matchtype); } } sub parse_nucleotide { my ($string, $verbose) = @_; my ($change, $modifier); my ($start,$stop,$type_length,$type,$reference,$mutant); ($start,$stop,$change) = $string =~ /^c\. (\d+) _ (\d+) (.*) $/x; unless (defined $change) { ($start,$change) = $string =~ /^c\. (\d+) (.*) $/x; } if ($string =~ /^c\.\w* \Q?\E.* /x) { #ambiguous entry return; } if( defined $change) { #first check to make sure its not intronic if ($change =~ /^ (-|\+) (\d+) (.*) $/x) { if ($verbose) {print "Mutation Appears Intronic";} $change = $3; } #This could be one of several possiblities #First distinguish between > notation and del18 notation ($reference,$mutant) = $change =~ /^ (\D*) > (\D+) $/x; if(defined($reference) && $reference eq '') { $reference = undef; } if(defined $mutant) { #Then we expect that this format is correct. if((defined($stop) && $stop eq $start) || !defined($stop)) { #We have a snp $type = 'SNP'; $type_length = 1; } else { #assuming that if it is listed explicitly it is an indel $type = 'indel'; if(defined $reference) { $type_length = length $reference; } else { $type_length = abs($start-$stop)+1; } } return ($start, $stop, $type_length, $type, $reference, $mutant); } else { #did not guess right. Should be either del15 or insAAT type of format $type = substr $change, 0,3; $modifier = substr $change, 3, (length($change) - 1); if($type =~ /^ (del|ins|delins) $/xi) { if($type eq 'delins') { $type = 'indel'; } #then insertion if($modifier =~ /^ (\d+) $/x) { #it is a digit $type_length = $1; return ($start, $stop, $type_length, $type, $reference, $mutant); } else { #it is a sequence $type_length = length $modifier; return ($start, $stop, $type_length, $type, $reference, $modifier); } } else { #unrecognized format warn "Unable to parse nucleotide format in: $string\n"; return; } } } else { warn "Unable to parse nucleotide format in: $string\n"; return; } } sub ParseMutationFile { my ($fh,$file,$wuheaders) = @_; my $source = 'mutation_csv'; # 'CSV' my $keyfields = 'HUGO_SYMBOL:TUMOR_SAMPLE_ID:file_line_num'; my $field_subset_array = undef; my $header_translation; if ($wuheaders) { $header_translation = { 'chromosome_name' => 'CHROMOSOME', 'start' => 'START_POSITION', 'stop' => 'END_POSITION', 'reference' => 'REFERENCE_ALLELE', 'variant' => 'TUMOR_SEQ_ALLELE1', # 'variant' => 'TUMOR_SEQ_ALLELE2', 'type' => 'VARIANT_TYPE', 'gene_name' => 'HUGO_SYMBOL', 'transcript_name' => 'TRANSCRIPT', 'strand' => 'TUMOR_SAMPLE_ID', #meaningless proxy 'amino_acid_change' => 'AA_CHANGE', }; } else { $header_translation = { 'Chromosome' => 'CHROMOSOME', 'Start_position' => 'START_POSITION', 'End_position' => 'END_POSITION', 'Reference_Allele' => 'REFERENCE_ALLELE', 'Tumor_Seq_Allele1' => 'TUMOR_SEQ_ALLELE1', 'Tumor_Seq_Allele2' => 'TUMOR_SEQ_ALLELE2', 'Variant_Type' => 'VARIANT_TYPE', 'Hugo_Symbol' => 'HUGO_SYMBOL', 'transcript_name' => 'TRANSCRIPT', 'Strand' => 'TUMOR_SAMPLE_ID', #meaningless proxy 'amino_acid_change' => 'AA_CHANGE', }; } my $header_skip = 0; my $no_header = 0; my $header_fields = undef; my $ucheader_fields = undef; my $separator = "\t"; my $no_spaces = 0; my $line_number_field = 'file_line_num'; my $line_field = 'file_line'; my $line_num = 1; my $record = {}; my $csv = Text::CSV_XS->new({'sep_char' => $separator}); #parse MAF header my $header = <$fh>; while ($header =~ /^#/) { $header = <$fh>; } unless ($header =~ m/Hugo_Symbol/) { die "Header field \"Hugo_Symbol\" not found. Therefore, your file is assumed to not have a header. If this is in error, please contact the authors to fix this line in the code. Otherwise, please include a header in your input file. Thank you.\n"; } $line_num++; $csv->parse($header); my @header_fields = $csv->fields(); if ($ucheader_fields) { @header_fields = map { uc($_) } @header_fields; } if(defined($no_spaces) && $no_spaces) { @header_fields = map { $_ =~ s/ /_/g } @header_fields; } unless(defined($separator)) { $header =~ s/\t/,/gx; } unshift( @header_fields, $line_field); # Add 'extra' fields of the input line unshift( @header_fields, $line_number_field); # and the line number # Translate the header names, if a translation is given for (my $h = 0; $h <= $#header_fields; $h++) { $header_fields[$h] = (exists($header_translation->{$header_fields[$h]})) ? $header_translation->{$header_fields[$h]} : $header_fields[$h]; } my (@key_fields) = split(':',$keyfields); my %key_fields; @key_fields{ @key_fields } = ( 0 .. $#key_fields ); # Construct field name to position lookup my %header_fields; @header_fields{ @header_fields } = ( 0 .. $#header_fields ); # Construct a subset of the fields--the default is the complete set of fields my %field_subset; if (defined($field_subset_array)) { @field_subset{ @{$field_subset_array} } = @header_fields{ @{$field_subset_array } }; } else { @field_subset{ @header_fields } = ( 0 .. $#header_fields ); } # Construct a list of fields that are not key (are values only) my @value_fields; foreach my $field (@header_fields[ (values %field_subset ) ]) { unless (exists($key_fields{$field})) { push @value_fields, ($field); } } #__PARSE FILE my $line; while ($line = <$fh>) { chomp ($line); my $temp = $line; #edit for punctuation characters in annotation section of MAF $temp =~ s/"//g; $temp =~ s/'//g; unless(defined($separator)) { #maintain original default behavior of handling both tabs and #commas $temp =~ s/\t/,/gx; } $csv->parse($temp); my @values = $csv->fields(); if(defined($no_spaces) && $no_spaces) { @values = map { $_ =~ s/ /_/g } @values; } unshift (@values, $line); # Add 'extra' fields of the input line unshift (@values, $line_num++); # and the line number my $sub_record; # Construct the hierarchical key structure $sub_record = $record; foreach my $sub_key (@key_fields) { unless (exists($sub_record->{ $values[ $header_fields{ $sub_key } ] } )) { $sub_record->{ $values[ $header_fields{ $sub_key } ] } = {}; } $sub_record = $sub_record->{ $values[ $header_fields{ $sub_key } ] }; } # Get the hash array slice of the values @{$sub_record}{ @value_fields } = @values[ @header_fields{@value_fields} ]; } #__DUMP PARSED RESULTS AND STOP IF JUST CHECKING Process($record); #__RETURN DATA STRUCTS return ($record); } sub Process { my ($input) = @_; my ($output) = {}; foreach my $hugo (keys (%{$input})) { foreach my $sample (keys (%{$input->{$hugo}})) { foreach my $line_num (keys (%{$input->{$hugo}->{$sample}})) { $output->{$hugo}->{$sample}->{$line_num} = $input->{$hugo}->{$sample}->{$line_num}; my ($residue1, $res_start, $residue2, $res_stop, $new_residue) = AA_Check( $input->{$hugo}->{$sample}->{$line_num}->{AA_CHANGE} , $line_num ); $output->{$hugo}->{$sample}->{$line_num}->{res_start} = $res_start; $output->{$hugo}->{$sample}->{$line_num}->{res_stop} = $res_stop; $output->{$hugo}->{$sample}->{$line_num}->{residue1} = $residue1; $output->{$hugo}->{$sample}->{$line_num}->{residue2} = $residue2; $output->{$hugo}->{$sample}->{$line_num}->{new_residue} = $new_residue; } } } #__DUMP PARSED RESULTS AND STOP IF JUST CHECKING return ($output); } sub AA_Check { my ($AminoAcidChange_string, $line_num) = @_; my ($residue1, $res_start, $residue2, $res_stop, $new_residue); unless (defined($AminoAcidChange_string)) { return ($residue1, $res_start, $residue2, $res_stop, $new_residue); } unless ($AminoAcidChange_string =~ m/^p\./) { return ($residue1, $res_start, $residue2, $res_stop, $new_residue); } #__FORMULATE ERROR STRING JUST IN CASE my $string = "'$AminoAcidChange_string' is not a valid AminoAcidChange"; $string .= " on line $line_num" if defined $line_num; #__VALIDATE $AminoAcidChange_string =~ s/^p\.//x; if ($AminoAcidChange_string =~ /^ (\D+) (\d+) _ (\D+) (\d+) (.*) $/x ) { ($residue1, $res_start, $residue2, $res_stop, $new_residue) = ($1, $2, $3, $4, $5); } elsif ($AminoAcidChange_string =~ /^ (\D+) (\d+) (\D+) (.*) $/x ) { ($residue1, $res_start, $residue2, $new_residue) = ($1, $2, $3, $4); $res_stop = $res_start; } elsif ($AminoAcidChange_string =~ /^ (\d+) (.*) $/x ) { ($res_start, $residue2, $res_stop, $new_residue) = ($1, $2); $residue1 = '*'; $res_stop = $res_start; $new_residue = $residue2; } elsif ($AminoAcidChange_string =~ /^ (\D+) (\d+) $/x ) { ($residue1, $res_start) = ($1, $2); $new_residue = ' '; $residue2 = ' '; $res_stop = $res_start; } if (defined($new_residue)) { $new_residue =~ s/^ > //x; } $new_residue ||= ''; return ($residue1, $res_start, $residue2, $res_stop, $new_residue); } Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Galaxy.pm000444000765000024 356412013522176 25173 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Galaxy; use strict; use warnings; use Genome; use File::Basename; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Galaxy { is => "Genome::Model::Tools::Music::Play", has_input => [ output_bundle => { is => 'Text', doc => 'Location where Galaxy would like the bundle of Music outputs to be saved', }, ], has => [ output_dir => { is_input => 0, is_output => 0, is_optional => 1, default => '', }, ], }; sub execute { my $self = shift; my $output_dir = Genome::Sys->create_temp_directory(); $self->output_dir($output_dir); $self->bam_list($self->create_new_bam_list); $self->SUPER::_execute_body(@_); my $tar_path = $self->output_bundle; my $cmd = "tar -cf $tar_path -C $output_dir ."; my $rv = Genome::Sys->shellcmd( cmd => $cmd, ); return 1; } sub create_new_bam_list { my $self = shift; my $original_bam_file = $self->bam_list; my $output_dir = Genome::Sys->create_temp_directory(); my $new_bam_list = "$output_dir/bam_list"; my @bams = Genome::Sys->read_file($original_bam_file); my $new_bam_fh = Genome::Sys->open_file_for_writing($new_bam_list); for (@bams) { chomp; my ($sample, $first_bam, $second_bam) = split("\t", $_); my $first_new_path = $self->index_and_link_bam_file($output_dir, $first_bam); my $second_new_path = $self->index_and_link_bam_file($output_dir, $second_bam); $new_bam_fh->print("$sample\t$first_new_path\t$second_new_path\n"); } $new_bam_fh->close(); return $new_bam_list; } sub index_and_link_bam_file { my $self = shift; my $output_dir = shift; my $bam = shift; my $filename = $output_dir . "/" . basename($bam); Genome::Sys->create_symlink($bam, $filename); Genome::Sys->shellcmd( cmd => "samtools index $filename"); return $filename; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/MutationRelation.pm000444000765000024 1741612013522176 27265 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::MutationRelation; use warnings; use strict; use Carp; use Genome; use IO::File; use POSIX qw( WIFEXITED ); our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::MutationRelation { is => 'Command::V2', has_input => [ bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)" }, maf_file => { is => 'Text', doc => "List of mutations in MAF format" }, mutation_matrix_file => { is => 'Text', doc => "Optionally store the sample-vs-gene matrix used during calculations.", is_optional => 1 }, output_file => { is => 'Text', doc => "Results of mutation-relation tool", is_output => 1 }, permutations => { is => 'Number', doc => "Number of permutations used to determine P-values", is_optional => 1, default => 100 }, gene_list => { is => 'Text', doc => "List of genes to test, typically SMGs. If unspecified, all genes in MAF are tested.", is_optional => 1 }, skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 }, skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 }, ], doc => "Identify relationships of mutation concurrency or mutual exclusivity in genes across cases.", }; sub help_synopsis { return <bam_list; my $maf_file = $self->maf_file; my $output_file = $self->output_file; my $gene_list = $self->gene_list; my $permutations = $self->permutations; my @all_sample_names; # names of all the samples, no matter if it's mutated or not my @genes_to_test; # the genes which will be tested for relationships my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent; # Parse out the names of the samples which should match the names in the MAF file my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; my $line_count = 0; while( my $line = $sampleFh->getline ) { $line_count++; next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); if ($sample) { push( @all_sample_names, $sample ); } else { warn("could not parse sample name from line $line_count of --bam_list"); } } $sampleFh->close; # If user-specified, parse out the names of the genes that we are limiting our tests to if( defined $gene_list ) { my $geneFh = IO::File->new( $gene_list ) or die "Couldn't open $gene_list. $!\n"; while( my $line = $geneFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $gene ) = split( /\t/, $line ); push( @genes_to_test, $gene ); } $geneFh->close; } # Create sample-gene matrix my $matrix_file = $self->create_sample_gene_matrix($maf_file, \@all_sample_names, \@genes_to_test, $skip_non_coding, $skip_silent); # Perform mutation-relation test using R my $R_cmd = "R --slave --args < " . __FILE__ . ".R $matrix_file $permutations $output_file"; print "$R_cmd\n"; WIFEXITED(system $R_cmd) or croak "Couldn't run: $R_cmd ($?)"; return(1); } sub create_sample_gene_matrix { my $self = shift; my ( $maf_file, $all_sample_names_ref, $genes_to_test_ref, $skip_non_coding, $skip_silent ) = @_; my @all_sample_names = @{$all_sample_names_ref}; my @genes_to_test = @{$genes_to_test_ref}; # Create hash of mutations from the MAF file my %mutations; my %all_genes; # Parse the MAF file my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!\n"; while( my $line = $mafFh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $mutation_class, $sample ) = ( $cols[0], $cols[8], $cols[15] ); # If the mutation classification is odd, quit with error if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } $all_genes{$gene}++; $mutations{$sample}{$gene}++; } $mafFh->close; # If the user specified a gene list, then check for genes that are not in the MAF if( scalar( @genes_to_test ) > 0 ) { for( my $i = 0; $i < scalar( @genes_to_test ); ++$i ) { unless( defined $all_genes{$genes_to_test[$i]} ) { print "Skipping ", $genes_to_test[$i], " from specified gene-list since it was not found in the MAF file\n"; splice( @genes_to_test, $i, 1 ); } } } else { @genes_to_test = sort keys %all_genes; } # Write the input matrix to a file for use by the R code my $matrix_file; unless( $matrix_file = $self->mutation_matrix_file ) { $matrix_file = Genome::Sys->create_temp_file_path(); } my $matrix_fh = new IO::File $matrix_file,"w"; # Print input matrix file header my $header = join("\t","Sample",@genes_to_test); $matrix_fh->print("$header\n"); # Print mutation relation input matrix for my $sample (sort @all_sample_names) { $matrix_fh->print($sample); for my $gene (@genes_to_test) { if (exists $mutations{$sample}{$gene}) { $matrix_fh->print("\t1"); } else { $matrix_fh->print("\t0"); } } $matrix_fh->print("\n"); } return $matrix_file; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/MutationRelation.pm.R000444000765000024 606012013522176 27436 0ustar00nnutterstaff000000000000#get command line arguments mutation_file = as.character(commandArgs()[4]); permutations = as.numeric(commandArgs()[5]); output_file = as.character(commandArgs()[6]); # FUNCTION prepare mutation matrix data for permutation seq.vjoint=function(x) { tt=NULL; for (i in 1:length(x)) { tt=c(tt,rep(names(x[i]),x[i])); } tt; } # END seq.vjoint # FUNCTION randomly permute the mutation matrix, keeping mutation-number/sample unchanged matrix.sample.v1=function(x,keep.freq=T) { colSums(x)->pool; rowSums(x)->freq; freq=freq[order(freq,decreasing=T)]; tt=NULL; for (i in c(1:length(freq))) { pooli=seq.vjoint(pool); xi=NULL; for (j in 1:freq[i]) { if (length(pooli)>1) sample(pooli,1)->temp; if (length(pooli)==1) pooli->temp; xi=c(xi,temp); pooli=pooli[pooli!=temp]; } tt=rbind(tt,cbind(i,xi)); pool[xi]=pool[xi]-1; pool=pool[pool>0]; } tt=table(tt[,1],tt[,2]); tt[tt>1]=1; tt; } # END matrix.sample.v1 # FUNCTION calculate the probability of having no correlation between any two genes prob.concur=function(x) { x>0->x; cols=colnames(x); cols=cols[order(cols)]; n=length(cols); tt=NULL; for (i in 1:(n-1)) { for (j in (i+1):n) { vi=cols[i]; vj=cols[j]; sum(x[,vi] & x[,vj])->nand; sum(x[,vi] != x[,vj])->nexc; ni=sum(x[,vi]); nj=sum(x[,vj]); temp=cbind(vi,vj,ni,nj,nand,nexc); tt=rbind(tt,temp); } } tt=as.data.frame(tt,stringsAsFactors=F); tt$nand=as.numeric(tt[,"nand"]); tt$nexc=as.numeric(tt[,"nexc"]); tt; } # END prob.concur # FUNCTION performing mutation correlation mut_cor_permu_test=function(x=NULL,n.permu=100,seed=NULL,mut.file=NULL,out.file=NULL,out.rdata=NULL) { if (!is.null(mut.file)) read.table(mut.file,header=T)->x x=x[,-1];x[x>0]=1 x=x[rowSums(x)>0,colSums(x)>0] prob.concur(x)->pc0 if (!is.null(seed)) set.seed(seed) pp=0;en=0 for (i in 1:n.permu) { doit=1; while(doit==1) { xi=matrix.sample.v1(x); if (max(xi)==1) doit=0; } prob.concur(xi)->pci; en=en+pci[,c("nand","nexc")]; as.numeric(pci$nand>=pc0$nand)->pand; as.numeric(pci$nexc>=pc0$nexc)->pexc; pp=pp+cbind(pand,pexc); } en=en/n.permu;pp=pp/n.permu; pci[,c("nand","nexc")]=en; pci=cbind(pci,pp); tt=merge(pc0,pci,by.x=c("vi","vj"),by.y=c("vi","vj")); if (!is.null(out.rdata)) save(tt,file=out.rdata,compress=T); names(tt)= c("Gene1","Gene2","CntGene1","CntGene2","AndCnt","XorCnt","Perm_CntGene1","Perm_CntGene2","Perm_AndCnt","Perm_XorCnt","Pvalue_And","Pvalue_Xor") if (!is.null(out.file)) write.table(tt,file=out.file,quote=F,sep="\t",row.names=F); invisible (tt); } #END mut_cor_permu_test #run test using mut_cor_permu_test mut_cor_permu_test(mut.file=mutation_file,n.permu=permutations,out.file=output_file); Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan.pm000444000765000024 4662012013522176 25467 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::PathScan; use warnings; use strict; use Genome; use Genome::Model::Tools::Music::PathScan::PopulationPathScan; use IO::File; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::PathScan { is => 'Command::V2', has_input => [ gene_covg_dir => { is => 'Text', doc => "Directory containing per-gene coverage files (Created using music bmr calc-covg)" }, bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)" }, pathway_file => { is => 'Text', doc => "Tab-delimited file of pathway information (See Description)" }, maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specifications v2.3" }, output_file => { is => 'Text', doc => "Output file that will list the significant pathways and their p-values", is_output => 1 }, bmr => { is => 'Number', doc => "Background mutation rate in the targeted regions", is_optional => 1, default => 1.0E-6 }, genes_to_ignore => { is => 'Text', doc => "Comma-delimited list of genes whose mutations should be ignored", is_optional => 1 }, min_mut_genes_per_path => { is => 'Number', doc => "Pathways with fewer mutated genes than this, will be ignored", is_optional => 1, default => 1 }, skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 }, skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 }, ], doc => "Find signifcantly mutated pathways in a cohort given a list of somatic mutations.", }; sub help_synopsis { return <gene_covg_dir; my $bam_list = $self->bam_list; my $pathway_file = $self->pathway_file; my $maf_file = $self->maf_file; my $output_file = $self->output_file; my $bgd_mut_rate = $self->bmr; my $genes_to_ignore = $self->genes_to_ignore; my $min_mut_genes_per_path = $self->min_mut_genes_per_path; my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent; # Check on all the input data before starting work print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file ); print STDERR "Directory with gene coverages not found: $covg_dir\n" unless( -e $covg_dir ); print STDERR "List of samples not found or is empty: $bam_list\n" unless( -s $bam_list ); print STDERR "Pathway info file not found or is empty: $pathway_file\n" unless( -s $pathway_file ); return undef unless( -s $maf_file && -e $covg_dir && -s $bam_list && -s $pathway_file ); # Build a hash to quickly lookup the genes whose mutations should be ignored my %ignored_genes = (); if( defined $genes_to_ignore ) { %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore ); } # PathScan uses a helluva lot of hashes - all your RAM are belong to it my %sample_gene_hash; # sample => array of genes (based on maf) my %gene_path_hash; # gene => array of pathways (based on path_file) my %path_hash; # pathway => all the information about the pathways in the database my %sample_path_hash; # sample => pathways (based on %sample_gene_hash and %gene_path_hash) my %path_sample_hits_hash; # path => sample => hits,mutated_genes my %gene_sample_cov_hash; # gene => sample => coverage my @all_sample_names; # names of all the samples, no matter if it's mutated or not my %id_gene_hash; # entrez id => gene (based on first two columns in MAF) # Parse out the names of the samples which should match the names in the MAF file my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); } $sampleFh->close; # Read coverage data calculated by the Music::Bmr::CalcCovg $covg_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any read_CoverageFiles( $covg_dir, \@all_sample_names, \%gene_sample_cov_hash ); #build gene => average_coverage hash for population test my %gene_cov_hash; foreach my $gene ( keys %gene_sample_cov_hash ) { my $total_cov = 0; my $sample_num = scalar( @all_sample_names ); $total_cov += $gene_sample_cov_hash{$gene}{$_} foreach( @all_sample_names ); $gene_cov_hash{$gene} = int( $total_cov / $sample_num ); } #build %sample_gene_hash based on maf my $maf_fh = IO::File->new( $maf_file ); while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp( $line ); my @cols = split( /\t/, $line ); my ( $gene, $entrez_id, $mutation_class, $tumor_sample ) = ( $cols[0], $cols[1], $cols[8], $cols[15] ); # If the mutation classification is odd, quit with error if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification $mutation_class in MAF file.\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { print STDERR "Skipping $mutation_class mutation in gene $gene.\n"; next; } # Check that the user followed instructions and named each sample correctly unless( grep( /^$tumor_sample$/, @all_sample_names )) { print STDERR "Sample $tumor_sample in MAF file does not match any in $bam_list\n"; return undef; } next if( defined $ignored_genes{$gene} ); # Ignore variants in genes that user wants ignored $id_gene_hash{$entrez_id} = $gene unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ ); push( @{$sample_gene_hash{$tumor_sample}}, $gene ) unless( grep /^$gene$/, @{$sample_gene_hash{$tumor_sample}} ); } $maf_fh->close; my $path_fh = IO::File->new( $pathway_file ); while( my $line = $path_fh->getline ) { chomp( $line ); next if( $line =~ /^(#|ID)/ ); #Skip headers my ( $path_id, $name, $class, $gene_line, $diseases, $drugs, $description ) = split( /\t/, $line ); my @genes = split( /\|/, $gene_line ); #Each gene is in the format "EntrezID:GeneSymbol" $diseases =~ s/\|/, /g; #Change the separators to commas $drugs =~ s/\|/, /g; #Change the separators to commas $path_hash{$path_id}{name} = $name unless( $name eq '' ); $path_hash{$path_id}{class} = $class unless( $class eq '' ); $path_hash{$path_id}{diseases} = $diseases unless( $diseases eq '' ); $path_hash{$path_id}{drugs} = $drugs unless( $drugs eq '' ); $path_hash{$path_id}{description} = $description unless( $description eq '' ); @{$path_hash{$path_id}{gene}} = (); foreach my $gene ( @genes ) { my ( $entrez_id, $gene_symbol ) = split( /:/, $gene ); unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ ) { # Use the gene name from the MAF file if the entrez ID matches $gene_symbol = $id_gene_hash{$entrez_id} if( defined $id_gene_hash{$entrez_id} ); } push( @{$gene_path_hash{$gene_symbol}}, $path_id ) unless( grep /^$path_id$/, @{$gene_path_hash{$gene_symbol}} ); unless( grep /^$gene_symbol$/, @{$path_hash{$path_id}{gene}} ) { push( @{$path_hash{$path_id}{gene}}, $gene_symbol ); } } } $path_fh->close; #build a sample => pathway hash foreach my $sample ( keys %sample_gene_hash ) { foreach my $gene ( @{$sample_gene_hash{$sample}} ) { if( defined $gene_path_hash{$gene} ) { foreach my $pathway ( @{$gene_path_hash{$gene}} ) { push( @{$sample_path_hash{$sample}}, $pathway ) unless( grep /^$pathway$/, @{$sample_path_hash{$sample}} ); } } } } #build path_sample_hits_hash, for population test foreach my $sample ( keys %sample_path_hash ) { foreach my $path ( @{$sample_path_hash{$sample}} ) { my $hits = 0; my @mutated_genes = (); #Mutated genes in this sample belonging to this pathway my @mutated_genes_in_sample = @{$sample_gene_hash{$sample}}; foreach my $gene ( @{$path_hash{$path}{gene}} ) { if( grep /^$gene$/, @mutated_genes_in_sample ) #if this gene is mutated in this sample (in maf) { $hits++; push( @mutated_genes, $gene ); } } if( $hits > 0 ) { $path_sample_hits_hash{$path}{$sample}{hits} = $hits; $path_sample_hits_hash{$path}{$sample}{mutated_genes} = \@mutated_genes; } } } #Calculation of p value my %data; #For printing my @pvals; foreach my $path ( sort keys %path_hash ) { my @pathway_genes = @{$path_hash{$path}{gene}}; my @gene_sizes = (); foreach my $gene ( @pathway_genes ) { if( defined $gene_cov_hash{$gene} ) { my $avg_cov = int( $gene_cov_hash{$gene} ); push( @gene_sizes, $avg_cov ) if( $avg_cov > 3 ); } } #If this pathway doesn't have any gene coverage, skip it next unless( scalar( @gene_sizes ) > 0 ); my @num_hits_per_sample; #store hits info for each patient my @mutated_samples = sort keys %{$path_sample_hits_hash{$path}}; foreach my $sample ( @all_sample_names ) { my $hits = 0; #if this sample has mutation if( grep /^$sample$/, @mutated_samples ) { $hits = $path_sample_hits_hash{$path}{$sample}{hits}; } push( @num_hits_per_sample, $hits ); } #If this pathway doesn't have any mutated genes in any samples, skip it next unless( scalar( @num_hits_per_sample ) > 0 ); my $hits_ref = \@num_hits_per_sample; ########### MCW ADDED # FIND MAX NUMBER OF HITS IN A SAMPLE my $max_hits = 0; foreach my $hits_in_sample ( @num_hits_per_sample ) { $max_hits = $hits_in_sample if( $hits_in_sample > $max_hits ); } ########### MCW ADDED my $pop_obj = Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \@gene_sizes ); if( scalar( @gene_sizes ) >= 3 ) { ########### MCW ADDED if( $max_hits > 15 ) { $pop_obj->assign( 5 ); } else { $pop_obj->assign( 3 ); } ########### MCW ADDED #$pop_obj->assign(3); } elsif( @gene_sizes == 2 ) { $pop_obj->assign( 2 ); } else { $pop_obj->assign( 1 ); } $pop_obj->preprocess( $bgd_mut_rate, $hits_ref ); #mwendl's new fix my $pval = $pop_obj->population_pval_approx($hits_ref); $data{$pval}{$path}{samples} = \@mutated_samples; $data{$pval}{$path}{hits} = $hits_ref; push( @pvals, $pval ); # For calculation of FDR } # Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values my $pval_cnt = scalar( @pvals ); my %fdr_hash; for( my $i = 0; $i < $pval_cnt; $i++ ) { my $fdr = $pvals[$i] * $pval_cnt / ( $pval_cnt - $i ); $fdr = 1 if $fdr > 1; $fdr_hash{$pvals[$i]} = $fdr; } # Print two output files, one more detailed than the other my $out_fh = IO::File->new( $output_file, ">" ); my $out_detailed_fh = IO::File->new( "$output_file\_detailed", ">" ); $out_fh->print( "Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n" ); foreach my $pval ( sort { $a <=> $b } keys %data ) { foreach my $path ( sort keys %{$data{$pval}} ) { # Skip this pathway if it has fewer affected genes than the user wants my %mutated_gene_hash; my @samples = @{$data{$pval}{$path}{samples}}; foreach my $sample ( @samples ) { foreach my $gene ( @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}} ) { $mutated_gene_hash{$gene}++; } } next unless ( scalar( keys %mutated_gene_hash ) >= $min_mut_genes_per_path ); # Print detailed output to a separate output file $out_detailed_fh->print( "Pathway: $path\n" ); $out_detailed_fh->print( "Name: ", $path_hash{$path}{name}, "\n" ) if( defined $path_hash{$path}{name} ); $out_detailed_fh->print( "Class: ", $path_hash{$path}{class}, "\n" ) if( defined $path_hash{$path}{class} ); $out_detailed_fh->print( "Diseases: ", $path_hash{$path}{diseases}, "\n" ) if( defined $path_hash{$path}{diseases} ); $out_detailed_fh->print( "Drugs: ", $path_hash{$path}{drugs}, "\n" ) if( defined $path_hash{$path}{drugs} ); $out_detailed_fh->print( "P-value: $pval\n", "FDR: ", $fdr_hash{$pval}, "\n" ); $out_detailed_fh->print( "Description: ", $path_hash{$path}{description}, "\n" ); my @hits = @{$data{$pval}{$path}{hits}}; foreach my $sample ( @samples ) { my @mutated_genes = @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}}; $out_detailed_fh->print( "$sample:" ); $out_detailed_fh->print( join ",", @mutated_genes ); $out_detailed_fh->print( "\n" ); } my ( $mutSampleCnt, $totalMutGenes ) = ( 0, 0 ); $out_detailed_fh->print( "Samples with mutations (#hits): " ); for( my $i = 0; $i < scalar( @all_sample_names ); ++$i ) { if( $hits[$i] > 0 ) { $out_detailed_fh->print( "$all_sample_names[$i]($hits[$i]) " ); $mutSampleCnt++; $totalMutGenes += $hits[$i]; } } $out_detailed_fh->print( "\n\n" ); # Print tabulated output to the main output file my ( $path_name, $path_class ) = ( "-", "-" ); $path_name = $path_hash{$path}{name} if( defined $path_hash{$path}{name} ); $path_class = $path_hash{$path}{class} if( defined $path_hash{$path}{class} ); $out_fh->print( "$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t", "$pval\t", $fdr_hash{$pval}, "\n" ); } } $out_detailed_fh->close; $out_fh->close; return 1; } # Reads files for each sample which are formatted as tab-separated lines each showing the number of # bases with sufficient coverage in a gene. sub read_CoverageFiles { my ( $covg_dir, $all_samples_ref, $gene_sample_cov_hash_ref ) = ( $_[0], $_[1], $_[2] ); # Read per-gene covered base counts for each sample foreach my $sample ( @{$all_samples_ref} ) { # If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete unless( -s "$covg_dir/$sample.covg" ) { print STDERR "Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n"; exit 1; } my $covgFh = IO::File->new( "$covg_dir/$sample.covg" ); while( my $line = $covgFh->getline ) { next if( $line =~ m/^#/ ); my ( $gene, undef, $covd_bases ) = split( /\t/, $line ); $gene_sample_cov_hash_ref->{$gene}{$sample} = $covd_bases; } $covgFh->close; } } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Pfam.pm000444000765000024 1061512013522176 24644 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Pfam; use warnings; use strict; use Genome; use IO::File; use IPC::Cmd qw/can_run/; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Pfam { is => 'Genome::Model::Tools::Music::Base', has_input => [ maf_file => { is => 'Text', is_input => 1, doc => "List of mutations using TCGA MAF specification v2.3", }, output_file => { is => 'Text', is_output => 1, doc => "Output MAF file with an extra column that reports Pfam annotation domains", }, reference_build => { is => 'Text', default => 'Build37', doc => "Options are 'Build36' or 'Build37'. This parameter ensures appropriate annotation of domains", valid_values => ['Build36', 'Build37'], }, ], doc => "Add Pfam annotation to a MAF file", }; sub help_synopsis { return <maf_file; my $reference_build = $self->reference_build; my $output_file = $self->output_file; #parse the MAF file and output a new MAF with Pfam info appended my $maf_fh = IO::File->new( $maf_file ) or die "Couldn't open MAF file!\n"; my $out_fh = IO::File->new( $output_file, ">" )or die "Couldn't open $output_file for writing!\n"; while( my $line = $maf_fh->getline ) { chomp $line; my @cols = split( /\t/, $line ); my ( $chr, $start, $stop ) = @cols[4..6]; #print out any headers directly to the output file if( $line =~ m/^(#|Hugo_Symbol)/ ) { $out_fh->print( $line ); $out_fh->print(( $line =~ m/^Hugo_Symbol/ ) ? "\tPfam_Annotation_Domains\n" : "\n" ); next; } #construct a tabix command my $db_path = Genome::Sys->dbpath( 'pfam', 'latest' ) or die "Cannot find the pfam db path."; my $tabix = can_run( 'tabix' ) or die "Cannot find the tabix command. It can be obtained from http://sourceforge.net/projects/samtools/files/tabix"; my $tabix_cmd = "$tabix"; if( $reference_build eq 'Build36' ) { $tabix_cmd .= " $db_path/pfam.annotation.build36.gz $chr:$start-$stop - |"; } elsif( $reference_build eq 'Build37' ) { $tabix_cmd .= " $db_path/pfam.annotation.build37.gz $chr:$start-$stop - |"; } else { die "Please specify either 'Build36' or 'Build37' for the --reference-build parameter."; } #run tabix command my %domains; open( TABIX, $tabix_cmd ) or die "Cannot run 'tabix'. Please check it is in your PATH. It can be installed from the samtools project. $!"; while( my $tabline = ) { chomp $tabline; my ( undef, undef, undef, $csv_domains ) = split( /\t/, $tabline ); my @domains = split( /,/, $csv_domains ); for my $domain ( @domains ) { $domains{$domain}++; } } close(TABIX); #print output to new file my $all_domains = join( ",", sort keys %domains ); my $output_line = "$line\t"; unless( $all_domains eq "" ) { $output_line .= "$all_domains\n"; } else { $output_line .= "NA\n"; } $out_fh->print( $output_line ); } return( 1 ); } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Play.pm000444000765000024 2630612013522176 24672 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Play; use strict; use warnings; use Genome; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Play { is => 'Command::V2', has_input => [ bam_list => { is => 'Text', doc => 'Tab delimited list of BAM files [sample_name normal_bam tumor_bam]' }, roi_file => { is => 'Text', doc => 'Tab delimited list of ROIs [chr start stop gene_name]' }, reference_sequence => { is => 'Text', doc => 'Path to reference sequence in FASTA format' }, output_dir => { is => 'Text', is_output => 1, doc => 'Directory where output files and subdirectories will be written', }, maf_file => { is => 'Text', doc => 'List of mutations using TCGA MAF specifications v2.3' }, pathway_file => { is => 'Text', doc => 'Tab-delimited file of pathway information', }, ], has_optional_input => [ numeric_clinical_data_file => { is => 'Text', doc => 'Table of samples (y) vs. numeric clinical data category (x)', }, categorical_clinical_data_file => { is => 'Text', doc => 'Table of samples (y) vs. categorical clinical data category (x)', }, numerical_data_test_method => { is => 'Text', default => 'cor', doc => "Either 'cor' for Pearson Correlation or 'wilcox' for the Wilcoxon Rank-Sum Test for numerical clinical data.", }, glm_model_file => { is => 'Text', doc => 'File outlining the type of model, response variable, covariants, etc. for the GLM analysis. (See DESCRIPTION).', }, glm_clinical_data_file => { is => 'Text', doc => 'Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION).', }, use_maf_in_glm => { is => 'Boolean', default => 0, doc => 'Set this flag to use the variant matrix created from the MAF file as variant input to GLM analysis.', }, omimaa_dir => { is => 'Path', doc => 'omim amino acid mutation database folder', default => Genome::Sys->dbpath('omim', 'latest'), }, cosmic_dir => { is => 'Path', doc => 'cosmic amino acid mutation database folder', default => Genome::Sys->dbpath('cosmic', 'latest'), }, verbose => { is => 'Boolean', default => 1, doc => 'turn on to display larger working output', }, clinical_correlation_matrix_file => { is => 'Text', doc => 'Optionally store the sample-vs-gene matrix used internally during calculations.', }, mutation_matrix_file => { is => 'Text', doc => 'Optionally store the sample-vs-gene matrix used during calculations.', }, permutations => { is => 'Number', doc => 'Number of permutations used to determine P-values', }, normal_min_depth => { is => 'Integer', doc => 'The minimum read depth to consider a Normal BAM base as covered', }, tumor_min_depth => { is => 'Integer', doc => 'The minimum read depth to consider a Tumor BAM base as covered', }, min_mapq => { is => 'Integer', doc => 'The minimum mapping quality of reads to consider towards read depth counts', }, show_skipped => { is => 'Boolean', default => 0, doc => 'Report each skipped mutation, not just how many', }, genes_to_ignore => { is => 'Text', doc => 'Comma-delimited list of genes to ignore for background mutation rates', }, bmr => { is => 'Number', doc => 'Background mutation rate in the targeted regions', }, max_proximity => { is => 'Text', doc => 'Maximum AA distance between 2 mutations', }, bmr_modifier_file => { is => 'Text', doc => 'Tab delimited list of values per gene that modify BMR before testing [gene_name bmr_modifier]', }, skip_low_mr_genes => { is => 'Boolean', default => 1, doc => "Skip testing genes with MRs lower than the background MR" }, max_fdr => { is => 'Number', default => 0.20, doc => 'The maximum allowed false discovery rate for a gene to be considered an SMG', }, genetic_data_type => { is => 'Text', doc => 'Data in matrix file must be either "gene" or "variant" type data', }, wu_annotation_headers => { is => 'Boolean', doc => 'Use this to default to wustl annotation format headers', }, bmr_groups => { is => 'Integer', default => 1, doc => 'Number of clusters of samples with comparable BMRs', }, separate_truncations => { is => 'Boolean', default => 0, doc => 'Group truncational mutations as a separate category', }, merge_concurrent_muts => { is => 'Boolean', default => 0, doc => 'Multiple mutations of a gene in the same sample are treated as 1', }, skip_non_coding => { is => 'Boolean', default => 1, doc => 'Skip non-coding mutations from the provided MAF file', }, skip_silent => { is => 'Boolean', default => 1, doc => 'Skip silent mutations from the provided MAF file', }, min_mut_genes_per_path => { is => 'Integer', default => 1, doc => 'Pathways with fewer mutated genes than this will be ignored', }, processors => { is => 'Integer', default => 1, doc => "Number of processors to use in SMG (requires 'foreach' and 'doMC' R packages)", }, aa_range => { is => 'Integer', default => 2, doc => "Set how close a 'near' match is when searching for amino acid near hits", }, nuc_range => { is => 'Integer', default => 5, doc => "Set how close a 'near' match is when searching for nucleotide position near hits", }, reference_build => { is => 'Text', default => 'Build37', doc => 'Put either "Build36" or "Build37"', }, show_known_hits => { is => 'Boolean', default => 1, doc => "When a finding is novel, show known AA in that gene", }, ], has_calculated_optional => [ gene_covg_dir => { calculate_from => ['output_dir'], calculate => q{ $output_dir . '/gene_covgs'; }, }, gene_mr_file => { calculate_from => ['output_dir'], calculate => q{ $output_dir . '/gene_mrs'; }, }, gene_list => { is => 'Text', doc => 'List of genes to test in B(1), typically SMGs. (Uses output from running B(1).)', calculate_from => ['output_dir'], calculate => q{ $output_dir . '/smg'; }, }, input_clinical_correlation_matrix_file => { is => 'Text', is_optional => 1, doc => "Instead of calculating this from the MAF, input the sample-vs-gene matrix used internally during calculations.", }, ], has_constant => [ cmd_list_file => { #If a workflow version of this tool is written, these parameters might be more useful is => 'Text', default_value => undef, is_optional => 1, }, cmd_prefix => { is => 'Text', default_value => undef, is_optional => 1, }, ], doc => 'Run the full suite of MuSiC tools sequentially.', }; sub help_synopsis { return <(1)."; } sub _doc_authors { return " Thomas B. Mooney, M.S."; } sub _doc_see_also { return <(1), B(1), B(1), B(1), B(1), B(1), B(1), B(1) EOS } sub execute { my $self = shift; my @no_dependencies = ('Proximity', 'ClinicalCorrelation', 'CosmicOmim', 'Pfam'); my @bmr = ('Bmr::CalcCovg', 'Bmr::CalcBmr'); my @depend_on_bmr = ('PathScan', 'Smg'); my @depend_on_smg = ('MutationRelation'); for my $command_name (@no_dependencies, @bmr, @depend_on_bmr, @depend_on_smg) { my $command = $self->_create_command($command_name) or return; $self->_run_command($command) or return; } return 1; } sub _create_command { my $self = shift; my $command_name = shift; my $command_module = join('::', 'Genome::Model::Tools::Music', $command_name); my $command_meta = $command_module->__meta__; my %params; for my $property ($command_meta->_legacy_properties()) { next unless exists $property->{is_input} and $property->{is_input}; my $property_name = $property->property_name; if($property_name eq 'output_file') { $params{$property_name} = $self->output_dir . '/' . $command_module->command_name_brief; } elsif(!$property->is_optional or defined $self->$property_name) { $params{$property_name} = $self->$property_name; } } my $command = $command_module->create(%params); unless($command) { $self->error_message('Failed to create command for ' . $command_name); return; } return $command; } sub _run_command { my $self = shift; my $command = shift; my $command_name = $command->command_name; $self->status_message('Running ' . $command_name . '...'); my $rv = eval { $command->execute() }; if($@) { my $error = $@; $self->error_message('Error running ' . $command_name . ': ' . $error); return; } elsif(not $rv) { $self->error_message('Command ' . $command_name . ' did not return a true value.'); return; } else { $self->status_message('Completed ' . $command_name . '.'); return 1; } } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot.pm000444000765000024 212012013522176 24647 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Plot; use warnings; use strict; use Genome; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Plot { is => ['Command::Tree'], doc => "Generate relevant plots and visualizations for MuSiC." }; sub _doc_copyright_years { (2010,2012); } sub _doc_license { my $self = shift; my (@y) = $self->_doc_copyright_years; return <(1), B(1) EOS } sub _doc_manual_body { return shift->help_detail; } sub help_detail { return "These tools are part of the MuSiC suite.\n"; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Proximity.pm000444000765000024 3150012013522176 25761 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Proximity; use warnings; use strict; use IO::File; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Proximity { is => 'Command::V2', has_input => [ maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specifications v2.3" }, output_dir => { is => 'Text', doc => "Directory where output files will be written" }, max_proximity => { is => 'Text', doc => "Maximum allowed AA distance between 2 mutations", is_optional => 1, default => 7 }, skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 }, skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 }, ], has_output => [ output_file => {is => 'Text', doc => "TODO"}, ], doc => "Perform a proximity analysis on a list of mutations." }; sub help_detail { return <maf_file; my $output_dir = $self->output_dir; my $max_proximity = $self->max_proximity; my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent; $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any # Check on all the input data before starting work print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file ); print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir ); return undef unless( -s $maf_file && -e $output_dir ); # Output of this script will be written to this location in the output directory my $out_file = "$output_dir/proximity_report"; $self->output_file($out_file); # Parse the header row in the MAF file my $maf_fh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!"; my $maf_header = $maf_fh->getline; $maf_header = $maf_fh->getline while( $maf_header =~ /^#/ ); # Skip commented lines chomp( $maf_header ); unless( $maf_header =~ /^Hugo_Symbol/ ) { print STDERR "Could not find column headers in $maf_file\n"; return undef; } # Check whether the required additional MAF columns were included in the MAF unless( $maf_header =~ m/c_position/ and $maf_header =~ m/amino_acid_change/ and $maf_header =~ m/transcript_name/ ) { print STDERR "Could not find required additional columns in $maf_file\n"; return undef; } # Find the indexes of all the MAF columns my $idx = 0; my %col_idx = map {($_, $idx++)} split( /\t/, $maf_header ); # A hash to store statuses, and a hash to store variants and AA positions my %status; my $status = \%status; my %aa_mutations; # Load relevant data from MAF into hash while( my $line = $maf_fh->getline ) { chomp $line; my @cols = split( /\t/, $line ); # Fetch data from the generic MAF columns my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref_allele, $var_allele, $var2, $sample ) = ( $cols[0], $cols[4], $cols[5], $cols[6], $cols[8], $cols[9], $cols[10], $cols[11], $cols[12], $cols[15] ); $var_allele = $var2 if ( $var_allele eq $ref_allele ); # Different centers interpret the 2 variant columns differently # Fetch data from the required additional MAF columns my ( $c_position, $aa_change, $transcript ) = ( $cols[$col_idx{c_position}], $cols[$col_idx{amino_acid_change}], $cols[$col_idx{transcript_name}] ); # Create a key to uniquely identify each variant my $variant_key = join( "\t", $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample ); #check that the mutation class is acceptable if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; $status{synonymous_mutations_skipped}++; next; } # Determine amino acid position and load into hash my @mutated_aa_positions = (); @mutated_aa_positions = $self->get_amino_acid_pos( $variant_key, $mutation_class, $c_position, $aa_change, $status ); # Record data in hash if mutated aa position found if( scalar( @mutated_aa_positions ) > 0 ) { push( @{$aa_mutations{$transcript}{$variant_key}{mut_AAs}}, @mutated_aa_positions ); } } $maf_fh->close; # Evaluate proximity of mutated amino acids for each transcript for my $transcript ( keys %aa_mutations ) { # For each variant hitting that transcript for my $variant ( keys %{$aa_mutations{$transcript}} ) { # Initialize the search my @affected_amino_acids = @{$aa_mutations{$transcript}{$variant}{mut_AAs}}; my $mutations_within_proximity = 0; # Variable for summing # of mutations within proximity my $min_proximity = $max_proximity + 1; # Current minimum proximity # For each OTHER variant hitting the transcript for my $other_variant ( keys %{$aa_mutations{$transcript}} ) { # Ignore the current mutation next if $variant eq $other_variant; # Get affected amino acids from OTHER variant my @other_affected_amino_acids = @{$aa_mutations{$transcript}{$other_variant}{mut_AAs}}; # Compare distances between amino acids my $found_close_one = 0; for my $other_variant_aa ( @other_affected_amino_acids ) { for my $variant_aa ( @affected_amino_acids ) { my $distance = abs($other_variant_aa - $variant_aa); # If distance is within range if( $distance <= $max_proximity ) { $found_close_one++; $min_proximity = $distance if $distance < $min_proximity; } } } # Note that this variant is within proximity if applicable $mutations_within_proximity++ if $found_close_one; } # Now, save results in hash if there are any if ($mutations_within_proximity) { $aa_mutations{$transcript}{$variant}{muts_within_range} = $mutations_within_proximity; $aa_mutations{$transcript}{$variant}{min_proximity} = $min_proximity; } } } # Print results my $out_fh = IO::File->new( $out_file, ">" ) or die "Couldn't open $out_file. $!"; $out_fh->print( "Mutations_Within_Proximity\tNearest_Mutation\tGene\tTranscript\tAffected_Amino_Acid(s)\tChr\tStart\tStop\tRef_Allele\tVar_Allele\tSample\n" ); for my $transcript ( keys %aa_mutations ) { for my $variant ( keys %{$aa_mutations{$transcript}} ) { if( exists $aa_mutations{$transcript}{$variant}{muts_within_range} ) { my ( $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample ) = split( /\t/, $variant ); my $affected_amino_acids = join( ",", sort @{$aa_mutations{$transcript}{$variant}{mut_AAs}} ); my $line = join( "\t", $aa_mutations{$transcript}{$variant}{muts_within_range}, $aa_mutations{$transcript}{$variant}{min_proximity}, $gene, $transcript, $affected_amino_acids, $chr, $start, $stop, $ref_allele, $var_allele, $sample ); $out_fh->print( "$line\n" ); } } } $out_fh->close(); return 1; } ################################################################################ =head2 get_amino_acid_pos This subroutine deducts the amino acid position within the transcript using the c_position and amino_acid_position columns in the MAF. =cut ################################################################################ sub get_amino_acid_pos { # Parse arguments my $self = shift; my ( $variant_key, $mut_class, $c_position, $aa_change, $status ) = @_; # Initialize variables my $tx_start = my $tx_stop = 0; my $aa_position_start = my $aa_position_stop = 0; my $inferred_aa_start = my $inferred_aa_stop = 0; my $aa_pos = my $inferred_aa_pos = 0; # Amino acid position determination if( $aa_change && $aa_change ne "NULL" && substr( $aa_change, 0, 1 ) ne "e" ) { $aa_pos = $aa_change; $aa_pos =~ s/[^0-9]//g; } # Parse out c_position if applicable ## if( $c_position && $c_position ne "NULL" ) { # If multiple results, parse both ## if( $c_position =~ '_' && !( $mut_class =~ 'splice' )) { ($tx_start, $tx_stop) = split( /\_/, $c_position ); $tx_start =~ s/[^0-9]//g; $tx_stop =~ s/[^0-9]//g; if( $tx_stop < $tx_start ) { $inferred_aa_start = $tx_stop / 3; $inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_stop % 3 ) ; $inferred_aa_stop = $tx_start / 3; $inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_start % 3 ); } else { $inferred_aa_start = $tx_start / 3; $inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_start % 3 ); $inferred_aa_stop = $tx_stop / 3; $inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_stop % 3 ); } } else { my ( $tx_pos ) = split( /[\+\-\_]/, $c_position ); $tx_pos =~ s/[^0-9]//g; $tx_start = $tx_stop = $tx_pos; if($tx_pos) { $inferred_aa_pos = $tx_pos / 3; $inferred_aa_pos = sprintf( "%d", $inferred_aa_pos ) + 1 if( $tx_pos % 3 ); $inferred_aa_start = $inferred_aa_stop = $inferred_aa_pos; } } } # If we inferred aa start stop, proceed with it ## if( $inferred_aa_start && $inferred_aa_stop ) { $aa_position_start = $inferred_aa_start; $aa_position_stop = $inferred_aa_stop; $status->{aa_position_inferred}++; } # Otherwise if we inferred aa position ## elsif( $aa_pos ) { $aa_position_start = $aa_pos; $aa_position_stop = $aa_pos; $status->{c_position_not_available}++; } # Otherwise we were unable to infer the info ## else { $status->{aa_position_not_found}++; $self->status_message( "Amino acid position not found for variant: $variant_key" ); return; } # Proceed if we have aa_position_start and stop ## my %mutated_aa_positions; if( $aa_position_start && $aa_position_stop ) { for( my $this_aa_pos = $aa_position_start; $this_aa_pos <= $aa_position_stop; $this_aa_pos++ ) { $mutated_aa_positions{$this_aa_pos}++; } } my @mutated_aa_positions = keys %mutated_aa_positions; return @mutated_aa_positions; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Smg.pm000444000765000024 2422312013522176 24507 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Smg; use warnings; use strict; use Genome; use IO::File; use Carp; use POSIX qw( WIFEXITED ); our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Smg { is => 'Command::V2', has_input => [ gene_mr_file => { is => 'Text', doc => "File with per-gene mutation rates (Created using \"music bmr calc-bmr\")" }, output_file => { is => 'Text', is_output => 1, doc => "Output file that will list significantly mutated genes and their p-values" }, ], has_optional_input => [ max_fdr => { is => 'Number', default => 0.20, doc => "The maximum allowed false discovery rate for a gene to be considered an SMG" }, skip_low_mr_genes => { is => 'Boolean', default => 1, doc => "Skip testing genes with MRs lower than the background MR" }, bmr_modifier_file => { is => 'Text', doc => "Tab delimited multipliers per gene that modify BMR before testing [gene_name bmr_modifier]" }, processors => { is => 'Integer', default => 1, doc => "Number of processors to use (requires 'foreach' and 'doMC' R packages)" }, ], doc => "Identify significantly mutated genes." }; sub help_synopsis { return <gene_mr_file; my $output_file = $self->output_file; my $output_file_detailed = $output_file . "_detailed"; my $max_fdr = $self->max_fdr; my $skip_low_mr_genes = $self->skip_low_mr_genes; my $bmr_modifier_file = $self->bmr_modifier_file; my $processors = $self->processors; # Check on all the input data before starting work print STDERR "Gene mutation rate file not found or is empty: $gene_mr_file\n" unless( -s $gene_mr_file ); print STDERR "BMR modifier file not found or is empty: $bmr_modifier_file\n" unless( !defined $bmr_modifier_file || -s $bmr_modifier_file ); return undef unless( -s $gene_mr_file && ( !defined $bmr_modifier_file || -s $bmr_modifier_file )); # If BMR modifiers were provided, then load them, and create another gene_mr_file with modified BMRs if( defined $bmr_modifier_file ) { my $inBmrModFh = IO::File->new( $bmr_modifier_file ) or die "Couldn't open $bmr_modifier_file. $!\n"; my %bmr_modifier = (); while( my $line = $inBmrModFh->getline ) { next if( $line =~ m/^#/ ); chomp( $line ); my ( $gene, $modifier ) = split( /\t/, $line ); ( $modifier > 0 ) or die "$modifier is an invalid bmr-modifier. Please fix values in $bmr_modifier_file.\n"; $bmr_modifier{$gene} = $modifier; } $inBmrModFh->close; my $new_gene_mr_file = Genome::Sys->create_temp_file_path; ( $new_gene_mr_file ) or die "Couldn't create a temp file. $!"; my $inMrFh = IO::File->new( $gene_mr_file ) or die "Couldn't open $gene_mr_file. $!\n"; my $outMrFh = IO::File->new( $new_gene_mr_file, ">" ) or die "Couldn't open $new_gene_mr_file. $!\n"; while( my $line = $inMrFh->getline ) { if( $line =~ m/^#/ ) { $outMrFh->print( $line ); next; } chomp( $line ); my ( $gene, $type, $covd_bps, $mut_cnt, $bmr ) = split( /\t/, $line ); $bmr = $bmr * $bmr_modifier{$gene} if( defined $bmr_modifier{$gene} ); $outMrFh->print( "$gene\t$type\t$covd_bps\t$mut_cnt\t$bmr\n" ); } $outMrFh->close; $inMrFh->close; $gene_mr_file = $new_gene_mr_file; } # Collect per-gene mutation rates for reporting in results later my ( %gene_muts, %gene_bps, %mut_classes_hash ); my $inMrFh = IO::File->new( $gene_mr_file ) or die "Couldn't open $gene_mr_file. $!\n"; while( my $line = $inMrFh->getline ) { next if( $line =~ m/^#/ ); my ( $gene, $type, $covd_bps, $mut_cnt, undef ) = split( /\t/, $line ); # Warn user about cases where there could be fewer covered bps than mutations detected ( $mut_cnt <= $covd_bps ) or warn "More $type seen in $gene than there are bps with sufficient coverage!\n"; if( $type eq "Overall" or $type eq "Indels" or $type eq "Truncations" ) { $gene_muts{$gene}{$type} = $mut_cnt; $gene_bps{$gene} = $covd_bps; $mut_classes_hash{$type} = 1 unless( $type eq "Overall" ); } elsif( $type =~ m/(Transitions|Transversions)$/ ) { $gene_muts{$gene}{SNVs} += $mut_cnt; $mut_classes_hash{SNVs} = 1; } else { die "Unrecognized mutation class in gene-mr-file. $!\n"; } } $inMrFh->close; my @mut_classes = sort keys %mut_classes_hash; # Create a temporary intermediate file to hold the p-values my $pval_file = Genome::Sys->create_temp_file_path; ( $pval_file ) or die "Couldn't create a temp file. $!"; # Call R for Fisher combined test, Likelihood ratio test, and convolution test on each gene my $smg_cmd = "R --slave --args < " . __FILE__ . ".R $gene_mr_file $pval_file smg_test $processors $skip_low_mr_genes"; WIFEXITED( system $smg_cmd ) or croak "Couldn't run: $smg_cmd ($?)"; # Call R for calculating FDR on the p-values calculated in the SMG test my $fdr_cmd = "R --slave --args < " . __FILE__ . ".R $pval_file $output_file_detailed calc_fdr $processors $skip_low_mr_genes"; WIFEXITED( system $fdr_cmd ) or croak "Couldn't run: $fdr_cmd ($?)"; # Parse the R output to identify the SMGs (significant by at least 2 of 3 tests) my $smgFh = IO::File->new( $output_file_detailed ) or die "Couldn't open $output_file_detailed. $!\n"; my ( @newLines, @smgLines ); my $header = "#Gene\t" . join( "\t", @mut_classes ); $header .= "\tTot Muts\tCovd Bps\tMuts pMbp\tP-value FCPT\tP-value LRT\tP-value CT\tFDR FCPT\tFDR LRT\tFDR CT\n"; while( my $line = $smgFh->getline ) { chomp( $line ); if( $line =~ m/^Gene\tp.fisher\tp.lr\tp.convol\tfdr.fisher\tfdr.lr\tfdr.convol$/ ) { push( @newLines, $header ); push( @smgLines, $header ); } else { my ( $gene, @pq_vals ) = split( /\t/, $line ); my ( $p_fcpt, $p_lrt, $p_ct, $q_fcpt, $q_lrt, $q_ct ) = @pq_vals; my @mut_cnts; foreach( @mut_classes ) { # If a mutation count is a fraction, round down the digits after the decimal point push( @mut_cnts, (( $gene_muts{$gene}{$_} =~ m/\./ ) ? sprintf( "%.2f", $gene_muts{$gene}{$_} ) : $gene_muts{$gene}{$_} )); } my $mut_per_mbp = ( $gene_bps{$gene} ? sprintf( "%.2f", ( $gene_muts{$gene}{Overall} / $gene_bps{$gene} * 1000000 )) : 0 ); push( @newLines, join( "\t", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\n" ); # If the FDR of at least two of these tests is less than the maximum allowed, we consider it an SMG if(( $q_fcpt <= $max_fdr && $q_lrt <= $max_fdr ) || ( $q_fcpt <= $max_fdr && $q_ct <= $max_fdr ) || ( $q_lrt <= $max_fdr && $q_ct <= $max_fdr )) { push( @smgLines, join( "\t", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\n" ); } } } $smgFh->close; # Add per-gene SNV and Indel counts to the detailed R output, and make the header friendlier my $outDetFh = IO::File->new( $output_file_detailed, ">" ) or die "Couldn't open $output_file_detailed. $!\n"; $outDetFh->print( @newLines ); $outDetFh->close; # Do the same for only the genes that we consider SMGs my $outFh = IO::File->new( $output_file, ">" ) or die "Couldn't open $output_file. $!\n"; $outFh->print( @smgLines ); $outFh->close; return 1; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Smg.pm.R000444000765000024 1561412013522176 24713 0ustar00nnutterstaff000000000000############################################# ### Functions for testing significance of ### ### per-gene categorized mutation rates ### ############################################# # Fetch command line arguments args = commandArgs(); input_file = as.character(args[4]); output_file = as.character(args[5]); run_type = as.character(args[6]); processors = as.numeric(args[7]); skip_low_mr_genes = as.numeric(args[8]); # See if we have the necessary packages installed to run in parallel is.installed <- function( mypkg ) is.element( mypkg, installed.packages()[,1] ); parallel = FALSE; if( processors > 1 & is.installed( 'doMC' ) & is.installed( 'foreach' )) { parallel = TRUE; } gethist <- function( xmax, n, p, ptype = "positive_log" ) { dbinom( 0:xmax, n, p ) -> ps; ps = ps[ps > 0]; lastp = 1 - sum( ps ); if( lastp > 0 ) ps = c( ps, lastp ); if( ptype == "positive_log" ) ps = -log( ps ); return( ps ); } binit <- function( x, hmax, bin, dropbin = T ) { bs = as.integer( x / bin ); bs[bs > hmax/bin] = hmax / bin; bs[is.na( bs )] = hmax / bin; tapply( exp(-x), as.factor( bs ), sum ) -> bs; bs = bs[bs>0]; bs = -log( bs ); if( dropbin ) bs = as.numeric( bs ); return( bs ); } convolute_b <- function( a, b ) { tt = NULL; for( j in b ) { tt = c( tt, ( a + j )); } return( tt ); } mut_class_test <- function( x, xmax = 100, hmax = 25, bin = 0.001 ) { x = as.data.frame( x ); colnames( x ) = c( "Class", "n", "x", "e" ); x$p = NA; x$lh0 = NA; x$lh1 = NA; tot_muts = x[( x$Class == "Overall" ),]$x; tot_bps = x[( x$Class == "Overall" ),]$n; overall_bmr = x[( x$Class == "Overall" ),]$e; # Remove the row containing overall MR and BMR because we don't want it to be a tested category x = x[( x$Class != "Overall" ),]; # If user wants to skip testing genes with low MRs, measure the relevant MRs of this gene gene_mr = 0; indel_mr = 0; indel_bmr = 0; trunc_mr = 0; trunc_bmr = 0; if( skip_low_mr_genes == 1 ) { if( tot_bps > 0 ) { gene_mr = tot_muts / tot_bps; } if( x[( x$Class == "Indels" ),]$n > 0 ) { indel_mr = x[( x$Class == "Indels" ),]$x / x[( x$Class == "Indels" ),]$n; } indel_bmr = x[( x$Class == "Indels" ),]$e; if( nrow( x[( x$Class == "Truncations" ),] ) > 0 ) { if( x[( x$Class == "Truncations" ),]$n > 0 ) { trunc_mr = x[( x$Class == "Truncations" ),]$x / x[( x$Class == "Truncations" ),]$n; } trunc_bmr = x[( x$Class == "Truncations" ),]$e; } } # Set pvals of 1 for genes with zero mutations, zero covered bps, or zero overall BMR if( tot_muts <= 0 | tot_bps <= 0 | overall_bmr <= 0 ) { p.fisher = 1; p.lr = 1; p.convol = 1; qc = 1; } # If user wants to skip testing genes with low MRs, give them pvals of 1 else if( skip_low_mr_genes == 1 & gene_mr < overall_bmr & indel_mr <= indel_bmr & trunc_mr <= trunc_bmr ) { p.fisher = 1; p.lr = 1; p.convol = 1; qc = 1; } else { # Skip testing mutation categories that have zero BMR, or if this gene has #muts >= #covd bps x = x[( x$n > 0 & x$n > x$x & x$e > 0 ),]; rounded_mut_cnts = round(x$x); for( i in 1:nrow(x) ) { x$p[i] = binom.test( rounded_mut_cnts[i], x$n[i], x$e[i], alternative = "greater" )$p.value; x$lh0[i] = dbinom( rounded_mut_cnts[i], x$n[i], x$e[i], log = T ); x$lh1[i] = dbinom( rounded_mut_cnts[i], x$n[i], x$x[i] / x$n[i], log = T ); ni = x$n[i]; ei = x$e[i]; gethist( xmax, ni, ei, ptype = "positive_log" ) -> bi; binit( bi, hmax, bin ) -> bi; if( i == 1 ) { hist0 = bi; } if( i > 1 & i < nrow(x) ) { hist0 = convolute_b( hist0, bi ); binit( hist0, hmax, bin ) -> hist0; } if( i == nrow(x)) { hist0 = convolute_b( hist0, bi ); } } # Fisher combined p-value q = ( -2 ) * sum( log( x$p )); df = 2 * length( x$p ); p.fisher = 1 - pchisq( q, df ); # Likelihood ratio test q = 2 * ( sum( x$lh1 ) - sum( x$lh0 )); df = sum( x$lh1 != 0 ); if( df > 0 ) p.lr = 1 - pchisq( q, df ); if( df == 0 ) p.lr = 1; # Convolution test bx = -sum( x[,"lh0"] ); p.convol = sum( exp( -hist0[hist0>=bx] )); qc = sum( exp( -hist0 )); } # Return results rst = list( x = cbind( x, tot_muts, p.fisher, p.lr, p.convol, qc )); return( rst ); } dotest <- function( idx, mut, zgenes ) { step = round( length( zgenes ) / processors ); start = step * ( idx - 1 ) + 1; stop = step * idx; if( idx == processors ) { stop = length( zgenes ); } tt = NULL; for( Gene in zgenes[start:stop] ) { mutgi = mut[mut$Gene==Gene,]; mut_class_test( mutgi[,2:5], hmax = 25, bin = 0.001 ) -> z; tt = rbind( tt, cbind( Gene, unique( z$x[,(9:11)] ))); } return( tt ); } combineresults <- function( a, b ) { return( rbind( a, b )); } smg_test <- function( gene_mr_file, pval_file ) { read.delim( gene_mr_file ) -> mut; colnames( mut ) = c( "Gene", "Class", "Bases", "Mutations", "BMR" ); mut$BMR = as.numeric( as.character( mut$BMR )); tt = NULL; # Run in parallel if we have the needed packages, or fall back to the old way if( parallel ) { library( 'doMC' ); library( 'foreach' ); registerDoMC(); cat( "Parallel backend installed - splitting across", processors, "cores\n" ); options( cores = processors ); mcoptions <- list( preschedule = TRUE ); zgenes = unique( as.character( mut$Gene )); tt = foreach( idx = 1:processors, .combine="combineresults", .options.multicore = mcoptions ) %dopar% { dotest( idx, mut, zgenes ); } write.table( tt, file = pval_file, quote = FALSE, row.names = F, sep = "\t" ); } else { for( Gene in unique( as.character( mut$Gene ))) { mutgi = mut[mut$Gene==Gene,]; mut_class_test( mutgi[,2:5], hmax = 25, bin = 0.001 ) -> z; tt = rbind( tt, cbind( Gene, unique( z$x[,(9:11)] ))); } write.table( tt, file = pval_file, quote = FALSE, row.names = F, sep = "\t" ); } } smg_fdr <- function( pval_file, fdr_file ) { read.table( pval_file, header = T, sep = "\t" ) -> x; #Calculate FDR measure and write FDR output p.adjust( x[,2], method="BH" ) -> fdr.fisher; p.adjust( x[,3], method="BH" ) -> fdr.lr; p.adjust( x[,4], method="BH" ) -> fdr.convol; x = cbind( x, fdr.fisher, fdr.lr, fdr.convol ); #Rank SMGs starting with lowest convolution test FDR, and then by Likelihood Ratio FDR x = x[order( fdr.convol, fdr.lr ),]; write.table( x, file = fdr_file, quote = FALSE, row.names = F, sep = "\t" ); } # Figure out which function needs to be invoked and call it if( run_type == "smg_test" ) { smg_test( input_file, output_file ); } if( run_type == "calc_fdr" ) { smg_fdr( input_file, output_file ); } Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Survival.pm000444000765000024 4544412013522176 25604 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Survival; use warnings; use strict; use Carp; use Genome; use IO::File; use POSIX qw( WIFEXITED ); our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Survival { is => 'Genome::Model::Tools::Music::Base', has_input => [ bam_list => { is => 'Text', doc => "List of sample names to be included in the analysis. (See Description)", }, maf_file => { is => 'Text', is_optional => 1, doc => "List of mutations in MAF format", }, output_dir => { is => 'Text', is_output => 1, doc => "Directory where output files will be written", }, genetic_data_type => { is => 'Text', is_optional => 1, default => "gene", doc => "Correlate clinical data to \"gene\" or \"variant\" level data", }, numeric_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Table of samples (y) vs. numeric clinical data category (x)", }, categorical_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Table of samples (y) vs. categorical clinical data category (x)", }, glm_clinical_data_file => { is => 'Text', is_optional => 1, doc => "Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION).", }, phenotypes_to_include => { is => 'Text', is_optional => 1, doc => "Include only these genes and/or phenotypes in the anlaysis. (COMMA-DELIMITED)", }, legend_placement => { is => 'Text', is_optional => 1, default => 'bottomleft', doc => "Choose one of 'bottomleft', 'topleft', 'topright', or 'bottomright'.", }, skip_non_coding => { is => 'Boolean', is_optional => 1, default => 1, doc => "Skip non-coding mutations from the provided MAF file", }, skip_silent => { is => 'Boolean', is_optional => 1, default => 1, doc => "Skip silent mutations from the provided MAF file", }, ], doc => "Create survival plots and P-values for clinical and mutational phenotypes.", }; sub help_synopsis { return <bam_list; my $genetic_data_type = $self->genetic_data_type; my $legend_placement = $self->legend_placement; # handle phenotype inclusions my @phenotypes_to_include; my @clinical_phenotypes_to_include; my @mutated_genes_to_include; if ($self->phenotypes_to_include) { @phenotypes_to_include = split /,/,$self->phenotypes_to_include; } # check genetic data type unless ($genetic_data_type =~ /^gene|variant$/i) { $self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter."); return; } # load clinical data and analysis types my %clinical_data; if ($self->numeric_clinical_data_file) { $clinical_data{'numeric'} = $self->numeric_clinical_data_file; } if ($self->categorical_clinical_data_file) { $clinical_data{'categ'} = $self->categorical_clinical_data_file; } if ($self->glm_clinical_data_file) { $clinical_data{'glm'} = $self->glm_clinical_data_file; } # create array of all sample names possibly included from clinical data and MAF my @all_sample_names; # names of all the samples, no matter if they are mutated or not my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); } $sampleFh->close; # loop through clinical data files and assemble survival data hash (vital_status and days_to_last_followup required); my %survival_data; my $vital_status_flag = 0; my $days_to_last_follow_flag = 0; for my $clin_file (keys %clinical_data) { #check filehandle my $clin_fh = new IO::File $clinical_data{$clin_file},"r"; unless ($clin_fh) { $self->error_message("Failed to open $clinical_data{$clin_file} for reading: $!"); return; } #initiate variables to hold column info my %phenotypes_to_print; my $vital_status_col = 0; my $days_to_last_follow_col = 0; #parse header and record column locations for needed data my $header = $clin_fh->getline; my @header_fields = split /\t/,$header; for (my $i = 1; $i <= $#header_fields; $i++) { #sample ID should be in first column of file my $field = $header_fields[$i]; if ($field =~ /vital_status|vitalstatus/i) { $vital_status_col = $i; $vital_status_flag++; } if ($field =~ /days_to_last_(follow_up|followup)|daystolastfollowup/i) { $days_to_last_follow_col = $i; $days_to_last_follow_flag++; } if (scalar grep { /^$field$/i } @phenotypes_to_include) { $phenotypes_to_print{$field} = $i; } } #read through clinical data file and store needed data in a hash while (my $line = $clin_fh->getline) { chomp $line; my @fields = split /\t/,$line; my $sample = $fields[0]; unless (scalar grep { m/^$sample$/ } @all_sample_names) { $self->status_message("Skipping sample $sample. (Sample is not in --bam-list)."); next; } if ($vital_status_col) { my $vital_status; if ($fields[$vital_status_col] =~ /^(0|living)$/i) { $vital_status = 0; } elsif ($fields[$vital_status_col] =~ /^(1|deceased)$/i) { $vital_status = 1; } else { $vital_status = "NA"; } $survival_data{$sample}{'vital_status'} = $vital_status; } if ($days_to_last_follow_col) { $survival_data{$sample}{'days'} = $fields[$days_to_last_follow_col]; } for my $pheno (keys %phenotypes_to_print) { $survival_data{$sample}{$pheno} = $fields[$phenotypes_to_print{$pheno}]; } } $clin_fh->close; # record phenotypes included from clinical data push @clinical_phenotypes_to_include, keys %phenotypes_to_print; } # check for necessary header fields unless ($vital_status_flag) { $self->error_message('Clinical data does not seem to contain a column labeled "vital_status".'); return; } unless ($days_to_last_follow_flag) { $self->error_message('Clnical data does not seem to contain a column labeled "days_to_last_followup".'); return; } # create temporary files for R command my $survival_data_file = Genome::Sys->create_temp_file_path(); my $mutation_matrix = Genome::Sys->create_temp_file_path(); # print survival data (temp file) my $surv_fh = new IO::File $survival_data_file,"w" or die "Couldn't open survival data filehandle."; print $surv_fh join("\t","Sample","Days_To_Last_Followup","Vital_Status"); if (@clinical_phenotypes_to_include) { print $surv_fh "\t" . join("\t",@clinical_phenotypes_to_include); } print $surv_fh "\n"; for my $sample (keys %survival_data) { unless (exists $survival_data{$sample}{'days'}) { $survival_data{$sample}{'days'} = "NA"; } unless (exists $survival_data{$sample}{'vital_status'}) { $survival_data{$sample}{'vital_status'} = "NA"; } print $surv_fh join("\t",$sample,$survival_data{$sample}{'days'},$survival_data{$sample}{'vital_status'}); for my $pheno (@clinical_phenotypes_to_include) { unless (exists $survival_data{$sample}{$pheno}) { $survival_data{$sample}{$pheno} = "NA"; } print $surv_fh "\t" . $survival_data{$sample}{$pheno}; } print $surv_fh "\n"; } $surv_fh->close; # find if any of the "phenotypes_to_include" are genes, and if so, limit the MAF mutation matrix to those genes my %clinical_pheno_to_include; @clinical_pheno_to_include{@clinical_phenotypes_to_include} = (); for my $item (@phenotypes_to_include) { push @mutated_genes_to_include,$item unless exists $clinical_pheno_to_include{$item}; } my $mutated_genes_to_include = \@mutated_genes_to_include; # create mutation matrix file if( $genetic_data_type =~ /^gene$/i ) { $self->create_sample_gene_matrix_gene( $mutation_matrix, $mutated_genes_to_include, @all_sample_names ); } elsif( $genetic_data_type =~ /^variant$/i ) { $self->create_sample_gene_matrix_variant( $mutation_matrix, $mutated_genes_to_include, @all_sample_names ); } else { $self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." ); return; } # check and prepare output directory my $output_dir = $self->output_dir . "/"; unless (-e $output_dir) { $self->status_message("Creating output directory: $output_dir..."); unless(mkdir $output_dir) { $self->error_message("Failed to create output directory: $!"); return; } } # set up R command my $R_cmd = "R --slave --args < " . __FILE__ . ".R " . join( " ", $survival_data_file, $mutation_matrix, $legend_placement, $output_dir ); print "R_cmd:\n$R_cmd\n"; #run R command WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)"; return(1); } sub create_sample_gene_matrix_gene { my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_; #create hash of mutations from the MAF file my ( %mutations, %all_genes ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $mutation_class, $sample ) = @cols[0,8,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } #check to see if this gene is on the list (if there is a list at all) if( defined @{$mutated_genes_to_include} ) { next unless( scalar grep { m/^$gene$/ } @{$mutated_genes_to_include} ); } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } $all_genes{$gene}++; $mutations{$sample}{$gene}++; } $maf_fh->close; #sort @all_genes for consistency in header and loops my @all_genes = sort keys %all_genes; #write the input matrix for R code to a temp file my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!"; #print input matrix file header my $header = join( "\t", "Sample", @all_genes ); $matrix_fh->print( "$header\n" ); #print mutation relation input matrix for my $sample ( sort @all_sample_names ) { $matrix_fh->print( $sample ); for my $gene ( @all_genes ) { if( exists $mutations{$sample}{$gene} ) { $matrix_fh->print( "\t$mutations{$sample}{$gene}" ); } else { $matrix_fh->print( "\t0" ); } } $matrix_fh->print( "\n" ); } } sub create_sample_gene_matrix_variant { my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_; #create hash of mutations from the MAF file my ( %variants_hash, %all_variants ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } #check to see if this gene is on the list (if there is a list at all) if( defined @{$mutated_genes_to_include} ) { next unless (scalar grep { m/^$gene$/ } @{$mutated_genes_to_include}); } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } my $var; my $variant_name; if( $ref eq $var1 ) { $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref eq $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref ne $var1 && $ref ne $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } } $maf_fh->close; #sort variants for consistency my @variant_names = sort keys %all_variants; #write the input matrix for R code to a file my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!"; #print input matrix file header my $header = join("\t","Sample",@variant_names); $matrix_fh->print("$header\n"); #print mutation relation input matrix for my $sample (sort @all_sample_names) { $matrix_fh->print($sample); for my $variant (@variant_names) { if (exists $variants_hash{$sample}{$variant}) { $matrix_fh->print("\t$variants_hash{$sample}{$variant}"); } else { $matrix_fh->print("\t0"); } } $matrix_fh->print("\n"); } } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Survival.pm.R000444000765000024 545112013522176 25756 0ustar00nnutterstaff000000000000### Survival analysis for mutation data ### ### original location of code: /gscuser/qzhang/gstat/survival/survival.R ### example input file: /gscuser/qzhang/gstat/survival/tcga.tsv ### Run it on command line like below ### for example, R --no-save --args < survival.R vital_status.input mut_matrix.input legend.placement output_dir & ### clinical data /vital status input file, first three columns are sample_ID, survival_time, vital_status (0=living, 1=deceased) ######################## read input arguments clinical.survival.data=commandArgs()[4]; mut.data=commandArgs()[5]; legend.placement=commandArgs()[6]; out.dir=commandArgs()[7]; ######################## read and prepare data vitals = read.table(clinical.survival.data,header=T); mut_matrix = read.table(mut.data,header=T); x = merge(vitals,mut_matrix,by.x=1,by.y=1); write.table(x,file=paste(out.dir,"survival_analysis_data_matrix.csv",sep="/"),quote=F,append=F,row.names=F,sep="\t") colnames(x)[-c(1:3)]->phenos if (class(x[,phenos])=="integer" & length(unique(x[,phenos]))<6) x[,phenos] [x[,phenos]>1]=1 ######################### survival analysis library(survival) logr=NULL for (phenotype in phenos) { #clean data loopdata <- x; loopdata <- loopdata[!is.na(loopdata[,phenotype]),]; loopdata <- loopdata[!is.na(loopdata[,3]),]; loopdata <- loopdata[!is.na(loopdata[,2]),]; status=loopdata[,3]; time=loopdata[,2]; x1=loopdata[,phenotype]; base.class = as.vector(sort(unique(x1)))[1]; coxph(Surv(time, status) ~ x1, loopdata) -> co; summary(co)->co; co$conf->cox; co$logtest[3]->p; co$coef[5]->indv.p; rownames(cox) = sub("x1","",rownames(cox)); if (length(rownames(cox))==1 && rownames(cox)[1]=="") { rownames(cox)[1] = "1"; } logr=rbind(logr,cbind(base.class,rownames(cox),phenotype,cox,indv.p,p)) mfit.by <- survfit(Surv(time, status == 1) ~ x1, data = loopdata) ## file name for plot bitmap(file=paste(out.dir,"/",phenotype,"_survival_plot.png",sep="")) ## create survival plot plot(mfit.by,lty=1:10,ylab="Survival Probability",xlab="Time",col=c(1:10)) if (dim(table(x1))>1) { title(paste(phenotype,", P=",signif(p,3),sep="")); } else { title(paste(phenotype)); } legend(x=legend.placement, legend=names(table(x1)), lty = 1:10, col=c(1:10)) dev.off() } ########################## calculate fdr logr=logr[,-5]; if (length(phenos) < 2) { logr=(t(logr)); } fdr=p.adjust(as.numeric(logr[,"p"]),"fdr") logr=cbind(logr,fdr) ######################### print output colnames(logr)[1:9]=c("base.class","comparison.class","phenotype","hazard.ratio","lower.95","upper.95","2-class-p-value","p-value","fdr") logr=logr[order(logr[,"p-value"]),] write.table(logr,file=paste(out.dir,"survival_analysis_test_results.csv",sep="/"),quote=F,append=F,row.names=F,sep="\t") Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr000755000765000024 012013522176 23763 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/Base.pm000444000765000024 115112013522176 25326 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr::Base; use strict; use warnings; use Genome; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Bmr::Base { is => ['Genome::Model::Tools::Music::Base'], is_abstract => 1, attributes_have => [ file_format => { is => 'Text', is_optional => 1, } ], doc => "Mutational Significance In Cancer (BMR Calculations)" }; sub _doc_authors { return " Cyriac Kandoth, Ph.D."; } sub _doc_see_also { return <(1), B(1), B(1) EOS } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm000444000765000024 7510312013522176 26007 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr::CalcBmr; use warnings; use strict; use Genome; use IO::File; use Bit::Vector; use List::Util qw( min sum ); our $VERSION = $Genome::Model::Tools::Music::VERSION; # These constants let us use space-efficient arrays instead of hashes, while keeping the code fairly readable use constant { AT_Transitions => 0, AT_Transversions => 1, CG_Transitions => 2, CG_Transversions => 3, CpG_Transitions => 4, CpG_Transversions => 5, Indels => 6, Truncations => 7, Overall => 8, covd_bases => 0, mutations => 1, bmr => 2 }; class Genome::Model::Tools::Music::Bmr::CalcBmr { is => 'Genome::Model::Tools::Music::Bmr::Base', has_input => [ roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See DESCRIPTION)" }, reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" }, bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name normal_bam tumor_bam] (See DESCRIPTION)" }, output_dir => { is => 'Text', doc => "Directory where output files will be written (Use the same one used with calc-covg)" }, maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specification v2.3" }, bmr_groups => { is => 'Integer', doc => "Number of clusters of samples with comparable BMRs (See DESCRIPTION)", is_optional => 1, default => 1 }, show_skipped => { is => 'Boolean', doc => "Report each skipped mutation, not just how many", is_optional => 1, default => 0 }, separate_truncations => { is => 'Boolean', doc => "Group truncational mutations as a separate category", is_optional => 1, default => 0 }, merge_concurrent_muts => { is => 'Boolean', doc => "Multiple mutations of a gene in the same sample are treated as 1", is_optional => 1, default => 0 }, genes_to_ignore => { is => 'Text', doc => "Comma-delimited list of genes to ignore for background mutation rates", is_optional => 1 }, skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 }, skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 }, ], has_output => [ bmr_output => { is => 'Number', doc => "TODO" }, gene_mr_file => { is => 'Text', doc => "TODO" }, ], doc => "Calculates mutation rates given per-gene coverage (from \"music bmr calc-covg\"), and a mutation list", }; sub help_synopsis { return <(1), B(1), B(1) EOS } sub execute { my $self = shift; my $roi_file = $self->roi_file; my $ref_seq = $self->reference_sequence; my $bam_list = $self->bam_list; my $output_dir = $self->output_dir; my $maf_file = $self->maf_file; my $show_skipped = $self->show_skipped; my $bmr_groups = $self->bmr_groups; my $separate_truncations = $self->separate_truncations; my $merge_concurrent_muts = $self->merge_concurrent_muts; my $genes_to_ignore = $self->genes_to_ignore; my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent; # Check on all the input data before starting work print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file ); print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq ); print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list ); print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir ); print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file ); return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir && -s $maf_file ); # Check on the files we expect to find within the provided output directory $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any my $gene_covg_dir = "$output_dir/gene_covgs"; # Should contain per-gene coverage files per sample my $total_covgs_file = "$output_dir/total_covgs"; # Should contain overall coverages per sample print STDERR "Directory with per-gene coverages not found: $gene_covg_dir\n" unless( -e $gene_covg_dir ); print STDERR "Total coverages file not found or is empty: $total_covgs_file\n" unless( -s $total_covgs_file ); return undef unless( -e $gene_covg_dir && -s $total_covgs_file ); # Outputs of this script will be written to these locations in the output directory my $overall_bmr_file = "$output_dir/overall_bmrs"; my $gene_mr_file = "$output_dir/gene_mrs"; $self->gene_mr_file( $gene_mr_file ); # Build a hash to quickly lookup the genes to be ignored for overall BMRs my %ignored_genes = (); if( defined $genes_to_ignore ) { %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore ); } # Parse out the names of the samples which should match the names of the coverage files needed my ( @all_sample_names, %sample_idx ); my $idx = 0; my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); $sample_idx{$sample} = $idx++; } $sampleFh->close; # If the reference sequence FASTA file hasn't been indexed, do it my $ref_seq_idx = "$ref_seq.fai"; Genome::Sys->shellcmd( cmd => "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx ); # Parse gene names and ROIs. Mutations outside these ROIs will be skipped my ( @all_gene_names, %gene_idx ); $idx = 0; my $roi_bitmask = $self->create_empty_genome_bitmask( $ref_seq_idx ); my $roiFh = IO::File->new( $roi_file ) or die "Couldn't open $roi_file. $!"; while( my $line = $roiFh->getline ) { next if( $line =~ m/^#/ ); chomp $line; my ( $chr, $start, $stop, $gene ) = split( /\t/, $line ); if( !$roi_bitmask->{$chr} or $start > $roi_bitmask->{$chr}->Size ) { print STDERR "Skipping invalid ROI bitmask $chr:$start-$stop\n"; next; } $roi_bitmask->{$chr}->Interval_Fill( $start, $stop ); unless( defined $gene_idx{$gene} ) { push( @all_gene_names, $gene ); $gene_idx{$gene} = $idx++; } } $roiFh->close; # These are the various categories that each mutation will be classified into my @mut_classes = ( AT_Transitions, AT_Transversions, CG_Transitions, CG_Transversions, CpG_Transitions, CpG_Transversions, Indels ); push( @mut_classes, Truncations ) if( $separate_truncations ); # Save the actual class names for reporting purposes, because the elements above are really just numerical constants my @mut_class_names = qw( AT_Transitions AT_Transversions CG_Transitions CG_Transversions CpG_Transitions CpG_Transversions Indels ); push( @mut_class_names, 'Truncations' ) if( $separate_truncations ); my @sample_mr; # Stores per sample covg and mutation information foreach my $sample ( @all_sample_names ) { $sample_mr[$sample_idx{$sample}][$_][mutations] = 0 foreach( @mut_classes ); $sample_mr[$sample_idx{$sample}][$_][covd_bases] = 0 foreach( @mut_classes ); } # Load the covered base-counts per sample from the output of "music bmr calc-covg" print STDERR "Loading per-sample coverages stored in $total_covgs_file\n"; my $sample_cnt_in_total_covgs_file = 0; my $totCovgFh = IO::File->new( $total_covgs_file ) or die "Couldn't open $total_covgs_file. $!"; while( my $line = $totCovgFh->getline ) { next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ ); chomp( $line ); ++$sample_cnt_in_total_covgs_file; my ( $sample, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line ); $sample_mr[$sample_idx{$sample}][AT_Transitions][covd_bases] = $covd_at_bases; $sample_mr[$sample_idx{$sample}][AT_Transversions][covd_bases] = $covd_at_bases; $sample_mr[$sample_idx{$sample}][CG_Transitions][covd_bases] = $covd_cg_bases; $sample_mr[$sample_idx{$sample}][CG_Transversions][covd_bases] = $covd_cg_bases; $sample_mr[$sample_idx{$sample}][CpG_Transitions][covd_bases] = $covd_cpg_bases; $sample_mr[$sample_idx{$sample}][CpG_Transversions][covd_bases] = $covd_cpg_bases; $sample_mr[$sample_idx{$sample}][Indels][covd_bases] = $covd_bases; $sample_mr[$sample_idx{$sample}][Truncations][covd_bases] = $covd_bases if( $separate_truncations ); } $totCovgFh->close; unless( $sample_cnt_in_total_covgs_file == scalar( @all_sample_names )) { print STDERR "Mismatching number of samples in $total_covgs_file and $bam_list\n"; return undef; } my @gene_mr; # Stores per gene covg and mutation information foreach my $gene ( @all_gene_names ) { foreach my $sample ( @all_sample_names ) { $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] = 0 foreach( @mut_classes ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][covd_bases] = 0 foreach( @mut_classes ); } } # Sum up the per-gene covered base-counts across samples from the output of "music bmr calc-covg" print STDERR "Loading per-gene coverage files stored under $gene_covg_dir/\n"; foreach my $sample ( @all_sample_names ) { my $sample_covg_file = "$gene_covg_dir/$sample.covg"; my $sampleCovgFh = IO::File->new( $sample_covg_file ) or die "Couldn't open $sample_covg_file. $!"; while( my $line = $sampleCovgFh->getline ) { next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ ); chomp( $line ); my ( $gene, undef, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transitions][covd_bases] += $covd_at_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transversions][covd_bases] += $covd_at_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transitions][covd_bases] += $covd_cg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transversions][covd_bases] += $covd_cg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transitions][covd_bases] += $covd_cpg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transversions][covd_bases] += $covd_cpg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Indels][covd_bases] += $covd_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Truncations][covd_bases] += $covd_bases if( $separate_truncations ); } $sampleCovgFh->close; } # Run "joinx ref-stats" to classify SNVs as being at AT, CG, or CpG sites in the reference print STDERR "Running 'joinx ref-stats' to read reference FASTA and identify SNVs at AT, CG, CpG sites\n"; my $maf_bed = Genome::Sys->create_temp_file_path(); my $mafBedFh = IO::File->new( $maf_bed, ">" ) or die "Temporary file could not be created. $!"; my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!"; while( my $line = $mafFh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $chr, $start, $stop, $mutation_type, $ref, $var1, $var2 ) = @cols[4..6,9..12]; if( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) { $mafBedFh->print( "$chr\t" . ( $start - 2 ) . "\t" . ( $start + 1 ) . "\n" ); } } $mafFh->close; $mafBedFh->close; my $refstats_file = Genome::Sys->create_temp_file_path(); Genome::Sys->shellcmd( cmd => "joinx ref-stats --ref-bases --bed $maf_bed --fasta $ref_seq --output $refstats_file" ); # Parse through the ref-stats output and load it into hashes for quick lookup later my ( %ref_base, %cpg_site ); my $refStatsFh = IO::File->new( $refstats_file ) or die "Couldn't open $refstats_file. $!"; while( my $line = $refStatsFh->getline ) { next if( $line =~ m/^#/ ); chomp $line; my ( $chr, undef, $pos, undef, undef, undef, $ref ) = split( /\t/, $line ); my $locus = "$chr\t" . ( $pos - 1 ); $ref_base{$locus} = substr( $ref, 1, 1 ); $cpg_site{$locus} = 1 if( $ref =~ m/CG/ ); } $refStatsFh->close; # Create a hash to help classify SNVs my %classify; $classify{$_} = AT_Transitions foreach( qw( AG TC )); $classify{$_} = AT_Transversions foreach( qw( AC AT TA TG )); $classify{$_} = CG_Transitions foreach( qw( CT GA )); $classify{$_} = CG_Transversions foreach( qw( CA CG GC GT )); # Parse through the MAF file and categorize each somatic mutation print STDERR "Parsing MAF file to classify mutations\n"; my %skip_cnts; $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!"; while( my $line = $mafFh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15]; # Skip mutations in samples that are not in the provided bam list unless( defined $sample_idx{$sample} ) { $skip_cnts{"belong to unrecognized samples"}++; print STDERR "Skipping unrecognized sample ($sample not in BAM list): $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # If the mutation classification is odd, quit with error if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { $skip_cnts{"are classified as $mutation_class"}++; print STDERR "Skipping $mutation_class mutation: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # If the mutation type is odd, quit with error if( $mutation_type !~ m/^(SNP|DNP|TNP|ONP|INS|DEL|Consolidated)$/ ) { print STDERR "Unrecognized Variant_Type \"$mutation_type\" in MAF file: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # Skip mutations that were consolidated into others (E.g. SNP consolidated into a TNP) if( $mutation_type =~ m/^Consolidated$/ ) { $skip_cnts{"are consolidated into another"}++; print STDERR "Skipping consolidated mutation: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # Skip mutations that fall completely outside any of the provided regions of interest if( $self->count_bits( $roi_bitmask->{$chr}, $start, $stop ) == 0 ) { $skip_cnts{"are outside any ROIs"}++; print STDERR "Skipping mutation that falls outside ROIs: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # Skip mutations whose gene names don't match any of those in the ROI list unless( defined $gene_idx{$gene} ) { $skip_cnts{"have unrecognized gene names"}++; print STDERR "Skipping unrecognized gene name (not in ROI file): $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } my $class = ''; # Check if the mutation is the truncating type, if the user wanted a separate category of those if( $separate_truncations && $mutation_class =~ m/^(Nonsense_Mutation|Splice_Site|Frame_Shift_Del|Frame_Shift_Ins)/ ) { $class = Truncations; } # Classify the mutation as AT/CG/CpG Transition, AT/CG/CpG Transversion elsif( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) { # ::TBD:: For DNPs and TNPs, we use only the first base for mutation classification $ref = substr( $ref, 0, 1 ); $var1 = substr( $var1, 0, 1 ); $var2 = substr( $var2, 0, 1 ); # If the alleles are anything but A, C, G, or T then quit with error if( $ref !~ m/[ACGT]/ || $var1 !~ m/[ACGT]/ || $var2 !~ m/[ACGT]/ ) { print STDERR "Unrecognized allele in column Reference_Allele, Tumor_Seq_Allele1, or Tumor_Seq_Allele2: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # Use the classify hash to find whether this SNV is an AT/CG Transition/Transversion $class = $classify{ "$ref$var1" } if( defined $classify{ "$ref$var1" } ); $class = $classify{ "$ref$var2" } if( defined $classify{ "$ref$var2" } ); # Check if the ref base in the MAF matched what we fetched from the ref-seq my $locus = "$chr\t$start"; if( defined $ref_base{$locus} && $ref_base{$locus} ne $ref ) { print STDERR "Reference allele $ref for $gene variant at $chr:$start-$stop is " . $ref_base{$locus} . " in the FASTA. Using it anyway.\n"; } # Check if a C or G reference allele belongs to a CpG pair in the refseq if(( $ref eq 'C' || $ref eq 'G' ) && defined $cpg_site{$locus} ) { $class = (( $class == CG_Transitions ) ? CpG_Transitions : CpG_Transversions ); } } # Classify it as an indel (excludes splice-site and frame-shift if user wanted truncations separately) elsif( $mutation_type =~ m/^(INS|DEL)$/ ) { $class = Indels; } # The user's gene exclusion list affects only the overall BMR calculations $sample_mr[$sample_idx{$sample}][$class][mutations]++ unless( defined $ignored_genes{$gene} ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]++; } $mafFh->close; # Display statistics related to parsing the MAF print STDERR "Finished Parsing the MAF file to classify mutations\n"; foreach my $skip_type ( sort {$skip_cnts{$b} <=> $skip_cnts{$a}} keys %skip_cnts ) { print STDERR "Skipped " . $skip_cnts{$skip_type} . " mutation(s) that $skip_type\n" if( defined $skip_cnts{$skip_type} ); } # If the user wants, merge together concurrent mutations of a gene in the same sample if( $merge_concurrent_muts ) { foreach my $sample ( @all_sample_names ) { foreach my $gene ( @all_gene_names ) { next unless( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ); my $num_muts = 0; $num_muts += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] foreach( @mut_classes ); if( $num_muts > 1 ) { foreach my $class ( @mut_classes ) { my $muts_in_class = $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; # Num of muts of gene in this class $sample_mr[$sample_idx{$sample}][$class][mutations] -= $muts_in_class; # Take it out of the sample total $muts_in_class /= $num_muts; # Turn it into a fraction of the total number of muts in this gene $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations] = $muts_in_class; # Use the fraction as the num muts of gene in this class $sample_mr[$sample_idx{$sample}][$class][mutations] += $muts_in_class; # Add the same fraction to the sample total } } } } } # Calculate per-sample BMRs, and also subtract out covered bases in genes the user wants ignored foreach my $sample ( @all_sample_names ) { my $tot_muts = 0; foreach my $class ( @mut_classes ) { # Subtract the covered bases in this class that belong to the genes to be ignored # ::TBD:: Some of these bases may also belong to another gene (on the other strand maybe?), and those should not be subtracted foreach my $ignored_gene ( keys %ignored_genes ) { $sample_mr[$sample_idx{$sample}][$class][covd_bases] -= $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}][$class][covd_bases] if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}] ); } $tot_muts += $sample_mr[$sample_idx{$sample}][$class][mutations]; } $sample_mr[$sample_idx{$sample}][Overall][bmr] = $tot_muts / $sample_mr[$sample_idx{$sample}][Indels][covd_bases]; } # Cluster samples into bmr-groups using k-means clustering my @sample_bmrs = map { $sample_mr[$sample_idx{$_}][Overall][bmr] } @all_sample_names; my @bmr_clusters = k_means( $bmr_groups, \@sample_bmrs ); # Calculate overall BMRs for each cluster of samples, and print them to file my %cluster_bmr; # Stores per cluster categorized BMR my $totBmrFh = IO::File->new( $overall_bmr_file, ">" ) or die "Couldn't open $overall_bmr_file. $!"; $totBmrFh->print( "#User-specified genes skipped in these calculations: $genes_to_ignore\n" ) if( defined $genes_to_ignore ); my ( $covered_bases_sum, $mutations_sum ) = ( 0, 0 ); for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) { my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]}; unless( $bmr_groups == 1 ) { $totBmrFh->print( "#BMR sub-group ", $i + 1, " (", scalar( @{$bmr_clusters[$i]} ), " samples)\n" ); $totBmrFh->print( "#Samples: ", join( ",", @samples_in_cluster ), "\n" ); } $totBmrFh->print( "#Mutation_Class\tCovered_Bases\tMutations\tOverall_BMR\n" ); my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 ); foreach my $class ( @mut_classes ) { my ( $covd_bases, $mutations ) = ( 0, 0 ); foreach my $sample ( @samples_in_cluster ) { $covd_bases += $sample_mr[$sample_idx{$sample}][$class][covd_bases]; $mutations += $sample_mr[$sample_idx{$sample}][$class][mutations]; } $tot_covd_bases = $covd_bases if( $class == Indels ); # Save this to calculate overall BMR below # Calculate overall BMR for this mutation class and print it to file $cluster_bmr{$i}[$class][bmr] = ( $covd_bases == 0 ? 0 : ( $mutations / $covd_bases )); $totBmrFh->print( join( "\t", $mut_class_names[$class], $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" ); $tot_muts += $mutations; } $totBmrFh->print( join( "\t", "Overall_BMR", $tot_covd_bases, $tot_muts, $tot_muts / $tot_covd_bases ), "\n\n" ); $covered_bases_sum += $tot_covd_bases; $mutations_sum += $tot_muts; } $totBmrFh->close; $self->bmr_output( $mutations_sum / $covered_bases_sum ); # Print out a file containing per-gene mutation counts and covered bases for use by "music smg" my $geneBmrFh = IO::File->new( $gene_mr_file, ">" ) or die "Couldn't open $gene_mr_file. $!"; $geneBmrFh->print( "#Gene\tMutation_Class\tCovered_Bases\tMutations\tBMR\n" ); foreach my $gene ( sort @all_gene_names ) { my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 ); for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) { my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]}; foreach my $class ( @mut_classes ) { my ( $covd_bases, $mutations ) = ( 0, 0 ); foreach my $sample( @samples_in_cluster ) { if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ) { $covd_bases += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][covd_bases]; $mutations += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; } } my $rename_class = $mut_class_names[$class]; $rename_class = ( $rename_class . "_SubGroup" . ( $i + 1 )) if( $bmr_groups > 1 ); $geneBmrFh->print( join( "\t", $gene, $rename_class, $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" ); $tot_muts += $mutations; $tot_covd_bases += $covd_bases if( $class == Indels ); } } $geneBmrFh->print( join( "\t", $gene, "Overall", $tot_covd_bases, $tot_muts, $self->bmr_output ), "\n" ); } $geneBmrFh->close; return 1; } # Creates an empty whole genome bitmask based on the given reference sequence index sub create_empty_genome_bitmask { my ( $self, $ref_seq_idx_file ) = @_; my %genome; my $refFh = IO::File->new( $ref_seq_idx_file ) or die "Couldn't open $ref_seq_idx_file. $!"; while( my $line = $refFh->getline ) { my ( $chr, $length ) = split( /\t/, $line ); $genome{$chr} = Bit::Vector->new( $length + 1 ); # Adding a base for 1-based coordinates } $refFh->close; return \%genome; } # Counts the number of bits that are set in the given region of a Bit:Vector sub count_bits { my ( $self, $vector, $start, $stop ) = @_; my $count = 0; for my $pos ( $start..$stop ) { ++$count if( $vector->bit_test( $pos )); } return $count; } # Given a list of numerical values, returns k clusters based on k-means clustering sub k_means { my ( $k, $list_ref ) = @_; my @vals = @{$list_ref}; my $num_vals = scalar( @vals ); # Start with the first k values as the centroids my @centroids = @vals[0..($k-1)]; my @prev_centroids = map { 0 } @centroids; my @groups = (); my $diff_means = 1; # Arbitrary non-zero value # Repeat until the difference between these centroids and the previous ones, converges to zero while( $diff_means > 0 ) { @groups = (); # Group values into clusters based on closest centroid for( my $i = 0; $i < $num_vals; ++$i ) { my @distances = map { abs( $vals[$i] - $_ ) } @centroids; my $closest = min( @distances ); for( my $j = 0; $j < $k; ++$j ) { if( $distances[$j] == $closest ) { push( @{$groups[$j]}, $i ); last; } } } # Calculate means to be the new centroids, and the sum of differences $diff_means = 0; for( my $i = 0; $i < $k; ++$i ) { $centroids[$i] = sum( map {$vals[$_]} @{$groups[$i]} ); $centroids[$i] /= scalar( @{$groups[$i]} ); $diff_means += abs( $centroids[$i] - $prev_centroids[$i] ); } # Save the current centroids for comparisons with those in the next iteration @prev_centroids = @centroids; } return @groups; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm000444000765000024 3357412013522176 26173 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr::CalcCovg; use warnings; use strict; use IO::File; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Bmr::CalcCovg { is => 'Genome::Model::Tools::Music::Bmr::Base', has_input => [ roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See Description)" }, reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" }, bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name normal_bam tumor_bam] (See Description)" }, output_dir => { is => 'Text', doc => "Directory where output files and subdirectories will be written", is_output => 1}, cmd_list_file => { is => 'Text', doc => "A file to write calcRoiCovg commands to (See Description)", is_optional => 1 }, cmd_prefix => { is => 'Text', doc => "A command that submits a job to your cluster (See Description)", is_optional => 1 }, normal_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Normal BAM base as covered", is_optional => 1}, tumor_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Tumor BAM base as covered", is_optional => 1}, min_mapq => { is => 'Integer', doc => "The minimum mapping quality of reads to consider towards read depth counts", is_optional => 1}, ], has_output => [ gene_covg_dir => { is => 'Text', doc => "Directory where per-sample gene coverage files are located"}, ], doc => "Uses calcRoiCovg.c to count covered bases per-gene for each given tumor-normal pair of BAMs." }; sub help_synopsis { return <(1), B(1), B(1) EOS } sub execute { my $self = shift; my $roi_file = $self->roi_file; my $ref_seq = $self->reference_sequence; my $bam_list = $self->bam_list; my $output_dir = $self->output_dir; my $cmd_list_file = $self->cmd_list_file; my $cmd_prefix = $self->cmd_prefix; my $normal_min_depth = $self->normal_min_depth; my $tumor_min_depth = $self->tumor_min_depth; my $min_mapq = $self->min_mapq; my $optional_params = ""; if ($normal_min_depth) { $optional_params .= " --normal-min-depth $normal_min_depth"; } if ($tumor_min_depth) { $optional_params .= " --tumor-min-depth $tumor_min_depth"; } if ($min_mapq) { $optional_params .= " --min-mapq $min_mapq"; } # Check on all the input data before starting work print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file ); print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq ); print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list ); print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir ); return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir ); # Outputs of this script will be written to these locations in the output directory $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any my $roi_covg_dir = "$output_dir/roi_covgs"; # Stores output from calcRoiCovg per sample my $gene_covg_dir = "$output_dir/gene_covgs"; # Stores per-gene coverages per sample my $tot_covg_file = "$output_dir/total_covgs"; # Stores total coverages per sample $self->gene_covg_dir($gene_covg_dir); # Check whether the annotated regions of interest are clumped together by chromosome my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n"; my @chroms = ( "" ); while( my $line = $roiFh->getline ) # Emulate Unix's uniq command on the chromosome column { my ( $chrom ) = ( $line =~ m/^(\S+)/ ); push( @chroms, $chrom ) if( $chrom ne $chroms[-1] ); } $roiFh->close; my %chroms = map { $_ => 1 } @chroms; # Get the actual number of unique chromosomes if( scalar( @chroms ) != scalar( keys %chroms )) { print STDERR "ROIs from the same chromosome must be listed adjacent to each other in file. "; print STDERR "If in UNIX, try:\nsort -k 1,1 $roi_file\n"; return undef; } # If the reference sequence FASTA file hasn't been indexed, do it my $ref_seq_idx = "$ref_seq.fai"; system( "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx ); # Create the output directories unless they already exist mkdir $roi_covg_dir unless( -e $roi_covg_dir ); mkdir $gene_covg_dir unless( -e $gene_covg_dir ); my ( $cmdFh, $totCovgFh ); if( defined $cmd_list_file ) { $cmdFh = IO::File->new( $cmd_list_file, ">" ); print "Creating a list of parallelizable jobs at $cmd_list_file.\n"; print "After successfully running all the jobs in $cmd_list_file,\n", "be sure to run this script a second time (without defining the cmd-list-file argument) to merge results in roi_covgs.\n"; } else { $totCovgFh = IO::File->new( $tot_covg_file, ">" ); $totCovgFh->print( "#Sample\tCovered_Bases\tAT_Bases_Covered\tCG_Bases_Covered\tCpG_Bases_Covered\n" ); } # Parse through each pair of BAM files provided and run calcRoiCovg as necessary my $bamFh = IO::File->new( $bam_list ); while( my $line = $bamFh->getline ) { next if( $line =~ m/^#/ ); chomp( $line ); my ( $sample, $normal_bam, $tumor_bam ) = split( /\t/, $line ); $normal_bam = '' unless( defined $normal_bam ); $tumor_bam = '' unless( defined $tumor_bam ); print STDERR "Normal BAM for $sample not found: \"$normal_bam\"\n" unless( -e $normal_bam ); print STDERR "Tumor BAM for $sample not found: \"$tumor_bam\"\n" unless( -e $tumor_bam ); next unless( -e $normal_bam && -e $tumor_bam ); # Construct the command that calculates coverage per ROI my $calcRoiCovg_cmd = "\'gmt music bmr calc-covg-helper --normal-tumor-bam-pair \"$line\" --roi-file \"$roi_file\" ". "--reference-sequence \"$ref_seq\" --output-file \"$roi_covg_dir/$sample.covg\"$optional_params\'"; # If user only wants the calcRoiCovg commands, write them to file and skip running calcRoiCovg if( defined $cmd_list_file ) { $calcRoiCovg_cmd = $cmd_prefix . " $calcRoiCovg_cmd" if( defined $cmd_prefix ); $cmdFh->print( "$calcRoiCovg_cmd\n" ); next; } # If the calcRoiCovg output was already generated, then don't rerun it if( -s "$roi_covg_dir/$sample.covg" ) { print "$sample.covg found in $roi_covg_dir. Skipping re-calculation.\n"; } # Run the calcRoiCovg command on this tumor-normal pair. This could take a while else { my %params = ( normal_tumor_bam_pair => $line, roi_file => $roi_file, reference_sequence => $ref_seq, output_file => $roi_covg_dir."/".$sample.".covg", ); if ($normal_min_depth) { $params{"normal_min_depth"} = $normal_min_depth; } if ($tumor_min_depth) { $params{"tumor_min_depth"} = $tumor_min_depth; } if ($min_mapq) { $params{"min_mapq"} = $min_mapq; } my $cmd = Genome::Model::Tools::Music::Bmr::CalcCovgHelper->create(%params); my $rv = $cmd->execute; if(!$rv) { print STDERR "Failed to execute: $calcRoiCovg_cmd\n"; next; } else { print "$sample.covg generated and stored to $roi_covg_dir.\n"; } } # Read the calcRoiCovg output and count covered bases per gene my %geneCovg = (); my ( $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd ); my $roiCovgFh = IO::File->new( "$roi_covg_dir/$sample.covg" ); while( my $line = $roiCovgFh->getline ) { chomp( $line ); if( $line =~ m/^#NonOverlappingTotals/ ) { ( undef, undef, undef, $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd ) = split( /\t/, $line ); } elsif( $line !~ m/^#/ ) { my ( $gene, undef, $length, $covd, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line ); $geneCovg{$gene}{len} += $length; $geneCovg{$gene}{covd_len} += $covd; $geneCovg{$gene}{at} += $at_covd; $geneCovg{$gene}{cg} += $cg_covd; $geneCovg{$gene}{cpg} += $cpg_covd; } } $roiCovgFh->close; # Write the per-gene coverages to a file named after this sample_name my $geneCovgFh = IO::File->new( "$gene_covg_dir/$sample.covg", ">" ); $geneCovgFh->print( "#Gene\tLength\tCovered\tAT_covd\tCG_covd\tCpG_covd\n" ); foreach my $gene ( sort keys %geneCovg ) { $geneCovgFh->print( join( "\t", $gene, $geneCovg{$gene}{len}, $geneCovg{$gene}{covd_len}, $geneCovg{$gene}{at}, $geneCovg{$gene}{cg}, $geneCovg{$gene}{cpg} ), "\n" ); } $geneCovgFh->close; # Write total coverages for this sample to a file $totCovgFh->print( "$sample\t$tot_covd\t$tot_at_covd\t$tot_cg_covg\t$tot_cpg_covd\n" ); } $bamFh->close; $cmdFh->close if( defined $cmd_list_file ); $totCovgFh->close unless( defined $cmd_list_file ); return 1; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm000444000765000024 1604112013522176 27321 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr::CalcCovgHelper; use warnings; use strict; use IO::File; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Bmr::CalcCovgHelper { is => 'Genome::Model::Tools::Music::Bmr::Base', has_input => [ roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See Description)" }, reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" }, normal_tumor_bam_pair => { is => 'Text', doc => "Tab delimited line with sample name, path to normal bam file, and path to tumor bam file (See Description)" }, output_file => { is => 'Text', doc => "Output file path. Specify either output-file or output-directory.", is_optional => 1}, output_dir => { is => 'Text', doc => "Output directory path. Specify either output-file or output-directory", is_optional => 1}, normal_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Normal BAM base as covered", is_optional => 1, default => 6}, tumor_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Tumor BAM base as covered", is_optional => 1, default => 8}, min_mapq => { is => 'Integer', doc => "The minimum mapping quality of reads to consider towards read depth counts", is_optional => 1, default => 20}, ], has_calculated_optional => [ sample_name => { calculate_from => ['normal_tumor_bam_pair'], calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[0];}, }, final_output_file => { is_output => 1, calculate_from => ['output_file','output_dir','sample_name'], calculate => q {if ($output_file) {return $output_file;} elsif ($output_dir){return $output_dir."/".$sample_name;} else {die "Either output-file or output-dir must be specified."}}, }, normal_bam => { calculate_from => ['normal_tumor_bam_pair'], calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[1];}, }, tumor_bam => { calculate_from => ['normal_tumor_bam_pair'], calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[2];}, }, ], doc => "Uses calcRoiCovg.c to count covered bases per-gene for a tumor-normal pair of BAMs." }; sub help_synopsis { return <(1), B(1), B(1) EOS } sub execute { my $self = shift; my $roi_file = $self->roi_file; my $ref_seq = $self->reference_sequence; my $tumor_bam = $self->tumor_bam; my $normal_bam = $self->normal_bam; my $output_file = $self->final_output_file; my $normal_min_depth = $self->normal_min_depth; my $tumor_min_depth = $self->tumor_min_depth; my $min_mapq = $self->min_mapq; # Check on all the input data before starting work print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file ); print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq ); print STDERR "Normal BAM file not found or is empty: $normal_bam\n" unless( -s $normal_bam ); print STDERR "Tumor BAM file not found or is empty: $tumor_bam\n" unless( -s $tumor_bam ); return undef unless( -s $roi_file && -e $ref_seq && -s $normal_bam && -s $tumor_bam ); # Check whether the annotated regions of interest are clumped together by chromosome my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n"; my @chroms = ( "" ); while( my $line = $roiFh->getline ) # Emulate Unix's uniq command on the chromosome column { my ( $chrom ) = ( $line =~ m/^(\S+)/ ); push( @chroms, $chrom ) if( $chrom ne $chroms[-1] ); } $roiFh->close; my %chroms = map { $_ => 1 } @chroms; # Get the actual number of unique chromosomes if( scalar( @chroms ) != scalar( keys %chroms )) { print STDERR "ROIs from the same chromosome must be listed adjacent to each other in file. "; print STDERR "If in UNIX, try:\nsort -k 1,1 $roi_file\n"; return undef; } # If the reference sequence FASTA file hasn't been indexed, do it my $ref_seq_idx = "$ref_seq.fai"; system( "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx ); $normal_bam = '' unless( defined $normal_bam ); $tumor_bam = '' unless( defined $tumor_bam ); print STDERR "Normal BAM not found: \"$normal_bam\"\n" unless( -e $normal_bam ); print STDERR "Tumor BAM not found: \"$tumor_bam\"\n" unless( -e $tumor_bam ); next unless( -e $normal_bam && -e $tumor_bam ); # Construct the command that calculates coverage per ROI my $calcRoiCovg_cmd = "calcRoiCovg $normal_bam $tumor_bam $roi_file $ref_seq $output_file $normal_min_depth $tumor_min_depth $min_mapq"; # If the calcRoiCovg output was already generated, then don't rerun it if( -s $output_file ) { print "Output file $output_file found. Skipping re-calculation.\n"; } # Run the calcRoiCovg command on this tumor-normal pair. This could take a while elsif( system( "$calcRoiCovg_cmd" ) != 0 ) { print STDERR "Failed to execute: $calcRoiCovg_cmd\n"; return; } else { print "$output_file generated and stored.\n"; return 1; } } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm000444000765000024 2322212013522176 26627 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Bmr::CalcWigCovg; use warnings; use strict; use Genome; use IO::File; our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Bmr::CalcWigCovg { is => 'Genome::Model::Tools::Music::Bmr::Base', has_input => [ roi_file => { is => 'Text', doc => "Tab-delimited list of ROIs [chr start stop gene_name] (See Description)" }, reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" }, wig_list => { is => 'Text', doc => "Tab-delimited list of WIG files [sample_name wig_file] (See Description)" }, output_dir => { is => 'Text', doc => "Directory where output files and subdirectories will be written", is_output => 1}, ], doc => "Count covered bases per-gene for each given wiggle track format file." }; sub help_synopsis { return <(1), B(1), B(1) EOS } sub execute { my $self = shift; my $roi_file = $self->roi_file; my $ref_seq = $self->reference_sequence; my $wig_list = $self->wig_list; my $output_dir = $self->output_dir; # Check on all the input data before starting work print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file ); print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq ); print STDERR "List of WIGs not found or is empty: $wig_list\n" unless( -s $wig_list ); print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir ); return undef unless( -s $roi_file && -e $ref_seq && -s $wig_list && -e $output_dir ); # Outputs of this script will be written to these locations in the output directory $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any my $roi_covg_dir = "$output_dir/roi_covgs"; # Stores output from calcRoiCovg per sample my $gene_covg_dir = "$output_dir/gene_covgs"; # Stores per-gene coverages per sample my $tot_covg_file = "$output_dir/total_covgs"; # Stores total coverages per sample # If the reference sequence FASTA file hasn't been indexed, do it my $ref_seq_idx = "$ref_seq.fai"; unless( -e $ref_seq_idx ) { print "Reference fasta index not found. Creating one at: $ref_seq.fai\n"; system( "samtools faidx $ref_seq" ) or die "Failed to run samtools! $!\n"; } # Create a temporary 0-based ROI BED-file that we can use with joinx, and also measure gene lengths my %geneLen = (); my $roi_bed = Genome::Sys->create_temp_file_path(); my $roiBedFh = IO::File->new( $roi_bed, ">" ) or die "Temporary ROI BED file could not be created. $!\n"; my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n"; while( my $line = $roiFh->getline ) { chomp( $line ); my ( $chr, $start, $stop, $gene ) = split( /\t/, $line ); --$start; unless( $start >= 0 && $start < $stop ) { print STDERR "Invalid ROI: $line\nPlease use 1-based loci and ensure that start <= stop\n"; return undef; } $geneLen{$gene} += ( $stop - $start ); $roiBedFh->print( "$chr\t$start\t$stop\t$gene\n" ); } $roiFh->close; $roiBedFh->close; # Also create a merged BED file where overlapping ROIs are joined together into contiguous regions # ::TODO:: Use joinx instead of mergeBed, because we'd rather add an in-house dependency my $merged_roi_bed = Genome::Sys->create_temp_file_path(); system( "mergeBed -i $roi_bed | joinx sort -s - -o $merged_roi_bed" );# or die "Failed to run mergeBed or joinx!\n$roi_bed\n$merged_roi_bed\n $!\n"; # Create the output directories unless they already exist mkdir $roi_covg_dir unless( -e $roi_covg_dir ); mkdir $gene_covg_dir unless( -e $gene_covg_dir ); # This is a file that will report the overall non-overlapping coverages per WIG my $totCovgFh = IO::File->new( $tot_covg_file, ">" ); $totCovgFh->print( "#Sample\tCovered_Bases\tAT_Bases_Covered\tCG_Bases_Covered\tCpG_Bases_Covered\n" ); # Parse through each pair of WIG files provided and run calcRoiCovg as necessary my $wigFh = IO::File->new( $wig_list ); while( my $line = $wigFh->getline ) { next if( $line =~ m/^#/ ); chomp( $line ); my ( $sample, $wig_file ) = split( /\t/, $line ); $wig_file = '' unless( defined $wig_file ); print STDERR "Wiggle track format file for $sample not found: \"$wig_file\"\n" unless( -e $wig_file ); next unless( -e $wig_file ); # Use joinx to parse the WIG file and return per-ROI coverages of AT, CG (non-CpG), and CpG system( "joinx wig2bed -Zc $wig_file | joinx sort -s | joinx intersect -F \"I A3\" $roi_bed - | joinx ref-stats - $ref_seq | cut -f 1-7 > $roi_covg_dir/$sample.covg" );# or die "Failed to run joinx to calculate per-gene coverages in $sample! $!\n"; # Read the joinx formatted coverage file and count covered bases per gene my %geneCovg = (); my $roiCovgFh = IO::File->new( "$roi_covg_dir/$sample.covg" ); while( my $line = $roiCovgFh->getline ) { chomp( $line ); if( $line !~ m/^#/ ) { my ( undef, undef, undef, $gene, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line ); $geneCovg{$gene}{covd} += ( $at_covd + $cg_covd + $cpg_covd ); $geneCovg{$gene}{at} += $at_covd; $geneCovg{$gene}{cg} += $cg_covd; $geneCovg{$gene}{cpg} += $cpg_covd; } } $roiCovgFh->close; # Write the per-gene coverages to a file named after this sample_name my $geneCovgFh = IO::File->new( "$gene_covg_dir/$sample.covg", ">" ); $geneCovgFh->print( "#Gene\tLength\tCovered\tAT_covd\tCG_covd\tCpG_covd\n" ); foreach my $gene ( sort keys %geneLen ) { if( defined $geneCovg{$gene} ) { $geneCovgFh->print( join( "\t", $gene, $geneLen{$gene}, $geneCovg{$gene}{covd}, $geneCovg{$gene}{at}, $geneCovg{$gene}{cg}, $geneCovg{$gene}{cpg} ), "\n" ); } else { $geneCovgFh->print( "$gene\t" . $geneLen{$gene} . "\t0\t0\t0\t0\n" ); } } $geneCovgFh->close; # Measure coverage stats on the merged ROI file, so that bps across the genome are not counted twice my $merged_roi_bed_covg = Genome::Sys->create_temp_file_path(); system( "joinx wig2bed -Zc $wig_file | joinx sort -s | joinx intersect $merged_roi_bed - | joinx ref-stats - $ref_seq | cut -f 1-6 > $merged_roi_bed_covg" );# or die "Failed to run joinx to calculate overall coverages in $sample! $!\n"; # Read the joinx formatted coverage file and sum up the coverage stats per region my ( $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd ); my $totRoiCovgFh = IO::File->new( $merged_roi_bed_covg ); while( my $line = $totRoiCovgFh->getline ) { chomp( $line ); if( $line !~ m/^#/ ) { my ( $chr, $start, $stop, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line ); $tot_covd += ( $at_covd + $cg_covd + $cpg_covd ); $tot_at_covd += $at_covd; $tot_cg_covg += $cg_covd; $tot_cpg_covd += $cpg_covd; } } $totRoiCovgFh->close; $totCovgFh->print( "$sample\t$tot_covd\t$tot_at_covd\t$tot_cg_covg\t$tot_cpg_covd\n" ); } $wigFh->close; $totCovgFh->close; return 1; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan000755000765000024 012013522176 24744 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm000444000765000024 10743512013522176 30073 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::PathScan::CombinePvals; #__STANDARD PERL PACKAGES use strict; use Carp; use Statistics::Distributions; #__CONSTANT OF PI -- NEEDED IN RAMANUJAN APPROX FOR POISSON PROBABILITY MASSES # (SEE "PATHSCAN TEST" NOTES PP 29-31) # use constant PI => 4*atan2 1, 1; # use constant LOG_PI_OVER_2 => log (PI) / 2; ################################################################################ ## ## ## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ## ## ## ################################################################################ =head1 NAME CombinePvals - combining probabilities from independent tests of significance into a single aggregate figure =head1 SYNOPSIS use CombinePvals; my $obj = CombinePvals->new ($reference_to_list_of_pvals); my $pval = $obj->method_name; my $pval = $obj->method_name (@arguments); =head1 DESCRIPTION There are a variety of circumstances under which one might have a number of different kinds of tests and/or separate instances of the same kind of test for one particular null hypothesis, where each of these tests returns a p-value. The problem is how to properly condense this list of probabilities into a single value so as to be able to make a statistical inference, e.g. whether to reject the null hypothesis. This problem was examined heavily starting about the 1930s, during which time numerous mathematical contintencies were treated, e.g. dependence vs. independence of tests, optimality, inter-test weighting, computational efficiency, continuous vs. discrete tests and combinations thereof, etc. There is quite a large mathematical literature on this topic (see L below) and any one particular situation might incur some of the above subtleties. This package concentrates on some of the more straightforward scenarios, furnishing various methods for combining p-vals. The main consideration will usually be the trade-off between the exactness of the p-value (according to strict frequentist modeling) and the computational efficiency, or even its actual feasibility. Tests should be chosen with this factor in mind. Note also that this scenario of combining p-values (many tests of a single hypothesis) is fundamentally different from that where a given hypothesis is tested multiple times. The latter instance usually calls for some method of multiple testing correction. =head1 REFERENCES Here is an abbreviated list of the substantive works on the topic of combining probabilities. =over =item * Birnbaum, A. (1954) I, Journal of the American Statistical Association B<49>(267), 559-574. =item * David, F. N. and Johnson, N. L. (1950) I, Biometrika B<37>(1/2), 42-49. =item * Fisher, R. A. (1958) I, 13-th Ed. Revised, Hafner Publishing Co., New York. =item * Lancaster, H. O. (1949) I, Biometrika B<36>(3/4), 370-382. =item * Littell, R. C. and Folks, J. L. (1971) I, Journal of the American Statistical Association B<66>(336), 802-806. =item * Pearson, E. S. (1938) I, Biometrika B<30>(12), 134-148. =item * Pearson, E. S. (1950) I, Biometrika B<37>(3/4), 383-398. =item * Pearson, K. (1933) I Biometrika B<25>(3/4), 379-410. =item * Van Valen, L. (1964) I, Nature B<201>(4919), 642. =item * Wallis, W. A. (1942) I, Econometrica B<10>(3/4), 229-248. =item * Zelen, M. and Joel, L. S. (1959) I, Annals of Mathematical Statistics B<30>(4), 885-895. =back =head1 AUTHOR Michael C. Wendl S Copyright (C) 2009 Washington University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =head1 GENERAL REMARKS ON METHODS The available methods are listed below. Each of computational techniques assumes that tests, as well as their associated p-values, are independent of one another and none considers any form of differential weighting. =cut ################################################################################ ## ## ## P R O G R A M M E R N O T E S ## ## ## ################################################################################ # # The obj schematic resembles: # # $obj = { # # #__PROBABILITY VALUES FOR THE INDIVIDUAL TESTS # pvals => [0.103, 0.078, 0.03, 0.2,...], # # #__PRODUCT OF THE INDIVIDUAL PROBABILITY VALUES # big_q => 0.103 * 0.078 * 0.03 * 0.2 * ..., # # #__THE ACTUAL NUMBER OF TESTS (SAVED FOR CONVENIENCE) # num_tests = integer, # # #__BINOMIAL COEFFICIENTS # # this will only be defined when passing multiple lists of genes, i.e. # for the approximate "binning" solution - we only define the symmetric # half of pascal's triangle # # binom_coeffs => [[1], [1], [1,2], [1,3], [1,4,6], [1,5,10], ....], # # }; ################################################################################ ## ## ## P U B L I C M E T H O D S ## ## ## ################################################################################ =head1 CONSTRUCTOR METHODS These methods return an object in the CombinePvals class. =cut ################################ # BEGIN: CONSTRUCTOR METHODS # ################################ # === # NEW create a new object # === ~~~~~~~~~~~~~~~~~~~ =head2 new This is the usual object constructor, which takes a mandatory, but otherwise un-ordered (reference to a) list of the p-values obtained by a set of independent tests. my $obj = CombinePvals->new ([0.103, 0.078, 0.03, 0.2,...]); The method checks to make sure that all elements are actual p-values, i.e. they are real numbers and they have values bounded by 0 and 1. =cut sub new { my $class = shift; my ($pvals) = @_; #__OBJECT TEMPLATE my $self = {}; #__PROCESS PVALS IF THEY'RE SPECIFIED if (defined $pvals && $pvals) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $pvals eq "ARRAY"; #__SAVE LIST $self->{'pvals'} = $pvals; #__PROCESS THE INPUT my ($big_q, $num_tests) = (1, 0); foreach my $pval (@{$pvals}) { #__MAKE SURE THIS IS A PVAL croak "'$pval' is not a p-val" unless &is_a_pval ($pval); #__COUNT NUMBER OF TESTS $num_tests++; #__SAVE THE PRODUCT --- THIS IS THE DERIVED TEST STATISTIC $big_q *= $pval; } $self->{'big_q'} = $big_q; $self->{'num_tests'} = $num_tests; #__OTHERWISE CROAK } else { croak "must specify a list of pvals as an argument"; } #__BLESS INTO CLASS AND RETURN OBJECT bless $self, $class; return $self; } ############################## # END: CONSTRUCTOR METHODS # ############################## =head1 EXACT ENUMERATIVE PROCEDURES FOR STRICTLY DISCRETE DISTRIBUTIONS When all the individual p-vals are derived from tests based on discrete distributions, the "standard" continuum methods cannot be used in the strictest sense. Both Wallis (1942) and Lancaster (1949) discuss the option of full enumeration, which will only be feasible when there are a limited number of p-values and their range is not too large. Feasibility experiments are suggested, depending upon the type of hardware and size of calculation. =cut # Again, these methods do some rudimentary checking, but the calling # program is responsible for making sure all elements are actual # p-values, i.e. real numbers, have values bounded by 0 and 1, # etc. # They are also responsible for making sure all p-values are # listed in decreasing order of extremity, as illustrated # below. #################################################################### # BEGIN: EXACT ENUMERATIVE PROCEDURES FOR DISCRETE DISTRIBUTIONS # #################################################################### # ==================== # EXACT ENUM ARBITRARY # ==================== # # exact enumerative solution for a set of p-values obtained from an # arbitrary set of not-necessarily-the-same *discrete* distributions =head2 exact_enum_arbitrary This routine is designed for combining p-values from completely arbitrary discrete probability distributions. It takes a list-of-lists data structure, each list being the probability tails I (i.e. as a probability cummulative density function) associated with each individual test. However, the ordering of the lists themselves is not important. For instance, Wallis (1942) gives the example of two binomials, a one-tailed test having tail values of 0.0625, 0.3125, 0.6875, 0.9375, and 1, and a two-tailed test having tail values 0.125, 0.625, and 1. We would then call this method using my $pval = $obj->exact_enum_arbitrary ( [0.0625, 0.3125, 0.6875, 0.9375, 1], [0.125, 0.625, 1] ); The internal computational method is relatively straightforard and described in detail by Wallis (1942). Note that this method does "all-by-all" multiplication, so it is the least efficient, although entirely exact. =cut sub exact_enum_arbitrary { my $obj = shift; my (@pvals_lists) = @_; my $pval = 0; #__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @pvals_lists; croak "number of lists passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; #__CHECK LIST INPUT my $list_num = 0; foreach my $list (@pvals_lists) { $list_num++; my $previous_pval = 0; foreach my $test_pval (@{$list}) { #__MAKE SURE THIS IS A PVAL croak "'$test_pval' in distribution $list_num is not a p-val" unless &is_a_pval ($test_pval); #__MAKE SURE THIS PVAL IS LARGER THAN PREVIOUS ONE: I.E. THIS IS A C.D.F. croak "distribution $list_num is not in ascending order (not a CDF)" unless $test_pval > $previous_pval; #__RESET $previous_pval = $test_pval; } } #__COMBINE INDIVIDUAL-TEST-P-VALS INTO A SINGLE P-VAL $pval = $obj->_recursive_exact_enum_arbitrary ([@pvals_lists], 0, 1, 1); #__RETURN P-VAL return $pval; } # ==================== # EXACT ENUM IDENTICAL # ==================== # # exact enumerative solution for a set of p-values obtained from one, # or rather, a set of identical *discrete* distributions =head2 exact_enum_identical This routine is designed for combining a set of p-values that all come from a single probability distribution. NOT IMPLEMENTED YET =cut ################################################################## # END: EXACT ENUMERATIVE PROCEDURES FOR DISCRETE DISTRIBUTIONS # ################################################################## =head1 TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS The mathematical literature furnishes several straightforward options for combining p-vals if all of the distributions underlying all of the individual tests are continuous. =cut #################################################### # BEGIN: TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS # #################################################### # =========================== # FISHER CHI-SQUARE TRANSFORM # =========================== # # Fisher's solution using the chi-square transform, valid strictly for # continuum distributions, but can be used approximately for discrete # distributions. Accuracy increases with the support of the distributions. =head2 fisher_chisq_transform This routine implements R.A. Fisher's (1958, originally 1932) chi-square transform method for combining p-vals from continuous distributions, which is essentially a CPU-efficient approximation of K. Pearson's log-based result (see e.g. Wallis (1942) pp 232). Note that the underlying distributions are not actually relevant, so no arguments are passed. my $pval = $obj->fisher_chisq_transform; This is certainly the fastest and easiest method for combining p-vals, but its accuracy for discrete distributions will not usually be very good. For such cases, an exact or a corrected method are better choices. =cut sub fisher_chisq_transform { my $obj = shift; #__GO THROUGH LIST OF INDIVIDUAL-TEST-P-VALS ACCUMULATING FISHER'S LOG # TRANSFORM TO CHI-SQUARE STATISTIC # my $chisq = 0; # foreach my $test_pval (@{$obj->{'pvals'}}) { #### $chisq += - 2 * log ($test_pval); # $chisq -= 2 * log ($test_pval); # } #__WALLIS (1942) MAKES THIS CLEVER SIMPLIFICATION my $chisq = -2 * log ($obj->{'big_q'}); #__TRANSFORM: TWICE THE DEGREES OF FREEDOM my $dof = 2 * $obj->{'num_tests'}; #__NOW GET P-VAL FROM A CHI-SQUARE TEST my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq); #__RETURN P-VAL return $pval; } ################################################## # END: TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS # ################################################## =head1 CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS Enumerative procedures quickly become infeasible if the number of tests and/or the support of each test grow large. A number of procedures have been described for correcting the methodologies designed for continuum testing, mostly in the context of applying so-called continuity corrections. Essentially, these seek to "spread" dicrete data out into a pseudo-continuous configuration as appropriate as possible, and then apply standard transforms. Accuracy varies and should be suitably established in each case. The methods in this section are due to H.O. Lancaster (1949), who discussed two corrections based upon the idea of describing how a chi-square transformed statistic varies between the points of a discrete distribution. Unfortunately, these methods require one to pass some extra information to the routines, i.e. not only the CDF (the p-val of each test), but the CDF value associated with the next-most-extreme statistic. These two pieces of information are the basis of interpolating. For example, if an underlying distribution has the possible tail values of 0.0625, 0.3125, 0.6875, 0.9375, 1 and the test itself has a value of 0.6875, then you would pass I 0.3125 I 0.6875 to the routine. I While there generally will be some extra inconvenience in obtaining this information, the accuracy is much improved over Fisher's method. =cut # PROGRAMMING NOTE ON LANCASTER'S METHODS # # Each method differs substantively by only a few lines of code, so there # are a lot of extra lines here that are required to offer the user 3 # individually-named methods. This should be fixed when time permits, for # example, perhaps pass the name of the correction method as an argument too. ################################################################################ # BEGIN: CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS # ################################################################################ # ========================================================== # LANCASTER'S MEAN-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM # ========================================================== =head2 lancaster_mean_corrected_transform This method is based on the mean value of the chi-squared transformed statistic. my $pval = $obj->lancaster_mean_corrected_transform (@cdf_pairs); Its accuracy is good, but the method is not strictly defined if one of the tests has either the most extreme or second-to-most-extreme statistic. =cut sub lancaster_mean_corrected_transform { my $obj = shift; my (@fxm1_and_fx_pvals) = @_; #__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @fxm1_and_fx_pvals; croak "number of pairs passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; #__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC my ($chisq, $list_num) = (0, 0); foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) { $list_num++; my ($fxm1, $fx) = @{$fxm1_and_fx_pair}; #__MAKE SURE BOTH ARE PVALS croak "'$fxm1' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fxm1); croak "'$fx' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fx); #__MAKE SURE THEY'RE ORDERED AS EXPECTED croak "cdf pair $list_num is not in ascending order" unless $fx > $fxm1; #__MEAN CORECTION $chisq += 2 * (1 - ($fx * log($fx) - $fxm1 * log($fxm1))/($fx - $fxm1)); } #__TRANSFORM: TWICE THE DEGREES OF FREEDOM my $dof = 2 * $obj->{'num_tests'}; #__NOW GET P-VAL FROM A CHI-SQUARE TEST my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq); #__RETURN P-VAL return $pval; } # ============================================================ # LANCASTER'S MEDIAN-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM # ============================================================ =head2 lancaster_median_corrected_transform This method is based on the median value of the chi-squared transformed statistic. my $pval = $obj->lancaster_median_corrected_transform (@cdf_pairs); Its accuracy may sometimes be not quite as good as when using the average, but the method is strictly defined for I values of the statistic. =cut sub lancaster_median_corrected_transform { my $obj = shift; my (@fxm1_and_fx_pvals) = @_; #__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @fxm1_and_fx_pvals; croak "number of pairs passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; #__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC my ($chisq, $list_num) = (0, 0); foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) { $list_num++; my ($fxm1, $fx) = @{$fxm1_and_fx_pair}; #__MAKE SURE BOTH ARE PVALS croak "'$fxm1' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fxm1); croak "'$fx' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fx); #__MAKE SURE THEY'RE ORDERED AS EXPECTED croak "cdf pair $list_num is not in ascending order" unless $fx > $fxm1; #__MEDIAN CORRECTION if ($fxm1) { $chisq -= 2 * log (($fx + $fxm1)/2); } else { $chisq += 2 * (1 - log ($fx)); } } #__TRANSFORM: TWICE THE DEGREES OF FREEDOM my $dof = 2 * $obj->{'num_tests'}; #__NOW GET P-VAL FROM A CHI-SQUARE TEST my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq); #__RETURN P-VAL return $pval; } # =========================================================== # LANCASTER'S MIXED-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM # =========================================================== =head2 lancaster_mixed_corrected_transform This method is a mixture of both the mean and median methods. Specifically, mean correction is used wherever it is well-defined, otherwise median correction is used. my $pval = $obj->lancaster_mixed_corrected_transform (@cdf_pairs); This will be a good way to handle certain cases. =cut sub lancaster_mixed_corrected_transform { my $obj = shift; my (@fxm1_and_fx_pvals) = @_; #__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @fxm1_and_fx_pvals; croak "number of pairs passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; #__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC my ($chisq, $list_num) = (0, 0); foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) { $list_num++; my ($fxm1, $fx) = @{$fxm1_and_fx_pair}; #__MAKE SURE BOTH ARE PVALS croak "'$fxm1' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fxm1); croak "'$fx' in cdf pair $list_num is not a p-val" unless &is_a_pval ($fx); #__MAKE SURE THEY'RE ORDERED AS EXPECTED # # NOTE: WE ALLOW FOR EQUIVALENCE OF ADJACENT VALUES (WITHIN FLOATING-POINT # PRECISION) FOR THOSE CASES WHERE THE CDF IS LOCALLY EXTREMELY # ASYMPTOTIC croak "cdf pair $list_num is not in ascending order" if $fx < $fxm1; # unless $fx > $fxm1; #__NO CORRECTION NEEDED IF VALS IDENTICAL WITHIN FLOATING-POINT PRECISION if ($fx == $fxm1) { $chisq -= 2 * log ($fx); # Fisher #__ELSE APPLY LANCASTER'S CONTINUITY CORRECTION (MIXED DEPENDING UPON VALS) } else { #__USE LANCASTER'S MEAN IF POSSIBLE if ($fxm1) { $chisq += 2 * (1 - ($fx * log($fx) - $fxm1 * log($fxm1))/($fx - $fxm1)); #__OTHERWISE USE LANCASTER'S MEDIAN } else { $chisq += 2 * (1 - log ($fx)); } } } #__TRANSFORM: TWICE THE DEGREES OF FREEDOM my $dof = 2 * $obj->{'num_tests'}; #__NOW GET P-VAL FROM A CHI-SQUARE TEST my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq); #__RETURN P-VAL return $pval; } ############################################################################### # END: CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS # ############################################################################### ################################################################################ ## ## ## S E M I - P R I V A T E M E T H O D S ## ## ## ## methods that are not ordinarily called externally but can be if needed ## ## because they are cast according to the object-oriented interface ## ## ## ################################################################################ =head2 additional methods The basic functionality of this package is encompassed in the methods described above. However, some lower-level functions can also sometimes be useful. =cut # ====================== # EXACT_ENUM_ARBITRARY_2 2-distrib precursor of exact_enum_arbitrary # ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head3 exact_enum_arbitrary_2 Hard-wired precursor of I for 2 distributions. Does no pre-checking, but may be useful for comparing to the output of the general program. =cut sub exact_enum_arbitrary_2 { my $obj = shift; my (@pvals_lists) = @_; my $pval = 0; #__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @pvals_lists; croak "number of lists passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; croak "you must have exactly 2 tests" unless $obj->{'num_tests'} == 2; #__TWO-LIST SPECIAL CASE my $list1 = $pvals_lists[0]; my $list2 = $pvals_lists[1]; #__TRAVERSE LIST 1 for (my $i = 0; $i <= $#{$list1}; $i++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail1 = $list1->[$i]; my $probability_of_ptail1; if ($i > 0) { $probability_of_ptail1 = $list1->[$i] - $list1->[$i-1]; } else { $probability_of_ptail1 = $list1->[$i]; } #__TRAVERSE LIST 2 for (my $j = 0; $j <= $#{$list2}; $j++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail2 = $list2->[$j]; my $probability_of_ptail2; if ($j > 0) { $probability_of_ptail2 = $list2->[$j] - $list2->[$j-1]; } else { $probability_of_ptail2 = $list2->[$j]; } #__PRODUCT OF TAIL VALUES AND THE PROBABILITY OF THIS PRODUCT my $product_tail_pval = $ptail1 * $ptail2; my $probability_of_product_tail_pval = $probability_of_ptail1 * $probability_of_ptail2; #__TALLY TO RESULTANT COMPOUND P-VAL IF SIGNIFICANT $pval += $probability_of_product_tail_pval if $product_tail_pval <= $obj->{'big_q'}; } } #__RETURN P-VAL return $pval; } # ====================== # EXACT_ENUM_ARBITRARY_3 3-distrib precursor of exact_enum_arbitrary # ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head3 exact_enum_arbitrary_3 Hard-wired precursor of I for 3 distributions. Does no pre-checking, but may be useful for comparing to the output of the general program. =cut sub exact_enum_arbitrary_3 { my $obj = shift; my (@pvals_lists) = @_; my $pval = 0; #__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR my $num_lists = scalar @pvals_lists; croak "number of lists passed ($num_lists) not equal to number of tests" . "in 'new' constructor ($obj->{'num_tests'})" unless $num_lists == $obj->{'num_tests'}; croak "you must have exactly 3 tests" unless $obj->{'num_tests'} == 3; #__THREE-LIST SPECIAL CASE my $list1 = $pvals_lists[0]; my $list2 = $pvals_lists[1]; my $list3 = $pvals_lists[2]; #__TRAVERSE LIST 1 for (my $i = 0; $i <= $#{$list1}; $i++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail1 = $list1->[$i]; my $probability_of_ptail1; if ($i > 0) { $probability_of_ptail1 = $list1->[$i] - $list1->[$i-1]; } else { $probability_of_ptail1 = $list1->[$i]; } #__TRAVERSE LIST 2 for (my $j = 0; $j <= $#{$list2}; $j++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail2 = $list2->[$j]; my $probability_of_ptail2; if ($j > 0) { $probability_of_ptail2 = $list2->[$j] - $list2->[$j-1]; } else { $probability_of_ptail2 = $list2->[$j]; } #__TRAVERSE LIST 3 for (my $k = 0; $k <= $#{$list3}; $k++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail3 = $list3->[$k]; my $probability_of_ptail3; if ($k > 0) { $probability_of_ptail3 = $list3->[$k] - $list3->[$k-1]; } else { $probability_of_ptail3 = $list3->[$k]; } #__PRODUCT OF TAIL VALUES AND THE PROBABILITY OF THIS PRODUCT my $product_tail_pval = $ptail1 * $ptail2 * $ptail3; my $probability_of_product_tail_pval = $probability_of_ptail1 * $probability_of_ptail2 * $probability_of_ptail3; #__TALLY TO RESULTANT COMPOUND P-VAL IF SIGNIFICANT $pval += $probability_of_product_tail_pval if $product_tail_pval <= $obj->{'big_q'}; } } } #__RETURN P-VAL return $pval; } # ============ # BINOM COEFFS calculate binomial coefficients # ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head3 binom_coeffs Calculates the binomial coefficients needed in the binomial (convolution) approximate solution. $pmobj->binom_coeffs; The internal data structure is essentially the symmetric half of the appropriately-sized Pascal triangle. Considerable memory is saved by not storing the full triangle. =cut ### This is called automatically, if necessary, before the probability ### calculation, so there is not typically a need to call it ### manually. sub binom_coeffs { my $obj = shift; croak "need to know most populous list first" unless defined $obj->{'most_populous_list'}; carp "already have binomial coefficients" if defined $obj->{'binom_coeffs'}; #__SET-UP FOR COEFFICIENTS my $bin_coeffs = [1]; $obj->{'binom_coeffs'} = []; push (@{$obj->{'binom_coeffs'}}, $bin_coeffs); #__CALCULATE COEFFICIENTS UP TO THAT REQUIRED BY THE MOST POPULOUS LIST for (my $i = 1; $i <= $obj->{'most_populous_list'}; $i++) { $bin_coeffs = &next_bin_coeff_row ($i, $bin_coeffs); push (@{$obj->{'binom_coeffs'}}, $bin_coeffs); } } ################################################################################ ## ## ## M E T H O D S M E A N T T O B E P R I V A T E ## ## ## ## methods that cannot be called in a contextually meaningful way from an ## ## external application using the object-oriented interface ## ## ## ################################################################################ # ========================================================================== # ROUTINE FOR DETERMINING WHETHER A VARIABLE REPRESENTS A LEGITIMATE P-VALUE # ========================================================================== sub is_a_pval { my ($val) = @_; # print "VAL IS '$val'\n"; #__MUST BE A FLOAT (REGEXP: PERL COOKBOOK CHAP 2.1) & MUST BE BOUNDED BY 0 AND 1 if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ && $val >= 0 && $val <= 1) { return 1; ################## # if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) { # print "VAL '$val' IS REAL\n"; # if ($val >= 0) { # print " VAL '$val' >= 0\n"; # if ($val <= 1) { # print " VAL '$val' <= 1\n"; # return 1; # } else { # print " VAL '$val' NOT <= 1\n"; # # chop $val; # # if ($val <= 1) { # # print " VAL '$val' NOW <= 1\n"; # # } else { # # print " VAL '$val' STILL NOT <= 1\n"; # # } # return 0; # } # } else { # print " VAL '$val' NOT >= 0\n"; # return 0; # } ################## #__ELSE IT IS NOT A PVAL } else { # print "VAL '$val' IS NOT REAL\n"; return 0; } } # ================================================================ # RECURSIVE EXACT PVAL CALCULATION FOR ARITRARY TEST DISTRIBUTIONS # ================================================================ sub _recursive_exact_enum_arbitrary { my $obj = shift; my ($list_of_pvals_lists, $prev_recursion_level, $product_tail_pval, $probability_of_product_tail_pval) = @_; #__THE CURRENT LIST my $current_list = $prev_recursion_level; my $list = $list_of_pvals_lists->[$current_list]; #__THE CURRENT LEVEL OF RECURSION my $curr_recursion_level = $prev_recursion_level + 1; my $local_pval = 0; #__LOOP AT THE CURRENT RECURSION LEVEL for (my $i = 0; $i <= $#{$list}; $i++) { #__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE my $ptail = $list->[$i]; my $probability_of_ptail; if ($i > 0) { $probability_of_ptail = $list->[$i] - $list->[$i-1]; } else { $probability_of_ptail = $list->[$i]; } #__RECURSE FURTHER IF NECESSARY if ($curr_recursion_level < $obj->{'num_tests'}) { #__RECURSE AND ACCUMULATE P-VAL CONTRIBUTIONS $local_pval += $obj->_recursive_exact_enum_arbitrary ( $list_of_pvals_lists, $curr_recursion_level, $product_tail_pval * $ptail, $probability_of_product_tail_pval * $probability_of_ptail ); #__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT } else { #__FINAL PRODUCTS my $local_product_tail_pval = $product_tail_pval * $ptail; my $local_probability_of_product_tail_pval = $probability_of_product_tail_pval * $probability_of_ptail; #__TALLY IF CONDITION IS SATISFIED if ($local_product_tail_pval <= $obj->{'big_q'}) { $local_pval += $local_probability_of_product_tail_pval; #__ELSE SKIP REST OF THIS DISTRIBUTION CUZ SUCCEEDING VALS ARE ALL LARGER } else { last; } } } #__RETURN RESULT TO THE ANTECEDENT LEVEL return $local_pval; } # ================== # NEXT BIN COEFF ROW compute 1/2 row i of binomial coefficients given row i-1 # ================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub next_bin_coeff_row { my ($i, $im1_row) = @_; #__FIRST ELEMENT I,0 IN ALL ROWS DEFINED AS UNITY my $next_row = [1]; #__IF I > 1 COMPUTE REST OF NEXT (I) ROW USING PASCAL TRIANGLE ON PREV (I-1) ROW if ($i > 1) { #__COMPUTE STOPPING POINT BASED ON SYMMETRY my $i_mirror = $i / 2; my $i_end = int $i_mirror; #__FILL IN INTERMEDIATE ELEMENTS I,1 TO I,I_END USING PASCALS TRIANGLE for (my $iposition = 1; $iposition <= $i_end; $iposition++) { #__START WITH "LEFT SIDE" VAL OF PREVIOUS ROW my $element = $im1_row->[$iposition-1]; #__COMPUTE NEXT ELEMENT USING THE PASCAL TRIANGLE METHOD if ($iposition == $i_mirror) { $element += $im1_row->[$iposition-1]; } else { $element += $im1_row->[$iposition]; } #__SAVE push (@{$next_row}, $element); } } #__RETURN LIST OF HALF SYMETRIC NEXT ROW OF BINOMIAL COEFFICIENTS return $next_row; } # ======== # BINCOEFF return binomial coefficient using 1/2 symetric stored table # ======== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # RETURNS C_{$i_top, $k_bottom} sub bincoeff { my $obj = shift; my ($i_top, $k_bottom) = @_; #__SYMMETRY CUTOFF IS HALF THE VALUE OF THE I'TH ROW my $i_half = $i_top / 2; #__IF I,K IS WITHIN THE STORED SYMMETRIC HALF OF TRIANGLE THEN SIMPLY RETURN VAL if ($k_bottom <= $i_half) { return $obj->{'binom_coeffs'}->[$i_top]->[$k_bottom]; #__ELSE COMPUTE SYMMETRIC REFLECTION OF NON-STORED COMPONENT AND RETURN THAT VAL } else { my $k_reflect = $i_top - $k_bottom; return $obj->{'binom_coeffs'}->[$i_top]->[$k_reflect]; } } ################################################################################ ## ## ## T R A I L I N G P O D D O C U M E N T A T I O N ## ## ## ################################################################################ ################################################################################ ## ## ## - E N D - ## ## ## ################################################################################ 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/PathScan.pm000444000765000024 13161312013522176 27205 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::PathScan::PathScan; # DEBUG # print "USING LOCAL MCW VERSION OF REGULAR +/-\n"; # DEBUG #__STANDARD PERL PACKAGES use strict; use Carp; #__CONSTANT OF PI -- NEEDED IN RAMANUJAN APPROX FOR POISSON PROBABILITY MASSES # (SEE "PATH-SCAN TEST" NOTES PP 29-31) use constant PI => 4*atan2 1, 1; use constant LOG_PI_OVER_2 => log (PI) / 2; ################################################################################ ## ## ## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ## ## ## ################################################################################ =head1 NAME PathScan - the Path-Scan significance test for mutations in groups of putative cancer genes =head1 SYNOPSIS use PathScan; my $pmobj = PathScan->new ($list_of_gene_lengths); my $pval = $pmobj->path_scan ($actual_hits, $background_mutation_rate); =head1 DESCRIPTION This package calculates the so-called path-scan statistic P-value for sets of putative cancer genes under the null hypothesis that somatic mutations found in data are the result of a random process characterized by the background mutation rate. This test is applied to, for example, a biologically-relevant group of genes, say all the genes in a particular pathway, for which somatic mutation data are available. A low p-value would imply that the null hypothesis should be rejected. In other words, the result suggests that the mutation configuration in this pathway is probably not the result of a strictly random process. =head2 Nature of the Path-Scan Test This statistic considers individual genes in a "binary" fashion, i.e. a gene is either mutated (has one or more mutations) or it is not mutated. I This is the "path-scan" aspect of the test. Why is such information discarded? The somatic background mutation rate is typically very small compared to the size of the average gene. Consequently, the expected number of mutations in any given gene is very low, much less than one, in fact. Under the null hypothesis, most genes will have no mutations. Genes with one (or just a few) may be interesting, but when many genes in a biologically-relevant group (say a pathway) have one (or just a few) mutations, that could be a sign of some underlying I process. In other words, this test is useful in cases where many genes in a group might each contribute a small component (i.e. a small fitness advantage) in the context of the disease process. What this test is not concerned with (and will I detect) is the case where a single, specific gene has a non-random association and it reflects this fact via a large number of mutations. Other single-gene tests should presumably flag such cases. The path-scan test should, therefore, be thought of as just one tool within a larger statistical "toolbox". =head2 Assumptions in the Test The main assumption is that a single background mutation rate applies to the set of genes of interest. That is, the rate does not vary among genes, among chromosomes (if more than one hosts genes of interest), etc. =head1 BUGS AND OPPORTUNITIES FOR EXTENSION Coefficients are recalculated for every individual test, but it would be good for these to persist between tests, adding more as necessary (i.e. if a subsequent test involves more genes than the current one). =head1 AUTHOR Michael C. Wendl S Copyright (C) 2007, 2008 Washington University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =head1 METHODS The available methods are as follows. =cut ################################################################################ ## ## ## P R O G R A M M E R N O T E S ## ## ## ################################################################################ # # The obj schematic resembles: # # $obj = { # # #__LIST OF GENE LENGTHS (RAW DATA) # # one list-within-a-list implies doing the exact solution, while 2 or more # lists-within-a-list imply doing the approximate "binning" approach # # genes => [ # [3500, 4234, 5609, 4550, 10763, 9879,...], # [33500, 44234, 35609, 34550, 110763, 49879,...], # : # ], # # # #__CONTEXT: 0=EXACT OR >0 FOR APPROXIMATE # # this indicator is 0 in the exact context, but in the approximate context # it gives the number of lists that have been passed # # context_approx => 1, # # # #__EXPECTATION # # this is the expected number of mutated genes and is only used for the # asymptotic (Poisson) model --- it will computed whenever the gene sizes # are passed as a single list (i.e. as when we do the exact solution), but # it is only used for the Poisson solution # # expected_genes => 0.097, # # # #__TOTAL COUNT OF GENES (FOR CONVENIENCE) # # this is the total count of all genes, whether in a single list (exact) # context, or multiple lists (approximate "binning" context) # # num_genes => 35, # # # #__AVERAGE GENE LENGTHS AND SIZES OF LISTS # # these will only be defined when passing multiple lists of genes, i.e. # for the approximate "binning" context - each element in 'avg_lengths' and # 'list_sizes' give, respectively, the average length and the number of # genes in the corresponding list of gene raw data above # # avg_lengths => [5445.454, 48984.39, ...], # list_sizes => [21, 34, 32, 54, 19, ...], # # # #__TOTAL LENGTH OF ALL GENES CONCATENATED AND SIZE OF MOST POPULOUS LIST # # this is the total length of all genes, whether in a single list (exact) # context, or multiple lists (approximate "binning" context), and the number # of elements in the most populous list, respectively -- if there is just # one list, then the latter will be equivalent to the value in 'num_genes' # # total_length => 75983, # most_populous_list = 35, # # # #__LIST OF CORRESPONDING "MODIFIED" BERNOULLI "PROBABILITY OF FAILURE" VALUES # # in the exact-solution context, each element is the bernoulli probability # for the corresponding gene length in the single list-within-a-list (above) # but in the approximate-"binning"-solution-context, each value is the # bernoulli probability for the average gene length in the corresponding # list of genes, e.g. the zero-th value is for the zero-th list, the first # value for the first list, etc. - the code will interpret the context # automatically # # mpvals => [0.003596, 0.004243, 0.005625, 0.004560, 0.01082, 0.009928,...], # # # #__BACKGROUND MUTATION RATE I.E. PROBABILITY OF A MUTATION IN A GIVEN NUCLEO # mutation_prob => 0.000001, # # # #__BINOMIAL COEFFICIENTS # # this will only be defined when passing multiple lists of genes, i.e. # for the approximate "binning" solution - we only define the symmetric # half of pascal's triangle # # binom_coeffs => [[1], [1], [1,2], [1,3], [1,4,6], [1,5,10], ....], # # }; ################################################################################ ## ## ## P U B L I C M E T H O D S ## ## ## ################################################################################ # === # NEW create a new path-scan object # === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 new This is the usual object constructor, which optionally takes all the gene-length data as input. If you want to use the exact probability solution or the asymptotic approximate solution, pass all lengths in a single list reference my $pmobj = PathScan->new ([3434, 54565, 6445, ...]); but if you want to use the convolution approximation method, divide the list of gene sizes into the desired number of bins and pass each of these as a reference my $pmobj = PathScan->new ([3434, 54565], [6445, ...]); In other words, the way you pass these arguments at partially determines the context in which you will obtain your P-value for this set of genes. The latter choice is typically betetr, as it gives good accuracy and good computational efficiency. Conversely, the exact solution is identically correct, but can be difficult to compute. The asymptotic approximation is always computationally efficient, but not necessarily accurate for small test sets. =cut sub new { my $class = shift; my (@list_of_gene_length_lists) = @_; #__OBJECT TEMPLATE my $self = {}; #__PROCESS GENE LENGTH DATA IF SPECIFIED &store_genes ($self, @list_of_gene_length_lists) if scalar @list_of_gene_length_lists; #__BLESS INTO CLASS AND RETURN OBJECT bless $self, $class; return $self; } # ========== # PATH SCAN path-scan (tail) probability value # ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 path_scan This function calculates the path-scan statistic in one of the appropriate contexts (exact or convolution approximation, as described above). It takes the actual number of "hits" you've observed in the data, i.e. the number of genes that have a mutated status. my $pval = $pmobj->path_scan (7); If you have not yet done the pre-processing with respect to the background mutation rate (see below), then pre-processing can be executed implicitly by passing the rate as the second argument. my $pval = $pmobj->path_scan (7, 0.000001); =cut sub path_scan { my $obj = shift; my ($actual_hits, $mutation_prob) = @_; #__SOME BASIC CHECKS croak "argument list: '$actual_hits' must be an integer" unless $actual_hits =~ /^-?\d+$/; croak "argument list: '$actual_hits' cant be negative" unless $actual_hits >= 0; croak "argument list: '$actual_hits' cant exceed number of " . "genes '$obj->{'num_genes'}'" if $actual_hits > $obj->{'num_genes'}; #__PREPROCESS IF NECESSARY if ($mutation_prob) { $obj->preprocess ($mutation_prob); } #__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY if ($obj->{'context_approx'}) { $obj->binom_coeffs unless defined $obj->{'binom_coeffs'}; } #__CALCULATE P-VAL WITH THE MINIMUM OF EFFORT my $half_of_num_genes = $obj->{'num_genes'} / 2; my $path_scan_pval = 0; #__COMPUTE ACTUAL TAIL IF NUM HITS IS IN THIS RANGE if ($actual_hits > $half_of_num_genes) { #__COMPUTE THE ACTUAL UPPER TAIL for (my $k = $actual_hits; $k <= $obj->{'num_genes'}; $k++) { my $pval; if ($obj->{'context_approx'}) { $pval = $obj->p_value_binomial_approx ($k); } else { $pval = $obj->p_value_exact ($k); } $path_scan_pval += $pval; } #__RETURN RESULT return $path_scan_pval; #__COMPUTE 1 - COMPLIMENTARY TAIL NUM HITS IS IN THIS RANGE } else { #__COMPUTE COMPLIMENTARY (LOWER) TAIL for (my $k = 0; $k < $actual_hits; $k++) { my $pval; if ($obj->{'context_approx'}) { $pval = $obj->p_value_binomial_approx ($k); } else { $pval = $obj->p_value_exact ($k); } $path_scan_pval += $pval; } #__RETURN 1 - THIS VAL AS THE RESULT return 1 - $path_scan_pval; } } # ============= # CDF_TRUNCATED truncated cummulative distribution function # ============= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # NOTE ON INDEX OF p_value_binomial_approx FUNCTION # # we have the identity for the probability masses of # # P(0) + P(1) + P(2) + ... + P(M) = 1 # # where M is the total number of genes, so that the tailed p-values in the # CDF list, ordered from most extreme to least extreme are # # P(K >= 0) = 1 # P(K >= 1) = 1 - P(0) = P(K >= 0) - P(0) # P(K >= 2) = 1 - P(0) - P(1) = P(K >= 1) - P(1) # P(K >= 3) = 1 - P(0) - P(1) - P(2) = P(K >= 2) - P(2) # : : : : : : : # P(K >= k) = = P(K >= k-1) - P(k-1) # # So, the index of p_value_binomial_approx is '$k-1' and each succeeding # element is UNSHIFTED onto the list to get the desired ordering from most # extreme to least extreme. # # Method "cdf_asymptot" has the same indexing for its probability calling # function. =head2 cdf_truncated This function returns the cummulative distribution in the context of the convolution approximation I such that it contains only enough information to process the given number of hits. my $pvals_list = $pmobj->cdf_truncated ($hits); The list is ordered from most extreme to least extreme probability tail values, i.e. the last value in the list is always unity. However, tailed p-values more extreme than that associated with the argument are not, in fact, calculated, but rather are replaced with the flag -1. This saves processing time and also reduces the chances of numerical overflow for large pathways, as the full CDF must ultimately raise an "mval" (>1) to a power equal to the number of genes in the pathway. The method assumes you have already done the pre-processing with respect to the background mutation rate. =cut sub cdf_truncated { my $obj = shift; my ($hits) = @_; #__MAKE SURE HITS IS AN INTEGER BETWEEN 0 AND THE NUMBER OF GENES croak "hit number must be integer > 0" unless $hits =~ /^\d+$/ && $hits >= 0; croak "hit number must be less than total number of genes" unless $hits <= $obj->{'num_genes'}; #__INCREMENT BY ONE FOR ANY METHODS THAT MAY ALSO NEED THE NEXT MORE EXTREME # TAILED P-VALUE, E.G. THE LANCASTER CORRECTION IN THE POPULATION CALCULATION, # UNLESS WE'RE ALREADY AT THE TOTAL NUMBER OF GENES, E.G. FOR A 1-GENE PATHWAY # LIKE HSA04112 $hits++ unless $hits == $obj->{'num_genes'}; #__MAKE SURE PRE-PROCESSING HAS ALREADY BEEN DONE croak "cannot call 'cdf_truncated' without first pre-processing" unless defined $obj->{'mutation_prob'}; #__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY if ($obj->{'context_approx'}) { $obj->binom_coeffs unless defined $obj->{'binom_coeffs'}; } #__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY # if ($obj->{'context_approx'}) { # $obj->binom_coeffs unless defined $obj->{'binom_coeffs'}; # } #__CDF INITIALIZED WITH UNITY --- MORE EXTREME VALS WILL BE UNSHIFTED IN FRONT my $path_scan_pval = 1; my $cdf = [$path_scan_pval]; #__COMPUTE CDF VALS PROGRESSIVELY MORE EXTREME PUSHING EACH TO FRONT OF LIST for (my $k = 1; $k <= $obj->{'num_genes'}; $k++) { #__COMPUTE ACTUAL TAILED P-VALUE IF WE'RE WITHIN TRUNCATION RANGE if ($k <= $hits) { $path_scan_pval -= $obj->p_value_binomial_approx ($k-1); unshift @{$cdf}, $path_scan_pval; #__OTHERWISE JUST INSERT A FLAG TO FUNCTION AS A PLACEHOLDER FOR OTHER # METHODS THAT EXPECT THE *FORM* OF THE LIST TO BE A FULL CDF } else { unshift @{$cdf}, -1; } } #__RETURN TRUNCATED CDF return $cdf; } # === # CDF cummulative distribution function # === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 cdf This function returns the cummulative distribution in one of the appropriate contexts (exact or convolution approximation, as described above). There are no arguments, my $pvals_list = $pmobj->cdf; unless you have not yet done the pre-processing with respect to the background mutation rate (see below), in which case the pre-processing can be executed implicitly by passing the rate as the sole argument. my $pvals_list = $pmobj->cdf (0.000001); The list is ordered from most extreme to least extreme probability tail values, i.e. the last value in the list is always unity. =cut sub cdf { my $obj = shift; my ($mutation_prob) = @_; my $cdf = []; #__PREPROCESS IF NECESSARY if ($mutation_prob) { $obj->preprocess ($mutation_prob); } #__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY if ($obj->{'context_approx'}) { $obj->binom_coeffs unless defined $obj->{'binom_coeffs'}; } #__CALCULATE P-VAL WITH THE MINIMUM OF EFFORT my $path_scan_pval = 0; #__COMPUTE CDF STARTING WITH MOST THE EXTREME STATE WORKING TOWARD LEAST EXTREME for (my $k = $obj->{'num_genes'}; $k >= 0; $k--) { my $pval; if ($obj->{'context_approx'}) { $pval = $obj->p_value_binomial_approx ($k); } else { $pval = $obj->p_value_exact ($k); } $path_scan_pval += $pval; push @{$cdf}, $path_scan_pval; } return $cdf; } # ============ # CDF_ASYMPTOT cummulative distribution function using asymptotic analysis # ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 cdf_asymptot This function returns the cummulative distribution based on asymptotic analysis. There are no arguments, i.e. my $pvals_list = $pmobj->cdf_asymptot; unless you have not yet done the pre-processing with respect to the background mutation rate (see below), in which case the pre-processing can be executed implicitly by passing the rate as the sole argument. my $pvals_list = $pmobj->cdf_asymptot (0.000001); The list is ordered from most extreme to least extreme probability tail values, i.e. the last value in the list is always unity. Note that asymptotic analysis gives a function (the Poisson) having infinite support. The infinite tail probability for all values past the most extreme physical case are all bundled into that most extreme p-value. =cut sub cdf_asymptot { my $obj = shift; my ($mutation_prob) = @_; #__CDF INITIALIZED WITH UNITY --- MORE EXTREME VALS WILL BE UNSHIFTED IN FRONT my $path_scan_pval = 1; my $cdf = [$path_scan_pval]; #__PREPROCESS IF NECESSARY if ($mutation_prob) { $obj->preprocess ($mutation_prob); } #__COMPUTE CDF VALS PROGRESSIVELY MORE EXTREME PUSHING EACH TO FRONT OF LIST # SEE PROGRAMMING NOTES OF CDF_TRUNCATED METHOD THAT EXPLAIN INDEXING OF THE # PROBABILITY CALL for (my $k = 1; $k <= $obj->{'num_genes'}; $k++) { $path_scan_pval -= $obj->p_value_asymptot_approx ($k-1); unshift @{$cdf}, $path_scan_pval; } return $cdf; } # =================== # PATH SCAN ASYMPTOT asymptotic path-scan probability value (CDF) # =================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 path_scan_asymptot This function calculates the path-scan statistic in the asymptotic (Poisson) context. It takes the actual number of "hits" you've observed in the data, i.e. the number of genes that have a mutated status. my $pval = $pmobj->path_scan_asymptot (7); If you have not yet done the pre-processing with respect to the background mutation rate (see below), then pre-processing can be executed implicitly by passing the rate as the second argument. my $pval = $pmobj->path_scan_asymptot (7, 0.000001); You must set up the object, somewhat paradoxically, I. (This is a consequence of how data are stored internally within the object.) =cut # The Poisson distribution has infinite support, so to be rigorous, we # should sum probability masses to a very large number. Strictly speaking, # we get the same (rigorous) result by simply computing the complement and # subtracting that from one. sub path_scan_asymptot { my $obj = shift; my ($actual_hits, $mutation_prob) = @_; #__SOME BASIC CHECKS croak "argument list: '$actual_hits' must be an integer" unless $actual_hits =~ /^-?\d+$/; croak "argument list: '$actual_hits' cant be negative" unless $actual_hits >= 0; croak "argument list: '$actual_hits' cant exceed number of " . "genes '$obj->{'num_genes'}'" if $actual_hits > $obj->{'num_genes'}; #__PREPROCESS IF NECESSARY if ($mutation_prob) { $obj->preprocess ($mutation_prob); } #__POISSON HAS INFINITE SUPPORT SO COMPUTE COMPLIMENTARY (LOWER) TAIL AS PVAL my $path_scan_pval = 0; for (my $k = 0; $k < $actual_hits; $k++) { $path_scan_pval += $obj->p_value_asymptot_approx ($k); } #__RETURN 1 - THIS VAL AS THE RESULT return 1 - $path_scan_pval; } ################################################################################ ## ## ## S E M I - P R I V A T E M E T H O D S ## ## ## ## methods that are not ordinarily called externally but can be if needed ## ## because they are cast according to the object-oriented interface ## ## ## ################################################################################ =head2 additional methods The basic functionality of this package is encompassed in the methods described above. However, some lower-level functions can also sometimes be useful. =cut # ============= # P VALUE EXACT returns probability of exactly k genes mutated by chance # ============= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # number of required recursion levels is based on number of hits = k =head3 p_value_exact This function returns the exact value of the probability I for a specific number of hits. $pval_exact = $pmobj->p_value_exact (7); You must make sure to call this only if you've configured the object in the exact context (see above). =cut sub p_value_exact { my $obj = shift; my ($k) = @_; #__SOME BASIC CHECKS croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/; croak "argument list: '$k' cant be negative" unless $k >= 0; croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'" if $k > $obj->{'num_genes'}; #__COMPUTE THE LEADING TERM my $leading_term = exp (- $obj->{'mutation_prob'} * $obj->{'total_length'}); #__THIS IS ALSO THE EXACT P-VALUE IN THE SPECIFIC CASE OF K=0 return $leading_term if $k == 0; #__CALCULATE EXACT P-VALUE FOR K>0 my $total_subpart_of_pval = $obj->_recursive_exact_pval_calculation ($k, 0, 1, 1); return $leading_term * $total_subpart_of_pval; } # ======================= # P VALUE BINOMIAL APPROX returns approx prob of k genes mutated by chance # ======================= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # NOTE: we have not implemented the approximate method for the case of just # a single bin, although a simple solution exists (path-scan test notes pp 16) # because the assumption is that, if the user just passes a single list, they # want the exact solution. In other words, we assume that the approximate # solution is only desired when there are at least 2 lists. # # number of required recursion levels is based on the number of lists # that the data have been binned into =head3 p_value_binomial_approx This function returns the convolution approximated value (i.e. using the binomial binning approximation) of the probability I for a specific number of hits. $pval_exact = $pmobj->p_value_binomial_approx (7); You must make sure to call this only if you've configured the object in the approximate binomial context (see above). Also, you must explicitly calculate the necessary binomial coefficients beforehand (see C). =cut sub p_value_binomial_approx { my $obj = shift; my ($k) = @_; #__SOME BASIC CHECKS croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/; croak "argument list: '$k' cant be negative" unless $k >= 0; croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'" if $k > $obj->{'num_genes'}; croak "need to first calculate binomial coefficients" unless defined $obj->{'binom_coeffs'}; #__COMPUTE THE LEADING TERM my $leading_term = exp (- $obj->{'mutation_prob'} * $obj->{'total_length'}); #__THIS IS ALSO THE EXACT P-VALUE IN THE SPECIFIC CASE OF K=0 return $leading_term if $k == 0; #__COMPUTE RECURSIVE PORTION OF THE SOLUTION my $num_lists = $obj->{'context_approx'}; my $total_subpart_of_pval = $obj->_recursive_binom_pval_calculation ($num_lists - 1, 0, $k); # THIS CHUNK DOES NOT WORK IN GENERAL BECAUSE A CDF IS NOT GUARENTEED TO START # FROM K=0 # # #__CALCULATE EXACT P-VALUE FOR K>0 # my $prob_mass = $leading_term * $obj->{'binomial_multiplier'} * # $total_subpart_of_pval; #__TAKE CARE OF THE POWER TERM FOR NEXT ITERATION (STEP-BY-STEP BUILD-UP) # -1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING # $obj->{'binomial_multiplier'} *= $obj->{'mpvals'}->[$num_lists-1]; #__RETURN P # return $prob_mass; #__RETURN (-1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING) return $leading_term * $obj->{'mpvals'}->[$num_lists-1]**$k * $total_subpart_of_pval; } # ======================= # P VALUE ASYMPTOT APPROX returns poisson approx prob of k genes mutated # ======================= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head3 p_value_asymptot_approx This function returns the asymptotic approximated value (i.e. using the Poisson limit approximation) of the probability I for a specific number of hits. $pval_exact = $pmobj->p_value_asymptot_approx (7); Somewhat paradoxically, you must make sure to call this only if you've configured the object in the exact context (see above). =cut sub p_value_asymptot_approx { my $obj = shift; my ($k) = @_; #__SOME BASIC CHECKS croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/; croak "argument list: '$k' cant be negative" unless $k >= 0; croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'" if $k > $obj->{'num_genes'}; #__SOME SHORTHAND my $avg = $obj->{'expected_genes'}; #__ASYMPTOTIC PROBABILITY MASS my $pmass; if ($k) { #__PROBABILITY MASS my $ramanuj = log($k * (1 + 4*$k*(1 + 2*$k))) / 6; my $arg_h0 = $k * log ($avg / $k) - $avg + $k; $arg_h0 -= $ramanuj; $arg_h0 -= LOG_PI_OVER_2; $pmass = exp ($arg_h0); } else { $pmass = exp (-$avg); } #__RETURN PROBABILITY MASS return $pmass; } # =========== # STORE GENES run some basic consistency checks on raw data then store them # =========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # CONFIGURED FOR BOTH EXACT AND APPROXIMATE CONTEXTS # # this has to take the whole data struct (list-of-lists) because it can be # called externally and we don't expect the user to have to do the proper # looping =head3 store_genes Stores the raw gene length data. Use this if you did not pass these data to C before you call any calculation methods. Works in the same way as C, described above. Specifically, the context is partially determined by whether you pass a single list (exact context or asymptotic approximation) $pmobj->store_genes ([3434, 54565, 6445, ...]); or more than one list (convolution approximate context) $pmobj->store_genes ([3434, 54565], [6445, ...]); =cut sub store_genes { my $obj = shift; my (@list_of_gene_length_lists) = @_; #__INIT my ($total_length, $most_populous_list) = (0, 0); $obj->{'num_genes'} = 0; #__DETERMINE WHETHER USER WANTS TO RUN IN EXACT CONTEXT OR APPROXIMATE CONTEXT # IF APPROXIMATE THE RECORD THE NUMBER OF LISTS $obj->{'context_approx'} = 0; if (scalar @list_of_gene_length_lists > 1) { $obj->{'context_approx'} = scalar @list_of_gene_length_lists; } #__PROCESS THE DATA foreach my $gene_list (@list_of_gene_length_lists) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $gene_list eq "ARRAY"; #__PROCESS EACH GENE IN THIS LIST my $list_concat_length = 0; foreach my $length (@{$gene_list}) { #__MAKE SURE THIS LENGTH IS A POSITIVE INTEGER croak "argument list: '$length' must be an integer" unless $length =~ /^-?\d+$/; croak "argument list: '$length' must be positive" unless $length > 0; #__TALLY THIS LENGTH TO THE TOTAL LENGTH OF THIS LIST $list_concat_length += $length; } #__SOME ADDITIONAL INFORMATION NEEDED IF IN APPROXIMATE CONTEXT if ($obj->{'context_approx'}) { #__RECORD THE AVERAGE LENGTH IN THIS LIST AND THE LIST SIZE my $num_elements = scalar @{$gene_list}; push @{$obj->{'avg_lengths'}}, $list_concat_length/$num_elements; push @{$obj->{'list_sizes'}}, $num_elements; #__DISCERN WHICH SUB-LIST HAS THE MOST MEMBERS $most_populous_list = $num_elements if $num_elements > $most_populous_list; } #__TALLY THE NUMBER OF GENES IN THIS LIST TO THE GRAND TOTAL $obj->{'num_genes'} += scalar @{$gene_list}; #__SAVE THIS GENE LIST TO THE OBJECT push @{$obj->{'genes'}}, $gene_list; #__TALLY TO OVERALL LENGTH OF ALL THE GENES $total_length += $list_concat_length; } #__RECORD GRAND TOTAL LENGTH OF ALL GENES CONCATENATED TOGETHER $obj->{'total_length'} = $total_length; #__RECORD SIZE OF MOST POPULOUS LIST $obj->{'most_populous_list'} = $most_populous_list; # # NOTES # # Here, we should probably now make sure to erase # $obj->{'mutation_prob'} and $obj->{'mpvals'} = $mpvals # if they exist, because their contents would no longer be valid (i.e. as # based on an "old" set of gene lengths) # } # ============ # BINOM COEFFS calculate binomial coefficients # ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head3 binom_coeffs Calculates the binomial coefficients needed in the binomial (convolution) approximate solution. $pmobj->binom_coeffs; The internal data structure is essentially the symmetric half of the appropriately-sized Pascal triangle. Considerable memory is saved by not storing the full triangle. =cut ### This is called automatically, if necessary, before the probability ### calculation, so there is not typically a need to call it ### manually. sub binom_coeffs { my $obj = shift; croak "need to know most populous list first" unless defined $obj->{'most_populous_list'}; carp "already have binomial coefficients" if defined $obj->{'binom_coeffs'}; #__SET-UP FOR COEFFICIENTS my $bin_coeffs = [1]; $obj->{'binom_coeffs'} = []; push (@{$obj->{'binom_coeffs'}}, $bin_coeffs); #__CALCULATE COEFFICIENTS UP TO THAT REQUIRED BY THE MOST POPULOUS LIST for (my $i = 1; $i <= $obj->{'most_populous_list'}; $i++) { $bin_coeffs = &next_bin_coeff_row ($i, $bin_coeffs); push (@{$obj->{'binom_coeffs'}}, $bin_coeffs); } } # ========== # PREPROCESS calculate list of modified bernoulli probabilities for each gene # ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # CONFIGURED FOR BOTH EXACT AND APPROXIMATE CONTEXTS =head3 preprocess Calculates the Bernoulli kernel probabilities for the individual genes or gene bins $pmobj_binom->preprocess ($background_mutation_rate); The data structure can be re-configured to run the test with different background mutation rates by just re-calling this routine with a different value $pmobj_binom->preprocess ($new_background_mutation_rate); =cut sub is_float { my $val = shift; if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) { return 1; } else { return 0; } } sub preprocess { my $obj = shift; my ($mutation_prob) = @_; #__PRELIMINARY VALIDATION croak "need background mutation rate" unless $mutation_prob; croak "background mutation '$mutation_prob' rate must be a p-val" unless (Genome::Model::Tools::Music::PathScan::PathScan::is_float($mutation_prob) && $mutation_prob > 0 && $mutation_prob < 1); croak "preprocessing: no data" unless defined $obj->{'genes'}; #__JUST RETURN SILENTLY IF WE'RE ABOUT TO COMPUTE WHAT HAS ALREADY BEEN DONE # carp "data already preprocessed" if defined $obj->{'mpvals'}; # # NOTE: IF THE CALLER WANTS TO NOW ANALYZE A NEW BACKGROUND MUTATION RATE # TEST FOR EQUIVALENCE TO THE OLD ONE USING "STRING" CONTEXT if (defined $obj->{'mpvals'} && $obj->{'mutation_prob'} eq $mutation_prob) { return; } #__CALCULATE THE MODIFIED BERNOULLI PROBABILITY VALUE FOR EACH GENE my $mpvals = []; #__CALCULATE IN THIS FASHION IF IN APPROXIMATE CONTEXT if ($obj->{'context_approx'}) { foreach my $length (@{$obj->{'avg_lengths'}}) { my $pval = exp ($mutation_prob * $length) - 1; if ($pval > 0) { push @{$mpvals}, $pval; } else { croak "unexpected bernoulli pval '$pval' from average gene length '$length'"; } } #__ALSO CALCULATE EXPECTED NUMBER OF GENES BEING MUTATED IN CASE WE NEED TO # REVERT TO ASYMPTOTIC ANALYSIS my $expected_mutated_genes = 0; foreach my $gene_list (@{$obj->{'genes'}}) { foreach my $length (@{$gene_list}) { $expected_mutated_genes += 1 - exp (- $mutation_prob * $length); } } $obj->{'expected_genes'} = $expected_mutated_genes; #__ELSE CALCULATE IN THIS FASHION IF IN EXACT CONTEXT OR IF USING THE # ASYMPTOTIC (POISSON) APPROXIMATION } else { my $expected_mutated_genes = 0; #__THERE IS ONLY 1 LIST OF GENES HERE (INDEX 0 IN THE DATA STRUCTURE) foreach my $length (@{$obj->{'genes'}->[0]}) { #__ACTUAL PVAL FOR THIS GENE BEING MUTATED TALLIED TOWARD EXPECTED VALUE $expected_mutated_genes += 1 - exp (- $mutation_prob * $length); #__ALGORITHMIC MODEL PVALUE (SEE NOTES) my $pval = exp ($mutation_prob * $length) - 1; #__STORE ALGORITHMIC MODELED PVALUE if ($pval > 0) { push @{$mpvals}, $pval; } else { croak "unexpected bernoulli pval '$pval' from gene length '$length'"; } } #__STORE EXPECTED NUMBER OF MUTATED GENES $obj->{'expected_genes'} = $expected_mutated_genes; } #__STORE RESULT $obj->{'mutation_prob'} = $mutation_prob; $obj->{'mpvals'} = $mpvals; } ################################################################################ ## ## ## M E T H O D S M E A N T T O B E P R I V A T E ## ## ## ## methods that cannot be called in a contextually meaningful way from an ## ## external application using the object-oriented interface ## ## ## ################################################################################ # ================================ # RECURSIVE EXACT PVAL CALCULATION recursion for handling summation of terms # ================================ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub _recursive_exact_pval_calculation { my $obj = shift; my ($k, $prev_recursion_level, $start_index, $coeff_for_this_level) = @_; #__THE CURRENT LEVEL OF RECURSION my $curr_recursion_level = $prev_recursion_level + 1; my $local_contribution = 0; #__LOOP AT THE CURRENT RECURSION LEVEL USING APPROPRIATE START/STOP INDECES for (my $i = $start_index; $i <= $obj->{'num_genes'} - $k + $curr_recursion_level; $i++) { #__RECURSE FURTHER IF NECESSARY: NOT if ($curr_recursion_level < $k) { #__RECURSION: NOTE LAST ARG RESOLVES DIFFERENCE BETWEEN PHYSICAL GENE # NUMBERING (STARTING AT 1) VS NUMBERING IN PERL LIST (STARTING AT 0) $local_contribution += $obj->_recursive_exact_pval_calculation ( $k, $curr_recursion_level, $i+1, # see mcw notes $coeff_for_this_level * $obj->{'mpvals'}->[$i-1] # resolve number ); #__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT: NOTE AGAIN # WE RESOLVE DIFFERENCE BETWEEN PHYSICAL GENE NUMBERING (STARTING AT 1) # VS NUMBERING IN PERL LIST (STARTING AT 0) } else { $local_contribution += $coeff_for_this_level * $obj->{'mpvals'}->[$i-1]; } } #__RETURN RESULT TO THE ANTECEDENT LEVEL return $local_contribution; } # ================================ # RECURSIVE BINOM PVAL CALCULATION recursion for handling summation of terms # ================================ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # HERE $obj->{'mpvals'}->[$i-1] IS THE BERNOULLI PROBABILITY FOR THE i-th BIN # I.E. THE -1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING # # note that the summation structure in the solution means that binomial # coefficients that are out-of-bounds can nevertheless be called upon, so # we must explicitly test when this is the case and skip these (see code below) sub _recursive_binom_pval_calculation { my $obj = shift; my ($recurs_limit, $prev_recursion_level, $k_stop_index) = @_; #__THE CURRENT LEVEL OF RECURSION my $curr_recursion_level = $prev_recursion_level + 1; my $local_contribution = 0; #__SUBSCRIPT my $current_list_number = $recurs_limit + 1 - $curr_recursion_level; #__EFFICIENCY -- WE DO NOT NEED TO COMPUTE THE POWER TERM WITHIN THE LOOP # THE POWER VALUE CORRESPONDS TO THE LOOP ITERATOR, SO WE CAN INCREMENT # THESE TOGETHER -- SEE SOLUTION NOTES my $fraction_coeff = 1; my $multiplier = $obj->{'mpvals'}->[$current_list_number-1] / $obj->{'mpvals'}->[$current_list_number]; #__LOOP AT THE CURRENT RECURSION LEVEL USING APPROPRIATE START/STOP INDECES for (my $k = 0; $k <= $k_stop_index; $k++) { #__RECURSE FURTHER IF NECESSARY: NOT if ($curr_recursion_level < $recurs_limit) { #__RECURSION: CURRENT ITERATION IN THIS LOOP IS STOP INDEX FOR NEXT if ( $obj->{'list_sizes'}->[$current_list_number] >= $k_stop_index - $k ) { $local_contribution += $fraction_coeff * &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number], $k_stop_index - $k) * $obj->_recursive_binom_pval_calculation ( $recurs_limit, $curr_recursion_level, $k ); } #__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT: NOTE AGAIN # WE RESOLVE DIFFERENCE BETWEEN PHYSICAL GENE NUMBERING (STARTING AT 1) # VS NUMBERING IN PERL LIST (STARTING AT 0) } else { if ( $obj->{'list_sizes'}->[$current_list_number-1] >= $k && $obj->{'list_sizes'}->[$current_list_number] >= $k_stop_index - $k ) { $local_contribution += $fraction_coeff * &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number-1], $k) * &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number], $k_stop_index - $k); } } #__TAKE CARE OF THE POWER TERM FOR NEXT ITERATION (STEP-BY-STEP BUILD-UP) $fraction_coeff *= $multiplier; } #__RETURN RESULT TO THE ANTECEDENT LEVEL return $local_contribution; } # ================== # NEXT BIN COEFF ROW compute 1/2 row i of binomial coefficients given row i-1 # ================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub next_bin_coeff_row { my ($i, $im1_row) = @_; #__FIRST ELEMENT I,0 IN ALL ROWS DEFINED AS UNITY my $next_row = [1]; #__IF I > 1 COMPUTE REST OF NEXT (I) ROW USING PASCAL TRIANGLE ON PREV (I-1) ROW if ($i > 1) { #__COMPUTE STOPPING POINT BASED ON SYMMETRY my $i_mirror = $i / 2; my $i_end = int $i_mirror; #__FILL IN INTERMEDIATE ELEMENTS I,1 TO I,I_END USING PASCALS TRIANGLE for (my $iposition = 1; $iposition <= $i_end; $iposition++) { #__START WITH "LEFT SIDE" VAL OF PREVIOUS ROW my $element = $im1_row->[$iposition-1]; #__COMPUTE NEXT ELEMENT USING THE PASCAL TRIANGLE METHOD if ($iposition == $i_mirror) { $element += $im1_row->[$iposition-1]; } else { $element += $im1_row->[$iposition]; } #__SAVE push (@{$next_row}, $element); } } #__RETURN LIST OF HALF SYMETRIC NEXT ROW OF BINOMIAL COEFFICIENTS return $next_row; } # ======== # BINCOEFF return binomial coefficient using 1/2 symetric stored table # ======== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # RETURNS C_{$i_top, $k_bottom} sub bincoeff { my $obj = shift; my ($i_top, $k_bottom) = @_; #__SYMMETRY CUTOFF IS HALF THE VALUE OF THE I'TH ROW my $i_half = $i_top / 2; #__IF I,K IS WITHIN THE STORED SYMMETRIC HALF OF TRIANGLE THEN SIMPLY RETURN VAL if ($k_bottom <= $i_half) { return $obj->{'binom_coeffs'}->[$i_top]->[$k_bottom]; #__ELSE COMPUTE SYMMETRIC REFLECTION OF NON-STORED COMPONENT AND RETURN THAT VAL } else { my $k_reflect = $i_top - $k_bottom; return $obj->{'binom_coeffs'}->[$i_top]->[$k_reflect]; } } ################################################################################ ## ## ## T R A I L I N G P O D D O C U M E N T A T I O N ## ## ## ################################################################################ =head1 EXAMPLES The following examples may be helpful in using this package. In each case, assume we have first executed some required preliminary code. #__USE THE PACKAGE use PathScan; #__SOME DATA FOR AN "EXACT CONTEXT" CALCULATION my $genes_exact = [ 4000, 4000, 4000, 4000, 4000, 15000, 15000, 15000, 15000, 15000, 35000, 35000, 35000, 35000, 35000 ]; #__SOME DATA FOR AN "APPROXIMATE CONTEXT" CALCULATION my @genes_binned = ( [4000, 4000, 4000, 4000, 4000], [15000, 15000, 15000, 15000, 15000], [35000, 35000, 35000, 35000, 35000] ); =head2 simple path-scan test Here, we compare the values returned by both the exact and approximate algorithms over the whole domain of possible hits for a case where the answers should be identical. #__SET BACKGROUND MUTATION RATE my $rho = 0.00002; #__CONFIGURE OBJECTS IN "EXACT" AND "APPROXIMATE" CONTEXTS my $pmobj_exact = PathScan->new ($genes_exact); $pmobj_exact->preprocess ($rho); my $pmobj_binom = PathScan->new (@genes_binned); $pmobj_binom->preprocess ($rho); #__CALCULATE AND TALLY THE MAXIMUM DIFFERENCE my $maxdiff = 0; for (my $i = 0; $i <= scalar @{$genes_exact}; $i++) { my $pm_pval_exact = $pmobj_exact->path_scan($i); my $pm_pval_binom = $pmobj_binom->path_scan($i); my $diff = abs ($pm_pval_exact - $pm_pval_binom); $maxdiff = $diff if $diff > $maxdiff; print "$i hits: $pm_pval_exact $pm_pval_binom $diff\n"; } print "MAXIMUM DIFFERENCE IS $maxdiff\n"; =head2 testing at different background rates This example shows how to run the test for a fixed number of hits, say 7 in this case, for various different background mutation rates. #__CONFIGURE OBJECT my $pmobj_binom = PathScan->new (@genes_binned); #__CALCULATE for (my $rho = 0.00001; $rho <= 0.0001; $rho += 0.00001) { my $pm_pval_binom = $pmobj_binom->path_scan(7, $rho); print "7 hits at background $rho : P = $pm_pval_binom\n"; } Note that we did not run C explicitly, but rather let the C method call it implicitly for each new value of the background mutation rate. =head2 computing asymptotic approximate solution The asymptotic (Poisson) approximate probabiltiy value is straightforward to compute. #__SET BACKGROUND MUTATION RATE my $rho = 0.00002; #__CONFIGURE OBJECT my $pmobj_poisson = PathScan->new ($genes_exact); $pmobj_poisson->preprocess ($rho); #__P-VALUE FOR 7 OBSERVED MUTATED GENES my $pm_pval_poisson = $pmobj_exact->path_scan_asymptot (7); =head2 accessing individual probability masses The probability masses for specific numbers of hits can also be calculated. #__SET BACKGROUND MUTATION RATE my $rho = 0.00002; #__CONFIGURE OBJECT my $pmobj_exact = PathScan->new ($genes_exact); $pmobj_exact->preprocess ($rho); #__CALCULATE MASSES my $total_prob = 0; for (my $i = 0; $i <= scalar @{$genes_exact}; $i++) { my $pval_exact = $pmobj_exact->p_value_exact ($i); $total_prob += $pval_exact; print "$i hits : probability mass = $pval_exact\n"; } print "total probability = $total_prob\n"; =cut ################################################################################ ## ## ## - E N D - ## ## ## ################################################################################ 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm000444000765000024 5525012013522176 31242 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::PathScan::PopulationPathScan; #__STANDARD PERL PACKAGES use strict; use warnings; use Carp; use Genome::Model::Tools::Music::PathScan::CombinePvals; use Genome::Model::Tools::Music::PathScan::PathScan; # DEBUG # print "USING LOCAL MCW VERSION OF POPULATION PATHSCAN\n"; # DEBUG # DEBUG -- PLEASE REMOVE # use lib "/home/mwendl/work/perl_modules"; # use PostData; ################################################################################ ## ## ## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ## ## ## ################################################################################ =head1 NAME PopulationPathScan - apply PathScan test to populations rather than just single individuals =head1 SYNOPSIS use PopulationPathScan; my $obj = PopulationPathScan->new ($ref_to_list_of_gene_lengths); $obj->assign ($number_of_compartments); $obj->preprocess ($background_mutation_rate); $pval = $obj->population_pval_approx ($ref_to_list_of_hits_per_sample); $pval = $obj->population_pval_exact ($ref_to_list_of_hits_per_sample); =head1 DESCRIPTION The C package is implemented strictly as a test of a set of genes, e.g. a pathway, for a I individual. Specifically, knowing the gene lengths in the pathway, the number of genes that have at least one mutation, and the estimated background mutation rate, one can test the null hypothesis that these observed mutations are well-explained simply by the mechanism of random background mutation. However, it will often be the case that data for a pathway will be available for many individuals, meaning that we now have many tests of the given (single) hypothesis. (This should not be confused with the scenario of multiple hypothesis testing.) The set of values contains much more information than a single value, suggesting that significance must be judged on the basis of the collective result. For example, while no single p-value by itself may exceed the chosen statistical threshold, the overall set of probabilities may still give the impression of significance. Properly combining such numbers is a necessary, but not entirely trivial task. This package basically serves as a high-level interface to first perform individual tests using the methods of C, and then to properly combine the resulting p-values using the methods of C. =head1 AUTHOR Michael C. Wendl S Copyright (C) 2009 Washington University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =head1 METHODS The available methods are listed below. =cut ################################################################################ ## ## ## P R O G R A M M E R N O T E S ## ## ## ################################################################################ # # The obj schematic resembles: # # $obj = { # # #__GENE LENGTHS IN THE POPULATION PATHSCAN TEST # gene_lengths => [474, 1038, 285, ...], # # #__THE ACTUAL NUMBER OF GENES IN TEST (SAVED FOR CONVENIENCE) # num_genes = 15, # # #__ARGUMENT LIST FOR PATHSCAN COMPUTATION (PathScan) STRUCTURE # IS DETERMINED BY THE WAY THE "assign" METHOD IS CALLED # path_scan_arg_list = [], # path_scan_arg_list = [ [], [], [] ], # # #__ASSIGN LEVEL (ESSENTIALLY THE ARGUMENT OF THE 'ASSIGN' METHOD) # assign_level = 1, # # #__CUMMULATIVE DISTRIBUTION FOR THIS SET OF GENES ORDERED MOST EXTREME # TO LEAST EXTREME --- COULD BE EITHER THE "COMPLETE" CDF, I.E. THE ENTIRE # DISTRIBUTION # cdf = [0.003, 0.0234, 0.1001, 0.23, 0.4, 0.8, 0.9, 0.94, 0.97, 0.99, 1], # # #__OR COULD BE A TRUNCATED LIST WITH JUST ENOUGH VALUES TO DO A CALCULATION # I.E. WHERE THE MORE EXTREME TAILED PROBABILITY VALUES ARE OMITTED, BEING # REPLACED BY A SIMPLE PLACEHOLDER FLAG -1 # cdf = [-1, -1, -1, -1, -1, -1, -1, 0.94, 0.97, 0.99, 1], # # #__MAXIMUM NUMBER OF MUTATED GENES TAKEN OVER ALL SAMPLES -- SEE PREPROCESS # max_hits = 5, # }; ################################################################################ ## ## ## P U B L I C M E T H O D S ## ## ## ################################################################################ # === # NEW create a new population path-scan object # === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 new The object constructor takes a mandatory, but otherwise un-ordered reference to a list of gene lengths comprising the biological group (e.g. a pathway) whose mutation significance is to be analyzed using the PathScan paradigm. my $obj = PopulationPathScan->new ([474, 1038, 285, ...]); The method checks to make sure that all elements are legitimate lengths, i.e. integers exceeding 3. =cut sub new { my $class = shift; my ($gene_lengths) = @_; #__OBJECT TEMPLATE my $self = {}; #__PROCESS GENE LENGTHS IF THEY'RE SPECIFIED if (defined $gene_lengths && $gene_lengths) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $gene_lengths eq "ARRAY"; #__SAVE LIST $self->{'gene_lengths'} = $gene_lengths; $self->{'num_genes'} = scalar @{$gene_lengths}; #__VALIDATE THE INPUT foreach my $gene_length (@{$gene_lengths}) { #__MAKE SURE THIS IS A LEGITIMATE LENGTH croak "'$gene_length' is not a gene length" unless $gene_length =~ /^\d+$/ && $gene_length >= 3; } #__OTHERWISE CROAK } else { croak "must specify a list of gene lengths as an argument"; } #__BLESS INTO CLASS AND RETURN OBJECT bless $self, $class; return $self; } # ====== # ASSIGN assign the manner in which genes will be internally organized # ====== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 assign This method assigns the manner in which genes will be internally organized for passing to the PathScan calculation component. The main consideration here is how the list may be compartmentalized for greater computational efficiency, though at some loss of accuracy, for the PathScan calculation. If the gene list is long, exact calculation is generally infeasible. The method takes a single argument representing the number of compartments (or sub-lists) the lengths will be divided into, e.g. 1 represents a single list, i.e. exact computation, 2 indicates two lists, 3 three lists, etc. $obj->assign (3); The values are then organized internally such that the smallest genes are grouped together, then the slightly larger ones, and so forth. Generally, 3 or 4 lists give reasonable balance between accuracy and computation (Wendl et al., in progress). =cut # THIS HAS NOT BEEN IMPLEMENTED YET # # The method can also be called without an argument # # $obj->assign; # # in which case the gene lengths will put into a number of # compartments such that each one has a maximum of 10 # values. sub assign { my $obj = shift; my ($assign_level) = @_; #__ORDER THE LIST OF VALIDATED GENE LENGTHS ACCORDING TO INCREASING SIZE @{$obj->{'gene_lengths'}} = sort _numerical_ @{$obj->{'gene_lengths'}}; sub _numerical_ {$a <=> $b} #__ASSIGN TO A SPECIFIC NUMBER OF COMPARTMENTS IF SPECIFIED if (defined $assign_level && $assign_level) { $obj->{'assign_level'} = $assign_level; #__QUICK-PROCESSING IF THERE'S NO COMPARTMENTALIZATION if ($assign_level == 1) { $obj->{'path_scan_arg_list'} = $obj->{'gene_lengths'}; return; } #__REMAINDER AFTER DIVIDING GENE LIST INTO AN INTEGER-NUMBER OF COMPARTMENTS my $remain = $obj->{'num_genes'} % $assign_level; #__LENGTH OF ALL COMPARTMENTS (EXCEPT LAST ONE IF THERE'S A REMAINDER) my $list_length = ($obj->{'num_genes'} - $remain) / $assign_level; #__BUILD-UP THE COMPARTMENTALIZED ARGUMENT LIST my ($list_number, $gene_number, $compartment) = (1, 1, []); foreach my $gene_length (@{$obj->{'gene_lengths'}}) { push @{$compartment}, $gene_length; $gene_number++; if ($gene_number > $list_length) { $list_number++; push @{$obj->{'path_scan_arg_list'}}, $compartment; ($gene_number, $compartment) = (1, []); $list_length += $remain if $list_number == $assign_level; } } #__ELSE ASSIGN SUCH THAT EACH COMPARTMENT HAS A MAXIMUM SIZE } else { croak "illegal assignment level"; } } # ========== # PREPROCESS set-up PathScan calculation and compute CDF # ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =head2 preprocess This method pre-processes the population-level calculation, specifically, it sets up and executes the PathScan module to obtain the CDF associated with the given gene set and background mutation rate. It takes the latter as an argument. $obj->preprocess (0.0000027); Executing this method will take various amounts of CPU time, depending upon the level of accuracy and the number of genes in the calculation. The method optionally takes the list of the number of mutated genes in the group for each sample as a second argument, if this information is known at this point $obj->preprocess (0.0000027, [4, 5, 7, 3, 0, ...]); and it is usually better to use this form because the internals will compute only a truncated CDF that is just sufficient to process this list, rather than computing the full CDF. Not only is speed improved, but this helps avoid overflow errors for large pathways. =cut sub preprocess { my $obj = shift; my ($mutation_prob, $list_of_hits) = @_; my $max_hits = 0; #__PRELIMINARY VALIDATION OF ARGUMENT croak "need background mutation rate" unless $mutation_prob; croak "background mutation '$mutation_prob' rate must be a p-val" unless is_a_pval ($mutation_prob); #__INVOKE NEW PATHSCAN OBJECT USING PRE-COMPUTED ARGUMENT LIST FOR EXACT SOLN my $pm_obj; if ($obj->{'assign_level'} == 1) { $pm_obj = Genome::Model::Tools::Music::PathScan::PathScan->new ($obj->{'path_scan_arg_list'}); #__OR FOR APPROXIMATE SOLUTION } elsif ($obj->{'assign_level'} > 1) { $pm_obj = Genome::Model::Tools::Music::PathScan::PathScan->new (@{$obj->{'path_scan_arg_list'}}); #__ALSO FIND THE MAX NUMBER OF MUTATED GENES AMONG ALL SAMPLES IF GIVEN HITS if (defined $list_of_hits && $list_of_hits) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $list_of_hits eq "ARRAY"; #__HARD-SET MAX HITS TO 1 IN CASE 0 SAMPLES HAVE HITS & TRIGGERS TRUNC CDF $max_hits = 1; #__FIND MAXIMUM NUMBER OF HITS foreach my $hits (@{$list_of_hits}) { #__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER croak "'$hits' is not a hit number" unless $hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'}; #__RECORD MAXIMUM $max_hits = $hits if $hits > $max_hits; } #__SAVE MAX HITS TO OBJECT $obj->{'max_hits'} = $max_hits; } #__ELSE WE CANT PROCESS } else { croak "I dont understand the 'assign' level you used previously"; } #__STANDARD PREPROCESSING FOR PathScan OBJECT $pm_obj->preprocess ($mutation_prob); #__COMPUTE AND STORE CDF -- EITHER FULL OR TRUNCATED if ($max_hits) { $obj->{'cdf'} = $pm_obj->cdf_truncated ($max_hits); } else { $obj->{'cdf'} = $pm_obj->cdf; } # $obj->{'cdf'} = $pm_obj->cdf_asymptot; #__WEIRD HEURISTIC: MAKE SURE LAST VALUE IN CDF LIST IS ALWAYS IDENTICALLY UNITY # # We have seen sometimes that the last value appears to be unity, i.e. it # prints as such, but the 'is_a_pval' rejects it either on the real number # regexp, or the <= 1 condition. Here is an actual croak: # # VAL '1' IS NOT REAL # '1' in distribution 1 is not a p-val at Statistics/PopulationPathScan.pm line 395 # # Please track down this problem when you have a chance, but this practical # fix seems to work acceptably for the moment. $obj->{'cdf'}->[$#{$obj->{'cdf'}}] = 1; } # ===================== # POPULATION_PVAL_EXACT tail prob for the population using exact enumeration # ===================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # PROGRAMMING NOTES: # # 1. remember that the cdf list returned from PathScan->cdf # is ordered from most etreme (lowest p-value, highest number of hits) to # least extreme (highest p-value = 1, lowest number of hits = 0). Therefore, # the correct p-value corresponding to the actual number of hits cannot # be naively looked-up in the list according to order, but must rather be # looked up according to the *reverse order*. For example, for the usual # binomial (0.5 + 0.5)^4 (see e.g. Wallis (1942) pp 244), we have # # cdf = [0.0625, 0.3125, 0.6875, 0.9375, 1] # position = [0, 1, 2, 3, 4] # meaning = [all 4 hit, at least 3 hit, at least 2, at least 1, at least 0] # # therefore, the actual "hit" pvalue is in position # # $obj->{'num_genes'} - $hits # # 2. CombinePvals does not yet have a method that exploits # scenarios, such as this one, where each individual p-val comes from the # _same_ distribution. Currently, we must call "exact_enum_arbitrary", # which does a full enumeration. Change methods here if the CombinePvals # class ever gets such a method. =head2 population_pval_exact This method performs the population-level calculation using exact enumeration. It takes the list of the number of mutated genes in the group for each sample, e.g. each patient's whole genome sequence, for example patient 1: 4 genes in the pathway are mutated patient 2: 5 genes in the pathway are mutated patient 3: 7 genes in the pathway are mutated patient 4: 3 genes in the pathway are mutated patient 5: 0 genes in the pathway are mutated : : : : : : : : : which is invoked as $pval = $obj->population_pval_exact ([4, 5, 7, 3, 0, ...]); Most scenarios will not actually be able to make use of this method because enumeration of all possible cases is rarely computationally feasible. This method will mostly be useful for examining small test cases. =cut sub population_pval_exact { my $obj = shift; my ($list_of_hits) = @_; #__PROCESS HITS IF THEY'RE SPECIFIED if (defined $list_of_hits && $list_of_hits) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $list_of_hits eq "ARRAY"; #__WE NEED 2 LISTS FOR EXACT METHOD my ($default_arg_list, $cdf_list) = ([], []); #__VALIDATE AND PROCESS THE INPUT INTO ARGUMENT LISTS foreach my $hits (@{$list_of_hits}) { #__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER croak "'$hits' is not a hit number" unless $hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'}; #__TAIL PVAL FOR THIS HIT NUMBER (SEE PROGRAMMING NOTE ABOVE) my $pval_x = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits]; #__STORE IN DEFAULT CombinePvals ARG LIST push @{$default_arg_list}, $pval_x; #__CDFS GO IN SPECIAL ARG LIST FOR EXACT ENUMERATION push @{$cdf_list}, $obj->{'cdf'}; } #__INVOKE NEW COMBINE_PVALS OBJECT USING PRE-COMPUTED ARGUMENT LIST my $combine_obj = Genome::Model::Tools::Music::PathScan::CombinePvals->new ($default_arg_list); #__COMPUTE OVERALL "GROUP" P-VALUE BASED ON EXACT ENUMERATION ###### DEBUG # print "from PopulationPathScan --- args for new\n"; # &PostData ($default_arg_list); # print "from PopulationPathScan --- args for exact_enum_arbitrary\n"; # &PostData ($cdf_list); ###### DEBUG my $pval = $combine_obj->exact_enum_arbitrary (@{$cdf_list}); return $pval; #__OTHERWISE CROAK } else { croak "must specify a list of number of genes mutated for the sample set"; } } # ====================== # POPULATION_PVAL_APPROX tail prob for the population using Lancaster approx # ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # PROGRAMMING NOTE: # # remember that the cdf list returned from PathScan->cdf # is ordered from most etreme (lowest p-value, highest number of hits) to # least extreme (highest p-value = 1, lowest number of hits = 0). Therefore, # the correct p-value corresponding to the actual number of hits cannot # be naively looked-up in the list according to order, but must rather be # looked up according to the *reverse order*. For example, for the usual # binomial (0.5 + 0.5)^4 (see e.g. Wallis (1942) pp 244), we have # # cdf = [0.0625, 0.3125, 0.6875, 0.9375, 1] # position = [0, 1, 2, 3, 4] # meaning = [all 4 hit, at least 3 hit, at least 2, at least 1, at least 0] # # therefore, the actual "hit" pvalue is in position # # $obj->{'num_genes'} - $hits # # and the next-most-extreme (lower p-value) is in position # # $obj->{'num_genes'} - $hits - 1 # # for using Lancaster's correction methods =head2 population_pval_approx This method performs the population-level calculation using Lancaster's approximate transform correction. It takes, as a mandatory argument, the list of the number of mutated genes in the group for each sample, e.g. each patient's whole genome sequence. $pval = $obj->population_pval_approx ([4, 5, 7, 3, 0, ...]); You must pass the list of hits, even if you already passed this list earlier to the pre-processing method. Most cases will use this method because exact combination of individual probability values is rarely computationally feasible. Note that Lancaster's method typically gives much better (more accurate) results than Fisher's "standard" chi-square transform. =over =item * Fisher, R. A. (1958) I, 13-th Ed. Revised, Hafner Publishing Co., New York. =item * Lancaster, H. O. (1949) I, Biometrika B<36>(3/4), 370-382. =back =cut sub population_pval_approx { my $obj = shift; my ($list_of_hits) = @_; #__PROCESS HITS IF THEY'RE SPECIFIED if (defined $list_of_hits && $list_of_hits) { #__MAKE SURE THIS IS A LIST croak "argument must be list reference" unless ref $list_of_hits eq "ARRAY"; #__WE NEED 2 LISTS FOR LANCASTER'S METHOD my ($default_arg_list, $lancaster_list) = ([], []); #__VALIDATE AND PROCESS THE INPUT INTO ARGUMENT LISTS # DEBUG #print "processing list of hits\n"; # DEBUG foreach my $hits (@{$list_of_hits}) { #__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER croak "'$hits' is not a hit number" unless $hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'}; #__TAIL PVALS FOR THIS HIT NUMBER (SEE PROGRAMMING NOTE ABOVE) my $pval_x = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits]; # my $pval_x_m_1 = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits - 1]; my $pval_x_m_1; my $x_m_1_index = $obj->{'num_genes'} - $hits - 1; if ($x_m_1_index >= 0) { $pval_x_m_1 = $obj->{'cdf'}->[$x_m_1_index]; } else { $pval_x_m_1 = 0; # dont allow this to inadvertently loop to list end } # DEBUG # print " hit number = $hits\n"; # print " number of genes = $obj->{'num_genes'}\n"; # print " pval_x = $pval_x\n"; # print " pval_x_m_1 = $pval_x_m_1\n"; # DEBUG #__STORE IN DEFAULT CombinePvals ARG LIST # (THIS IS ACUTALLY JUST A FORMALITY IF USING LANCASTERS METHOD) push @{$default_arg_list}, $pval_x; #__STORE IN SPECIAL ARG LIST FOR LANCASTERS METHOD: P(X-1) THEN P(X) push @{$lancaster_list}, [$pval_x_m_1, $pval_x]; } #__INVOKE NEW COMBINE_PVALS OBJECT USING PRE-COMPUTED ARGUMENT LIST my $combine_obj = Genome::Model::Tools::Music::PathScan::CombinePvals->new ($default_arg_list); #__COMPUTE OVERALL "GROUP" P-VALUE BASED ON LANCASTERS TRANSFORM CORRECTION my $pval = $combine_obj->lancaster_mixed_corrected_transform (@{$lancaster_list}); return $pval; #__OTHERWISE CROAK } else { croak "must specify a list of number of genes mutated for the sample set"; } } ################################################################################ ## ## ## P R I V A T E M E T H O D S ## ## ## ################################################################################ # ========================================================================== # ROUTINE FOR DETERMINING WHETHER A VARIABLE REPRESENTS A LEGITIMATE P-VALUE # ========================================================================== sub is_a_pval { my ($val) = @_; $DB::single = 1; #__MUST BE A FLOAT (REGEXP: PERL COOKBOOK CHAP 2.1) & MUST BE BOUNDED BY 0 AND 1 if (Genome::Model::Tools::Music::PathScan::PathScan::is_float($val) && $val >= 0 && $val <= 1) { return 1; #__ELSE IT IS NOT A PVAL } else { return 0; } } ################################################################################ ## ## ## T R A I L I N G P O D D O C U M E N T A T I O N ## ## ## ################################################################################ ################################################################################ ## ## ## - E N D - ## ## ## ################################################################################ 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot000755000765000024 012013522176 24161 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm000444000765000024 1000312013522176 30164 0ustar00nnutterstaff000000000000package Genome::Model::Tools::Music::Plot::MutationRelation; use warnings; use strict; use Genome; use IO::File; use List::Util qw( sum ); use Carp; use POSIX qw( WIFEXITED ); our $VERSION = $Genome::Model::Tools::Music::VERSION; class Genome::Model::Tools::Music::Plot::MutationRelation { is => 'Command::V2', has_input => [ input_matrix => { is => 'String', doc => "A gene/sample matrix generated by the mutation-relation tool" }, output_pdf => { is => 'String', doc => "An output pdf file to draw the plot to" }, ], has_optional_input => [ skip_zero_mut_samples => { is => 'Boolean', doc => "Don't plot samples that have no mutations on any gene being plotted", default => 1 }, genes_to_plot => { is => 'String', doc => "Comma-separated list of genes to plot (example: DNMT3A,NPM1)" }, plot_genes_in_order_listed => { is => 'Boolean', doc => "Plot the genes in the order that they're listed. Default is to plot them in descending order by number of mutations", default => 0 }, ], doc => "Makes plots for results from the mutation-relation tool", }; sub help_detail { return <input_matrix; my $output_pdf = $self->output_pdf; my $skip_zero_mut_samples = $self->skip_zero_mut_samples; my $genes_to_plot = $self->genes_to_plot; my $plot_genes_in_order_listed = $self->plot_genes_in_order_listed; # Load the gene names from the matrix my $headFh = IO::File->new( $input_matrix ); my @cols = split( /\t/, $headFh->getline ); # Create a hash of the gene names, which associates each with an R-friendly name my %genes_in_matrix = map{chomp; my $t = $_; s/\W/_/g; ( $t, $_ )} splice( @cols, 1 ); $headFh->close; # Discard any user-specified genes that are not in the input matrix # being sure to preserve the order of the genes in the input if( defined $genes_to_plot ) { my %genes = map{($_,1)} split( /,/, $genes_to_plot ); foreach my $gene ( keys %genes ) { unless( defined $genes_in_matrix{$gene} ) { warn "Skipping gene $gene which is not seen in the provided input-matrix\n"; delete $genes{$gene}; } } #preserve the order of genes in the input my @glist = split(",",$genes_to_plot); my @finalList; foreach my $g (@glist){ if(defined($genes{$g})){ push(@finalList, $g) } } $genes_to_plot = join(",",@finalList); } # If user didn't specify genes-to-plot, then plot every gene in the input-matrix else { $genes_to_plot = join( ",", values %genes_in_matrix ); } # Create a temporary mutation matrix with R-friendly gene names my $tmp_matrix = Genome::Sys->create_temp_file_path; ( $tmp_matrix ) or die "Couldn't create a temp file. $!"; my $inFh = IO::File->new( $input_matrix ); my $outFh = IO::File->new( $tmp_matrix, ">" ); while( my $line = $inFh->getline ) { chomp( $line ); my @cols = split( /\t/, $line ); if( $line =~ m/^Sample\t/ ) { # Handle the header line $outFh->print( join( "\t", $cols[0], map{$genes_in_matrix{$_}}@cols[1..$#cols] ) . "\n" ); } elsif( !$skip_zero_mut_samples || sum( @cols[1..$#cols] ) > 0 ) { $outFh->print( join( "\t", @cols ) . "\n" ); } } $outFh->close; $inFh->close; # Call R to create a plot for the user-specified genes my $plot_cmd = "R --slave --args < " . __FILE__ . ".R $tmp_matrix $genes_to_plot $output_pdf $plot_genes_in_order_listed"; WIFEXITED( system $plot_cmd ) or croak "Couldn't run: $plot_cmd ($?)"; return 1; } 1; Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm.R000444000765000024 742412013522176 30361 0ustar00nnutterstaff000000000000# Fetch command line arguments args = commandArgs(); input_matrix = as.character(args[4]); genes_to_plot = as.character(args[5]); output_pdf = as.character(args[6]); preserveGeneOrder = as.numeric(as.character(args[7])); sort.data.frame <- function( x, by ) { if(by[[1]] != "~") stop("Argument 'by' must be a one-sided formula.") ## Make the formula into character and remove spaces formc <- as.character(by[2]) formc <- gsub(" ", "", formc) ## If the first character is not + or -, add + if(!is.element(substring(formc, 1, 1), c("+", "-"))) formc <- paste("+", formc, sep = "") ## Extract the variables from the formula vars <- unlist(strsplit(formc, "[\\+\\-]")) vars <- vars[vars != ""] # Remove any extra "" terms ## Build a list of arguments to pass to "order" function calllist <- list() pos <- 1 # Position of + or - for(i in 1:length(vars)){ varsign <- substring(formc, pos, pos) pos <- pos + 1 + nchar(vars[i]) if(is.factor(x[, vars[i]])){ if(varsign == "-") { calllist[[i]] <- -rank(x[, vars[i]]) } else { calllist[[i]] <- rank(x[, vars[i]]) } } else { if(varsign == "-") { calllist[[i]] <- -x[, vars[i]] } else { calllist[[i]] <- x[,vars[i]] } } } return( x[do.call( "order", calllist ), ] ) } plotMutRel <- function( infile, genes, outfile, preserveGeneOrder=FALSE) { ##------------------ a = read.table(infile,row.names=1,header=T) gene_list = unlist(strsplit(genes, split=",")) print(genes) df=a[,gene_list] numSamp=length(df[,1]) numGenes=length(df) if(numGenes < 1){ return("Error: genes to plot not found in matrix") } ##adjustments to plot and text sizes for different numbers of samples samptext=0.3 genetext=0.75 pdfwidth=numSamp/10 offset=2 if(numSamp < 50){ samptext=0.3 #genetext=0.85 pdfwidth=numSamp/5 offset=1.25 } if(numSamp < 35){ samptext=0.4 } if(pdfwidth < 3) { pdfwidth = 3 } # if(numSamp >=100) { genetext=1.1 } # if(numSamp >=200) { genetext=1.1 } pdfheight=3+(0.25*numGenes-1) ##------- #sort the data using the number of mutations in each, desc sortdf = data.frame(g=gene_list[1],s=sum(df[,gene_list[1]])) for(i in 2:length(gene_list)){ sortdf = rbind(sortdf, data.frame(g=gene_list[i],s=sum(df[,gene_list[i]]))) } if(!(preserveGeneOrder)){ gene_list = as.vector(sort.data.frame(sortdf,~-s)$g) } print(preserveGeneOrder) print(gene_list) for(i in rev(gene_list)){ df = sort.data.frame(df,c("~",paste("-",i,sep=""))) } #output pdf here pdf(outfile,height=pdfheight,width=pdfwidth) par(xpd=T) plot(-100,-100, xlim=c(0,numSamp), ylim=c(-numGenes,0), axes=F, xlab="", ylab="") hspace=0.10 vspace=0.05 ##plot grey rects for(i in -(1:numGenes)){ rect( (1:numSamp)+hspace, rep(i,numSamp)+(1-vspace), (1:numSamp)+(1-hspace), rep(i,numSamp)+vspace, col="grey80", border=F ); } ##plot color rects for(i in -(1:numGenes)){ pos=which(df[,gene_list[-i]]==1) rect(pos+hspace, rep(i,length(pos))+(1-vspace), pos+(1-hspace), rep(i,length(pos))+vspace, col="darkgreen", border=F ) } ##gene labels for(i in -(1:numGenes)){ text(1,i+0.5,gene_list[-i],pos=2, cex=genetext) } ##sample labels for(i in 1:numSamp){ text(i+offset,-numGenes,row.names(df)[i],srt=90,pos=2,cex=samptext) } dev.off() } plotMutRel( input_matrix, genes_to_plot, output_pdf, preserveGeneOrder ); Genome-Model-Tools-Music-0.04/t000755000765000024 012013522176 16306 5ustar00nnutterstaff000000000000Genome-Model-Tools-Music-0.04/t/Play.t000444000765000024 217312013522176 17540 0ustar00nnutterstaff000000000000#!/usr/bin/env genome-perl use strict; use warnings; use above "Genome"; use Test::More tests => 12; BEGIN { #It's important this be loaded first before the sub is redefined. use_ok('Genome::Model::Tools::Music::Play'); }; #don't run all the commands while testing the glue that binds them--instead just record that we tried no warnings qw(redefine); sub Genome::Model::Tools::Music::Play::_run_command { my $self = shift; my $command = shift; pass('Would have run ' . $command->command_name); return 1; } use warnings qw(redefine); #fake parameters--since commands not being run (per above) my $play_cmd = Genome::Model::Tools::Music::Play->create( bam_list => 'bam.list', roi_file => 'roi.file', reference_sequence => 'reference.sequence', output_dir => Genome::Sys->create_temp_directory, maf_file => 'maf.file', genetic_data_type => 'gene', pathway_file => 'pathway.file', ); isa_ok($play_cmd, 'Genome::Model::Tools::Music::Play', 'created play comand'); my $rv = eval { $play_cmd->execute; }; if($@) { diag('Error executing: ' . $@); } ok($rv, 'Play executed successfully.');