Genome-Model-Tools-Music-0.04 000755 000765 000024 0 12013522176 16043 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/Build.PL 000444 000765 000024 4163 12013522176 17500 0 ustar 00nnutter staff 000000 000000 #!/usr/bin/perl
# Use local perl, not some perl on an application server!
use Config;
use Module::Build;
BEGIN {
unshift @INC, "$ENV{PWD}/blib/lib";
}
my $class = Module::Build->subclass (
class => 'Pod::Builder',
code => <<'EOS',
sub ACTION_clean {
# FIXME: is this safe?
use File::Path qw/rmtree/;
rmtree "./_build";
rmtree "./blib";
rmtree "./cmd-bindoc";
unlink "./Build";
unlink "./MYMETA.yml";
}
sub ACTION_cmd_docs {
use File::Copy qw/copy/;
$ENV{ANSI_COLORS_DISABLED} = 1;
eval {
local @INC = @INC;
unshift @INC, 'blib/lib';
die $@ if $@;
eval "use Genome::Model::Tools::Music";
die $@ if $@;
foreach my $exec ('genome','gmt') {
UR::Namespace::Command::Update::Doc->execute(
class_name => 'Genome::Model::Tools',
targets => [ 'Genome::Model::Tools::Music' ],
executable_name => $exec,
output_path => 'cmd-bindoc',
output_format => 'pod',
);
}
};
die "failed to extract pod: $!: $@" if ($@);
}
sub ACTION_docs {
my $self = shift;
$self->depends_on('code');
$self->depends_on('cmd_docs');
$self->depends_on('manpages', 'html');
}
sub man1page_name {
my ($self, $file) = @_;
$file =~ s/.pod$//;
return $self->SUPER::man1page_name($file);
}
EOS
);
my $build = $class->new(
module_name => 'Genome::Model::Tools::Music',
license => 'lgpl',
dist_abstract => 'find mutations of significance in cancer',
install_path => { 't' => $Config{vendorlib} . "/Genome/Model/Tools" },
build_requires => {
'Genome' => '0.06',
'Text::CSV_XS' => '',
'Regexp::Common' => '',
'Statistics::Distributions' => '',
},
requires => {
'Genome' => '0.06',
'Text::CSV_XS' => '',
'Regexp::Common' => '',
'Statistics::Distributions' => '',
},
bindoc_dirs => ['cmd-bindoc'],
);
$build->add_build_element('R');
$build->create_build_script;
Genome-Model-Tools-Music-0.04/Changes 000444 000765 000024 3743 12013522176 17502 0 ustar 00nnutter staff 000000 000000 Revision history for Genome::Model::Tools::Music
0.04 2012-08-15
- Implemented generalized linear models for regression analyses against clinical data
- Added tools to generate typical visualizations like Kaplan-Meier survival estimates, and mutation status matrices
- Support for TCGA Mutation Annotation Format (MAF) version 2.3
- Performance improvements in mutation rate calculations, and more efficient memory usage
- Added support for wiggle track format files describing coverage, if BAMs are unavailable
0.03 2012-04-29
[FEATURES]
- All: Added support for updated MAF format v2.2 (Mar 28th, 2012)
- All: Output header descriptions are friendlier, and comma-delimited outputs are now tab-delimited
- All: Added optional skipping of noncoding and silent muts for most modules
- ClinicalCorrelation: Added generalized linear model for phenotype-covariate relationships
- ClinicalCorrelation: Optional use of a sample-vs-gene matrix as input, instead of a MAF
- SMG: Uses the R multicore package to parallelize its tests
- SMG: Provide per-gene BMR modifiers to adjust for varying mutation accumulation rates across the genome
- CalcBmr: Use mutation-recurrence rates instead of mutation rates for SMG testing
- CalcBmr: Implemented clustered-sample BMRs for subgroup-wise or sample-wise SMG tests
- CalcBmr: Added an optional mutation category for truncation mutations
- Pfam: Now supports NCBI Build 37 variant loci
[BUG FIXES]
- All: Modules no longer remove chr-prefixes from user-provided chromosome names
- CalcBmr: Print a warning if the MAF contains a variant from a sample not in the bam-list
- calcRoiCovg: Fixed a potential memory leak
- CosmicOmim: Now treats chromosomes 23 and 24 in the COSMIC DB file as chromosomes X and Y, respectively
- CosmicOmim: Nucleotide changes corrected for splice site variants Genome-Model-Tools-Music-0.04/INSTALL 000444 000765 000024 67 12013522176 17174 0 ustar 00nnutter staff 000000 000000 perl Build.PL
./Build
./Build test
sudo Build install
Genome-Model-Tools-Music-0.04/LICENSE 000444 000765 000024 16743 12013522176 17240 0 ustar 00nnutter staff 000000 000000 GNU LESSER GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.
0. Additional Definitions.
As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the GNU
General Public License.
"The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.
An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.
A "Combined Work" is a work produced by combining or linking an
Application with the Library. The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".
The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.
The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.
1. Exception to Section 3 of the GNU GPL.
You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.
2. Conveying Modified Versions.
If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:
a) under this License, provided that you make a good faith effort to
ensure that, in the event an Application does not supply the
function or data, the facility still operates, and performs
whatever part of its purpose remains meaningful, or
b) under the GNU GPL, with none of the additional permissions of
this License applicable to that copy.
3. Object Code Incorporating Material from Library Header Files.
The object code form of an Application may incorporate material from
a header file that is part of the Library. You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:
a) Give prominent notice with each copy of the object code that the
Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the object code with a copy of the GNU GPL and this license
document.
4. Combined Works.
You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:
a) Give prominent notice with each copy of the Combined Work that
the Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the Combined Work with a copy of the GNU GPL and this license
document.
c) For a Combined Work that displays copyright notices during
execution, include the copyright notice for the Library among
these notices, as well as a reference directing the user to the
copies of the GNU GPL and this license document.
d) Do one of the following:
0) Convey the Minimal Corresponding Source under the terms of this
License, and the Corresponding Application Code in a form
suitable for, and under terms that permit, the user to
recombine or relink the Application with a modified version of
the Linked Version to produce a modified Combined Work, in the
manner specified by section 6 of the GNU GPL for conveying
Corresponding Source.
1) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (a) uses at run time
a copy of the Library already present on the user's computer
system, and (b) will operate properly with a modified version
of the Library that is interface-compatible with the Linked
Version.
e) Provide Installation Information, but only if you would otherwise
be required to provide such information under section 6 of the
GNU GPL, and only to the extent that such information is
necessary to install and execute a modified version of the
Combined Work produced by recombining or relinking the
Application with a modified version of the Linked Version. (If
you use option 4d0, the Installation Information must accompany
the Minimal Corresponding Source and Corresponding Application
Code. If you use option 4d1, you must provide the Installation
Information in the manner specified by section 6 of the GNU GPL
for conveying Corresponding Source.)
5. Combined Libraries.
You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:
a) Accompany the combined library with a copy of the same work based
on the Library, uncombined with any other library facilities,
conveyed under the terms of this License.
b) Give prominent notice with the combined library that part of it
is a work based on the Library, and explaining where to find the
accompanying uncombined form of the same work.
6. Revised Versions of the GNU Lesser General Public License.
The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.
If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.
Genome-Model-Tools-Music-0.04/MANIFEST 000444 000765 000024 2520 12013522176 17330 0 ustar 00nnutter staff 000000 000000 Build.PL
Changes
INSTALL
lib/Genome/Model/Tools/Music.pm
lib/Genome/Model/Tools/Music/Base.pm
lib/Genome/Model/Tools/Music/Bmr.pm
lib/Genome/Model/Tools/Music/Bmr/Base.pm
lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm
lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm
lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm
lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm
lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm
lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm.R
lib/Genome/Model/Tools/Music/CosmicOmim.pm
lib/Genome/Model/Tools/Music/Galaxy.pm
lib/Genome/Model/Tools/Music/MutationRelation.pm
lib/Genome/Model/Tools/Music/MutationRelation.pm.R
lib/Genome/Model/Tools/Music/PathScan.pm
lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm
lib/Genome/Model/Tools/Music/PathScan/PathScan.pm
lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm
lib/Genome/Model/Tools/Music/Pfam.pm
lib/Genome/Model/Tools/Music/Play.pm
lib/Genome/Model/Tools/Music/Plot.pm
lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm
lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm.R
lib/Genome/Model/Tools/Music/Proximity.pm
lib/Genome/Model/Tools/Music/Smg.pm
lib/Genome/Model/Tools/Music/Smg.pm.R
lib/Genome/Model/Tools/Music/Survival.pm
lib/Genome/Model/Tools/Music/Survival.pm.R
LICENSE
MANIFEST This list of files
MANIFEST.SKIP
META.yml
README
t/Play.t
META.json
Genome-Model-Tools-Music-0.04/MANIFEST.SKIP 000444 000765 000024 444 12013522176 20060 0 ustar 00nnutter staff 000000 000000 ^MYMETA.yml$
^.git
^debian/
^ubuntu-lucid/
^alt/
^dist-maint/
^MANIFEST.bak$
^_build/
^Build$
\.tar\.gz$
^blib
^genome-bindoc
^gmt-bindoc
^cmd-bindoc
^i
^.modulebuildrc
^gmt-web
rebuild-debian-install
galaxy.xml
^MYMETA\.json$
t/Music.t # floating point causes this to fail on other machines
Genome-Model-Tools-Music-0.04/META.json 000444 000765 000024 11033 12013522176 17637 0 ustar 00nnutter staff 000000 000000 {
"abstract" : "find mutations of significance in cancer",
"author" : [
"unknown"
],
"dynamic_config" : 1,
"generated_by" : "Module::Build version 0.3901, CPAN::Meta::Converter version 2.120630",
"license" : [
"open_source"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : "2"
},
"name" : "Genome-Model-Tools-Music",
"prereqs" : {
"build" : {
"requires" : {
"Genome" : "0.06",
"Regexp::Common" : "0",
"Statistics::Distributions" : "0",
"Text::CSV_XS" : "0"
}
},
"configure" : {
"requires" : {
"Module::Build" : "0.39"
}
},
"runtime" : {
"requires" : {
"Genome" : "0.06",
"Regexp::Common" : "0",
"Statistics::Distributions" : "0",
"Text::CSV_XS" : "0"
}
}
},
"provides" : {
"Genome::Model::Tools::Music" : {
"file" : "lib/Genome/Model/Tools/Music.pm",
"version" : "0.04"
},
"Genome::Model::Tools::Music::Base" : {
"file" : "lib/Genome/Model/Tools/Music/Base.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr::Base" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr/Base.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr::CalcBmr" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr::CalcCovg" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr::CalcCovgHelper" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Bmr::CalcWigCovg" : {
"file" : "lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::ClinicalCorrelation" : {
"file" : "lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::CosmicOmim" : {
"file" : "lib/Genome/Model/Tools/Music/CosmicOmim.pm",
"version" : 0
},
"Genome::Model::Tools::Music::Galaxy" : {
"file" : "lib/Genome/Model/Tools/Music/Galaxy.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::MutationRelation" : {
"file" : "lib/Genome/Model/Tools/Music/MutationRelation.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::PathScan" : {
"file" : "lib/Genome/Model/Tools/Music/PathScan.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::PathScan::CombinePvals" : {
"file" : "lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm",
"version" : 0
},
"Genome::Model::Tools::Music::PathScan::PathScan" : {
"file" : "lib/Genome/Model/Tools/Music/PathScan/PathScan.pm",
"version" : 0
},
"Genome::Model::Tools::Music::PathScan::PopulationPathScan" : {
"file" : "lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm",
"version" : 0
},
"Genome::Model::Tools::Music::Pfam" : {
"file" : "lib/Genome/Model/Tools/Music/Pfam.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Play" : {
"file" : "lib/Genome/Model/Tools/Music/Play.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Plot" : {
"file" : "lib/Genome/Model/Tools/Music/Plot.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Plot::MutationRelation" : {
"file" : "lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Proximity" : {
"file" : "lib/Genome/Model/Tools/Music/Proximity.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Smg" : {
"file" : "lib/Genome/Model/Tools/Music/Smg.pm",
"version" : "0"
},
"Genome::Model::Tools::Music::Survival" : {
"file" : "lib/Genome/Model/Tools/Music/Survival.pm",
"version" : "0"
}
},
"release_status" : "stable",
"resources" : {
"license" : [
"http://opensource.org/licenses/lgpl-license.php"
]
},
"version" : "0.04"
}
Genome-Model-Tools-Music-0.04/META.yml 000444 000765 000024 6261 12013522176 17456 0 ustar 00nnutter staff 000000 000000 ---
abstract: 'find mutations of significance in cancer'
author:
- unknown
build_requires:
Genome: 0.06
Regexp::Common: 0
Statistics::Distributions: 0
Text::CSV_XS: 0
configure_requires:
Module::Build: 0.39
dynamic_config: 1
generated_by: 'Module::Build version 0.3901, CPAN::Meta::Converter version 2.120630'
license: open_source
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: 1.4
name: Genome-Model-Tools-Music
provides:
Genome::Model::Tools::Music:
file: lib/Genome/Model/Tools/Music.pm
version: 0.04
Genome::Model::Tools::Music::Base:
file: lib/Genome/Model/Tools/Music/Base.pm
version: 0
Genome::Model::Tools::Music::Bmr:
file: lib/Genome/Model/Tools/Music/Bmr.pm
version: 0
Genome::Model::Tools::Music::Bmr::Base:
file: lib/Genome/Model/Tools/Music/Bmr/Base.pm
version: 0
Genome::Model::Tools::Music::Bmr::CalcBmr:
file: lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm
version: 0
Genome::Model::Tools::Music::Bmr::CalcCovg:
file: lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm
version: 0
Genome::Model::Tools::Music::Bmr::CalcCovgHelper:
file: lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm
version: 0
Genome::Model::Tools::Music::Bmr::CalcWigCovg:
file: lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm
version: 0
Genome::Model::Tools::Music::ClinicalCorrelation:
file: lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm
version: 0
Genome::Model::Tools::Music::CosmicOmim:
file: lib/Genome/Model/Tools/Music/CosmicOmim.pm
version: 0
Genome::Model::Tools::Music::Galaxy:
file: lib/Genome/Model/Tools/Music/Galaxy.pm
version: 0
Genome::Model::Tools::Music::MutationRelation:
file: lib/Genome/Model/Tools/Music/MutationRelation.pm
version: 0
Genome::Model::Tools::Music::PathScan:
file: lib/Genome/Model/Tools/Music/PathScan.pm
version: 0
Genome::Model::Tools::Music::PathScan::CombinePvals:
file: lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm
version: 0
Genome::Model::Tools::Music::PathScan::PathScan:
file: lib/Genome/Model/Tools/Music/PathScan/PathScan.pm
version: 0
Genome::Model::Tools::Music::PathScan::PopulationPathScan:
file: lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm
version: 0
Genome::Model::Tools::Music::Pfam:
file: lib/Genome/Model/Tools/Music/Pfam.pm
version: 0
Genome::Model::Tools::Music::Play:
file: lib/Genome/Model/Tools/Music/Play.pm
version: 0
Genome::Model::Tools::Music::Plot:
file: lib/Genome/Model/Tools/Music/Plot.pm
version: 0
Genome::Model::Tools::Music::Plot::MutationRelation:
file: lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm
version: 0
Genome::Model::Tools::Music::Proximity:
file: lib/Genome/Model/Tools/Music/Proximity.pm
version: 0
Genome::Model::Tools::Music::Smg:
file: lib/Genome/Model/Tools/Music/Smg.pm
version: 0
Genome::Model::Tools::Music::Survival:
file: lib/Genome/Model/Tools/Music/Survival.pm
version: 0
requires:
Genome: 0.06
Regexp::Common: 0
Statistics::Distributions: 0
Text::CSV_XS: 0
resources:
license: http://opensource.org/licenses/lgpl-license.php
version: 0.04
Genome-Model-Tools-Music-0.04/README 000444 000765 000024 2632 12013522176 17063 0 ustar 00nnutter staff 000000 000000 Genome::Music
The README is used to introduce the module and provide instructions on
how to install the module, any machine dependencies it may have (for
example C compilers and installed libraries) and any other information
that should be provided before the module is installed.
A README file is required for CPAN modules since CPAN extracts the README
file from a module distribution so that people browsing the archive
can use it to get an idea of the module's uses. It is usually a good idea
to provide version information here so that people can decide whether
fixes for the module are worth downloading.
INSTALLATION
To install this module, run the following commands:
perl Makefile.PL
make
make test
make install
SUPPORT AND DOCUMENTATION
After installing, you can find documentation for this module with the
perldoc command.
perldoc Genome::Music
You can also look for information at:
RT, CPAN's request tracker
http://rt.cpan.org/NoAuth/Bugs.html?Dist=Genome::Music
AnnoCPAN, Annotated CPAN documentation
http://annocpan.org/dist/Genome::Music
CPAN Ratings
http://cpanratings.perl.org/d/Genome::Music
Search CPAN
http://search.cpan.org/dist/Genome::Music/
COPYRIGHT AND LICENCE
Copyright (C) 2010 The Genome Center at Washington University
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
Genome-Model-Tools-Music-0.04/lib 000755 000765 000024 0 12013522176 16611 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome 000755 000765 000024 0 12013522176 20023 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model 000755 000765 000024 0 12013522176 21063 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools 000755 000765 000024 0 12013522176 22163 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music.pm 000444 000765 000024 6506 12013522176 23745 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music;
use strict;
use warnings;
use Genome;
#bugfix version needs to be encoded as extra precision after the decimal point ie: .0401
our $VERSION = '0.04';
class Genome::Model::Tools::Music {
is => ['Command::Tree'],
doc => 'Mutational Significance in Cancer (Cancer Mutation Analysis)'
};
sub _doc_manual_body {
return < command runs all of the sub-commands serially on a selected input set.
EOS
}
sub _doc_copyright_years {
(2007,2011);
}
sub _doc_license {
my $self = shift;
my (@y) = $self->_doc_copyright_years;
return <(1)',
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music 000755 000765 000024 0 12013522176 23243 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Base.pm 000444 000765 000024 2545 12013522176 24616 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Base;
use strict;
use warnings;
use Genome;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Base {
is => ['Command::V2'],
is_abstract => 1,
attributes_have => [
file_format => {
is => 'Text',
is_optional => 1,
}
],
doc => "Mutational Significance In Cancer (Cancer Mutation Analysis)"
};
sub _doc_copyright_years {
(2010,2011);
}
sub _doc_license {
my $self = shift;
my (@y) = $self->_doc_copyright_years;
return <(1), B(1)
EOS
}
sub _doc_manual_body {
my $help = shift->help_detail;
$help =~ s/\n+$/\n/g;
return $help;
}
sub help_detail {
return "This tool is part of the MuSiC suite. See:\n"
. join("\n",shift->_doc_see_also)
. "\n";
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr.pm 000444 000765 000024 2223 12013522176 24455 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr;
use warnings;
use strict;
use Genome;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Bmr {
#is => ['Command::Tree','Genome::Model::Tools::Music::Base'],
is => ['Command::Tree'],
doc => "Calculate gene coverages and background mutation rates."
};
sub _doc_copyright_years {
(2010,2011);
}
sub _doc_license {
my $self = shift;
my (@y) = $self->_doc_copyright_years;
return <(1),
B(1),
B(1),
B(1)
EOS
}
sub _doc_manual_body {
return shift->help_detail;
}
sub help_detail {
return "These tools are part of the MuSiC suite.\n";
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm 000444 000765 000024 45255 12013522176 27711 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::ClinicalCorrelation;
use warnings;
use strict;
use Carp;
use Genome;
use IO::File;
use POSIX qw( WIFEXITED );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::ClinicalCorrelation {
is => 'Genome::Model::Tools::Music::Base',
has_input => [
bam_list => {
is => 'Text',
doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)",
},
maf_file => {
is => 'Text', is_optional => 1,
doc => "List of mutations using TCGA MAF specification v2.3",
},
output_file => {
is_output => 1, is => 'Text',
doc => "Results of clinical-correlation tool. Will have suffix added for data type",
},
clinical_correlation_matrix_file => {
is => 'Text', is_optional => 1,
doc => "Specify a file to store the sample-vs-gene matrix created during calculations",
},
input_clinical_correlation_matrix_file => {
is => 'Text', is_optional => 1,
doc => "Instead of creating this from the MAF, input the sample-vs-gene matrix for calculations",
},
genetic_data_type => {
is => 'Text', is_optional => 1, default => "gene",
doc => "Correlate clinical data to \"gene\" or \"variant\" level data",
},
numeric_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Table of samples (y) vs. numeric clinical data category (x)",
},
numerical_data_test_method => {
is => 'Text', is_optional => 1, default => 'cor',
doc => "Either 'cor' for Pearson Correlation or 'wilcox' for the Wilcoxon Rank-Sum Test for numerical clinical data",
},
categorical_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Table of samples (y) vs. categorical clinical data category (x)",
},
glm_model_file => {
is => 'Text', is_optional => 1,
doc => "File outlining the type of model, response variable, covariants, etc. for the GLM analysis. (See DESCRIPTION)",
},
glm_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION)",
},
use_maf_in_glm => {
is => 'Boolean', is_optional => 1, default => 0,
doc => "Create a variant matrix from the MAF file as variant input to GLM analysis.",
},
skip_non_coding => {
is => 'Boolean', is_optional => 1, default => 1,
doc => "Skip non-coding mutations from the provided MAF file",
},
skip_silent => {
is => 'Boolean', is_optional => 1, default => 1,
doc => "Skip silent mutations from the provided MAF file",
},
],
doc => "Correlate phenotypic traits against mutated genes, or against individual variants",
};
sub help_synopsis {
return <bam_list;
my $output_file = $self->output_file;
my $genetic_data_type = $self->genetic_data_type;
# check genetic data type
unless( $genetic_data_type =~ /^gene|variant$/i ) {
$self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter.");
return;
}
# load clinical data and analysis types
my %clinical_data;
if( $self->numeric_clinical_data_file ) {
$clinical_data{'numeric'} = $self->numeric_clinical_data_file;
}
if( $self->categorical_clinical_data_file ) {
$clinical_data{'categ'} = $self->categorical_clinical_data_file;
}
if( $self->glm_clinical_data_file ) {
$clinical_data{'glm'} = $self->glm_clinical_data_file;
}
my $glm_model = $self->glm_model_file;
# declarations
my @all_sample_names; # names of all the samples, no matter if it's mutated or not
# parse out the sample names from the bam-list which should match the names in the MAF file
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline ) {
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
}
$sampleFh->close;
# loop through clinical data files
for my $datatype ( keys %clinical_data ) {
my $test_method;
my $full_output_filename;
if( $datatype =~ /numeric/i ) {
$full_output_filename = $output_file . ".numeric.csv";
$test_method = $self->numerical_data_test_method;
}
if( $datatype =~ /categ/i ) {
$full_output_filename = $output_file . ".categorical.csv";
$test_method = "fisher";
}
if( $datatype =~ /glm/i ) {
$full_output_filename = $output_file . ".glm.csv";
$test_method = "glm";
}
#read through clinical data file to see which samples are represented and create input matrix for R
my %samples;
my $matrix_file;
my $samples = \%samples;
my $clin_fh = new IO::File $clinical_data{$datatype},"r";
unless( $clin_fh ) {
die "failed to open $clinical_data{$datatype} for reading: $!";
}
my $header = $clin_fh->getline;
while( my $line = $clin_fh->getline ) {
chomp $line;
my ( $sample ) = split( /\t/, $line );
$samples{$sample}++;
}
#create correlation matrix unless it's glm analysis without using a maf file
unless(( $datatype =~ /glm/i && !$self->use_maf_in_glm ) || $self->input_clinical_correlation_matrix_file ) {
if( $genetic_data_type =~ /^gene$/i ) {
$matrix_file = $self->create_sample_gene_matrix_gene( $samples, $clinical_data{$datatype}, @all_sample_names );
}
elsif( $genetic_data_type =~ /^variant$/i ) {
$matrix_file = $self->create_sample_gene_matrix_variant( $samples, $clinical_data{$datatype}, @all_sample_names );
}
else {
$self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." );
return;
}
}
if( $self->input_clinical_correlation_matrix_file ) {
$matrix_file = $self->input_clinical_correlation_matrix_file;
}
unless( defined $matrix_file ) { $matrix_file = "'*'"; }
#set up R command
my $R_cmd = "R --slave --args < " . __FILE__ . ".R $test_method ";
if( $datatype =~ /glm/i ) {
$R_cmd .= "$glm_model $clinical_data{$datatype} $matrix_file $full_output_filename";
}
else {
$R_cmd .= "$clinical_data{$datatype} $matrix_file $full_output_filename";
}
#run R command
print "R_cmd:\n$R_cmd\n";
WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)";
}
return( 1 );
}
sub create_sample_gene_matrix_gene {
my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_;
my $output_matrix = $self->clinical_correlation_matrix_file;
#create a hash of mutations from the MAF file
my ( %mutations, %all_genes, @all_genes );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $mutation_class, $sample ) = @cols[0,8,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
#check if sample exists in clinical data
unless( defined $samples->{$sample} ) {
warn "Sample Name: $sample from MAF file does not exist in Clinical Data File";
next;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
$all_genes{$gene}++;
$mutations{$sample}{$gene}++;
}
$maf_fh->close;
#sort @all_genes for consistency
@all_genes = sort keys %all_genes;
#write the input matrix for R code to a file
my $matrix_fh;
unless (defined $output_matrix) {
$output_matrix = Genome::Sys->create_temp_file_path();
}
$matrix_fh = new IO::File $output_matrix,"w";
unless ($matrix_fh) {
die "Failed to create matrix file $output_matrix!: $!";
}
#print input matrix file header
my $header = join("\t","Sample",@all_genes);
$matrix_fh->print("$header\n");
#print mutation relation input matrix
for my $sample (sort @all_sample_names) {
$matrix_fh->print($sample);
for my $gene (@all_genes) {
if (exists $mutations{$sample}{$gene}) {
$matrix_fh->print("\t$mutations{$sample}{$gene}");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
return $output_matrix;
}
sub create_sample_gene_matrix_variant {
my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_;
my $output_matrix = $self->clinical_correlation_matrix_file;
#create hash of mutations from the MAF file
my ( %variants_hash, %all_variants );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
unless( exists $samples->{$sample} ) {
warn "Sample Name: $sample from MAF file does not exist in Clinical Data File";
next;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
my $var;
my $variant_name;
if( $ref eq $var1 ) {
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref eq $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref ne $var1 && $ref ne $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
}
$maf_fh->close;
#sort variants for consistency
my @variant_names = sort keys %all_variants;
#write the input matrix for R code to a file
my $matrix_fh;
unless( defined $output_matrix ) {
$output_matrix = Genome::Sys->create_temp_file_path();
}
$matrix_fh = new IO::File $output_matrix,"w";
unless( $matrix_fh ) {
die "Failed to create matrix file $output_matrix!: $!";
}
#print input matrix file header
my $header = join( "\t", "Sample", @variant_names );
$matrix_fh->print("$header\n");
#print mutation relation input matrix
for my $sample ( sort @all_sample_names ) {
$matrix_fh->print( $sample );
for my $variant ( @variant_names ) {
if( exists $variants_hash{$sample}{$variant} ) {
$matrix_fh->print("\t$variants_hash{$sample}{$variant}");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
return $output_matrix;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/ClinicalCorrelation.pm.R 000444 000765 000024 23402 12013522176 30077 0 ustar 00nnutter staff 000000 000000 #determine which test method to use
method = as.character(commandArgs()[4]);
if (method == "glm") {
### This program is for Generalized Linear Model (GLM) analysis
### Y = X + covar1 + covar2 ......
### Usually (not limited),
### Y is clinical trait (quantitative or binary)
### X is variant/gene/mutation etc.
### covar means covariate
### by Qunyuan Zhang (qunyuan@wustl.edu), 02/16/2012 updated
### input options
model.file = as.character(commandArgs()[5]);
y.file = as.character(commandArgs()[6]);
x.file = as.character(commandArgs()[7]);
out.file = as.character(commandArgs()[8]);
x.names="*";
### to run it on command line, type
# R --no-save < glm.R model.file y.file x.file * out.csv
# or (if you want to define x variables)
# R --no-save < glm.R model.csv y.file x.file x1|x2|x3 out.csv
# or (if x.file has been merged into y.file)
# R --no-save < glm.R model.csv y.file * x1|x2|x3 out.csv
# or (if you have defined x variable in model.file)
# R --no-save < glm.R model.csv y.file * * out.csv
### about model.file
# column "type": Q=quantitative trait, B=binary trait
# column "y": trait name
# colum "x": variant/gene name; if x=NA or blank, it will determined by x.file and x.names
# column "cvar": covariate(s)
# tab delimited
### about y.file
# trait data file, column 1 must be sample id
# tab delimited
### about x.file
# usually mutation/variant/geene data file
# the first column must be sample id (the same as in y.file, ordered or not)
# tab delimited
# x.file="*" if x.file already merged into y.file
### about x.names
# x.names="*" will use all column names in x.file as x variable names
# or you can define it in the format x.names="gene1|gene2|gene3"
# self-defined x.names have to be found in column names of y.file and/or x.file
#################### myglm fuction ##############
myglm=function(z,trait,variant,covar=NA,ytype) {
if (nchar(covar)==0 | is.na(covar) | is.null(covar)) {
model=formula(paste(trait,"~",variant))
} else {
model=formula(paste(trait,"~",variant,"+",covar))
}
if (ytype=="B") fit=glm(formula=model,data=z,family=binomial(link = "logit"))
if (ytype=="Q") fit=glm(formula=model,data=z,family=gaussian(link = "identity"))
fit
}
#################################################
### data input #####
read.table(model.file,colClasses="character",na.strings = c("","NA"),sep="\t",header=T)->md
read.table(y.file,na.strings = c("","NA"),sep="\t",header=T)->y
if (x.names!="*") x.names=strsplit(x.names,split="[|]")[[1]]
if (x.file!="*")
{
read.table(x.file,na.strings = c("","NA"),sep="\t",header=T)->x
xid=colnames(x)[1]
xs=colnames(x)[-1]
if (x.names!="*") {x=x[,c(xid,x.names)];xs=colnames(x)[-1]}
x.names=xs
yid=colnames(y)[1]
ysid = ! (colnames(y) %in% xs)
y=y[,ysid]
if (sum(ysid)==1) {y=data.frame(id=y);colnames(y)[1]=yid}
#y=merge(y,x,by.x = xid, by.y = yid)
y=merge(x,y,by.x = xid, by.y = yid)
}
######### analysis ##########
tt=NULL
for (i in c(1:nrow(md)))
{
ytype=md[i,1];yi=md[i,2];xs=md[i,3];covi=md[i,4];memo=md[i,5]
if (!is.na(xs) & nchar(xs)>0) xs=strsplit(xs,split="[|]")[[1]]
if (is.na(xs)[1]|nchar(xs)[1]==0) xs=x.names
if (length(covi)==0) covi=NA
for (xi in xs)
{
print(yi); print(xi); print(covi); print("******")
if (ytype=="Q") try(anova(myglm(y,yi,xi,covi,ytype),test="F"))->fit
if (ytype=="B") try(anova(myglm(y,yi,xi,covi,ytype),test="Chisq"))->fit
if (class(fit)[1]!="try-error")
{
fit=as.matrix(fit)
if (xi %in% rownames(fit)) tt=rbind(tt, cbind(yi,ytype,xi,as.data.frame(t(fit[xi,])),covi,memo))
}
}
}
#"yi","ytype","xi","Df","Deviance","Resid. Df","Resid. Dev","F","Pr(>F)","covi","memo"
if (ytype=="Q") colnames(tt) = c("y","y_type","x","degrees_freedom","deviance","residual_degrees_freedom","residual_deviance","F_statistic","p-value","covariants","memo");
if (ytype=="B") colnames(tt) = c("y","y_type","x","degrees_freedom","deviance","residual_degrees_freedom","residual_deviance","p-value","covariants","memo");
write.table(tt,file=out.file,quote=F,sep="\t",row.names=F);
} else {
# else, we process numerical or categorical clinical data correlation
clinical_data = as.character(commandArgs()[5]);
mutation_matrix = as.character(commandArgs()[6]);
output_file = as.character(commandArgs()[7]);
# FUNCTION finds the correlation between two variables
cor2=function(ty,tx,method)
{
id=intersect(!is.na(ty),!is.na(tx));
ty=ty[id];
tx=tx[id];
if(method=="cor")
{
tst=cor.test(tx,ty);
s=tst$est;
p=tst$p.value;
}
if(method=="wilcox") #x must be (0,1) mutation data
{
tst=wilcox.test(x=ty[tx==0],y=ty[tx>=1])
s=tst$stat
p=tst$p.value
}
if(method=="chisq")
{
tst=chisq.test(tx,ty);
s=tst$stat;
p=tst$p.value;
}
if(method=="fisher")
{
tst=fisher.test(tx,ty)
s=tst$p.value
p=tst$p.value
}
if(method=="anova")
{
tst=summary(aov(ty~tx,as.data.frame(cbind(tx,ty))))
s=tst[[1]]$F[1]
p=tst[[1]]$Pr[1]
}
tt=c(p,s);
tt;
}
# END cor2
# FUNCTION runs correlation test on matrixes of data
cor2test =function(y,x=NULL,method="cor",cutoff=1,sep="\t",outf=NULL)
{
if (!is.null(x))
{
if (length(x)==1) {read.table(x,header=T,sep=sep)->x;}
if (length(y)==1) {read.table(y,header=T,sep=sep)->y;}
colnames(y)[1]="id";
colnames(x)[1]="id";
tt=character(0);
for (vi in colnames(x)[-1])
{
for (vj in colnames(y)[-1])
{
tx=x[,c("id",vi)];
tx=tx[!is.na(tx[,vi]),];
tx=tx[!duplicated(tx[,"id"]),];
ty=y[,c("id",vj)];
ty=ty[!is.na(ty[,vj]),];
ty=ty[!duplicated(ty[,"id"]),];
xy=merge(tx,ty,by.x="id",by.y="id");
tx=xy[,2];
ty=xy[,3];
n=length(xy[,"id"]);
rst=try(cor2(ty,tx,method));
if (class(rst)=="try-error") {p=NA;s=NA;} else {p=rst[1];s=rst[2];}
t=c(vi,vj,method,n,s,p)
tt=rbind(tt,t);
} #end vj
} #end vi
rownames(tt)=NULL;
colnames(tt)=c("x","y","method","n","s","p");
tt=as.data.frame(tt);
tt[,"s"]=as.character(tt[,"s"]);
tt[,"s"]=as.numeric(tt[,"s"]);
tt[,"p"]=as.character(tt[,"p"]);
tt[,"p"]=as.numeric(tt[,"p"]);
fdr=p.adjust(tt[,"p"],method="fdr");
bon=p.adjust(tt[,"p"],method="bon");
tt=cbind(tt,fdr,bon);
tt=tt[order(tt[,"p"]),];
}
if (is.null(x))
{
if (length(y)==1) {read.table(y,header=T,sep=sep)->y;}
x=y;
nxy=ncol(y)-1;
colnames(y)[1]="id";
colnames(x)[1]="id"
tt=character(0);
for (i in c(1:(nxy-1)))
{
for (j in c((i+1):nxy))
{
vi=colnames(x)[-1][i];
vj=colnames(y)[-1][j];
tx=x[,c("id",vi)];
tx=tx[!is.na(tx[,vi]),];
tx=tx[!duplicated(tx[,"id"]),]
ty=y[,c("id",vj)];
ty=ty[!is.na(ty[,vj]),];
ty=ty[!duplicated(ty[,"id"]),];
xy=merge(tx,ty,by.x="id",by.y="id");
tx=xy[,2];
ty=xy[,3];
n=length(xy[,"id"]);
rst=try(cor2(ty,tx,method));
if (class(rst)=="try-error") {p=NA;s=NA;} else {p=rst[1];s=rst[2];}
t=c(vi,vj,method,n,s,p);
tt=rbind(tt,t);
} #end vj
} #end vi
rownames(tt)=NULL;
colnames(tt)=c("x","y","method","n","s","p");
tt=as.data.frame(tt);
tt[,"s"]=as.character(tt[,"s"]);
tt[,"s"]=as.numeric(tt[,"s"]);
tt[,"p"]=as.character(tt[,"p"]);
tt[,"p"]=as.numeric(tt[,"p"]);
fdr=p.adjust(tt[,"p"],method="fdr");
bon=p.adjust(tt[,"p"],method="bon");
tt=cbind(tt,fdr,bon);
tt=tt[order(tt[,"p"]),];
}
if (!is.null(outf))
{
colnames(tt)=c("x","y","method","n","s","p","fdr","bon");
#The amount of precision that R prints with is somehow machine dependent (or the R version?)
tt[,"s"] = sapply(tt[,"s"], sprintf, fmt="%.4E");
tt[,"p"] = sapply(tt[,"p"], sprintf, fmt="%.4E");
tt[,"fdr"] = sapply(tt[,"fdr"], sprintf, fmt="%.2E");
tt[,"bon"] = sapply(tt[,"bon"], sprintf, fmt="%.2E");
#The ordering should be done after reformatting the precision (duh)
tt=tt[order(tt[,"x"]),];
tt=tt[order(tt[,"p"]),];
write.table(tt,file=outf,quote=FALSE,row.names=FALSE,sep=",");
}
invisible(tt);
}
#END cor2test
#run correlation test using function
cor2test(y = clinical_data, x = mutation_matrix, method = method, outf = output_file);
}
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/CosmicOmim.pm 000444 000765 000024 144312 12013522176 26042 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::CosmicOmim;
#__STANDARD PERL PACKAGES
use warnings;
use strict;
use Genome;
use FileHandle;
use Text::CSV_XS;
class Genome::Model::Tools::Music::CosmicOmim {
is => 'Genome::Model::Tools::Music::Base',
has_input => [
maf_file => {
is => 'Path', file_format => 'maf',
doc => 'list of annotated mutations in MAF format (or any file with MAF+annotation headers)',
},
output_file => {
is => 'Path', is_output => 1,
doc => 'Output file contains the input file with two columns appended to the end, corresponding to cosmic and omim mutation comparisons, respectively',
},
reference_build => {
is => 'Text', default => 'Build37',
doc => 'Put either "Build36" or "Build37"',
}
],
has_optional_input => [
omimaa_dir => {
is => 'Path', default => Genome::Sys->dbpath( 'omim', 'latest' ),
doc => 'omim amino acid mutation database folder',
},
cosmic_dir => {
is => 'Path', default => Genome::Sys->dbpath( 'cosmic', 'latest' ),
doc => 'cosmic amino acid mutation database folder',
},
verbose => {
is => 'Boolean', default => 0,
doc => 'Use this to display the larger working output',
},
wu_annotation_headers => {
is => 'Boolean', default => 0,
doc => "Use this if input MAF contains WUSTL annotation format headers",
},
aa_range => {
is => 'Integer', default => 2,
doc => "Set how close a 'near' match is when searching for amino acid near hits",
},
nuc_range => {
is => 'Integer', default => 5,
doc => "Set how close a 'near' match is when searching for nucleotide position near hits",
},
show_known_hits => {
is => 'Boolean', default => 1,
doc => "When a finding is novel, show known AA in that gene",
},
],
doc => 'Compare the amino acid changes of supplied mutations to COSMIC and OMIM databases.'
};
sub help_synopsis {
return <maf_file;
my $cosmic_dir = $self->cosmic_dir;
my $basename = $self->output_file;
my $ref_build = $self->reference_build;
unless($ref_build =~ m/build36/i xor $ref_build =~ m/build37/i){
$self->error_message("You must either specify reference_build as either \"Build36\" or \"Build37\"");
die $self->error_message;
}
my $omimaa_dir = $self->omimaa_dir;
my $omimaa;
if (-d $omimaa_dir){
$omimaa = "$omimaa_dir/omim_aa.csv";
unless (-e $omimaa) {
$omimaa = "$omimaa_dir/OMIM_aa_will.csv";
}
}
my $cosmic_database_file = 'Cosmic_Database.tsv';
my $cosmic_database;
if (-d $cosmic_dir){
$cosmic_database = "$cosmic_dir/$cosmic_database_file";
}
my $verbose = $self->verbose;
my $wuheaders = $self->wu_annotation_headers;
$self->status_message("Using $omimaa as omima db file");
$self->status_message("Using $cosmic_dir as cosmic db folder");
my $aa_range = $self->aa_range;
my $nuc_range = $self->nuc_range;
#####################
# MAIN PROCESSING #
#####################
#Set Stats hash that counts everything
my %stats;
$stats{'OMIMDB'} = 0;
$stats{'OMIM'}{'doublematch'} = 0;
$stats{'OMIM'}{'ntmatch'} = 0;
$stats{'OMIM'}{'aamatch'} = 0;
$stats{'OMIM'}{'posmatch'} = 0;
$stats{'OMIM'}{'ntposmatch'} = 0;
$stats{'OMIM'}{'aaposmatch'} = 0;
$stats{'OMIM'}{'nearmatch'} = 0;
$stats{'OMIM'}{'ntnearmatch'} = 0;
$stats{'OMIM'}{'aanearmatch'} = 0;
$stats{'OMIM'}{'novel'} = 0;
$stats{'OMIM'}{'ntnovel'} = 0;
$stats{'OMIM'}{'aanovel'} = 0;
$stats{'OMIM'}{'silent'} = 0;
$stats{'OMIM'}{'nomatch'} = 0;
$stats{'COSMICDB'} = 0;
$stats{'COSMIC'}{'doublematch'} = 0;
$stats{'COSMIC'}{'ntmatch'} = 0;
$stats{'COSMIC'}{'aamatch'} = 0;
$stats{'COSMIC'}{'posmatch'} = 0;
$stats{'COSMIC'}{'ntposmatch'} = 0;
$stats{'COSMIC'}{'aaposmatch'} = 0;
$stats{'COSMIC'}{'nearmatch'} = 0;
$stats{'COSMIC'}{'ntnearmatch'} = 0;
$stats{'COSMIC'}{'aanearmatch'} = 0;
$stats{'COSMIC'}{'novel'} = 0;
$stats{'COSMIC'}{'ntnovel'} = 0;
$stats{'COSMIC'}{'aanovel'} = 0;
$stats{'COSMIC'}{'silent'} = 0;
$stats{'COSMIC'}{'nomatch'} = 0;
#__PARSE MUTATION DATA
my $fh = new FileHandle;
unless ($fh->open (qq{$mut_file})) { die "Could not open mutation project file '$mut_file' for reading"; }
if ($verbose) {print "Parsing mutation file...\n";}
my $mutation = ParseMutationFile($fh, $mut_file, $wuheaders);
$fh->close;
if ($verbose) {print "Done Parsing Mutation File! Yippee!\n";}
#__READ IN OMIM FILE
my %omimaa;
if (defined($omimaa) && -e $omimaa) {
open(OMIMAA,$omimaa) || die "Could not open omim file '$omimaa'";
if ($verbose) {print "Loading OMIM Database\n";}
my $omimaa_header = ;
while() {
$stats{'OMIMDB'}++;
chomp;
my ($gene, $omim_entry, $position, $aa_ori, $aa_mut, $description, $diseases) = split("\t");
$omimaa{$gene}{$omim_entry}{$position}{residue1} = $aa_ori;
$omimaa{$gene}{$omim_entry}{$position}{residue2} = $aa_mut;
$omimaa{$gene}{$omim_entry}{$position}{description} = $description;
}
if ($verbose) {print "Finished Loading OMIM Database\n";}
close(OMIMAA);
}
#__READ IN COSMIC FILE
my %cosmic_gene;
my %cosmic_position;
my %cosmic_position_only;
my %cosmic_tissue;
my %cosmic_histology;
my %aa_count;
my %residue_match;
if (defined($cosmic_database) && -e $cosmic_database) {
my ($gene_col, $chr_col, $start_col, $stop_col, $chr_col_37, $start_col_37, $stop_col_37, $amino_col, $nucleo_col, $somatic_col, $primary_tissue_col, $tissue_sub_1_col, $tissue_sub_2_col, $histology_col,
$histology_sub_1_col, $histology_sub_2_col, $gene, $chr, $start, $stop, $chr_37, $start_37, $stop_37, $amino, $nucleo, $somatic, $primary_tissue, $tissue_sub_1, $tissue_sub_2, $histology, $histology_sub_1,
$histology_sub_2);
open(COSMIC,$cosmic_database) || die "Could not open omim file '$cosmic_database'";
if ($verbose) {print "Loading COSMIC Database\n";}
my $cosmic_header = ;
chomp($cosmic_header);
my @parser = split(/\t/, $cosmic_header);
my $parsecount = 0;
my %parsehash;
foreach my $item (@parser) {
$parsehash{$item} = $parsecount;
$parsecount++;
}
$gene_col = $parsehash{'Gene'};
$chr_col = $parsehash{'Chromosome'};
$start_col = $parsehash{'Genome Start'};
$stop_col = $parsehash{'Genome Stop'};
$chr_col_37 = $parsehash{'Chromosome Build37'};
$start_col_37 = $parsehash{'Genome Start Build37'};
$stop_col_37 = $parsehash{'Genome Stop Build37'};
$amino_col = $parsehash{'Amino Acid'};
$nucleo_col = $parsehash{'Nucleotide'};
$somatic_col = $parsehash{'Somatic Status'};
$primary_tissue_col = $parsehash{'Primary_Tissue'};
$tissue_sub_1_col = $parsehash{'Tissue_subtype_1'};
$tissue_sub_2_col = $parsehash{'Tissue_subtype_2'};
$histology_col = $parsehash{'Histology'};
$histology_sub_1_col = $parsehash{'Histology_subtype_1'};
$histology_sub_2_col = $parsehash{'Histology_subtype_2'};
while(my $line = ) {
$stats{'COSMICDB'}++;
chomp($line);
my @parser = split(/\t/, $line);
$gene = $parser[$gene_col];
if ($ref_build =~ m/build36/i) {
$chr = $parser[$chr_col];
$start = $parser[$start_col];
$stop = $parser[$stop_col];
}
else {
$chr = $parser[$chr_col_37];
$start = $parser[$start_col_37];
$stop = $parser[$stop_col_37];
}
$chr =~ s/NotListed/ /;
$start =~ s/NotListed/ /;
$stop =~ s/NotListed/ /;
$chr =~ s/23/X/;
$chr =~ s/24/Y/;
$amino = $parser[$amino_col];
$nucleo = $parser[$nucleo_col];
$somatic = $parser[$somatic_col];
$primary_tissue = $parser[$primary_tissue_col];
$tissue_sub_1 = $parser[$tissue_sub_1_col];
$tissue_sub_2 = $parser[$tissue_sub_2_col];
$histology = $parser[$histology_col];
$histology_sub_1 = $parser[$histology_sub_1_col];
$histology_sub_2 = $parser[$histology_sub_2_col];
my ($residue1, $res_start, $residue2, $res_stop, $new_residue);
if ($amino eq 'E597A') {
$amino = 'p.E597A';
}
if ($amino =~ m/^p/) {
unless ($amino =~ m/^p\./) {
$amino =~ s/^p/p\./;
}
}
if ($amino =~ m/^P\./) {
$amino =~ s/^P\./p\./;
}
$amino =~ s/\s+$//;
$amino =~ s/^\s+//;
($residue1, $res_start, $residue2, $res_stop, $new_residue) = AA_Check($amino);
if (defined $res_start){
if ($res_start == $res_stop){
$residue_match{$gene}{$res_start}{$res_stop}{$residue1}{$residue2}++;
my $addition = $residue1.$res_start.$residue2;
$aa_count{$gene}{$addition}++;
}
else {
my $addition = $residue1.$res_start."-".$res_stop.$residue2;
$aa_count{$gene}{$addition}++;
}
}
else {
if ($amino =~ m/^p\.\?/ || $amino =~ m/^p\.\>/ || $amino eq 'p.INS' || $amino eq 'p.DEL' || $amino eq 'p.fs*?' || $amino eq 'p.fs') {
my $addition = $amino;
$addition =~ s/p\.//;
$aa_count{$gene}{$addition}++;
}
else {
print "$residue1, $res_start, $residue2, $res_stop, $new_residue\n";
die "$amino not found\n";
}
}
if (defined($nucleo)) {
$cosmic_gene{$gene}{$amino}{$nucleo}++;
}
else {
$cosmic_gene{$gene}{$amino}++;
}
if (defined($chr) && defined($start) && defined($stop) && $chr ne ' ' && $start ne ' ' && $stop ne ' ') {
$cosmic_position{$chr}{$start}{$stop}{$gene}++;
if (defined($nucleo)) {
$cosmic_position_only{$chr}{$start}{$stop}{$nucleo}++;
}
else {
$cosmic_position_only{$chr}{$start}{$stop}++;
}
}
$cosmic_tissue{$gene}{$amino} = "$primary_tissue\t$tissue_sub_1\t$tissue_sub_2";
$cosmic_histology{$gene}{$amino} = "$histology\t$histology_sub_1\t$histology_sub_2";
}
close(COSMIC);
if ($verbose) {print "Finished Loading COSMIC Database! Hooray!\n";}
}
my %cosmic_results;
my %omim_results;
my $summary_file = $basename;
unless (open(SUMMARY,">$summary_file")) {
die "Could not open output file '$summary_file' for writing";
}
unless ($fh->open (qq{$mut_file})) { die "Could not open mutation project file '$mut_file' for reading"; }
my %fileline;
my $i = 1;
while (my $filehandleline = <$fh>) {
chomp $filehandleline;
while ($filehandleline =~ /^#/) {
print SUMMARY "$filehandleline\n";
$filehandleline = <$fh>;
chomp $filehandleline;
}
$fileline{$i} = $filehandleline;
$i++;
}
$fh->close;
print SUMMARY "$fileline{'1'}\tInput_MAF_Line_Number\tCosmic_Results\tOMIM_Results\n";
if ($verbose) {print "Starting COSMIC/OMIM to Mutation File Comparisons\n";}
foreach my $hugo (sort keys %{$mutation}) {
foreach my $sample (keys %{$mutation->{$hugo}}) {
foreach my $line_num (keys %{$mutation->{$hugo}->{$sample}}) {
if ($verbose) {print ".";} #report that we are starting a sample
#read in the alleles. The keys may change with future file formats. If so, a new version should be added to
# my ($entrez_gene_id, $line, $aa_change,$transcript,$mstatus,$Variant_Type,$Chromosome,$Start_position,$End_position,$Reference_Allele,$Tumor_Seq_Allele1,$gene) =
my ($entrez_gene_id, $line, $aa_change,$transcript,$Variant_Type,$Chromosome,$Start_position,$End_position,$Reference_Allele,$Tumor_Seq_Allele1,$Tumor_Seq_Allele2,$gene) =
(
$mutation->{$hugo}->{$sample}->{$line_num}->{ENTREZ_GENE_ID},
$mutation->{$hugo}->{$sample}->{$line_num}->{file_line},
$mutation->{$hugo}->{$sample}->{$line_num}->{AA_CHANGE},
$mutation->{$hugo}->{$sample}->{$line_num}->{TRANSCRIPT},
# $mutation->{$hugo}->{$sample}->{$line_num}->{MUTATION_STATUS},
$mutation->{$hugo}->{$sample}->{$line_num}->{VARIANT_TYPE},
$mutation->{$hugo}->{$sample}->{$line_num}->{CHROMOSOME},
$mutation->{$hugo}->{$sample}->{$line_num}->{START_POSITION},
$mutation->{$hugo}->{$sample}->{$line_num}->{END_POSITION},
$mutation->{$hugo}->{$sample}->{$line_num}->{REFERENCE_ALLELE},
$mutation->{$hugo}->{$sample}->{$line_num}->{TUMOR_SEQ_ALLELE1},
$mutation->{$hugo}->{$sample}->{$line_num}->{TUMOR_SEQ_ALLELE2},
$mutation->{$hugo}->{$sample}->{$line_num}->{HUGO_SYMBOL},
);
# if ($mstatus){
#Annotate the allele's effect on all known (ie transcript without the 'unknown' status) transcripts
my $proper_allele;
if ($Reference_Allele ne $Tumor_Seq_Allele1) {
$proper_allele = $Tumor_Seq_Allele1;
}
elsif($Reference_Allele ne $Tumor_Seq_Allele2) {
$proper_allele = $Tumor_Seq_Allele2;
}
else {
warn "line $line_num has both Tumor_Seq_Allele1 ($Tumor_Seq_Allele1) and Tumor_Seq_Allele2 ($Tumor_Seq_Allele2) as the Reference ($Reference_Allele)\n";
$proper_allele = $Tumor_Seq_Allele2;
}
#LOOK FOR ONLY SINGLE CHARACTER PROPER ALLELE TYPES - A, C, T, G, 0, or -
unless($Reference_Allele ne $proper_allele) {
die "Ref allele: $Reference_Allele same as mutation allele: $proper_allele ('line num' $line_num)";
}
unless($Reference_Allele =~ /[ACTG0\-]/ && $proper_allele =~ /[ACTG0\-]/) {
die "Read in improper alleles from mutation file ref: $Reference_Allele var: $proper_allele ('line num' $line_num)";
}
## SHOULD NEVER BE '--' BECAUSE THE CODE WILL THINK THIS IS A DINUCLEOTIDE POLYMORPHISM INSTEAD OF AN INDEL, SO CHANGE THESE
$Reference_Allele = '-' if $Reference_Allele eq '--';
$proper_allele = '-' if $proper_allele eq '--';
chomp($line_num);
my %results_hash;
#parse the amino acid string
my ($residue1, $res_start, $residue2, $res_stop, $new_residue) = AA_Check($aa_change);
if(defined($Start_position) && defined($End_position) && (!$residue2 || $residue2 eq ' ')){
if ($verbose) {print "Skipping Non-coding (Silent) Mutation";}
my $find_type = $self->CheckPositionMatch($Start_position, $End_position, $Reference_Allele, $proper_allele, \%cosmic_position_only, \%cosmic_position);
my $createspreadsheet;
if ($find_type =~ m/no_match/) {
$createspreadsheet = "$fileline{$line_num}\t$line_num\tSkipped - Silent Mutation\tSkipped - Silent Mutation";
$stats{'COSMIC'}{'silent'}++;
$stats{'OMIM'}{'silent'}++;
}
elsif ($find_type =~ m/position/) {
my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop) = split(":",$find_type);
$results_hash{NT}{POSITION}{COSMIC}{$transcript}=": Nucleotide -> Cosmic Gene(s):$cosmic_genes Position:$chr:$gen_start-$gen_stop";
my $matchtype_cosmic;
($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC");
$stats{'COSMIC'}{$matchtype_cosmic}++;
$stats{'OMIM'}{'silent'}++;
$createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation";
}
elsif ($find_type =~ m/position_nucleotide/) {
my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop,$reference,$mutant) = split(":",$find_type);
$results_hash{NT}{MATCH}{COSMIC}{$transcript} = ": Nucleotide -> Cosmic Gene(s): $cosmic_genes Position:$chr:$gen_start-$gen_stop,ref:$reference,mut:$mutant";
my $matchtype_cosmic;
($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC");
$stats{'COSMIC'}{$matchtype_cosmic}++;
$stats{'OMIM'}{'silent'}++;
$createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation";
}
elsif ($find_type =~ m/near_match/) {
my ($findtype,$diff_position) = split(":",$find_type);
$results_hash{NT}{ALMOST}{COSMIC}{$transcript}=": Nucleotide $diff_position bases away";
my $matchtype_cosmic;
($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC");
$stats{'COSMIC'}{$matchtype_cosmic}++;
$stats{'OMIM'}{'silent'}++;
$createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\tSkipped - Silent Mutation";
}
print SUMMARY "$createspreadsheet\n";
next; #skip silent mutations
}
#look for mutation file gene ($hugo) to match cosmic file gene (%cosmic_gene{gene}), note matched name in $cosmic_hugo
my $cosmic_hugo;
my $uc_hugo = uc($hugo);
if (exists($cosmic_gene{$hugo})) {
$cosmic_hugo = $hugo;
}
elsif (exists($cosmic_gene{$uc_hugo})) { # check for UPPERCASE hugo match
$cosmic_hugo = $uc_hugo;
}
else {
#if cosmic key needs to be uppercase to match mutation file (for example, maf default is all uppercase)
foreach my $key (keys %cosmic_gene) { # check for UPPERCASE keys match
if ($uc_hugo eq uc($key)) {
$cosmic_hugo = $key;
}
}
}
#genes that didn't find a match go here, will check for position matches later
unless (defined($cosmic_hugo)) {
if (-e "$cosmic_dir/$hugo\.csv") { #database flatfile has only genes with AA changes, check source files for gene existance
$results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Gene $hugo in Cosmic but No Amino Acid Results for Gene";
}
else {
$results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Gene $hugo not in Cosmic Database";
}
}
#retrieve COSMIC match
if (!defined($aa_change) || $aa_change eq 'NULL') {
warn "We skipped silent mutations, how do we have undel or NULL amino acids at non-silent sites? Gene:$hugo AA:$aa_change\n";
}
#Start checks,First check position (then check amino acid)
my $cosmic_find_type;
my @aa_holder;
if(defined($Start_position) && defined($End_position) && $Start_position ne ' ' && $End_position ne ' ') {
my $find_type = $self->CheckPositionMatch($Start_position, $End_position, $Reference_Allele, $proper_allele, \%cosmic_position_only, \%cosmic_position);
if ($find_type =~ m/no_match/) {
$results_hash{NT}{NOVEL}{COSMIC}{$transcript}=": Nucleotide";
}
elsif ($find_type =~ m/position/) {
my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop) = split(":",$find_type);
$results_hash{NT}{POSITION}{COSMIC}{$transcript}=": Nucleotide -> Cosmic Gene(s):$cosmic_genes Position:$chr:$gen_start-$gen_stop";
}
elsif ($find_type =~ m/position_nucleotide/) {
my ($findtype,$cosmic_genes,$chr,$gen_start,$gen_stop,$reference,$mutant) = split(":",$find_type);
$results_hash{NT}{MATCH}{COSMIC}{$transcript} = ": Nucleotide -> Cosmic Gene(s): $cosmic_genes Position:$chr:$gen_start-$gen_stop,ref:$reference,mut:$mutant";
}
elsif ($find_type =~ m/near_match/) {
my ($findtype,$diff_position) = split(":",$find_type);
$results_hash{NT}{ALMOST}{COSMIC}{$transcript}=": Nucleotide $diff_position bases away";
}
}
#check amino acid here
if ($cosmic_hugo && exists($cosmic_gene{$cosmic_hugo})) {
$cosmic_find_type = 'no_match';
foreach my $key (keys %{$aa_count{$cosmic_hugo}}) {
if ($key =~ m/\S+/) {
push(@aa_holder,"$key ($aa_count{$cosmic_hugo}{$key})");
}
}
unless (@aa_holder) {
@aa_holder = "AA?Unknown?";
}
if ($res_start && $res_stop && exists($residue_match{$cosmic_hugo}) && exists($residue_match{$cosmic_hugo}{$res_start}) && exists($residue_match{$cosmic_hugo}{$res_start}{$res_stop})) { #match amino acid
$cosmic_find_type = 'position';
$results_hash{AA}{POSITION}{COSMIC}{$transcript}=": Amino Acid -> Matched $cosmic_hugo, $res_start, $res_stop";
if ($residue1 && $residue2 && ($residue_match{$cosmic_hugo}{$res_start}{$res_stop}{$residue1}{$residue2} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{uc($residue1)}{$residue2} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{$residue1}{uc($residue2)} || $residue_match{$cosmic_hugo}{$res_start}{$res_stop}{uc($residue1)}{uc($residue2)})) { # matches both amino acid and amino acid position
$cosmic_find_type = 'position_aminoacid';
my $addition;
if ($res_start == $res_stop){
$addition = $residue1.$res_start.$residue2;
}
else {
$addition = $residue1.$res_start."-".$res_stop.$residue2;
}
$results_hash{AA}{MATCH}{COSMIC}{$transcript} = ": Amino Acid -> Matched $cosmic_hugo, $addition";
}
}
if($cosmic_find_type && $cosmic_find_type eq 'no_match') {
if ($self->show_known_hits) {
$results_hash{AA}{NOVEL}{COSMIC}{$transcript}=": Amino Acid -> Known AA = @aa_holder";
}
else {
$results_hash{AA}{NOVEL}{COSMIC}{$transcript}=": Amino Acid -> Known AA Not Shown";
}
my $iter_start = $res_start - $aa_range;
my $iter_stop = $res_stop + $aa_range;
my $iter;
for($iter = $iter_start; $iter <= $iter_stop; $iter++) {
if ($res_start && $res_stop && exists($residue_match{$cosmic_hugo}) && exists($residue_match{$cosmic_hugo}{$iter})) {
if ($self->show_known_hits) {
$results_hash{AA}{ALMOST}{COSMIC}{$transcript} = ": Amino Acid -> Known AA for Gene = @aa_holder";
}
else {
$results_hash{AA}{ALMOST}{COSMIC}{$transcript} = ": Amino Acid -> Known AA Not Shown";
}
}
}
}
}
#retrieve OMIM match
my $omim_find_type;
my $omim = \%omimaa;
if (exists($omim->{$hugo})) {
$omim_find_type = FindOMIM(\%omimaa,$hugo,$res_start,$res_stop,$residue1,$residue2,$aa_range);
}
#Add OMIM result to the results hash
if(defined($omim_find_type)) {
if ($omim_find_type eq 'position_aminoacid') {
$results_hash{AA}{MATCH}{OMIM}{$transcript} =": Amino Acid";
}
elsif ($omim_find_type eq 'position') {
$results_hash{AA}{POSITION}{OMIM}->{$transcript}=": Amino Acid";
}
elsif ($omim_find_type eq 'almost') {
$results_hash{AA}{ALMOST}{OMIM}->{$transcript}=": Amino Acid";
}
else {
$results_hash{AA}{NOVEL}{OMIM}->{$transcript}=": Amino Acid";
}
}
else {
$results_hash{AA}{NOVEL}{OMIM}->{$transcript}=": Amino Acid - OMIM Gene Name Not Found";
}
#now check to see what the 'best' cosmic score was
my $matchtype_cosmic;
($cosmic_results{$line_num}, $matchtype_cosmic) = score_results(\%results_hash, "COSMIC");
$stats{'COSMIC'}{$matchtype_cosmic}++;
#now check to see what the 'best' omim score was
my $matchtype_omim;
($omim_results{$line_num}, $matchtype_omim) = score_results(\%results_hash, "OMIM");
$stats{'OMIM'}{$matchtype_omim}++;
my $createspreadsheet = "$fileline{$line_num}\t$line_num\t$cosmic_results{$line_num}\t$omim_results{$line_num}";
print SUMMARY "$createspreadsheet\n";
# }
}
}
}
close(SUMMARY);
if ($verbose) {print "Finished COSMIC/OMIM to Mutation File Comparisons! HAPPY HAPPY JOY JOY!\n";}
print "\n";
print "Number of Genes in OMIM: $stats{'OMIMDB'}\n";
print "Number of AA and NT Matches: $stats{'OMIM'}{'doublematch'}\n";
print "Number of NT only Matches: $stats{'OMIM'}{'ntmatch'}\n";
print "Number of AA only Matches: $stats{'OMIM'}{'aamatch'}\n";
print "Number of AA and NT Position Matches: $stats{'OMIM'}{'posmatch'}\n";
print "Number of NT only Position Matches: $stats{'OMIM'}{'ntposmatch'}\n";
print "Number of AA only Position Matches: $stats{'OMIM'}{'aaposmatch'}\n";
print "Number of NT and AA Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'nearmatch'}\n";
print "Number of NT only Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'ntnearmatch'}\n";
print "Number of AA only Novel Sites with Matches in Near Proximity: $stats{'OMIM'}{'aanearmatch'}\n";
print "Number of NT and AA Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'novel'}\n";
print "Number of NT Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'ntnovel'}\n";
print "Number of AA Novel Sites with Nothing in Near Proximity: $stats{'OMIM'}{'aanovel'}\n";
print "Number of Silent Mutations Skipped: $stats{'OMIM'}{'silent'}\n";
print "Number of Lines that Exited with No Hit: $stats{'OMIM'}{'nomatch'}\n";
print "\n";
print "Number of Genes in COSMIC: $stats{'COSMICDB'}\n";
print "Number of AA and NT Matches: $stats{'COSMIC'}{'doublematch'}\n";
print "Number of NT only Matches: $stats{'COSMIC'}{'ntmatch'}\n";
print "Number of AA only Matches: $stats{'COSMIC'}{'aamatch'}\n";
print "Number of AA and NT Position Matches: $stats{'COSMIC'}{'posmatch'}\n";
print "Number of NT only Position Matches: $stats{'COSMIC'}{'ntposmatch'}\n";
print "Number of AA only Position Matches: $stats{'COSMIC'}{'aaposmatch'}\n";
print "Number of NT and AA Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'nearmatch'}\n";
print "Number of NT only Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'ntnearmatch'}\n";
print "Number of AA only Novel Sites with Matches in Near Proximity: $stats{'COSMIC'}{'aanearmatch'}\n";
print "Number of NT and AA Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'novel'}\n";
print "Number of NT Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'ntnovel'}\n";
print "Number of AA Novel Sites with Nothing in Near Proximity: $stats{'COSMIC'}{'aanovel'}\n";
print "Number of Silent Mutations Skipped: $stats{'COSMIC'}{'silent'}\n";
print "Number of Lines that Exited with No Hit: $stats{'COSMIC'}{'nomatch'}\n";
return 1;
}
################################################################################
# #
# S U B R O U T I N E S #
# #
################################################################################
sub CheckPositionMatch {
my $self = shift;
my $genomic_start = shift;
my $genomic_stop = shift;
my $nt1 = shift;
my $nt2 = shift;
my $cp_only = shift;
my %cosmic_position_only = %{$cp_only};
my $cp_all = shift;
my %cosmic_position = %{$cp_all};
my $verbose = $self->verbose;
my $nuc_range = $self->nuc_range;
my $find_type = 'no_match';
foreach my $chr (sort keys %cosmic_position_only) {
# Test that it at least matches position
foreach my $gen_start (keys %{$cosmic_position_only{$chr}}) {
my $diff_start = $gen_start - $genomic_start;
if ($gen_start == $genomic_start) {
foreach my $gen_stop (keys %{$cosmic_position_only{$chr}{$gen_start}}) {
my $diff_stop = $gen_stop - $genomic_stop;
if ($gen_stop == $genomic_stop) {
my $cosmic_genes;
if (keys %{$cosmic_position{$chr}{$gen_start}{$gen_stop}}) {
my @cosmic_genes = keys %{$cosmic_position{$chr}{$gen_start}{$gen_stop}};
$cosmic_genes = join(",",@cosmic_genes);
}
if ($find_type eq 'no_match' || $find_type eq 'near_match') {
$find_type = "position:$cosmic_genes:$chr:$gen_start:$gen_stop";
}
# Test that it matches both
foreach my $nucleo (keys %{$cosmic_position_only{$chr}{$gen_start}{$gen_stop}}) {
my ($start,$stop,$type_length,$type,$reference,$mutant) = parse_nucleotide($nucleo,$verbose);
if($reference && $mutant && $reference eq $nt1 && $mutant eq $nt2) {
$find_type = "position_nucleotide:$cosmic_genes:$chr:$gen_start:$gen_stop:$reference:$mutant";
return $find_type;
}
}
}
elsif ($diff_stop <= $nuc_range && $diff_stop >= -$nuc_range) {
if ($find_type eq 'no_match') {
$find_type = "near_match:$diff_stop";
}
}
}
}
elsif ($diff_start <= $nuc_range && $diff_start >= -$nuc_range) {
if ($find_type eq 'no_match') {
$find_type = "near_match:$diff_start";
}
}
}
}
return $find_type;
}
sub FindOMIM {
my ($omim, $hugo,$res_start, $res_stop, $residue1, $residue2, $aa_range) = @_;
my $return_value = 'no_match';
unless (exists($omim->{$hugo})) {
warn "No omim entry for: $hugo";
return $return_value;
}
foreach my $sample (keys %{$omim->{$hugo}}) {
# Test that it at least matches position
if (exists($omim->{$hugo}{$sample}{$res_start})) {
$return_value = 'position';
# Test that it matches both
if (exists($omim->{$hugo}{$sample}{$res_start}{residue1}) &&
exists($omim->{$hugo}{$sample}{$res_start}{residue2}) &&
defined($omim->{$hugo}{$sample}{$res_start}{residue1}) &&
defined($omim->{$hugo}{$sample}{$res_start}{residue2}) &&
uc($omim->{$hugo}{$sample}{$res_start}{residue1}) eq uc($residue1) &&
uc($omim->{$hugo}{$sample}{$res_start}{residue2}) eq uc($residue2)) {
return 'position_aminoacid';
}
}
elsif ($return_value eq 'no_match') {
my $iter_start = $res_start - $aa_range;
my $iter_stop = $res_stop + $aa_range;
my $iter;
for($iter = $iter_start; $iter <= $iter_stop; $iter++) {
if (exists($omim->{$hugo}{$sample}{$iter})) {
$return_value = 'almost';
}
}
}
}
return $return_value;
}
sub score_results {
my ($results, $database) = @_;
my $matchtype;
if(exists($results->{NT}{MATCH}->{$database}) && exists($results->{AA}{MATCH}->{$database})) {
#best hit was a DOUBLE MATCH. Huzzah!
$matchtype = 'doublematch';
my ($transcript) = keys %{$results->{NT}{MATCH}{$database}};
my ($transcript2) = keys %{$results->{AA}{MATCH}{$database}};
my $ret_value = "NT and AA Match".$results->{AA}{MATCH}->{$database}{$transcript2}." and ".$results->{NT}{MATCH}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{MATCH}->{$database})) {
#best hit was a MATCH. Huzzah!
$matchtype = 'ntmatch';
my ($transcript) = keys %{$results->{NT}{MATCH}{$database}};
my $ret_value = "Match".$results->{NT}{MATCH}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{AA}{MATCH}->{$database})) {
#best hit was a MATCH. Huzzah!
$matchtype = 'aamatch';
my ($transcript) = keys %{$results->{AA}{MATCH}{$database}};
my $ret_value = "Match".$results->{AA}{MATCH}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{POSITION}->{$database}) && exists($results->{AA}{POSITION}->{$database})) {
#best hit was a position match
$matchtype = 'posmatch';
my ($transcript) = keys %{$results->{NT}{POSITION}{$database}};
my ($transcript2) = keys %{$results->{AA}{POSITION}{$database}};
my $ret_value = "NT and AA Position Match".$results->{AA}{POSITION}->{$database}{$transcript2}." and ".$results->{NT}{POSITION}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{POSITION}->{$database})) {
#best hit was a position match
$matchtype = 'ntposmatch';
my ($transcript) = keys %{$results->{NT}{POSITION}{$database}};
my $ret_value = "Position Match".$results->{NT}{POSITION}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{AA}{POSITION}->{$database})) {
#best hit was a position match
$matchtype = 'aaposmatch';
my ($transcript) = keys %{$results->{AA}{POSITION}{$database}};
my $ret_value = "Position Match".$results->{AA}{POSITION}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{ALMOST}->{$database}) && exists($results->{AA}{ALMOST}->{$database})) {
#best hit was near a position match
$matchtype = 'nearmatch';
my ($transcript) = keys %{$results->{NT}{ALMOST}{$database}};
my ($transcript2) = keys %{$results->{AA}{ALMOST}{$database}};
my $ret_value = "NT and AA Novel, but near match".$results->{AA}{ALMOST}->{$database}{$transcript2}." and ".$results->{NT}{ALMOST}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{ALMOST}->{$database})) {
#best hit was near a position match
$matchtype = 'ntnearmatch';
my ($transcript) = keys %{$results->{NT}{ALMOST}{$database}};
my $ret_value = "Novel, but near match".$results->{NT}{ALMOST}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{AA}{ALMOST}->{$database})) {
#best hit was near a position match
$matchtype = 'aanearmatch';
my ($transcript) = keys %{$results->{AA}{ALMOST}{$database}};
my $ret_value = "Novel, but near match".$results->{AA}{ALMOST}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{AA}{NOVEL}->{$database}) && exists($results->{NT}{NOVEL}->{$database})) {
#no hits, novel
$matchtype = 'novel';
my ($transcript) = keys %{$results->{AA}{NOVEL}{$database}};
my ($transcript2) = keys %{$results->{NT}{NOVEL}{$database}};
my $ret_value = "Novel".$results->{AA}{NOVEL}->{$database}{$transcript}. " and ".$results->{NT}{NOVEL}->{$database}{$transcript2};
return ($ret_value, $matchtype);
}
elsif(exists($results->{AA}{NOVEL}->{$database})) {
#no hits, novel
$matchtype = 'aanovel';
my ($transcript) = keys %{$results->{AA}{NOVEL}{$database}};
my $ret_value = "Novel".$results->{AA}{NOVEL}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
elsif(exists($results->{NT}{NOVEL}->{$database})) {
#no hits, novel
$matchtype = 'ntnovel';
my ($transcript) = keys %{$results->{NT}{NOVEL}{$database}};
my $ret_value = "Novel".$results->{NT}{NOVEL}->{$database}{$transcript};
return ($ret_value, $matchtype);
}
else {
#it was a nomatch! this shouldn't happen.
$matchtype = 'nomatch';
my ($transcript) = keys %{$results->{NOMATCH}};
my $ret_value = (defined($transcript) && $results->{NOMATCH}{$transcript}) ? $results->{NOMATCH}{$transcript} : "Unknown/NULL";
return ($ret_value, $matchtype);
}
}
sub parse_nucleotide {
my ($string, $verbose) = @_;
my ($change, $modifier);
my ($start,$stop,$type_length,$type,$reference,$mutant);
($start,$stop,$change) = $string =~ /^c\. (\d+) _ (\d+) (.*) $/x;
unless (defined $change) {
($start,$change) = $string =~ /^c\. (\d+) (.*) $/x;
}
if ($string =~ /^c\.\w* \Q?\E.* /x) {
#ambiguous entry
return;
}
if( defined $change) {
#first check to make sure its not intronic
if ($change =~ /^ (-|\+) (\d+) (.*) $/x) {
if ($verbose) {print "Mutation Appears Intronic";}
$change = $3;
}
#This could be one of several possiblities
#First distinguish between > notation and del18 notation
($reference,$mutant) = $change =~ /^ (\D*) > (\D+) $/x;
if(defined($reference) && $reference eq '') {
$reference = undef;
}
if(defined $mutant) {
#Then we expect that this format is correct.
if((defined($stop) && $stop eq $start) || !defined($stop)) {
#We have a snp
$type = 'SNP';
$type_length = 1;
}
else {
#assuming that if it is listed explicitly it is an indel
$type = 'indel';
if(defined $reference) {
$type_length = length $reference;
}
else {
$type_length = abs($start-$stop)+1;
}
}
return ($start, $stop, $type_length, $type, $reference, $mutant);
}
else {
#did not guess right. Should be either del15 or insAAT type of format
$type = substr $change, 0,3;
$modifier = substr $change, 3, (length($change) - 1);
if($type =~ /^ (del|ins|delins) $/xi) {
if($type eq 'delins') {
$type = 'indel';
}
#then insertion
if($modifier =~ /^ (\d+) $/x) {
#it is a digit
$type_length = $1;
return ($start, $stop, $type_length, $type, $reference, $mutant);
}
else {
#it is a sequence
$type_length = length $modifier;
return ($start, $stop, $type_length, $type, $reference, $modifier);
}
}
else {
#unrecognized format
warn "Unable to parse nucleotide format in: $string\n";
return;
}
}
}
else {
warn "Unable to parse nucleotide format in: $string\n";
return;
}
}
sub ParseMutationFile {
my ($fh,$file,$wuheaders) = @_;
my $source = 'mutation_csv'; # 'CSV'
my $keyfields = 'HUGO_SYMBOL:TUMOR_SAMPLE_ID:file_line_num';
my $field_subset_array = undef;
my $header_translation;
if ($wuheaders) {
$header_translation = {
'chromosome_name' => 'CHROMOSOME',
'start' => 'START_POSITION',
'stop' => 'END_POSITION',
'reference' => 'REFERENCE_ALLELE',
'variant' => 'TUMOR_SEQ_ALLELE1',
# 'variant' => 'TUMOR_SEQ_ALLELE2',
'type' => 'VARIANT_TYPE',
'gene_name' => 'HUGO_SYMBOL',
'transcript_name' => 'TRANSCRIPT',
'strand' => 'TUMOR_SAMPLE_ID', #meaningless proxy
'amino_acid_change' => 'AA_CHANGE',
};
}
else {
$header_translation = {
'Chromosome' => 'CHROMOSOME',
'Start_position' => 'START_POSITION',
'End_position' => 'END_POSITION',
'Reference_Allele' => 'REFERENCE_ALLELE',
'Tumor_Seq_Allele1' => 'TUMOR_SEQ_ALLELE1',
'Tumor_Seq_Allele2' => 'TUMOR_SEQ_ALLELE2',
'Variant_Type' => 'VARIANT_TYPE',
'Hugo_Symbol' => 'HUGO_SYMBOL',
'transcript_name' => 'TRANSCRIPT',
'Strand' => 'TUMOR_SAMPLE_ID', #meaningless proxy
'amino_acid_change' => 'AA_CHANGE',
};
}
my $header_skip = 0;
my $no_header = 0;
my $header_fields = undef;
my $ucheader_fields = undef;
my $separator = "\t";
my $no_spaces = 0;
my $line_number_field = 'file_line_num';
my $line_field = 'file_line';
my $line_num = 1;
my $record = {};
my $csv = Text::CSV_XS->new({'sep_char' => $separator});
#parse MAF header
my $header = <$fh>;
while ($header =~ /^#/) {
$header = <$fh>;
}
unless ($header =~ m/Hugo_Symbol/) {
die "Header field \"Hugo_Symbol\" not found. Therefore, your file is assumed to not have a header. If this is in error, please contact the authors to fix this line in the code. Otherwise, please include a header in your input file. Thank you.\n";
}
$line_num++;
$csv->parse($header);
my @header_fields = $csv->fields();
if ($ucheader_fields) {
@header_fields = map { uc($_) } @header_fields;
}
if(defined($no_spaces) && $no_spaces) {
@header_fields = map { $_ =~ s/ /_/g } @header_fields;
}
unless(defined($separator)) {
$header =~ s/\t/,/gx;
}
unshift( @header_fields, $line_field); # Add 'extra' fields of the input line
unshift( @header_fields, $line_number_field); # and the line number
# Translate the header names, if a translation is given
for (my $h = 0; $h <= $#header_fields; $h++) {
$header_fields[$h] = (exists($header_translation->{$header_fields[$h]})) ?
$header_translation->{$header_fields[$h]} : $header_fields[$h];
}
my (@key_fields) = split(':',$keyfields);
my %key_fields;
@key_fields{ @key_fields } = ( 0 .. $#key_fields );
# Construct field name to position lookup
my %header_fields;
@header_fields{ @header_fields } = ( 0 .. $#header_fields );
# Construct a subset of the fields--the default is the complete set of fields
my %field_subset;
if (defined($field_subset_array)) {
@field_subset{ @{$field_subset_array} } = @header_fields{ @{$field_subset_array } };
} else {
@field_subset{ @header_fields } = ( 0 .. $#header_fields );
}
# Construct a list of fields that are not key (are values only)
my @value_fields;
foreach my $field (@header_fields[ (values %field_subset ) ]) {
unless (exists($key_fields{$field})) {
push @value_fields, ($field);
}
}
#__PARSE FILE
my $line;
while ($line = <$fh>) {
chomp ($line);
my $temp = $line;
#edit for punctuation characters in annotation section of MAF
$temp =~ s/"//g;
$temp =~ s/'//g;
unless(defined($separator)) {
#maintain original default behavior of handling both tabs and
#commas
$temp =~ s/\t/,/gx;
}
$csv->parse($temp);
my @values = $csv->fields();
if(defined($no_spaces) && $no_spaces) {
@values = map { $_ =~ s/ /_/g } @values;
}
unshift (@values, $line); # Add 'extra' fields of the input line
unshift (@values, $line_num++); # and the line number
my $sub_record;
# Construct the hierarchical key structure
$sub_record = $record;
foreach my $sub_key (@key_fields) {
unless (exists($sub_record->{ $values[ $header_fields{ $sub_key } ] } )) {
$sub_record->{ $values[ $header_fields{ $sub_key } ] } = {};
}
$sub_record = $sub_record->{ $values[ $header_fields{ $sub_key } ] };
}
# Get the hash array slice of the values
@{$sub_record}{ @value_fields } = @values[ @header_fields{@value_fields} ];
}
#__DUMP PARSED RESULTS AND STOP IF JUST CHECKING
Process($record);
#__RETURN DATA STRUCTS
return ($record);
}
sub Process {
my ($input) = @_;
my ($output) = {};
foreach my $hugo (keys (%{$input})) {
foreach my $sample (keys (%{$input->{$hugo}})) {
foreach my $line_num (keys (%{$input->{$hugo}->{$sample}})) {
$output->{$hugo}->{$sample}->{$line_num} =
$input->{$hugo}->{$sample}->{$line_num};
my ($residue1, $res_start, $residue2, $res_stop, $new_residue) =
AA_Check(
$input->{$hugo}->{$sample}->{$line_num}->{AA_CHANGE} , $line_num
);
$output->{$hugo}->{$sample}->{$line_num}->{res_start} = $res_start;
$output->{$hugo}->{$sample}->{$line_num}->{res_stop} = $res_stop;
$output->{$hugo}->{$sample}->{$line_num}->{residue1} = $residue1;
$output->{$hugo}->{$sample}->{$line_num}->{residue2} = $residue2;
$output->{$hugo}->{$sample}->{$line_num}->{new_residue} = $new_residue;
}
}
}
#__DUMP PARSED RESULTS AND STOP IF JUST CHECKING
return ($output);
}
sub AA_Check {
my ($AminoAcidChange_string, $line_num) = @_;
my ($residue1, $res_start, $residue2, $res_stop, $new_residue);
unless (defined($AminoAcidChange_string)) {
return ($residue1, $res_start, $residue2, $res_stop, $new_residue);
}
unless ($AminoAcidChange_string =~ m/^p\./) {
return ($residue1, $res_start, $residue2, $res_stop, $new_residue);
}
#__FORMULATE ERROR STRING JUST IN CASE
my $string = "'$AminoAcidChange_string' is not a valid AminoAcidChange";
$string .= " on line $line_num" if defined $line_num;
#__VALIDATE
$AminoAcidChange_string =~ s/^p\.//x;
if ($AminoAcidChange_string =~ /^ (\D+) (\d+) _ (\D+) (\d+) (.*) $/x ) {
($residue1, $res_start, $residue2, $res_stop, $new_residue) =
($1, $2, $3, $4, $5);
} elsif ($AminoAcidChange_string =~ /^ (\D+) (\d+) (\D+) (.*) $/x ) {
($residue1, $res_start, $residue2, $new_residue) =
($1, $2, $3, $4);
$res_stop = $res_start;
} elsif ($AminoAcidChange_string =~ /^ (\d+) (.*) $/x ) {
($res_start, $residue2, $res_stop, $new_residue) =
($1, $2);
$residue1 = '*';
$res_stop = $res_start;
$new_residue = $residue2;
} elsif ($AminoAcidChange_string =~ /^ (\D+) (\d+) $/x ) {
($residue1, $res_start) =
($1, $2);
$new_residue = ' ';
$residue2 = ' ';
$res_stop = $res_start;
}
if (defined($new_residue)) {
$new_residue =~ s/^ > //x;
}
$new_residue ||= '';
return ($residue1, $res_start, $residue2, $res_stop, $new_residue);
}
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Galaxy.pm 000444 000765 000024 3564 12013522176 25173 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Galaxy;
use strict;
use warnings;
use Genome;
use File::Basename;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Galaxy {
is => "Genome::Model::Tools::Music::Play",
has_input => [
output_bundle => {
is => 'Text',
doc => 'Location where Galaxy would like the bundle of Music outputs to be saved',
},
],
has => [
output_dir => {
is_input => 0,
is_output => 0,
is_optional => 1,
default => '',
},
],
};
sub execute {
my $self = shift;
my $output_dir = Genome::Sys->create_temp_directory();
$self->output_dir($output_dir);
$self->bam_list($self->create_new_bam_list);
$self->SUPER::_execute_body(@_);
my $tar_path = $self->output_bundle;
my $cmd = "tar -cf $tar_path -C $output_dir .";
my $rv = Genome::Sys->shellcmd(
cmd => $cmd,
);
return 1;
}
sub create_new_bam_list {
my $self = shift;
my $original_bam_file = $self->bam_list;
my $output_dir = Genome::Sys->create_temp_directory();
my $new_bam_list = "$output_dir/bam_list";
my @bams = Genome::Sys->read_file($original_bam_file);
my $new_bam_fh = Genome::Sys->open_file_for_writing($new_bam_list);
for (@bams) {
chomp;
my ($sample, $first_bam, $second_bam) = split("\t", $_);
my $first_new_path = $self->index_and_link_bam_file($output_dir, $first_bam);
my $second_new_path = $self->index_and_link_bam_file($output_dir, $second_bam);
$new_bam_fh->print("$sample\t$first_new_path\t$second_new_path\n");
}
$new_bam_fh->close();
return $new_bam_list;
}
sub index_and_link_bam_file {
my $self = shift;
my $output_dir = shift;
my $bam = shift;
my $filename = $output_dir . "/" . basename($bam);
Genome::Sys->create_symlink($bam, $filename);
Genome::Sys->shellcmd( cmd => "samtools index $filename");
return $filename;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/MutationRelation.pm 000444 000765 000024 17416 12013522176 27265 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::MutationRelation;
use warnings;
use strict;
use Carp;
use Genome;
use IO::File;
use POSIX qw( WIFEXITED );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::MutationRelation {
is => 'Command::V2',
has_input => [
bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)" },
maf_file => { is => 'Text', doc => "List of mutations in MAF format" },
mutation_matrix_file => { is => 'Text', doc => "Optionally store the sample-vs-gene matrix used during calculations.", is_optional => 1 },
output_file => { is => 'Text', doc => "Results of mutation-relation tool", is_output => 1 },
permutations => { is => 'Number', doc => "Number of permutations used to determine P-values", is_optional => 1, default => 100 },
gene_list => { is => 'Text', doc => "List of genes to test, typically SMGs. If unspecified, all genes in MAF are tested.", is_optional => 1 },
skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 },
skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 },
],
doc => "Identify relationships of mutation concurrency or mutual exclusivity in genes across cases.",
};
sub help_synopsis {
return <bam_list;
my $maf_file = $self->maf_file;
my $output_file = $self->output_file;
my $gene_list = $self->gene_list;
my $permutations = $self->permutations;
my @all_sample_names; # names of all the samples, no matter if it's mutated or not
my @genes_to_test; # the genes which will be tested for relationships
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;
# Parse out the names of the samples which should match the names in the MAF file
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
my $line_count = 0;
while( my $line = $sampleFh->getline )
{
$line_count++;
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
if ($sample) {
push( @all_sample_names, $sample );
} else {
warn("could not parse sample name from line $line_count of --bam_list");
}
}
$sampleFh->close;
# If user-specified, parse out the names of the genes that we are limiting our tests to
if( defined $gene_list )
{
my $geneFh = IO::File->new( $gene_list ) or die "Couldn't open $gene_list. $!\n";
while( my $line = $geneFh->getline )
{
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $gene ) = split( /\t/, $line );
push( @genes_to_test, $gene );
}
$geneFh->close;
}
# Create sample-gene matrix
my $matrix_file = $self->create_sample_gene_matrix($maf_file, \@all_sample_names, \@genes_to_test, $skip_non_coding, $skip_silent);
# Perform mutation-relation test using R
my $R_cmd = "R --slave --args < " . __FILE__ . ".R $matrix_file $permutations $output_file";
print "$R_cmd\n";
WIFEXITED(system $R_cmd) or croak "Couldn't run: $R_cmd ($?)";
return(1);
}
sub create_sample_gene_matrix {
my $self = shift;
my ( $maf_file, $all_sample_names_ref, $genes_to_test_ref, $skip_non_coding, $skip_silent ) = @_;
my @all_sample_names = @{$all_sample_names_ref};
my @genes_to_test = @{$genes_to_test_ref};
# Create hash of mutations from the MAF file
my %mutations;
my %all_genes;
# Parse the MAF file
my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!\n";
while( my $line = $mafFh->getline )
{
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $mutation_class, $sample ) = ( $cols[0], $cols[8], $cols[15] );
# If the mutation classification is odd, quit with error
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ ))
{
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
$all_genes{$gene}++;
$mutations{$sample}{$gene}++;
}
$mafFh->close;
# If the user specified a gene list, then check for genes that are not in the MAF
if( scalar( @genes_to_test ) > 0 )
{
for( my $i = 0; $i < scalar( @genes_to_test ); ++$i )
{
unless( defined $all_genes{$genes_to_test[$i]} )
{
print "Skipping ", $genes_to_test[$i], " from specified gene-list since it was not found in the MAF file\n";
splice( @genes_to_test, $i, 1 );
}
}
}
else
{
@genes_to_test = sort keys %all_genes;
}
# Write the input matrix to a file for use by the R code
my $matrix_file;
unless( $matrix_file = $self->mutation_matrix_file ) {
$matrix_file = Genome::Sys->create_temp_file_path();
}
my $matrix_fh = new IO::File $matrix_file,"w";
# Print input matrix file header
my $header = join("\t","Sample",@genes_to_test);
$matrix_fh->print("$header\n");
# Print mutation relation input matrix
for my $sample (sort @all_sample_names) {
$matrix_fh->print($sample);
for my $gene (@genes_to_test) {
if (exists $mutations{$sample}{$gene}) {
$matrix_fh->print("\t1");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
return $matrix_file;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/MutationRelation.pm.R 000444 000765 000024 6060 12013522176 27436 0 ustar 00nnutter staff 000000 000000 #get command line arguments
mutation_file = as.character(commandArgs()[4]);
permutations = as.numeric(commandArgs()[5]);
output_file = as.character(commandArgs()[6]);
# FUNCTION prepare mutation matrix data for permutation
seq.vjoint=function(x)
{
tt=NULL;
for (i in 1:length(x))
{
tt=c(tt,rep(names(x[i]),x[i]));
}
tt;
}
# END seq.vjoint
# FUNCTION randomly permute the mutation matrix, keeping mutation-number/sample unchanged
matrix.sample.v1=function(x,keep.freq=T)
{
colSums(x)->pool;
rowSums(x)->freq;
freq=freq[order(freq,decreasing=T)];
tt=NULL;
for (i in c(1:length(freq)))
{
pooli=seq.vjoint(pool);
xi=NULL;
for (j in 1:freq[i])
{
if (length(pooli)>1) sample(pooli,1)->temp;
if (length(pooli)==1) pooli->temp;
xi=c(xi,temp);
pooli=pooli[pooli!=temp];
}
tt=rbind(tt,cbind(i,xi));
pool[xi]=pool[xi]-1;
pool=pool[pool>0];
}
tt=table(tt[,1],tt[,2]);
tt[tt>1]=1;
tt;
}
# END matrix.sample.v1
# FUNCTION calculate the probability of having no correlation between any two genes
prob.concur=function(x)
{
x>0->x;
cols=colnames(x);
cols=cols[order(cols)];
n=length(cols);
tt=NULL;
for (i in 1:(n-1))
{
for (j in (i+1):n)
{
vi=cols[i];
vj=cols[j];
sum(x[,vi] & x[,vj])->nand;
sum(x[,vi] != x[,vj])->nexc;
ni=sum(x[,vi]);
nj=sum(x[,vj]);
temp=cbind(vi,vj,ni,nj,nand,nexc);
tt=rbind(tt,temp);
}
}
tt=as.data.frame(tt,stringsAsFactors=F);
tt$nand=as.numeric(tt[,"nand"]);
tt$nexc=as.numeric(tt[,"nexc"]);
tt;
}
# END prob.concur
# FUNCTION performing mutation correlation
mut_cor_permu_test=function(x=NULL,n.permu=100,seed=NULL,mut.file=NULL,out.file=NULL,out.rdata=NULL)
{
if (!is.null(mut.file)) read.table(mut.file,header=T)->x
x=x[,-1];x[x>0]=1
x=x[rowSums(x)>0,colSums(x)>0]
prob.concur(x)->pc0
if (!is.null(seed)) set.seed(seed)
pp=0;en=0
for (i in 1:n.permu)
{
doit=1;
while(doit==1)
{
xi=matrix.sample.v1(x);
if (max(xi)==1) doit=0;
}
prob.concur(xi)->pci;
en=en+pci[,c("nand","nexc")];
as.numeric(pci$nand>=pc0$nand)->pand;
as.numeric(pci$nexc>=pc0$nexc)->pexc;
pp=pp+cbind(pand,pexc);
}
en=en/n.permu;pp=pp/n.permu;
pci[,c("nand","nexc")]=en;
pci=cbind(pci,pp);
tt=merge(pc0,pci,by.x=c("vi","vj"),by.y=c("vi","vj"));
if (!is.null(out.rdata)) save(tt,file=out.rdata,compress=T);
names(tt)= c("Gene1","Gene2","CntGene1","CntGene2","AndCnt","XorCnt","Perm_CntGene1","Perm_CntGene2","Perm_AndCnt","Perm_XorCnt","Pvalue_And","Pvalue_Xor")
if (!is.null(out.file)) write.table(tt,file=out.file,quote=F,sep="\t",row.names=F);
invisible (tt);
}
#END mut_cor_permu_test
#run test using mut_cor_permu_test
mut_cor_permu_test(mut.file=mutation_file,n.permu=permutations,out.file=output_file);
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan.pm 000444 000765 000024 46620 12013522176 25467 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::PathScan;
use warnings;
use strict;
use Genome;
use Genome::Model::Tools::Music::PathScan::PopulationPathScan;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::PathScan {
is => 'Command::V2',
has_input => [
gene_covg_dir => { is => 'Text', doc => "Directory containing per-gene coverage files (Created using music bmr calc-covg)" },
bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name, normal_bam, tumor_bam] (See Description)" },
pathway_file => { is => 'Text', doc => "Tab-delimited file of pathway information (See Description)" },
maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specifications v2.3" },
output_file => { is => 'Text', doc => "Output file that will list the significant pathways and their p-values", is_output => 1 },
bmr => { is => 'Number', doc => "Background mutation rate in the targeted regions", is_optional => 1, default => 1.0E-6 },
genes_to_ignore => { is => 'Text', doc => "Comma-delimited list of genes whose mutations should be ignored", is_optional => 1 },
min_mut_genes_per_path => { is => 'Number', doc => "Pathways with fewer mutated genes than this, will be ignored", is_optional => 1, default => 1 },
skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 },
skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 },
],
doc => "Find signifcantly mutated pathways in a cohort given a list of somatic mutations.",
};
sub help_synopsis {
return <gene_covg_dir;
my $bam_list = $self->bam_list;
my $pathway_file = $self->pathway_file;
my $maf_file = $self->maf_file;
my $output_file = $self->output_file;
my $bgd_mut_rate = $self->bmr;
my $genes_to_ignore = $self->genes_to_ignore;
my $min_mut_genes_per_path = $self->min_mut_genes_per_path;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;
# Check on all the input data before starting work
print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
print STDERR "Directory with gene coverages not found: $covg_dir\n" unless( -e $covg_dir );
print STDERR "List of samples not found or is empty: $bam_list\n" unless( -s $bam_list );
print STDERR "Pathway info file not found or is empty: $pathway_file\n" unless( -s $pathway_file );
return undef unless( -s $maf_file && -e $covg_dir && -s $bam_list && -s $pathway_file );
# Build a hash to quickly lookup the genes whose mutations should be ignored
my %ignored_genes = ();
if( defined $genes_to_ignore )
{
%ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore );
}
# PathScan uses a helluva lot of hashes - all your RAM are belong to it
my %sample_gene_hash; # sample => array of genes (based on maf)
my %gene_path_hash; # gene => array of pathways (based on path_file)
my %path_hash; # pathway => all the information about the pathways in the database
my %sample_path_hash; # sample => pathways (based on %sample_gene_hash and %gene_path_hash)
my %path_sample_hits_hash; # path => sample => hits,mutated_genes
my %gene_sample_cov_hash; # gene => sample => coverage
my @all_sample_names; # names of all the samples, no matter if it's mutated or not
my %id_gene_hash; # entrez id => gene (based on first two columns in MAF)
# Parse out the names of the samples which should match the names in the MAF file
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline )
{
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
}
$sampleFh->close;
# Read coverage data calculated by the Music::Bmr::CalcCovg
$covg_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
read_CoverageFiles( $covg_dir, \@all_sample_names, \%gene_sample_cov_hash );
#build gene => average_coverage hash for population test
my %gene_cov_hash;
foreach my $gene ( keys %gene_sample_cov_hash )
{
my $total_cov = 0;
my $sample_num = scalar( @all_sample_names );
$total_cov += $gene_sample_cov_hash{$gene}{$_} foreach( @all_sample_names );
$gene_cov_hash{$gene} = int( $total_cov / $sample_num );
}
#build %sample_gene_hash based on maf
my $maf_fh = IO::File->new( $maf_file );
while( my $line = $maf_fh->getline )
{
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp( $line );
my @cols = split( /\t/, $line );
my ( $gene, $entrez_id, $mutation_class, $tumor_sample ) = ( $cols[0], $cols[1], $cols[8], $cols[15] );
# If the mutation classification is odd, quit with error
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
print STDERR "Unrecognized Variant_Classification $mutation_class in MAF file.\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ ))
{
print STDERR "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
# Check that the user followed instructions and named each sample correctly
unless( grep( /^$tumor_sample$/, @all_sample_names ))
{
print STDERR "Sample $tumor_sample in MAF file does not match any in $bam_list\n";
return undef;
}
next if( defined $ignored_genes{$gene} ); # Ignore variants in genes that user wants ignored
$id_gene_hash{$entrez_id} = $gene unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ );
push( @{$sample_gene_hash{$tumor_sample}}, $gene ) unless( grep /^$gene$/, @{$sample_gene_hash{$tumor_sample}} );
}
$maf_fh->close;
my $path_fh = IO::File->new( $pathway_file );
while( my $line = $path_fh->getline )
{
chomp( $line );
next if( $line =~ /^(#|ID)/ ); #Skip headers
my ( $path_id, $name, $class, $gene_line, $diseases, $drugs, $description ) = split( /\t/, $line );
my @genes = split( /\|/, $gene_line ); #Each gene is in the format "EntrezID:GeneSymbol"
$diseases =~ s/\|/, /g; #Change the separators to commas
$drugs =~ s/\|/, /g; #Change the separators to commas
$path_hash{$path_id}{name} = $name unless( $name eq '' );
$path_hash{$path_id}{class} = $class unless( $class eq '' );
$path_hash{$path_id}{diseases} = $diseases unless( $diseases eq '' );
$path_hash{$path_id}{drugs} = $drugs unless( $drugs eq '' );
$path_hash{$path_id}{description} = $description unless( $description eq '' );
@{$path_hash{$path_id}{gene}} = ();
foreach my $gene ( @genes )
{
my ( $entrez_id, $gene_symbol ) = split( /:/, $gene );
unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ )
{
# Use the gene name from the MAF file if the entrez ID matches
$gene_symbol = $id_gene_hash{$entrez_id} if( defined $id_gene_hash{$entrez_id} );
}
push( @{$gene_path_hash{$gene_symbol}}, $path_id ) unless( grep /^$path_id$/, @{$gene_path_hash{$gene_symbol}} );
unless( grep /^$gene_symbol$/, @{$path_hash{$path_id}{gene}} )
{
push( @{$path_hash{$path_id}{gene}}, $gene_symbol );
}
}
}
$path_fh->close;
#build a sample => pathway hash
foreach my $sample ( keys %sample_gene_hash )
{
foreach my $gene ( @{$sample_gene_hash{$sample}} )
{
if( defined $gene_path_hash{$gene} )
{
foreach my $pathway ( @{$gene_path_hash{$gene}} )
{
push( @{$sample_path_hash{$sample}}, $pathway ) unless( grep /^$pathway$/, @{$sample_path_hash{$sample}} );
}
}
}
}
#build path_sample_hits_hash, for population test
foreach my $sample ( keys %sample_path_hash )
{
foreach my $path ( @{$sample_path_hash{$sample}} )
{
my $hits = 0;
my @mutated_genes = (); #Mutated genes in this sample belonging to this pathway
my @mutated_genes_in_sample = @{$sample_gene_hash{$sample}};
foreach my $gene ( @{$path_hash{$path}{gene}} )
{
if( grep /^$gene$/, @mutated_genes_in_sample ) #if this gene is mutated in this sample (in maf)
{
$hits++;
push( @mutated_genes, $gene );
}
}
if( $hits > 0 )
{
$path_sample_hits_hash{$path}{$sample}{hits} = $hits;
$path_sample_hits_hash{$path}{$sample}{mutated_genes} = \@mutated_genes;
}
}
}
#Calculation of p value
my %data; #For printing
my @pvals;
foreach my $path ( sort keys %path_hash )
{
my @pathway_genes = @{$path_hash{$path}{gene}};
my @gene_sizes = ();
foreach my $gene ( @pathway_genes )
{
if( defined $gene_cov_hash{$gene} )
{
my $avg_cov = int( $gene_cov_hash{$gene} );
push( @gene_sizes, $avg_cov ) if( $avg_cov > 3 );
}
}
#If this pathway doesn't have any gene coverage, skip it
next unless( scalar( @gene_sizes ) > 0 );
my @num_hits_per_sample; #store hits info for each patient
my @mutated_samples = sort keys %{$path_sample_hits_hash{$path}};
foreach my $sample ( @all_sample_names )
{
my $hits = 0;
#if this sample has mutation
if( grep /^$sample$/, @mutated_samples )
{
$hits = $path_sample_hits_hash{$path}{$sample}{hits};
}
push( @num_hits_per_sample, $hits );
}
#If this pathway doesn't have any mutated genes in any samples, skip it
next unless( scalar( @num_hits_per_sample ) > 0 );
my $hits_ref = \@num_hits_per_sample;
########### MCW ADDED
# FIND MAX NUMBER OF HITS IN A SAMPLE
my $max_hits = 0;
foreach my $hits_in_sample ( @num_hits_per_sample )
{
$max_hits = $hits_in_sample if( $hits_in_sample > $max_hits );
}
########### MCW ADDED
my $pop_obj = Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \@gene_sizes );
if( scalar( @gene_sizes ) >= 3 )
{
########### MCW ADDED
if( $max_hits > 15 )
{
$pop_obj->assign( 5 );
}
else
{
$pop_obj->assign( 3 );
}
########### MCW ADDED
#$pop_obj->assign(3);
}
elsif( @gene_sizes == 2 )
{
$pop_obj->assign( 2 );
}
else
{
$pop_obj->assign( 1 );
}
$pop_obj->preprocess( $bgd_mut_rate, $hits_ref ); #mwendl's new fix
my $pval = $pop_obj->population_pval_approx($hits_ref);
$data{$pval}{$path}{samples} = \@mutated_samples;
$data{$pval}{$path}{hits} = $hits_ref;
push( @pvals, $pval ); # For calculation of FDR
}
# Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values
my $pval_cnt = scalar( @pvals );
my %fdr_hash;
for( my $i = 0; $i < $pval_cnt; $i++ )
{
my $fdr = $pvals[$i] * $pval_cnt / ( $pval_cnt - $i );
$fdr = 1 if $fdr > 1;
$fdr_hash{$pvals[$i]} = $fdr;
}
# Print two output files, one more detailed than the other
my $out_fh = IO::File->new( $output_file, ">" );
my $out_detailed_fh = IO::File->new( "$output_file\_detailed", ">" );
$out_fh->print( "Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n" );
foreach my $pval ( sort { $a <=> $b } keys %data )
{
foreach my $path ( sort keys %{$data{$pval}} )
{
# Skip this pathway if it has fewer affected genes than the user wants
my %mutated_gene_hash;
my @samples = @{$data{$pval}{$path}{samples}};
foreach my $sample ( @samples )
{
foreach my $gene ( @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}} )
{
$mutated_gene_hash{$gene}++;
}
}
next unless ( scalar( keys %mutated_gene_hash ) >= $min_mut_genes_per_path );
# Print detailed output to a separate output file
$out_detailed_fh->print( "Pathway: $path\n" );
$out_detailed_fh->print( "Name: ", $path_hash{$path}{name}, "\n" ) if( defined $path_hash{$path}{name} );
$out_detailed_fh->print( "Class: ", $path_hash{$path}{class}, "\n" ) if( defined $path_hash{$path}{class} );
$out_detailed_fh->print( "Diseases: ", $path_hash{$path}{diseases}, "\n" ) if( defined $path_hash{$path}{diseases} );
$out_detailed_fh->print( "Drugs: ", $path_hash{$path}{drugs}, "\n" ) if( defined $path_hash{$path}{drugs} );
$out_detailed_fh->print( "P-value: $pval\n", "FDR: ", $fdr_hash{$pval}, "\n" );
$out_detailed_fh->print( "Description: ", $path_hash{$path}{description}, "\n" );
my @hits = @{$data{$pval}{$path}{hits}};
foreach my $sample ( @samples )
{
my @mutated_genes = @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}};
$out_detailed_fh->print( "$sample:" );
$out_detailed_fh->print( join ",", @mutated_genes );
$out_detailed_fh->print( "\n" );
}
my ( $mutSampleCnt, $totalMutGenes ) = ( 0, 0 );
$out_detailed_fh->print( "Samples with mutations (#hits): " );
for( my $i = 0; $i < scalar( @all_sample_names ); ++$i )
{
if( $hits[$i] > 0 )
{
$out_detailed_fh->print( "$all_sample_names[$i]($hits[$i]) " );
$mutSampleCnt++;
$totalMutGenes += $hits[$i];
}
}
$out_detailed_fh->print( "\n\n" );
# Print tabulated output to the main output file
my ( $path_name, $path_class ) = ( "-", "-" );
$path_name = $path_hash{$path}{name} if( defined $path_hash{$path}{name} );
$path_class = $path_hash{$path}{class} if( defined $path_hash{$path}{class} );
$out_fh->print( "$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t",
"$pval\t", $fdr_hash{$pval}, "\n" );
}
}
$out_detailed_fh->close;
$out_fh->close;
return 1;
}
# Reads files for each sample which are formatted as tab-separated lines each showing the number of
# bases with sufficient coverage in a gene.
sub read_CoverageFiles
{
my ( $covg_dir, $all_samples_ref, $gene_sample_cov_hash_ref ) = ( $_[0], $_[1], $_[2] );
# Read per-gene covered base counts for each sample
foreach my $sample ( @{$all_samples_ref} )
{
# If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete
unless( -s "$covg_dir/$sample.covg" )
{
print STDERR "Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n";
exit 1;
}
my $covgFh = IO::File->new( "$covg_dir/$sample.covg" );
while( my $line = $covgFh->getline )
{
next if( $line =~ m/^#/ );
my ( $gene, undef, $covd_bases ) = split( /\t/, $line );
$gene_sample_cov_hash_ref->{$gene}{$sample} = $covd_bases;
}
$covgFh->close;
}
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Pfam.pm 000444 000765 000024 10615 12013522176 24644 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Pfam;
use warnings;
use strict;
use Genome;
use IO::File;
use IPC::Cmd qw/can_run/;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Pfam {
is => 'Genome::Model::Tools::Music::Base',
has_input => [
maf_file => {
is => 'Text', is_input => 1,
doc => "List of mutations using TCGA MAF specification v2.3",
},
output_file => {
is => 'Text', is_output => 1,
doc => "Output MAF file with an extra column that reports Pfam annotation domains",
},
reference_build => {
is => 'Text', default => 'Build37',
doc => "Options are 'Build36' or 'Build37'. This parameter ensures appropriate annotation of domains",
valid_values => ['Build36', 'Build37'],
},
],
doc => "Add Pfam annotation to a MAF file",
};
sub help_synopsis {
return <maf_file;
my $reference_build = $self->reference_build;
my $output_file = $self->output_file;
#parse the MAF file and output a new MAF with Pfam info appended
my $maf_fh = IO::File->new( $maf_file ) or die "Couldn't open MAF file!\n";
my $out_fh = IO::File->new( $output_file, ">" )or die "Couldn't open $output_file for writing!\n";
while( my $line = $maf_fh->getline ) {
chomp $line;
my @cols = split( /\t/, $line );
my ( $chr, $start, $stop ) = @cols[4..6];
#print out any headers directly to the output file
if( $line =~ m/^(#|Hugo_Symbol)/ ) {
$out_fh->print( $line );
$out_fh->print(( $line =~ m/^Hugo_Symbol/ ) ? "\tPfam_Annotation_Domains\n" : "\n" );
next;
}
#construct a tabix command
my $db_path = Genome::Sys->dbpath( 'pfam', 'latest' ) or die "Cannot find the pfam db path.";
my $tabix = can_run( 'tabix' ) or die "Cannot find the tabix command. It can be obtained from http://sourceforge.net/projects/samtools/files/tabix";
my $tabix_cmd = "$tabix";
if( $reference_build eq 'Build36' ) {
$tabix_cmd .= " $db_path/pfam.annotation.build36.gz $chr:$start-$stop - |";
}
elsif( $reference_build eq 'Build37' ) {
$tabix_cmd .= " $db_path/pfam.annotation.build37.gz $chr:$start-$stop - |";
}
else {
die "Please specify either 'Build36' or 'Build37' for the --reference-build parameter.";
}
#run tabix command
my %domains;
open( TABIX, $tabix_cmd ) or die "Cannot run 'tabix'. Please check it is in your PATH. It can be installed from the samtools project. $!";
while( my $tabline = ) {
chomp $tabline;
my ( undef, undef, undef, $csv_domains ) = split( /\t/, $tabline );
my @domains = split( /,/, $csv_domains );
for my $domain ( @domains ) {
$domains{$domain}++;
}
}
close(TABIX);
#print output to new file
my $all_domains = join( ",", sort keys %domains );
my $output_line = "$line\t";
unless( $all_domains eq "" ) {
$output_line .= "$all_domains\n";
}
else {
$output_line .= "NA\n";
}
$out_fh->print( $output_line );
}
return( 1 );
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Play.pm 000444 000765 000024 26306 12013522176 24672 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Play;
use strict;
use warnings;
use Genome;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Play {
is => 'Command::V2',
has_input => [
bam_list => {
is => 'Text',
doc => 'Tab delimited list of BAM files [sample_name normal_bam tumor_bam]'
},
roi_file => {
is => 'Text',
doc => 'Tab delimited list of ROIs [chr start stop gene_name]'
},
reference_sequence => {
is => 'Text',
doc => 'Path to reference sequence in FASTA format'
},
output_dir => {
is => 'Text', is_output => 1,
doc => 'Directory where output files and subdirectories will be written',
},
maf_file => {
is => 'Text',
doc => 'List of mutations using TCGA MAF specifications v2.3'
},
pathway_file => {
is => 'Text',
doc => 'Tab-delimited file of pathway information',
},
],
has_optional_input => [
numeric_clinical_data_file => {
is => 'Text',
doc => 'Table of samples (y) vs. numeric clinical data category (x)',
},
categorical_clinical_data_file => {
is => 'Text',
doc => 'Table of samples (y) vs. categorical clinical data category (x)',
},
numerical_data_test_method => {
is => 'Text', default => 'cor',
doc => "Either 'cor' for Pearson Correlation or 'wilcox' for the Wilcoxon Rank-Sum Test for numerical clinical data.",
},
glm_model_file => {
is => 'Text',
doc => 'File outlining the type of model, response variable, covariants, etc. for the GLM analysis. (See DESCRIPTION).',
},
glm_clinical_data_file => {
is => 'Text',
doc => 'Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION).',
},
use_maf_in_glm => {
is => 'Boolean', default => 0,
doc => 'Set this flag to use the variant matrix created from the MAF file as variant input to GLM analysis.',
},
omimaa_dir => {
is => 'Path',
doc => 'omim amino acid mutation database folder',
default => Genome::Sys->dbpath('omim', 'latest'),
},
cosmic_dir => {
is => 'Path',
doc => 'cosmic amino acid mutation database folder',
default => Genome::Sys->dbpath('cosmic', 'latest'),
},
verbose => {
is => 'Boolean', default => 1,
doc => 'turn on to display larger working output',
},
clinical_correlation_matrix_file => {
is => 'Text',
doc => 'Optionally store the sample-vs-gene matrix used internally during calculations.',
},
mutation_matrix_file => {
is => 'Text',
doc => 'Optionally store the sample-vs-gene matrix used during calculations.',
},
permutations => {
is => 'Number',
doc => 'Number of permutations used to determine P-values',
},
normal_min_depth => {
is => 'Integer',
doc => 'The minimum read depth to consider a Normal BAM base as covered',
},
tumor_min_depth => {
is => 'Integer',
doc => 'The minimum read depth to consider a Tumor BAM base as covered',
},
min_mapq => {
is => 'Integer',
doc => 'The minimum mapping quality of reads to consider towards read depth counts',
},
show_skipped => {
is => 'Boolean', default => 0,
doc => 'Report each skipped mutation, not just how many',
},
genes_to_ignore => {
is => 'Text',
doc => 'Comma-delimited list of genes to ignore for background mutation rates',
},
bmr => {
is => 'Number',
doc => 'Background mutation rate in the targeted regions',
},
max_proximity => {
is => 'Text',
doc => 'Maximum AA distance between 2 mutations',
},
bmr_modifier_file => {
is => 'Text',
doc => 'Tab delimited list of values per gene that modify BMR before testing [gene_name bmr_modifier]',
},
skip_low_mr_genes => {
is => 'Boolean', default => 1,
doc => "Skip testing genes with MRs lower than the background MR"
},
max_fdr => {
is => 'Number', default => 0.20,
doc => 'The maximum allowed false discovery rate for a gene to be considered an SMG',
},
genetic_data_type => {
is => 'Text',
doc => 'Data in matrix file must be either "gene" or "variant" type data',
},
wu_annotation_headers => {
is => 'Boolean',
doc => 'Use this to default to wustl annotation format headers',
},
bmr_groups => {
is => 'Integer', default => 1,
doc => 'Number of clusters of samples with comparable BMRs',
},
separate_truncations => {
is => 'Boolean', default => 0,
doc => 'Group truncational mutations as a separate category',
},
merge_concurrent_muts => {
is => 'Boolean', default => 0,
doc => 'Multiple mutations of a gene in the same sample are treated as 1',
},
skip_non_coding => {
is => 'Boolean', default => 1,
doc => 'Skip non-coding mutations from the provided MAF file',
},
skip_silent => {
is => 'Boolean', default => 1,
doc => 'Skip silent mutations from the provided MAF file',
},
min_mut_genes_per_path => {
is => 'Integer', default => 1,
doc => 'Pathways with fewer mutated genes than this will be ignored',
},
processors => {
is => 'Integer', default => 1,
doc => "Number of processors to use in SMG (requires 'foreach' and 'doMC' R packages)",
},
aa_range => {
is => 'Integer', default => 2,
doc => "Set how close a 'near' match is when searching for amino acid near hits",
},
nuc_range => {
is => 'Integer', default => 5,
doc => "Set how close a 'near' match is when searching for nucleotide position near hits",
},
reference_build => {
is => 'Text', default => 'Build37',
doc => 'Put either "Build36" or "Build37"',
},
show_known_hits => {
is => 'Boolean', default => 1,
doc => "When a finding is novel, show known AA in that gene",
},
],
has_calculated_optional => [
gene_covg_dir => {
calculate_from => ['output_dir'],
calculate => q{ $output_dir . '/gene_covgs'; },
},
gene_mr_file => {
calculate_from => ['output_dir'],
calculate => q{ $output_dir . '/gene_mrs'; },
},
gene_list => {
is => 'Text',
doc => 'List of genes to test in B(1), typically SMGs. (Uses output from running B(1).)',
calculate_from => ['output_dir'],
calculate => q{ $output_dir . '/smg'; },
},
input_clinical_correlation_matrix_file => {
is => 'Text', is_optional => 1,
doc => "Instead of calculating this from the MAF, input the sample-vs-gene matrix used internally during calculations.",
},
],
has_constant => [
cmd_list_file => { #If a workflow version of this tool is written, these parameters might be more useful
is => 'Text', default_value => undef, is_optional => 1,
},
cmd_prefix => {
is => 'Text', default_value => undef, is_optional => 1,
},
],
doc => 'Run the full suite of MuSiC tools sequentially.',
};
sub help_synopsis {
return <(1).";
}
sub _doc_authors {
return " Thomas B. Mooney, M.S.";
}
sub _doc_see_also {
return <(1),
B(1),
B(1),
B(1),
B(1),
B(1),
B(1),
B(1)
EOS
}
sub execute {
my $self = shift;
my @no_dependencies = ('Proximity', 'ClinicalCorrelation', 'CosmicOmim', 'Pfam');
my @bmr = ('Bmr::CalcCovg', 'Bmr::CalcBmr');
my @depend_on_bmr = ('PathScan', 'Smg');
my @depend_on_smg = ('MutationRelation');
for my $command_name (@no_dependencies, @bmr, @depend_on_bmr, @depend_on_smg) {
my $command = $self->_create_command($command_name)
or return;
$self->_run_command($command)
or return;
}
return 1;
}
sub _create_command {
my $self = shift;
my $command_name = shift;
my $command_module = join('::', 'Genome::Model::Tools::Music', $command_name);
my $command_meta = $command_module->__meta__;
my %params;
for my $property ($command_meta->_legacy_properties()) {
next unless exists $property->{is_input} and $property->{is_input};
my $property_name = $property->property_name;
if($property_name eq 'output_file') {
$params{$property_name} = $self->output_dir . '/' . $command_module->command_name_brief;
} elsif(!$property->is_optional or defined $self->$property_name) {
$params{$property_name} = $self->$property_name;
}
}
my $command = $command_module->create(%params);
unless($command) {
$self->error_message('Failed to create command for ' . $command_name);
return;
}
return $command;
}
sub _run_command {
my $self = shift;
my $command = shift;
my $command_name = $command->command_name;
$self->status_message('Running ' . $command_name . '...');
my $rv = eval { $command->execute() };
if($@) {
my $error = $@;
$self->error_message('Error running ' . $command_name . ': ' . $error);
return;
} elsif(not $rv) {
$self->error_message('Command ' . $command_name . ' did not return a true value.');
return;
} else {
$self->status_message('Completed ' . $command_name . '.');
return 1;
}
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot.pm 000444 000765 000024 2120 12013522176 24647 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Plot;
use warnings;
use strict;
use Genome;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Plot {
is => ['Command::Tree'],
doc => "Generate relevant plots and visualizations for MuSiC."
};
sub _doc_copyright_years {
(2010,2012);
}
sub _doc_license {
my $self = shift;
my (@y) = $self->_doc_copyright_years;
return <(1),
B(1)
EOS
}
sub _doc_manual_body {
return shift->help_detail;
}
sub help_detail {
return "These tools are part of the MuSiC suite.\n";
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Proximity.pm 000444 000765 000024 31500 12013522176 25761 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Proximity;
use warnings;
use strict;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Proximity {
is => 'Command::V2',
has_input => [
maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specifications v2.3" },
output_dir => { is => 'Text', doc => "Directory where output files will be written" },
max_proximity => { is => 'Text', doc => "Maximum allowed AA distance between 2 mutations", is_optional => 1, default => 7 },
skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 },
skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 },
],
has_output => [
output_file => {is => 'Text', doc => "TODO"},
],
doc => "Perform a proximity analysis on a list of mutations."
};
sub help_detail {
return <maf_file;
my $output_dir = $self->output_dir;
my $max_proximity = $self->max_proximity;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;
$output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
# Check on all the input data before starting work
print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
return undef unless( -s $maf_file && -e $output_dir );
# Output of this script will be written to this location in the output directory
my $out_file = "$output_dir/proximity_report";
$self->output_file($out_file);
# Parse the header row in the MAF file
my $maf_fh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
my $maf_header = $maf_fh->getline;
$maf_header = $maf_fh->getline while( $maf_header =~ /^#/ ); # Skip commented lines
chomp( $maf_header );
unless( $maf_header =~ /^Hugo_Symbol/ )
{
print STDERR "Could not find column headers in $maf_file\n";
return undef;
}
# Check whether the required additional MAF columns were included in the MAF
unless( $maf_header =~ m/c_position/ and $maf_header =~ m/amino_acid_change/ and $maf_header =~ m/transcript_name/ )
{
print STDERR "Could not find required additional columns in $maf_file\n";
return undef;
}
# Find the indexes of all the MAF columns
my $idx = 0;
my %col_idx = map {($_, $idx++)} split( /\t/, $maf_header );
# A hash to store statuses, and a hash to store variants and AA positions
my %status;
my $status = \%status;
my %aa_mutations;
# Load relevant data from MAF into hash
while( my $line = $maf_fh->getline )
{
chomp $line;
my @cols = split( /\t/, $line );
# Fetch data from the generic MAF columns
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref_allele, $var_allele, $var2, $sample ) =
( $cols[0], $cols[4], $cols[5], $cols[6], $cols[8], $cols[9], $cols[10], $cols[11], $cols[12], $cols[15] );
$var_allele = $var2 if ( $var_allele eq $ref_allele ); # Different centers interpret the 2 variant columns differently
# Fetch data from the required additional MAF columns
my ( $c_position, $aa_change, $transcript ) = ( $cols[$col_idx{c_position}], $cols[$col_idx{amino_acid_change}], $cols[$col_idx{transcript_name}] );
# Create a key to uniquely identify each variant
my $variant_key = join( "\t", $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample );
#check that the mutation class is acceptable
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ ))
{
print "Skipping $mutation_class mutation in gene $gene.\n";
$status{synonymous_mutations_skipped}++;
next;
}
# Determine amino acid position and load into hash
my @mutated_aa_positions = ();
@mutated_aa_positions = $self->get_amino_acid_pos( $variant_key, $mutation_class, $c_position, $aa_change, $status );
# Record data in hash if mutated aa position found
if( scalar( @mutated_aa_positions ) > 0 )
{
push( @{$aa_mutations{$transcript}{$variant_key}{mut_AAs}}, @mutated_aa_positions );
}
}
$maf_fh->close;
# Evaluate proximity of mutated amino acids for each transcript
for my $transcript ( keys %aa_mutations )
{
# For each variant hitting that transcript
for my $variant ( keys %{$aa_mutations{$transcript}} )
{
# Initialize the search
my @affected_amino_acids = @{$aa_mutations{$transcript}{$variant}{mut_AAs}};
my $mutations_within_proximity = 0; # Variable for summing # of mutations within proximity
my $min_proximity = $max_proximity + 1; # Current minimum proximity
# For each OTHER variant hitting the transcript
for my $other_variant ( keys %{$aa_mutations{$transcript}} )
{
# Ignore the current mutation
next if $variant eq $other_variant;
# Get affected amino acids from OTHER variant
my @other_affected_amino_acids = @{$aa_mutations{$transcript}{$other_variant}{mut_AAs}};
# Compare distances between amino acids
my $found_close_one = 0;
for my $other_variant_aa ( @other_affected_amino_acids )
{
for my $variant_aa ( @affected_amino_acids )
{
my $distance = abs($other_variant_aa - $variant_aa);
# If distance is within range
if( $distance <= $max_proximity )
{
$found_close_one++;
$min_proximity = $distance if $distance < $min_proximity;
}
}
}
# Note that this variant is within proximity if applicable
$mutations_within_proximity++ if $found_close_one;
}
# Now, save results in hash if there are any
if ($mutations_within_proximity)
{
$aa_mutations{$transcript}{$variant}{muts_within_range} = $mutations_within_proximity;
$aa_mutations{$transcript}{$variant}{min_proximity} = $min_proximity;
}
}
}
# Print results
my $out_fh = IO::File->new( $out_file, ">" ) or die "Couldn't open $out_file. $!";
$out_fh->print( "Mutations_Within_Proximity\tNearest_Mutation\tGene\tTranscript\tAffected_Amino_Acid(s)\tChr\tStart\tStop\tRef_Allele\tVar_Allele\tSample\n" );
for my $transcript ( keys %aa_mutations )
{
for my $variant ( keys %{$aa_mutations{$transcript}} )
{
if( exists $aa_mutations{$transcript}{$variant}{muts_within_range} )
{
my ( $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample ) = split( /\t/, $variant );
my $affected_amino_acids = join( ",", sort @{$aa_mutations{$transcript}{$variant}{mut_AAs}} );
my $line = join( "\t", $aa_mutations{$transcript}{$variant}{muts_within_range},
$aa_mutations{$transcript}{$variant}{min_proximity}, $gene, $transcript,
$affected_amino_acids, $chr, $start, $stop, $ref_allele, $var_allele, $sample );
$out_fh->print( "$line\n" );
}
}
}
$out_fh->close();
return 1;
}
################################################################################
=head2 get_amino_acid_pos
This subroutine deducts the amino acid position within the transcript using the c_position and amino_acid_position columns in the MAF.
=cut
################################################################################
sub get_amino_acid_pos {
# Parse arguments
my $self = shift;
my ( $variant_key, $mut_class, $c_position, $aa_change, $status ) = @_;
# Initialize variables
my $tx_start = my $tx_stop = 0;
my $aa_position_start = my $aa_position_stop = 0;
my $inferred_aa_start = my $inferred_aa_stop = 0;
my $aa_pos = my $inferred_aa_pos = 0;
# Amino acid position determination
if( $aa_change && $aa_change ne "NULL" && substr( $aa_change, 0, 1 ) ne "e" )
{
$aa_pos = $aa_change;
$aa_pos =~ s/[^0-9]//g;
}
# Parse out c_position if applicable ##
if( $c_position && $c_position ne "NULL" )
{
# If multiple results, parse both ##
if( $c_position =~ '_' && !( $mut_class =~ 'splice' ))
{
($tx_start, $tx_stop) = split( /\_/, $c_position );
$tx_start =~ s/[^0-9]//g;
$tx_stop =~ s/[^0-9]//g;
if( $tx_stop < $tx_start )
{
$inferred_aa_start = $tx_stop / 3;
$inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_stop % 3 ) ;
$inferred_aa_stop = $tx_start / 3;
$inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_start % 3 );
}
else
{
$inferred_aa_start = $tx_start / 3;
$inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_start % 3 );
$inferred_aa_stop = $tx_stop / 3;
$inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_stop % 3 );
}
}
else
{
my ( $tx_pos ) = split( /[\+\-\_]/, $c_position );
$tx_pos =~ s/[^0-9]//g;
$tx_start = $tx_stop = $tx_pos;
if($tx_pos)
{
$inferred_aa_pos = $tx_pos / 3;
$inferred_aa_pos = sprintf( "%d", $inferred_aa_pos ) + 1 if( $tx_pos % 3 );
$inferred_aa_start = $inferred_aa_stop = $inferred_aa_pos;
}
}
}
# If we inferred aa start stop, proceed with it ##
if( $inferred_aa_start && $inferred_aa_stop )
{
$aa_position_start = $inferred_aa_start;
$aa_position_stop = $inferred_aa_stop;
$status->{aa_position_inferred}++;
}
# Otherwise if we inferred aa position ##
elsif( $aa_pos )
{
$aa_position_start = $aa_pos;
$aa_position_stop = $aa_pos;
$status->{c_position_not_available}++;
}
# Otherwise we were unable to infer the info ##
else
{
$status->{aa_position_not_found}++;
$self->status_message( "Amino acid position not found for variant: $variant_key" );
return;
}
# Proceed if we have aa_position_start and stop ##
my %mutated_aa_positions;
if( $aa_position_start && $aa_position_stop )
{
for( my $this_aa_pos = $aa_position_start; $this_aa_pos <= $aa_position_stop; $this_aa_pos++ )
{
$mutated_aa_positions{$this_aa_pos}++;
}
}
my @mutated_aa_positions = keys %mutated_aa_positions;
return @mutated_aa_positions;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Smg.pm 000444 000765 000024 24223 12013522176 24507 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Smg;
use warnings;
use strict;
use Genome;
use IO::File;
use Carp;
use POSIX qw( WIFEXITED );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Smg {
is => 'Command::V2',
has_input => [
gene_mr_file => { is => 'Text', doc => "File with per-gene mutation rates (Created using \"music bmr calc-bmr\")" },
output_file => { is => 'Text', is_output => 1, doc => "Output file that will list significantly mutated genes and their p-values" },
],
has_optional_input => [
max_fdr => { is => 'Number', default => 0.20, doc => "The maximum allowed false discovery rate for a gene to be considered an SMG" },
skip_low_mr_genes => { is => 'Boolean', default => 1, doc => "Skip testing genes with MRs lower than the background MR" },
bmr_modifier_file => { is => 'Text', doc => "Tab delimited multipliers per gene that modify BMR before testing [gene_name bmr_modifier]" },
processors => { is => 'Integer', default => 1, doc => "Number of processors to use (requires 'foreach' and 'doMC' R packages)" },
],
doc => "Identify significantly mutated genes."
};
sub help_synopsis {
return <gene_mr_file;
my $output_file = $self->output_file;
my $output_file_detailed = $output_file . "_detailed";
my $max_fdr = $self->max_fdr;
my $skip_low_mr_genes = $self->skip_low_mr_genes;
my $bmr_modifier_file = $self->bmr_modifier_file;
my $processors = $self->processors;
# Check on all the input data before starting work
print STDERR "Gene mutation rate file not found or is empty: $gene_mr_file\n" unless( -s $gene_mr_file );
print STDERR "BMR modifier file not found or is empty: $bmr_modifier_file\n" unless( !defined $bmr_modifier_file || -s $bmr_modifier_file );
return undef unless( -s $gene_mr_file && ( !defined $bmr_modifier_file || -s $bmr_modifier_file ));
# If BMR modifiers were provided, then load them, and create another gene_mr_file with modified BMRs
if( defined $bmr_modifier_file ) {
my $inBmrModFh = IO::File->new( $bmr_modifier_file ) or die "Couldn't open $bmr_modifier_file. $!\n";
my %bmr_modifier = ();
while( my $line = $inBmrModFh->getline ) {
next if( $line =~ m/^#/ );
chomp( $line );
my ( $gene, $modifier ) = split( /\t/, $line );
( $modifier > 0 ) or die "$modifier is an invalid bmr-modifier. Please fix values in $bmr_modifier_file.\n";
$bmr_modifier{$gene} = $modifier;
}
$inBmrModFh->close;
my $new_gene_mr_file = Genome::Sys->create_temp_file_path;
( $new_gene_mr_file ) or die "Couldn't create a temp file. $!";
my $inMrFh = IO::File->new( $gene_mr_file ) or die "Couldn't open $gene_mr_file. $!\n";
my $outMrFh = IO::File->new( $new_gene_mr_file, ">" ) or die "Couldn't open $new_gene_mr_file. $!\n";
while( my $line = $inMrFh->getline ) {
if( $line =~ m/^#/ ) {
$outMrFh->print( $line );
next;
}
chomp( $line );
my ( $gene, $type, $covd_bps, $mut_cnt, $bmr ) = split( /\t/, $line );
$bmr = $bmr * $bmr_modifier{$gene} if( defined $bmr_modifier{$gene} );
$outMrFh->print( "$gene\t$type\t$covd_bps\t$mut_cnt\t$bmr\n" );
}
$outMrFh->close;
$inMrFh->close;
$gene_mr_file = $new_gene_mr_file;
}
# Collect per-gene mutation rates for reporting in results later
my ( %gene_muts, %gene_bps, %mut_classes_hash );
my $inMrFh = IO::File->new( $gene_mr_file ) or die "Couldn't open $gene_mr_file. $!\n";
while( my $line = $inMrFh->getline ) {
next if( $line =~ m/^#/ );
my ( $gene, $type, $covd_bps, $mut_cnt, undef ) = split( /\t/, $line );
# Warn user about cases where there could be fewer covered bps than mutations detected
( $mut_cnt <= $covd_bps ) or warn "More $type seen in $gene than there are bps with sufficient coverage!\n";
if( $type eq "Overall" or $type eq "Indels" or $type eq "Truncations" ) {
$gene_muts{$gene}{$type} = $mut_cnt;
$gene_bps{$gene} = $covd_bps;
$mut_classes_hash{$type} = 1 unless( $type eq "Overall" );
}
elsif( $type =~ m/(Transitions|Transversions)$/ ) {
$gene_muts{$gene}{SNVs} += $mut_cnt;
$mut_classes_hash{SNVs} = 1;
}
else {
die "Unrecognized mutation class in gene-mr-file. $!\n";
}
}
$inMrFh->close;
my @mut_classes = sort keys %mut_classes_hash;
# Create a temporary intermediate file to hold the p-values
my $pval_file = Genome::Sys->create_temp_file_path;
( $pval_file ) or die "Couldn't create a temp file. $!";
# Call R for Fisher combined test, Likelihood ratio test, and convolution test on each gene
my $smg_cmd = "R --slave --args < " . __FILE__ . ".R $gene_mr_file $pval_file smg_test $processors $skip_low_mr_genes";
WIFEXITED( system $smg_cmd ) or croak "Couldn't run: $smg_cmd ($?)";
# Call R for calculating FDR on the p-values calculated in the SMG test
my $fdr_cmd = "R --slave --args < " . __FILE__ . ".R $pval_file $output_file_detailed calc_fdr $processors $skip_low_mr_genes";
WIFEXITED( system $fdr_cmd ) or croak "Couldn't run: $fdr_cmd ($?)";
# Parse the R output to identify the SMGs (significant by at least 2 of 3 tests)
my $smgFh = IO::File->new( $output_file_detailed ) or die "Couldn't open $output_file_detailed. $!\n";
my ( @newLines, @smgLines );
my $header = "#Gene\t" . join( "\t", @mut_classes );
$header .= "\tTot Muts\tCovd Bps\tMuts pMbp\tP-value FCPT\tP-value LRT\tP-value CT\tFDR FCPT\tFDR LRT\tFDR CT\n";
while( my $line = $smgFh->getline ) {
chomp( $line );
if( $line =~ m/^Gene\tp.fisher\tp.lr\tp.convol\tfdr.fisher\tfdr.lr\tfdr.convol$/ ) {
push( @newLines, $header );
push( @smgLines, $header );
}
else {
my ( $gene, @pq_vals ) = split( /\t/, $line );
my ( $p_fcpt, $p_lrt, $p_ct, $q_fcpt, $q_lrt, $q_ct ) = @pq_vals;
my @mut_cnts;
foreach( @mut_classes ) {
# If a mutation count is a fraction, round down the digits after the decimal point
push( @mut_cnts, (( $gene_muts{$gene}{$_} =~ m/\./ ) ? sprintf( "%.2f", $gene_muts{$gene}{$_} ) : $gene_muts{$gene}{$_} ));
}
my $mut_per_mbp = ( $gene_bps{$gene} ? sprintf( "%.2f", ( $gene_muts{$gene}{Overall} / $gene_bps{$gene} * 1000000 )) : 0 );
push( @newLines, join( "\t", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\n" );
# If the FDR of at least two of these tests is less than the maximum allowed, we consider it an SMG
if(( $q_fcpt <= $max_fdr && $q_lrt <= $max_fdr ) || ( $q_fcpt <= $max_fdr && $q_ct <= $max_fdr ) ||
( $q_lrt <= $max_fdr && $q_ct <= $max_fdr )) {
push( @smgLines, join( "\t", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\n" );
}
}
}
$smgFh->close;
# Add per-gene SNV and Indel counts to the detailed R output, and make the header friendlier
my $outDetFh = IO::File->new( $output_file_detailed, ">" ) or die "Couldn't open $output_file_detailed. $!\n";
$outDetFh->print( @newLines );
$outDetFh->close;
# Do the same for only the genes that we consider SMGs
my $outFh = IO::File->new( $output_file, ">" ) or die "Couldn't open $output_file. $!\n";
$outFh->print( @smgLines );
$outFh->close;
return 1;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Smg.pm.R 000444 000765 000024 15614 12013522176 24713 0 ustar 00nnutter staff 000000 000000 #############################################
### Functions for testing significance of ###
### per-gene categorized mutation rates ###
#############################################
# Fetch command line arguments
args = commandArgs();
input_file = as.character(args[4]);
output_file = as.character(args[5]);
run_type = as.character(args[6]);
processors = as.numeric(args[7]);
skip_low_mr_genes = as.numeric(args[8]);
# See if we have the necessary packages installed to run in parallel
is.installed <- function( mypkg ) is.element( mypkg, installed.packages()[,1] );
parallel = FALSE;
if( processors > 1 & is.installed( 'doMC' ) & is.installed( 'foreach' )) {
parallel = TRUE;
}
gethist <- function( xmax, n, p, ptype = "positive_log" ) {
dbinom( 0:xmax, n, p ) -> ps;
ps = ps[ps > 0];
lastp = 1 - sum( ps );
if( lastp > 0 ) ps = c( ps, lastp );
if( ptype == "positive_log" ) ps = -log( ps );
return( ps );
}
binit <- function( x, hmax, bin, dropbin = T ) {
bs = as.integer( x / bin );
bs[bs > hmax/bin] = hmax / bin;
bs[is.na( bs )] = hmax / bin;
tapply( exp(-x), as.factor( bs ), sum ) -> bs;
bs = bs[bs>0];
bs = -log( bs );
if( dropbin ) bs = as.numeric( bs );
return( bs );
}
convolute_b <- function( a, b ) {
tt = NULL;
for( j in b ) { tt = c( tt, ( a + j )); }
return( tt );
}
mut_class_test <- function( x, xmax = 100, hmax = 25, bin = 0.001 ) {
x = as.data.frame( x );
colnames( x ) = c( "Class", "n", "x", "e" );
x$p = NA; x$lh0 = NA; x$lh1 = NA;
tot_muts = x[( x$Class == "Overall" ),]$x;
tot_bps = x[( x$Class == "Overall" ),]$n;
overall_bmr = x[( x$Class == "Overall" ),]$e;
# Remove the row containing overall MR and BMR because we don't want it to be a tested category
x = x[( x$Class != "Overall" ),];
# If user wants to skip testing genes with low MRs, measure the relevant MRs of this gene
gene_mr = 0; indel_mr = 0; indel_bmr = 0; trunc_mr = 0; trunc_bmr = 0;
if( skip_low_mr_genes == 1 ) {
if( tot_bps > 0 ) { gene_mr = tot_muts / tot_bps; }
if( x[( x$Class == "Indels" ),]$n > 0 ) { indel_mr = x[( x$Class == "Indels" ),]$x / x[( x$Class == "Indels" ),]$n; }
indel_bmr = x[( x$Class == "Indels" ),]$e;
if( nrow( x[( x$Class == "Truncations" ),] ) > 0 ) {
if( x[( x$Class == "Truncations" ),]$n > 0 ) { trunc_mr = x[( x$Class == "Truncations" ),]$x / x[( x$Class == "Truncations" ),]$n; }
trunc_bmr = x[( x$Class == "Truncations" ),]$e;
}
}
# Set pvals of 1 for genes with zero mutations, zero covered bps, or zero overall BMR
if( tot_muts <= 0 | tot_bps <= 0 | overall_bmr <= 0 ) {
p.fisher = 1; p.lr = 1; p.convol = 1; qc = 1;
}
# If user wants to skip testing genes with low MRs, give them pvals of 1
else if( skip_low_mr_genes == 1 & gene_mr < overall_bmr & indel_mr <= indel_bmr & trunc_mr <= trunc_bmr ) {
p.fisher = 1; p.lr = 1; p.convol = 1; qc = 1;
}
else {
# Skip testing mutation categories that have zero BMR, or if this gene has #muts >= #covd bps
x = x[( x$n > 0 & x$n > x$x & x$e > 0 ),];
rounded_mut_cnts = round(x$x);
for( i in 1:nrow(x) ) {
x$p[i] = binom.test( rounded_mut_cnts[i], x$n[i], x$e[i], alternative = "greater" )$p.value;
x$lh0[i] = dbinom( rounded_mut_cnts[i], x$n[i], x$e[i], log = T );
x$lh1[i] = dbinom( rounded_mut_cnts[i], x$n[i], x$x[i] / x$n[i], log = T );
ni = x$n[i]; ei = x$e[i];
gethist( xmax, ni, ei, ptype = "positive_log" ) -> bi;
binit( bi, hmax, bin ) -> bi;
if( i == 1 ) { hist0 = bi; }
if( i > 1 & i < nrow(x) ) { hist0 = convolute_b( hist0, bi ); binit( hist0, hmax, bin ) -> hist0; }
if( i == nrow(x)) { hist0 = convolute_b( hist0, bi ); }
}
# Fisher combined p-value
q = ( -2 ) * sum( log( x$p ));
df = 2 * length( x$p );
p.fisher = 1 - pchisq( q, df );
# Likelihood ratio test
q = 2 * ( sum( x$lh1 ) - sum( x$lh0 ));
df = sum( x$lh1 != 0 );
if( df > 0 ) p.lr = 1 - pchisq( q, df );
if( df == 0 ) p.lr = 1;
# Convolution test
bx = -sum( x[,"lh0"] );
p.convol = sum( exp( -hist0[hist0>=bx] ));
qc = sum( exp( -hist0 ));
}
# Return results
rst = list( x = cbind( x, tot_muts, p.fisher, p.lr, p.convol, qc ));
return( rst );
}
dotest <- function( idx, mut, zgenes ) {
step = round( length( zgenes ) / processors );
start = step * ( idx - 1 ) + 1;
stop = step * idx;
if( idx == processors ) { stop = length( zgenes ); }
tt = NULL;
for( Gene in zgenes[start:stop] ) {
mutgi = mut[mut$Gene==Gene,];
mut_class_test( mutgi[,2:5], hmax = 25, bin = 0.001 ) -> z;
tt = rbind( tt, cbind( Gene, unique( z$x[,(9:11)] )));
}
return( tt );
}
combineresults <- function( a, b ) {
return( rbind( a, b ));
}
smg_test <- function( gene_mr_file, pval_file ) {
read.delim( gene_mr_file ) -> mut;
colnames( mut ) = c( "Gene", "Class", "Bases", "Mutations", "BMR" );
mut$BMR = as.numeric( as.character( mut$BMR ));
tt = NULL;
# Run in parallel if we have the needed packages, or fall back to the old way
if( parallel ) {
library( 'doMC' );
library( 'foreach' );
registerDoMC();
cat( "Parallel backend installed - splitting across", processors, "cores\n" );
options( cores = processors );
mcoptions <- list( preschedule = TRUE );
zgenes = unique( as.character( mut$Gene ));
tt = foreach( idx = 1:processors, .combine="combineresults", .options.multicore = mcoptions ) %dopar% {
dotest( idx, mut, zgenes );
}
write.table( tt, file = pval_file, quote = FALSE, row.names = F, sep = "\t" );
}
else {
for( Gene in unique( as.character( mut$Gene ))) {
mutgi = mut[mut$Gene==Gene,];
mut_class_test( mutgi[,2:5], hmax = 25, bin = 0.001 ) -> z;
tt = rbind( tt, cbind( Gene, unique( z$x[,(9:11)] )));
}
write.table( tt, file = pval_file, quote = FALSE, row.names = F, sep = "\t" );
}
}
smg_fdr <- function( pval_file, fdr_file ) {
read.table( pval_file, header = T, sep = "\t" ) -> x;
#Calculate FDR measure and write FDR output
p.adjust( x[,2], method="BH" ) -> fdr.fisher;
p.adjust( x[,3], method="BH" ) -> fdr.lr;
p.adjust( x[,4], method="BH" ) -> fdr.convol;
x = cbind( x, fdr.fisher, fdr.lr, fdr.convol );
#Rank SMGs starting with lowest convolution test FDR, and then by Likelihood Ratio FDR
x = x[order( fdr.convol, fdr.lr ),];
write.table( x, file = fdr_file, quote = FALSE, row.names = F, sep = "\t" );
}
# Figure out which function needs to be invoked and call it
if( run_type == "smg_test" ) { smg_test( input_file, output_file ); }
if( run_type == "calc_fdr" ) { smg_fdr( input_file, output_file ); }
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Survival.pm 000444 000765 000024 45444 12013522176 25604 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Survival;
use warnings;
use strict;
use Carp;
use Genome;
use IO::File;
use POSIX qw( WIFEXITED );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Survival {
is => 'Genome::Model::Tools::Music::Base',
has_input => [
bam_list => {
is => 'Text',
doc => "List of sample names to be included in the analysis. (See Description)",
},
maf_file => {
is => 'Text', is_optional => 1,
doc => "List of mutations in MAF format",
},
output_dir => {
is => 'Text', is_output => 1,
doc => "Directory where output files will be written",
},
genetic_data_type => {
is => 'Text', is_optional => 1, default => "gene",
doc => "Correlate clinical data to \"gene\" or \"variant\" level data",
},
numeric_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Table of samples (y) vs. numeric clinical data category (x)",
},
categorical_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Table of samples (y) vs. categorical clinical data category (x)",
},
glm_clinical_data_file => {
is => 'Text', is_optional => 1,
doc => "Clinical traits, mutational profiles, other mixed clinical data (See DESCRIPTION).",
},
phenotypes_to_include => {
is => 'Text', is_optional => 1,
doc => "Include only these genes and/or phenotypes in the anlaysis. (COMMA-DELIMITED)",
},
legend_placement => {
is => 'Text', is_optional => 1, default => 'bottomleft',
doc => "Choose one of 'bottomleft', 'topleft', 'topright', or 'bottomright'.",
},
skip_non_coding => {
is => 'Boolean', is_optional => 1, default => 1,
doc => "Skip non-coding mutations from the provided MAF file",
},
skip_silent => {
is => 'Boolean', is_optional => 1, default => 1,
doc => "Skip silent mutations from the provided MAF file",
},
],
doc => "Create survival plots and P-values for clinical and mutational phenotypes.",
};
sub help_synopsis {
return <bam_list;
my $genetic_data_type = $self->genetic_data_type;
my $legend_placement = $self->legend_placement;
# handle phenotype inclusions
my @phenotypes_to_include;
my @clinical_phenotypes_to_include;
my @mutated_genes_to_include;
if ($self->phenotypes_to_include) { @phenotypes_to_include = split /,/,$self->phenotypes_to_include; }
# check genetic data type
unless ($genetic_data_type =~ /^gene|variant$/i) {
$self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter.");
return;
}
# load clinical data and analysis types
my %clinical_data;
if ($self->numeric_clinical_data_file) {
$clinical_data{'numeric'} = $self->numeric_clinical_data_file;
}
if ($self->categorical_clinical_data_file) {
$clinical_data{'categ'} = $self->categorical_clinical_data_file;
}
if ($self->glm_clinical_data_file) {
$clinical_data{'glm'} = $self->glm_clinical_data_file;
}
# create array of all sample names possibly included from clinical data and MAF
my @all_sample_names; # names of all the samples, no matter if they are mutated or not
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline )
{
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
}
$sampleFh->close;
# loop through clinical data files and assemble survival data hash (vital_status and days_to_last_followup required);
my %survival_data;
my $vital_status_flag = 0;
my $days_to_last_follow_flag = 0;
for my $clin_file (keys %clinical_data) {
#check filehandle
my $clin_fh = new IO::File $clinical_data{$clin_file},"r";
unless ($clin_fh) {
$self->error_message("Failed to open $clinical_data{$clin_file} for reading: $!");
return;
}
#initiate variables to hold column info
my %phenotypes_to_print;
my $vital_status_col = 0;
my $days_to_last_follow_col = 0;
#parse header and record column locations for needed data
my $header = $clin_fh->getline;
my @header_fields = split /\t/,$header;
for (my $i = 1; $i <= $#header_fields; $i++) { #sample ID should be in first column of file
my $field = $header_fields[$i];
if ($field =~ /vital_status|vitalstatus/i) { $vital_status_col = $i; $vital_status_flag++; }
if ($field =~ /days_to_last_(follow_up|followup)|daystolastfollowup/i) { $days_to_last_follow_col = $i; $days_to_last_follow_flag++; }
if (scalar grep { /^$field$/i } @phenotypes_to_include) { $phenotypes_to_print{$field} = $i; }
}
#read through clinical data file and store needed data in a hash
while (my $line = $clin_fh->getline) {
chomp $line;
my @fields = split /\t/,$line;
my $sample = $fields[0];
unless (scalar grep { m/^$sample$/ } @all_sample_names) {
$self->status_message("Skipping sample $sample. (Sample is not in --bam-list).");
next;
}
if ($vital_status_col) {
my $vital_status;
if ($fields[$vital_status_col] =~ /^(0|living)$/i) { $vital_status = 0; }
elsif ($fields[$vital_status_col] =~ /^(1|deceased)$/i) { $vital_status = 1; }
else { $vital_status = "NA"; }
$survival_data{$sample}{'vital_status'} = $vital_status;
}
if ($days_to_last_follow_col) { $survival_data{$sample}{'days'} = $fields[$days_to_last_follow_col]; }
for my $pheno (keys %phenotypes_to_print) { $survival_data{$sample}{$pheno} = $fields[$phenotypes_to_print{$pheno}]; }
}
$clin_fh->close;
# record phenotypes included from clinical data
push @clinical_phenotypes_to_include, keys %phenotypes_to_print;
}
# check for necessary header fields
unless ($vital_status_flag) {
$self->error_message('Clinical data does not seem to contain a column labeled "vital_status".');
return;
}
unless ($days_to_last_follow_flag) {
$self->error_message('Clnical data does not seem to contain a column labeled "days_to_last_followup".');
return;
}
# create temporary files for R command
my $survival_data_file = Genome::Sys->create_temp_file_path();
my $mutation_matrix = Genome::Sys->create_temp_file_path();
# print survival data (temp file)
my $surv_fh = new IO::File $survival_data_file,"w" or die "Couldn't open survival data filehandle.";
print $surv_fh join("\t","Sample","Days_To_Last_Followup","Vital_Status");
if (@clinical_phenotypes_to_include) { print $surv_fh "\t" . join("\t",@clinical_phenotypes_to_include); }
print $surv_fh "\n";
for my $sample (keys %survival_data) {
unless (exists $survival_data{$sample}{'days'}) { $survival_data{$sample}{'days'} = "NA"; }
unless (exists $survival_data{$sample}{'vital_status'}) { $survival_data{$sample}{'vital_status'} = "NA"; }
print $surv_fh join("\t",$sample,$survival_data{$sample}{'days'},$survival_data{$sample}{'vital_status'});
for my $pheno (@clinical_phenotypes_to_include) {
unless (exists $survival_data{$sample}{$pheno}) { $survival_data{$sample}{$pheno} = "NA"; }
print $surv_fh "\t" . $survival_data{$sample}{$pheno};
}
print $surv_fh "\n";
}
$surv_fh->close;
# find if any of the "phenotypes_to_include" are genes, and if so, limit the MAF mutation matrix to those genes
my %clinical_pheno_to_include;
@clinical_pheno_to_include{@clinical_phenotypes_to_include} = ();
for my $item (@phenotypes_to_include) {
push @mutated_genes_to_include,$item unless exists $clinical_pheno_to_include{$item};
}
my $mutated_genes_to_include = \@mutated_genes_to_include;
# create mutation matrix file
if( $genetic_data_type =~ /^gene$/i ) {
$self->create_sample_gene_matrix_gene( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
}
elsif( $genetic_data_type =~ /^variant$/i ) {
$self->create_sample_gene_matrix_variant( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
}
else {
$self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." );
return;
}
# check and prepare output directory
my $output_dir = $self->output_dir . "/";
unless (-e $output_dir) {
$self->status_message("Creating output directory: $output_dir...");
unless(mkdir $output_dir) {
$self->error_message("Failed to create output directory: $!");
return;
}
}
# set up R command
my $R_cmd = "R --slave --args < " . __FILE__ . ".R " . join( " ", $survival_data_file, $mutation_matrix, $legend_placement, $output_dir );
print "R_cmd:\n$R_cmd\n";
#run R command
WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)";
return(1);
}
sub create_sample_gene_matrix_gene {
my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;
#create hash of mutations from the MAF file
my ( %mutations, %all_genes );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $mutation_class, $sample ) = @cols[0,8,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
#check to see if this gene is on the list (if there is a list at all)
if( defined @{$mutated_genes_to_include} ) {
next unless( scalar grep { m/^$gene$/ } @{$mutated_genes_to_include} );
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
$all_genes{$gene}++;
$mutations{$sample}{$gene}++;
}
$maf_fh->close;
#sort @all_genes for consistency in header and loops
my @all_genes = sort keys %all_genes;
#write the input matrix for R code to a temp file
my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";
#print input matrix file header
my $header = join( "\t", "Sample", @all_genes );
$matrix_fh->print( "$header\n" );
#print mutation relation input matrix
for my $sample ( sort @all_sample_names ) {
$matrix_fh->print( $sample );
for my $gene ( @all_genes ) {
if( exists $mutations{$sample}{$gene} ) {
$matrix_fh->print( "\t$mutations{$sample}{$gene}" );
}
else {
$matrix_fh->print( "\t0" );
}
}
$matrix_fh->print( "\n" );
}
}
sub create_sample_gene_matrix_variant {
my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;
#create hash of mutations from the MAF file
my ( %variants_hash, %all_variants );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
#check to see if this gene is on the list (if there is a list at all)
if( defined @{$mutated_genes_to_include} ) {
next unless (scalar grep { m/^$gene$/ } @{$mutated_genes_to_include});
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
my $var;
my $variant_name;
if( $ref eq $var1 ) {
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref eq $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref ne $var1 && $ref ne $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
}
$maf_fh->close;
#sort variants for consistency
my @variant_names = sort keys %all_variants;
#write the input matrix for R code to a file
my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";
#print input matrix file header
my $header = join("\t","Sample",@variant_names);
$matrix_fh->print("$header\n");
#print mutation relation input matrix
for my $sample (sort @all_sample_names) {
$matrix_fh->print($sample);
for my $variant (@variant_names) {
if (exists $variants_hash{$sample}{$variant}) {
$matrix_fh->print("\t$variants_hash{$sample}{$variant}");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Survival.pm.R 000444 000765 000024 5451 12013522176 25756 0 ustar 00nnutter staff 000000 000000 ### Survival analysis for mutation data ###
### original location of code: /gscuser/qzhang/gstat/survival/survival.R
### example input file: /gscuser/qzhang/gstat/survival/tcga.tsv
### Run it on command line like below
### for example, R --no-save --args < survival.R vital_status.input mut_matrix.input legend.placement output_dir &
### clinical data /vital status input file, first three columns are sample_ID, survival_time, vital_status (0=living, 1=deceased)
######################## read input arguments
clinical.survival.data=commandArgs()[4];
mut.data=commandArgs()[5];
legend.placement=commandArgs()[6];
out.dir=commandArgs()[7];
######################## read and prepare data
vitals = read.table(clinical.survival.data,header=T);
mut_matrix = read.table(mut.data,header=T);
x = merge(vitals,mut_matrix,by.x=1,by.y=1);
write.table(x,file=paste(out.dir,"survival_analysis_data_matrix.csv",sep="/"),quote=F,append=F,row.names=F,sep="\t")
colnames(x)[-c(1:3)]->phenos
if (class(x[,phenos])=="integer" & length(unique(x[,phenos]))<6) x[,phenos] [x[,phenos]>1]=1
######################### survival analysis
library(survival)
logr=NULL
for (phenotype in phenos)
{
#clean data
loopdata <- x;
loopdata <- loopdata[!is.na(loopdata[,phenotype]),];
loopdata <- loopdata[!is.na(loopdata[,3]),];
loopdata <- loopdata[!is.na(loopdata[,2]),];
status=loopdata[,3];
time=loopdata[,2];
x1=loopdata[,phenotype];
base.class = as.vector(sort(unique(x1)))[1];
coxph(Surv(time, status) ~ x1, loopdata) -> co;
summary(co)->co; co$conf->cox; co$logtest[3]->p; co$coef[5]->indv.p;
rownames(cox) = sub("x1","",rownames(cox));
if (length(rownames(cox))==1 && rownames(cox)[1]=="") { rownames(cox)[1] = "1"; }
logr=rbind(logr,cbind(base.class,rownames(cox),phenotype,cox,indv.p,p))
mfit.by <- survfit(Surv(time, status == 1) ~ x1, data = loopdata)
## file name for plot
bitmap(file=paste(out.dir,"/",phenotype,"_survival_plot.png",sep=""))
## create survival plot
plot(mfit.by,lty=1:10,ylab="Survival Probability",xlab="Time",col=c(1:10))
if (dim(table(x1))>1) {
title(paste(phenotype,", P=",signif(p,3),sep=""));
} else {
title(paste(phenotype));
}
legend(x=legend.placement, legend=names(table(x1)), lty = 1:10, col=c(1:10))
dev.off()
}
########################## calculate fdr
logr=logr[,-5];
if (length(phenos) < 2) { logr=(t(logr)); }
fdr=p.adjust(as.numeric(logr[,"p"]),"fdr")
logr=cbind(logr,fdr)
######################### print output
colnames(logr)[1:9]=c("base.class","comparison.class","phenotype","hazard.ratio","lower.95","upper.95","2-class-p-value","p-value","fdr")
logr=logr[order(logr[,"p-value"]),]
write.table(logr,file=paste(out.dir,"survival_analysis_test_results.csv",sep="/"),quote=F,append=F,row.names=F,sep="\t")
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr 000755 000765 000024 0 12013522176 23763 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/Base.pm 000444 000765 000024 1151 12013522176 25326 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr::Base;
use strict;
use warnings;
use Genome;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Bmr::Base {
is => ['Genome::Model::Tools::Music::Base'],
is_abstract => 1,
attributes_have => [
file_format => {
is => 'Text',
is_optional => 1,
}
],
doc => "Mutational Significance In Cancer (BMR Calculations)"
};
sub _doc_authors {
return " Cyriac Kandoth, Ph.D.";
}
sub _doc_see_also {
return <(1),
B(1),
B(1)
EOS
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcBmr.pm 000444 000765 000024 75103 12013522176 26007 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr::CalcBmr;
use warnings;
use strict;
use Genome;
use IO::File;
use Bit::Vector;
use List::Util qw( min sum );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
# These constants let us use space-efficient arrays instead of hashes, while keeping the code fairly readable
use constant { AT_Transitions => 0, AT_Transversions => 1, CG_Transitions => 2, CG_Transversions => 3,
CpG_Transitions => 4, CpG_Transversions => 5, Indels => 6, Truncations => 7, Overall => 8,
covd_bases => 0, mutations => 1, bmr => 2 };
class Genome::Model::Tools::Music::Bmr::CalcBmr {
is => 'Genome::Model::Tools::Music::Bmr::Base',
has_input => [
roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See DESCRIPTION)" },
reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" },
bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name normal_bam tumor_bam] (See DESCRIPTION)" },
output_dir => { is => 'Text', doc => "Directory where output files will be written (Use the same one used with calc-covg)" },
maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specification v2.3" },
bmr_groups => { is => 'Integer', doc => "Number of clusters of samples with comparable BMRs (See DESCRIPTION)", is_optional => 1, default => 1 },
show_skipped => { is => 'Boolean', doc => "Report each skipped mutation, not just how many", is_optional => 1, default => 0 },
separate_truncations => { is => 'Boolean', doc => "Group truncational mutations as a separate category", is_optional => 1, default => 0 },
merge_concurrent_muts => { is => 'Boolean', doc => "Multiple mutations of a gene in the same sample are treated as 1", is_optional => 1, default => 0 },
genes_to_ignore => { is => 'Text', doc => "Comma-delimited list of genes to ignore for background mutation rates", is_optional => 1 },
skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 },
skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 },
],
has_output => [
bmr_output => { is => 'Number', doc => "TODO" },
gene_mr_file => { is => 'Text', doc => "TODO" },
],
doc => "Calculates mutation rates given per-gene coverage (from \"music bmr calc-covg\"), and a mutation list",
};
sub help_synopsis {
return <(1),
B(1),
B(1)
EOS
}
sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $bam_list = $self->bam_list;
my $output_dir = $self->output_dir;
my $maf_file = $self->maf_file;
my $show_skipped = $self->show_skipped;
my $bmr_groups = $self->bmr_groups;
my $separate_truncations = $self->separate_truncations;
my $merge_concurrent_muts = $self->merge_concurrent_muts;
my $genes_to_ignore = $self->genes_to_ignore;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;
# Check on all the input data before starting work
print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list );
print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir && -s $maf_file );
# Check on the files we expect to find within the provided output directory
$output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
my $gene_covg_dir = "$output_dir/gene_covgs"; # Should contain per-gene coverage files per sample
my $total_covgs_file = "$output_dir/total_covgs"; # Should contain overall coverages per sample
print STDERR "Directory with per-gene coverages not found: $gene_covg_dir\n" unless( -e $gene_covg_dir );
print STDERR "Total coverages file not found or is empty: $total_covgs_file\n" unless( -s $total_covgs_file );
return undef unless( -e $gene_covg_dir && -s $total_covgs_file );
# Outputs of this script will be written to these locations in the output directory
my $overall_bmr_file = "$output_dir/overall_bmrs";
my $gene_mr_file = "$output_dir/gene_mrs";
$self->gene_mr_file( $gene_mr_file );
# Build a hash to quickly lookup the genes to be ignored for overall BMRs
my %ignored_genes = ();
if( defined $genes_to_ignore ) {
%ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore );
}
# Parse out the names of the samples which should match the names of the coverage files needed
my ( @all_sample_names, %sample_idx );
my $idx = 0;
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!";
while( my $line = $sampleFh->getline ) {
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
$sample_idx{$sample} = $idx++;
}
$sampleFh->close;
# If the reference sequence FASTA file hasn't been indexed, do it
my $ref_seq_idx = "$ref_seq.fai";
Genome::Sys->shellcmd( cmd => "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx );
# Parse gene names and ROIs. Mutations outside these ROIs will be skipped
my ( @all_gene_names, %gene_idx );
$idx = 0;
my $roi_bitmask = $self->create_empty_genome_bitmask( $ref_seq_idx );
my $roiFh = IO::File->new( $roi_file ) or die "Couldn't open $roi_file. $!";
while( my $line = $roiFh->getline ) {
next if( $line =~ m/^#/ );
chomp $line;
my ( $chr, $start, $stop, $gene ) = split( /\t/, $line );
if( !$roi_bitmask->{$chr} or $start > $roi_bitmask->{$chr}->Size ) {
print STDERR "Skipping invalid ROI bitmask $chr:$start-$stop\n";
next;
}
$roi_bitmask->{$chr}->Interval_Fill( $start, $stop );
unless( defined $gene_idx{$gene} ) {
push( @all_gene_names, $gene );
$gene_idx{$gene} = $idx++;
}
}
$roiFh->close;
# These are the various categories that each mutation will be classified into
my @mut_classes = ( AT_Transitions, AT_Transversions, CG_Transitions, CG_Transversions, CpG_Transitions, CpG_Transversions, Indels );
push( @mut_classes, Truncations ) if( $separate_truncations );
# Save the actual class names for reporting purposes, because the elements above are really just numerical constants
my @mut_class_names = qw( AT_Transitions AT_Transversions CG_Transitions CG_Transversions CpG_Transitions CpG_Transversions Indels );
push( @mut_class_names, 'Truncations' ) if( $separate_truncations );
my @sample_mr; # Stores per sample covg and mutation information
foreach my $sample ( @all_sample_names ) {
$sample_mr[$sample_idx{$sample}][$_][mutations] = 0 foreach( @mut_classes );
$sample_mr[$sample_idx{$sample}][$_][covd_bases] = 0 foreach( @mut_classes );
}
# Load the covered base-counts per sample from the output of "music bmr calc-covg"
print STDERR "Loading per-sample coverages stored in $total_covgs_file\n";
my $sample_cnt_in_total_covgs_file = 0;
my $totCovgFh = IO::File->new( $total_covgs_file ) or die "Couldn't open $total_covgs_file. $!";
while( my $line = $totCovgFh->getline ) {
next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ );
chomp( $line );
++$sample_cnt_in_total_covgs_file;
my ( $sample, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line );
$sample_mr[$sample_idx{$sample}][AT_Transitions][covd_bases] = $covd_at_bases;
$sample_mr[$sample_idx{$sample}][AT_Transversions][covd_bases] = $covd_at_bases;
$sample_mr[$sample_idx{$sample}][CG_Transitions][covd_bases] = $covd_cg_bases;
$sample_mr[$sample_idx{$sample}][CG_Transversions][covd_bases] = $covd_cg_bases;
$sample_mr[$sample_idx{$sample}][CpG_Transitions][covd_bases] = $covd_cpg_bases;
$sample_mr[$sample_idx{$sample}][CpG_Transversions][covd_bases] = $covd_cpg_bases;
$sample_mr[$sample_idx{$sample}][Indels][covd_bases] = $covd_bases;
$sample_mr[$sample_idx{$sample}][Truncations][covd_bases] = $covd_bases if( $separate_truncations );
}
$totCovgFh->close;
unless( $sample_cnt_in_total_covgs_file == scalar( @all_sample_names )) {
print STDERR "Mismatching number of samples in $total_covgs_file and $bam_list\n";
return undef;
}
my @gene_mr; # Stores per gene covg and mutation information
foreach my $gene ( @all_gene_names ) {
foreach my $sample ( @all_sample_names ) {
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] = 0 foreach( @mut_classes );
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][covd_bases] = 0 foreach( @mut_classes );
}
}
# Sum up the per-gene covered base-counts across samples from the output of "music bmr calc-covg"
print STDERR "Loading per-gene coverage files stored under $gene_covg_dir/\n";
foreach my $sample ( @all_sample_names ) {
my $sample_covg_file = "$gene_covg_dir/$sample.covg";
my $sampleCovgFh = IO::File->new( $sample_covg_file ) or die "Couldn't open $sample_covg_file. $!";
while( my $line = $sampleCovgFh->getline ) {
next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ );
chomp( $line );
my ( $gene, undef, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line );
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transitions][covd_bases] += $covd_at_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transversions][covd_bases] += $covd_at_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transitions][covd_bases] += $covd_cg_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transversions][covd_bases] += $covd_cg_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transitions][covd_bases] += $covd_cpg_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transversions][covd_bases] += $covd_cpg_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Indels][covd_bases] += $covd_bases;
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Truncations][covd_bases] += $covd_bases if( $separate_truncations );
}
$sampleCovgFh->close;
}
# Run "joinx ref-stats" to classify SNVs as being at AT, CG, or CpG sites in the reference
print STDERR "Running 'joinx ref-stats' to read reference FASTA and identify SNVs at AT, CG, CpG sites\n";
my $maf_bed = Genome::Sys->create_temp_file_path();
my $mafBedFh = IO::File->new( $maf_bed, ">" ) or die "Temporary file could not be created. $!";
my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
while( my $line = $mafFh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $chr, $start, $stop, $mutation_type, $ref, $var1, $var2 ) = @cols[4..6,9..12];
if( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) {
$mafBedFh->print( "$chr\t" . ( $start - 2 ) . "\t" . ( $start + 1 ) . "\n" );
}
}
$mafFh->close;
$mafBedFh->close;
my $refstats_file = Genome::Sys->create_temp_file_path();
Genome::Sys->shellcmd( cmd => "joinx ref-stats --ref-bases --bed $maf_bed --fasta $ref_seq --output $refstats_file" );
# Parse through the ref-stats output and load it into hashes for quick lookup later
my ( %ref_base, %cpg_site );
my $refStatsFh = IO::File->new( $refstats_file ) or die "Couldn't open $refstats_file. $!";
while( my $line = $refStatsFh->getline ) {
next if( $line =~ m/^#/ );
chomp $line;
my ( $chr, undef, $pos, undef, undef, undef, $ref ) = split( /\t/, $line );
my $locus = "$chr\t" . ( $pos - 1 );
$ref_base{$locus} = substr( $ref, 1, 1 );
$cpg_site{$locus} = 1 if( $ref =~ m/CG/ );
}
$refStatsFh->close;
# Create a hash to help classify SNVs
my %classify;
$classify{$_} = AT_Transitions foreach( qw( AG TC ));
$classify{$_} = AT_Transversions foreach( qw( AC AT TA TG ));
$classify{$_} = CG_Transitions foreach( qw( CT GA ));
$classify{$_} = CG_Transversions foreach( qw( CA CG GC GT ));
# Parse through the MAF file and categorize each somatic mutation
print STDERR "Parsing MAF file to classify mutations\n";
my %skip_cnts;
$mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
while( my $line = $mafFh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];
# Skip mutations in samples that are not in the provided bam list
unless( defined $sample_idx{$sample} ) {
$skip_cnts{"belong to unrecognized samples"}++;
print STDERR "Skipping unrecognized sample ($sample not in BAM list): $gene, $chr:$start-$stop\n" if( $show_skipped );
next;
}
# If the mutation classification is odd, quit with error
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file: $gene, $chr:$start-$stop\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ )) {
$skip_cnts{"are classified as $mutation_class"}++;
print STDERR "Skipping $mutation_class mutation: $gene, $chr:$start-$stop\n" if( $show_skipped );
next;
}
# If the mutation type is odd, quit with error
if( $mutation_type !~ m/^(SNP|DNP|TNP|ONP|INS|DEL|Consolidated)$/ ) {
print STDERR "Unrecognized Variant_Type \"$mutation_type\" in MAF file: $gene, $chr:$start-$stop\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# Skip mutations that were consolidated into others (E.g. SNP consolidated into a TNP)
if( $mutation_type =~ m/^Consolidated$/ ) {
$skip_cnts{"are consolidated into another"}++;
print STDERR "Skipping consolidated mutation: $gene, $chr:$start-$stop\n" if( $show_skipped );
next;
}
# Skip mutations that fall completely outside any of the provided regions of interest
if( $self->count_bits( $roi_bitmask->{$chr}, $start, $stop ) == 0 ) {
$skip_cnts{"are outside any ROIs"}++;
print STDERR "Skipping mutation that falls outside ROIs: $gene, $chr:$start-$stop\n" if( $show_skipped );
next;
}
# Skip mutations whose gene names don't match any of those in the ROI list
unless( defined $gene_idx{$gene} ) {
$skip_cnts{"have unrecognized gene names"}++;
print STDERR "Skipping unrecognized gene name (not in ROI file): $gene, $chr:$start-$stop\n" if( $show_skipped );
next;
}
my $class = '';
# Check if the mutation is the truncating type, if the user wanted a separate category of those
if( $separate_truncations && $mutation_class =~ m/^(Nonsense_Mutation|Splice_Site|Frame_Shift_Del|Frame_Shift_Ins)/ ) {
$class = Truncations;
}
# Classify the mutation as AT/CG/CpG Transition, AT/CG/CpG Transversion
elsif( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) {
# ::TBD:: For DNPs and TNPs, we use only the first base for mutation classification
$ref = substr( $ref, 0, 1 );
$var1 = substr( $var1, 0, 1 );
$var2 = substr( $var2, 0, 1 );
# If the alleles are anything but A, C, G, or T then quit with error
if( $ref !~ m/[ACGT]/ || $var1 !~ m/[ACGT]/ || $var2 !~ m/[ACGT]/ ) {
print STDERR "Unrecognized allele in column Reference_Allele, Tumor_Seq_Allele1, or Tumor_Seq_Allele2: $gene, $chr:$start-$stop\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# Use the classify hash to find whether this SNV is an AT/CG Transition/Transversion
$class = $classify{ "$ref$var1" } if( defined $classify{ "$ref$var1" } );
$class = $classify{ "$ref$var2" } if( defined $classify{ "$ref$var2" } );
# Check if the ref base in the MAF matched what we fetched from the ref-seq
my $locus = "$chr\t$start";
if( defined $ref_base{$locus} && $ref_base{$locus} ne $ref ) {
print STDERR "Reference allele $ref for $gene variant at $chr:$start-$stop is " . $ref_base{$locus} . " in the FASTA. Using it anyway.\n";
}
# Check if a C or G reference allele belongs to a CpG pair in the refseq
if(( $ref eq 'C' || $ref eq 'G' ) && defined $cpg_site{$locus} ) {
$class = (( $class == CG_Transitions ) ? CpG_Transitions : CpG_Transversions );
}
}
# Classify it as an indel (excludes splice-site and frame-shift if user wanted truncations separately)
elsif( $mutation_type =~ m/^(INS|DEL)$/ ) {
$class = Indels;
}
# The user's gene exclusion list affects only the overall BMR calculations
$sample_mr[$sample_idx{$sample}][$class][mutations]++ unless( defined $ignored_genes{$gene} );
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]++;
}
$mafFh->close;
# Display statistics related to parsing the MAF
print STDERR "Finished Parsing the MAF file to classify mutations\n";
foreach my $skip_type ( sort {$skip_cnts{$b} <=> $skip_cnts{$a}} keys %skip_cnts ) {
print STDERR "Skipped " . $skip_cnts{$skip_type} . " mutation(s) that $skip_type\n" if( defined $skip_cnts{$skip_type} );
}
# If the user wants, merge together concurrent mutations of a gene in the same sample
if( $merge_concurrent_muts ) {
foreach my $sample ( @all_sample_names ) {
foreach my $gene ( @all_gene_names ) {
next unless( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] );
my $num_muts = 0;
$num_muts += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] foreach( @mut_classes );
if( $num_muts > 1 ) {
foreach my $class ( @mut_classes ) {
my $muts_in_class = $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; # Num of muts of gene in this class
$sample_mr[$sample_idx{$sample}][$class][mutations] -= $muts_in_class; # Take it out of the sample total
$muts_in_class /= $num_muts; # Turn it into a fraction of the total number of muts in this gene
$gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations] = $muts_in_class; # Use the fraction as the num muts of gene in this class
$sample_mr[$sample_idx{$sample}][$class][mutations] += $muts_in_class; # Add the same fraction to the sample total
}
}
}
}
}
# Calculate per-sample BMRs, and also subtract out covered bases in genes the user wants ignored
foreach my $sample ( @all_sample_names ) {
my $tot_muts = 0;
foreach my $class ( @mut_classes ) {
# Subtract the covered bases in this class that belong to the genes to be ignored
# ::TBD:: Some of these bases may also belong to another gene (on the other strand maybe?), and those should not be subtracted
foreach my $ignored_gene ( keys %ignored_genes ) {
$sample_mr[$sample_idx{$sample}][$class][covd_bases] -= $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}][$class][covd_bases] if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}] );
}
$tot_muts += $sample_mr[$sample_idx{$sample}][$class][mutations];
}
$sample_mr[$sample_idx{$sample}][Overall][bmr] = $tot_muts / $sample_mr[$sample_idx{$sample}][Indels][covd_bases];
}
# Cluster samples into bmr-groups using k-means clustering
my @sample_bmrs = map { $sample_mr[$sample_idx{$_}][Overall][bmr] } @all_sample_names;
my @bmr_clusters = k_means( $bmr_groups, \@sample_bmrs );
# Calculate overall BMRs for each cluster of samples, and print them to file
my %cluster_bmr; # Stores per cluster categorized BMR
my $totBmrFh = IO::File->new( $overall_bmr_file, ">" ) or die "Couldn't open $overall_bmr_file. $!";
$totBmrFh->print( "#User-specified genes skipped in these calculations: $genes_to_ignore\n" ) if( defined $genes_to_ignore );
my ( $covered_bases_sum, $mutations_sum ) = ( 0, 0 );
for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) {
my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]};
unless( $bmr_groups == 1 ) {
$totBmrFh->print( "#BMR sub-group ", $i + 1, " (", scalar( @{$bmr_clusters[$i]} ), " samples)\n" );
$totBmrFh->print( "#Samples: ", join( ",", @samples_in_cluster ), "\n" );
}
$totBmrFh->print( "#Mutation_Class\tCovered_Bases\tMutations\tOverall_BMR\n" );
my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 );
foreach my $class ( @mut_classes ) {
my ( $covd_bases, $mutations ) = ( 0, 0 );
foreach my $sample ( @samples_in_cluster ) {
$covd_bases += $sample_mr[$sample_idx{$sample}][$class][covd_bases];
$mutations += $sample_mr[$sample_idx{$sample}][$class][mutations];
}
$tot_covd_bases = $covd_bases if( $class == Indels ); # Save this to calculate overall BMR below
# Calculate overall BMR for this mutation class and print it to file
$cluster_bmr{$i}[$class][bmr] = ( $covd_bases == 0 ? 0 : ( $mutations / $covd_bases ));
$totBmrFh->print( join( "\t", $mut_class_names[$class], $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" );
$tot_muts += $mutations;
}
$totBmrFh->print( join( "\t", "Overall_BMR", $tot_covd_bases, $tot_muts, $tot_muts / $tot_covd_bases ), "\n\n" );
$covered_bases_sum += $tot_covd_bases;
$mutations_sum += $tot_muts;
}
$totBmrFh->close;
$self->bmr_output( $mutations_sum / $covered_bases_sum );
# Print out a file containing per-gene mutation counts and covered bases for use by "music smg"
my $geneBmrFh = IO::File->new( $gene_mr_file, ">" ) or die "Couldn't open $gene_mr_file. $!";
$geneBmrFh->print( "#Gene\tMutation_Class\tCovered_Bases\tMutations\tBMR\n" );
foreach my $gene ( sort @all_gene_names ) {
my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 );
for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) {
my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]};
foreach my $class ( @mut_classes ) {
my ( $covd_bases, $mutations ) = ( 0, 0 );
foreach my $sample( @samples_in_cluster ) {
if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ) {
$covd_bases += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][covd_bases];
$mutations += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations];
}
}
my $rename_class = $mut_class_names[$class];
$rename_class = ( $rename_class . "_SubGroup" . ( $i + 1 )) if( $bmr_groups > 1 );
$geneBmrFh->print( join( "\t", $gene, $rename_class, $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" );
$tot_muts += $mutations;
$tot_covd_bases += $covd_bases if( $class == Indels );
}
}
$geneBmrFh->print( join( "\t", $gene, "Overall", $tot_covd_bases, $tot_muts, $self->bmr_output ), "\n" );
}
$geneBmrFh->close;
return 1;
}
# Creates an empty whole genome bitmask based on the given reference sequence index
sub create_empty_genome_bitmask {
my ( $self, $ref_seq_idx_file ) = @_;
my %genome;
my $refFh = IO::File->new( $ref_seq_idx_file ) or die "Couldn't open $ref_seq_idx_file. $!";
while( my $line = $refFh->getline ) {
my ( $chr, $length ) = split( /\t/, $line );
$genome{$chr} = Bit::Vector->new( $length + 1 ); # Adding a base for 1-based coordinates
}
$refFh->close;
return \%genome;
}
# Counts the number of bits that are set in the given region of a Bit:Vector
sub count_bits {
my ( $self, $vector, $start, $stop ) = @_;
my $count = 0;
for my $pos ( $start..$stop ) {
++$count if( $vector->bit_test( $pos ));
}
return $count;
}
# Given a list of numerical values, returns k clusters based on k-means clustering
sub k_means {
my ( $k, $list_ref ) = @_;
my @vals = @{$list_ref};
my $num_vals = scalar( @vals );
# Start with the first k values as the centroids
my @centroids = @vals[0..($k-1)];
my @prev_centroids = map { 0 } @centroids;
my @groups = ();
my $diff_means = 1; # Arbitrary non-zero value
# Repeat until the difference between these centroids and the previous ones, converges to zero
while( $diff_means > 0 ) {
@groups = ();
# Group values into clusters based on closest centroid
for( my $i = 0; $i < $num_vals; ++$i ) {
my @distances = map { abs( $vals[$i] - $_ ) } @centroids;
my $closest = min( @distances );
for( my $j = 0; $j < $k; ++$j ) {
if( $distances[$j] == $closest ) { push( @{$groups[$j]}, $i ); last; }
}
}
# Calculate means to be the new centroids, and the sum of differences
$diff_means = 0;
for( my $i = 0; $i < $k; ++$i ) {
$centroids[$i] = sum( map {$vals[$_]} @{$groups[$i]} );
$centroids[$i] /= scalar( @{$groups[$i]} );
$diff_means += abs( $centroids[$i] - $prev_centroids[$i] );
}
# Save the current centroids for comparisons with those in the next iteration
@prev_centroids = @centroids;
}
return @groups;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcCovg.pm 000444 000765 000024 33574 12013522176 26173 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr::CalcCovg;
use warnings;
use strict;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Bmr::CalcCovg {
is => 'Genome::Model::Tools::Music::Bmr::Base',
has_input => [
roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See Description)" },
reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" },
bam_list => { is => 'Text', doc => "Tab delimited list of BAM files [sample_name normal_bam tumor_bam] (See Description)" },
output_dir => { is => 'Text', doc => "Directory where output files and subdirectories will be written", is_output => 1},
cmd_list_file => { is => 'Text', doc => "A file to write calcRoiCovg commands to (See Description)", is_optional => 1 },
cmd_prefix => { is => 'Text', doc => "A command that submits a job to your cluster (See Description)", is_optional => 1 },
normal_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Normal BAM base as covered", is_optional => 1},
tumor_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Tumor BAM base as covered", is_optional => 1},
min_mapq => { is => 'Integer', doc => "The minimum mapping quality of reads to consider towards read depth counts", is_optional => 1},
],
has_output => [
gene_covg_dir => { is => 'Text', doc => "Directory where per-sample gene coverage files are located"},
],
doc => "Uses calcRoiCovg.c to count covered bases per-gene for each given tumor-normal pair of BAMs."
};
sub help_synopsis {
return <(1),
B(1),
B(1)
EOS
}
sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $bam_list = $self->bam_list;
my $output_dir = $self->output_dir;
my $cmd_list_file = $self->cmd_list_file;
my $cmd_prefix = $self->cmd_prefix;
my $normal_min_depth = $self->normal_min_depth;
my $tumor_min_depth = $self->tumor_min_depth;
my $min_mapq = $self->min_mapq;
my $optional_params = "";
if ($normal_min_depth) {
$optional_params .= " --normal-min-depth $normal_min_depth";
}
if ($tumor_min_depth) {
$optional_params .= " --tumor-min-depth $tumor_min_depth";
}
if ($min_mapq) {
$optional_params .= " --min-mapq $min_mapq";
}
# Check on all the input data before starting work
print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list );
print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir );
# Outputs of this script will be written to these locations in the output directory
$output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
my $roi_covg_dir = "$output_dir/roi_covgs"; # Stores output from calcRoiCovg per sample
my $gene_covg_dir = "$output_dir/gene_covgs"; # Stores per-gene coverages per sample
my $tot_covg_file = "$output_dir/total_covgs"; # Stores total coverages per sample
$self->gene_covg_dir($gene_covg_dir);
# Check whether the annotated regions of interest are clumped together by chromosome
my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n";
my @chroms = ( "" );
while( my $line = $roiFh->getline ) # Emulate Unix's uniq command on the chromosome column
{
my ( $chrom ) = ( $line =~ m/^(\S+)/ );
push( @chroms, $chrom ) if( $chrom ne $chroms[-1] );
}
$roiFh->close;
my %chroms = map { $_ => 1 } @chroms; # Get the actual number of unique chromosomes
if( scalar( @chroms ) != scalar( keys %chroms ))
{
print STDERR "ROIs from the same chromosome must be listed adjacent to each other in file. ";
print STDERR "If in UNIX, try:\nsort -k 1,1 $roi_file\n";
return undef;
}
# If the reference sequence FASTA file hasn't been indexed, do it
my $ref_seq_idx = "$ref_seq.fai";
system( "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx );
# Create the output directories unless they already exist
mkdir $roi_covg_dir unless( -e $roi_covg_dir );
mkdir $gene_covg_dir unless( -e $gene_covg_dir );
my ( $cmdFh, $totCovgFh );
if( defined $cmd_list_file )
{
$cmdFh = IO::File->new( $cmd_list_file, ">" );
print "Creating a list of parallelizable jobs at $cmd_list_file.\n";
print "After successfully running all the jobs in $cmd_list_file,\n",
"be sure to run this script a second time (without defining the cmd-list-file argument) to merge results in roi_covgs.\n";
}
else
{
$totCovgFh = IO::File->new( $tot_covg_file, ">" );
$totCovgFh->print( "#Sample\tCovered_Bases\tAT_Bases_Covered\tCG_Bases_Covered\tCpG_Bases_Covered\n" );
}
# Parse through each pair of BAM files provided and run calcRoiCovg as necessary
my $bamFh = IO::File->new( $bam_list );
while( my $line = $bamFh->getline )
{
next if( $line =~ m/^#/ );
chomp( $line );
my ( $sample, $normal_bam, $tumor_bam ) = split( /\t/, $line );
$normal_bam = '' unless( defined $normal_bam );
$tumor_bam = '' unless( defined $tumor_bam );
print STDERR "Normal BAM for $sample not found: \"$normal_bam\"\n" unless( -e $normal_bam );
print STDERR "Tumor BAM for $sample not found: \"$tumor_bam\"\n" unless( -e $tumor_bam );
next unless( -e $normal_bam && -e $tumor_bam );
# Construct the command that calculates coverage per ROI
my $calcRoiCovg_cmd = "\'gmt music bmr calc-covg-helper --normal-tumor-bam-pair \"$line\" --roi-file \"$roi_file\" ".
"--reference-sequence \"$ref_seq\" --output-file \"$roi_covg_dir/$sample.covg\"$optional_params\'";
# If user only wants the calcRoiCovg commands, write them to file and skip running calcRoiCovg
if( defined $cmd_list_file )
{
$calcRoiCovg_cmd = $cmd_prefix . " $calcRoiCovg_cmd" if( defined $cmd_prefix );
$cmdFh->print( "$calcRoiCovg_cmd\n" );
next;
}
# If the calcRoiCovg output was already generated, then don't rerun it
if( -s "$roi_covg_dir/$sample.covg" )
{
print "$sample.covg found in $roi_covg_dir. Skipping re-calculation.\n";
}
# Run the calcRoiCovg command on this tumor-normal pair. This could take a while
else {
my %params = (
normal_tumor_bam_pair => $line,
roi_file => $roi_file,
reference_sequence => $ref_seq,
output_file => $roi_covg_dir."/".$sample.".covg",
);
if ($normal_min_depth) {
$params{"normal_min_depth"} = $normal_min_depth;
}
if ($tumor_min_depth) {
$params{"tumor_min_depth"} = $tumor_min_depth;
}
if ($min_mapq) {
$params{"min_mapq"} = $min_mapq;
}
my $cmd = Genome::Model::Tools::Music::Bmr::CalcCovgHelper->create(%params);
my $rv = $cmd->execute;
if(!$rv)
{
print STDERR "Failed to execute: $calcRoiCovg_cmd\n";
next;
}
else
{
print "$sample.covg generated and stored to $roi_covg_dir.\n";
}
}
# Read the calcRoiCovg output and count covered bases per gene
my %geneCovg = ();
my ( $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd );
my $roiCovgFh = IO::File->new( "$roi_covg_dir/$sample.covg" );
while( my $line = $roiCovgFh->getline )
{
chomp( $line );
if( $line =~ m/^#NonOverlappingTotals/ )
{
( undef, undef, undef, $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd ) = split( /\t/, $line );
}
elsif( $line !~ m/^#/ )
{
my ( $gene, undef, $length, $covd, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line );
$geneCovg{$gene}{len} += $length;
$geneCovg{$gene}{covd_len} += $covd;
$geneCovg{$gene}{at} += $at_covd;
$geneCovg{$gene}{cg} += $cg_covd;
$geneCovg{$gene}{cpg} += $cpg_covd;
}
}
$roiCovgFh->close;
# Write the per-gene coverages to a file named after this sample_name
my $geneCovgFh = IO::File->new( "$gene_covg_dir/$sample.covg", ">" );
$geneCovgFh->print( "#Gene\tLength\tCovered\tAT_covd\tCG_covd\tCpG_covd\n" );
foreach my $gene ( sort keys %geneCovg )
{
$geneCovgFh->print( join( "\t", $gene, $geneCovg{$gene}{len}, $geneCovg{$gene}{covd_len},
$geneCovg{$gene}{at}, $geneCovg{$gene}{cg}, $geneCovg{$gene}{cpg} ), "\n" );
}
$geneCovgFh->close;
# Write total coverages for this sample to a file
$totCovgFh->print( "$sample\t$tot_covd\t$tot_at_covd\t$tot_cg_covg\t$tot_cpg_covd\n" );
}
$bamFh->close;
$cmdFh->close if( defined $cmd_list_file );
$totCovgFh->close unless( defined $cmd_list_file );
return 1;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcCovgHelper.pm 000444 000765 000024 16041 12013522176 27321 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr::CalcCovgHelper;
use warnings;
use strict;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Bmr::CalcCovgHelper {
is => 'Genome::Model::Tools::Music::Bmr::Base',
has_input => [
roi_file => { is => 'Text', doc => "Tab delimited list of ROIs [chr start stop gene_name] (See Description)" },
reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" },
normal_tumor_bam_pair => { is => 'Text', doc => "Tab delimited line with sample name, path to normal bam file, and path to tumor bam file (See Description)" },
output_file => { is => 'Text', doc => "Output file path. Specify either output-file or output-directory.", is_optional => 1},
output_dir => { is => 'Text', doc => "Output directory path. Specify either output-file or output-directory", is_optional => 1},
normal_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Normal BAM base as covered", is_optional => 1, default => 6},
tumor_min_depth => { is => 'Integer', doc => "The minimum read depth to consider a Tumor BAM base as covered", is_optional => 1, default => 8},
min_mapq => { is => 'Integer', doc => "The minimum mapping quality of reads to consider towards read depth counts", is_optional => 1, default => 20},
],
has_calculated_optional => [
sample_name => {
calculate_from => ['normal_tumor_bam_pair'],
calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[0];},
},
final_output_file => {
is_output => 1,
calculate_from => ['output_file','output_dir','sample_name'],
calculate => q {if ($output_file) {return $output_file;} elsif ($output_dir){return $output_dir."/".$sample_name;} else {die "Either output-file or output-dir must be specified."}},
},
normal_bam => {
calculate_from => ['normal_tumor_bam_pair'],
calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[1];},
},
tumor_bam => {
calculate_from => ['normal_tumor_bam_pair'],
calculate => q {my @bams = split /\t/, $normal_tumor_bam_pair; return $bams[2];},
},
],
doc => "Uses calcRoiCovg.c to count covered bases per-gene for a tumor-normal pair of BAMs."
};
sub help_synopsis {
return <(1),
B(1),
B(1)
EOS
}
sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $tumor_bam = $self->tumor_bam;
my $normal_bam = $self->normal_bam;
my $output_file = $self->final_output_file;
my $normal_min_depth = $self->normal_min_depth;
my $tumor_min_depth = $self->tumor_min_depth;
my $min_mapq = $self->min_mapq;
# Check on all the input data before starting work
print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
print STDERR "Normal BAM file not found or is empty: $normal_bam\n" unless( -s $normal_bam );
print STDERR "Tumor BAM file not found or is empty: $tumor_bam\n" unless( -s $tumor_bam );
return undef unless( -s $roi_file && -e $ref_seq && -s $normal_bam && -s $tumor_bam );
# Check whether the annotated regions of interest are clumped together by chromosome
my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n";
my @chroms = ( "" );
while( my $line = $roiFh->getline ) # Emulate Unix's uniq command on the chromosome column
{
my ( $chrom ) = ( $line =~ m/^(\S+)/ );
push( @chroms, $chrom ) if( $chrom ne $chroms[-1] );
}
$roiFh->close;
my %chroms = map { $_ => 1 } @chroms; # Get the actual number of unique chromosomes
if( scalar( @chroms ) != scalar( keys %chroms ))
{
print STDERR "ROIs from the same chromosome must be listed adjacent to each other in file. ";
print STDERR "If in UNIX, try:\nsort -k 1,1 $roi_file\n";
return undef;
}
# If the reference sequence FASTA file hasn't been indexed, do it
my $ref_seq_idx = "$ref_seq.fai";
system( "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx );
$normal_bam = '' unless( defined $normal_bam );
$tumor_bam = '' unless( defined $tumor_bam );
print STDERR "Normal BAM not found: \"$normal_bam\"\n" unless( -e $normal_bam );
print STDERR "Tumor BAM not found: \"$tumor_bam\"\n" unless( -e $tumor_bam );
next unless( -e $normal_bam && -e $tumor_bam );
# Construct the command that calculates coverage per ROI
my $calcRoiCovg_cmd = "calcRoiCovg $normal_bam $tumor_bam $roi_file $ref_seq $output_file $normal_min_depth $tumor_min_depth $min_mapq";
# If the calcRoiCovg output was already generated, then don't rerun it
if( -s $output_file )
{
print "Output file $output_file found. Skipping re-calculation.\n";
}
# Run the calcRoiCovg command on this tumor-normal pair. This could take a while
elsif( system( "$calcRoiCovg_cmd" ) != 0 )
{
print STDERR "Failed to execute: $calcRoiCovg_cmd\n";
return;
}
else
{
print "$output_file generated and stored.\n";
return 1;
}
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Bmr/CalcWigCovg.pm 000444 000765 000024 23222 12013522176 26627 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Bmr::CalcWigCovg;
use warnings;
use strict;
use Genome;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Bmr::CalcWigCovg {
is => 'Genome::Model::Tools::Music::Bmr::Base',
has_input => [
roi_file => { is => 'Text', doc => "Tab-delimited list of ROIs [chr start stop gene_name] (See Description)" },
reference_sequence => { is => 'Text', doc => "Path to reference sequence in FASTA format" },
wig_list => { is => 'Text', doc => "Tab-delimited list of WIG files [sample_name wig_file] (See Description)" },
output_dir => { is => 'Text', doc => "Directory where output files and subdirectories will be written", is_output => 1},
],
doc => "Count covered bases per-gene for each given wiggle track format file."
};
sub help_synopsis {
return <(1),
B(1),
B(1)
EOS
}
sub execute {
my $self = shift;
my $roi_file = $self->roi_file;
my $ref_seq = $self->reference_sequence;
my $wig_list = $self->wig_list;
my $output_dir = $self->output_dir;
# Check on all the input data before starting work
print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file );
print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq );
print STDERR "List of WIGs not found or is empty: $wig_list\n" unless( -s $wig_list );
print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
return undef unless( -s $roi_file && -e $ref_seq && -s $wig_list && -e $output_dir );
# Outputs of this script will be written to these locations in the output directory
$output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
my $roi_covg_dir = "$output_dir/roi_covgs"; # Stores output from calcRoiCovg per sample
my $gene_covg_dir = "$output_dir/gene_covgs"; # Stores per-gene coverages per sample
my $tot_covg_file = "$output_dir/total_covgs"; # Stores total coverages per sample
# If the reference sequence FASTA file hasn't been indexed, do it
my $ref_seq_idx = "$ref_seq.fai";
unless( -e $ref_seq_idx ) {
print "Reference fasta index not found. Creating one at: $ref_seq.fai\n";
system( "samtools faidx $ref_seq" ) or die "Failed to run samtools! $!\n";
}
# Create a temporary 0-based ROI BED-file that we can use with joinx, and also measure gene lengths
my %geneLen = ();
my $roi_bed = Genome::Sys->create_temp_file_path();
my $roiBedFh = IO::File->new( $roi_bed, ">" ) or die "Temporary ROI BED file could not be created. $!\n";
my $roiFh = IO::File->new( $roi_file ) or die "ROI file could not be opened. $!\n";
while( my $line = $roiFh->getline ) {
chomp( $line );
my ( $chr, $start, $stop, $gene ) = split( /\t/, $line );
--$start;
unless( $start >= 0 && $start < $stop ) {
print STDERR "Invalid ROI: $line\nPlease use 1-based loci and ensure that start <= stop\n";
return undef;
}
$geneLen{$gene} += ( $stop - $start );
$roiBedFh->print( "$chr\t$start\t$stop\t$gene\n" );
}
$roiFh->close;
$roiBedFh->close;
# Also create a merged BED file where overlapping ROIs are joined together into contiguous regions
# ::TODO:: Use joinx instead of mergeBed, because we'd rather add an in-house dependency
my $merged_roi_bed = Genome::Sys->create_temp_file_path();
system( "mergeBed -i $roi_bed | joinx sort -s - -o $merged_roi_bed" );# or die "Failed to run mergeBed or joinx!\n$roi_bed\n$merged_roi_bed\n $!\n";
# Create the output directories unless they already exist
mkdir $roi_covg_dir unless( -e $roi_covg_dir );
mkdir $gene_covg_dir unless( -e $gene_covg_dir );
# This is a file that will report the overall non-overlapping coverages per WIG
my $totCovgFh = IO::File->new( $tot_covg_file, ">" );
$totCovgFh->print( "#Sample\tCovered_Bases\tAT_Bases_Covered\tCG_Bases_Covered\tCpG_Bases_Covered\n" );
# Parse through each pair of WIG files provided and run calcRoiCovg as necessary
my $wigFh = IO::File->new( $wig_list );
while( my $line = $wigFh->getline ) {
next if( $line =~ m/^#/ );
chomp( $line );
my ( $sample, $wig_file ) = split( /\t/, $line );
$wig_file = '' unless( defined $wig_file );
print STDERR "Wiggle track format file for $sample not found: \"$wig_file\"\n" unless( -e $wig_file );
next unless( -e $wig_file );
# Use joinx to parse the WIG file and return per-ROI coverages of AT, CG (non-CpG), and CpG
system( "joinx wig2bed -Zc $wig_file | joinx sort -s | joinx intersect -F \"I A3\" $roi_bed - | joinx ref-stats - $ref_seq | cut -f 1-7 > $roi_covg_dir/$sample.covg" );# or die "Failed to run joinx to calculate per-gene coverages in $sample! $!\n";
# Read the joinx formatted coverage file and count covered bases per gene
my %geneCovg = ();
my $roiCovgFh = IO::File->new( "$roi_covg_dir/$sample.covg" );
while( my $line = $roiCovgFh->getline ) {
chomp( $line );
if( $line !~ m/^#/ ) {
my ( undef, undef, undef, $gene, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line );
$geneCovg{$gene}{covd} += ( $at_covd + $cg_covd + $cpg_covd );
$geneCovg{$gene}{at} += $at_covd;
$geneCovg{$gene}{cg} += $cg_covd;
$geneCovg{$gene}{cpg} += $cpg_covd;
}
}
$roiCovgFh->close;
# Write the per-gene coverages to a file named after this sample_name
my $geneCovgFh = IO::File->new( "$gene_covg_dir/$sample.covg", ">" );
$geneCovgFh->print( "#Gene\tLength\tCovered\tAT_covd\tCG_covd\tCpG_covd\n" );
foreach my $gene ( sort keys %geneLen ) {
if( defined $geneCovg{$gene} ) {
$geneCovgFh->print( join( "\t", $gene, $geneLen{$gene}, $geneCovg{$gene}{covd},
$geneCovg{$gene}{at}, $geneCovg{$gene}{cg}, $geneCovg{$gene}{cpg} ), "\n" );
}
else {
$geneCovgFh->print( "$gene\t" . $geneLen{$gene} . "\t0\t0\t0\t0\n" );
}
}
$geneCovgFh->close;
# Measure coverage stats on the merged ROI file, so that bps across the genome are not counted twice
my $merged_roi_bed_covg = Genome::Sys->create_temp_file_path();
system( "joinx wig2bed -Zc $wig_file | joinx sort -s | joinx intersect $merged_roi_bed - | joinx ref-stats - $ref_seq | cut -f 1-6 > $merged_roi_bed_covg" );# or die "Failed to run joinx to calculate overall coverages in $sample! $!\n";
# Read the joinx formatted coverage file and sum up the coverage stats per region
my ( $tot_covd, $tot_at_covd, $tot_cg_covg, $tot_cpg_covd );
my $totRoiCovgFh = IO::File->new( $merged_roi_bed_covg );
while( my $line = $totRoiCovgFh->getline ) {
chomp( $line );
if( $line !~ m/^#/ ) {
my ( $chr, $start, $stop, $at_covd, $cg_covd, $cpg_covd ) = split( /\t/, $line );
$tot_covd += ( $at_covd + $cg_covd + $cpg_covd );
$tot_at_covd += $at_covd;
$tot_cg_covg += $cg_covd;
$tot_cpg_covd += $cpg_covd;
}
}
$totRoiCovgFh->close;
$totCovgFh->print( "$sample\t$tot_covd\t$tot_at_covd\t$tot_cg_covg\t$tot_cpg_covd\n" );
}
$wigFh->close;
$totCovgFh->close;
return 1;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan 000755 000765 000024 0 12013522176 24744 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/CombinePvals.pm 000444 000765 000024 107435 12013522176 30073 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::PathScan::CombinePvals;
#__STANDARD PERL PACKAGES
use strict;
use Carp;
use Statistics::Distributions;
#__CONSTANT OF PI -- NEEDED IN RAMANUJAN APPROX FOR POISSON PROBABILITY MASSES
# (SEE "PATHSCAN TEST" NOTES PP 29-31)
# use constant PI => 4*atan2 1, 1;
# use constant LOG_PI_OVER_2 => log (PI) / 2;
################################################################################
## ##
## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ##
## ##
################################################################################
=head1 NAME
CombinePvals - combining probabilities from independent tests of
significance into a single aggregate figure
=head1 SYNOPSIS
use CombinePvals;
my $obj = CombinePvals->new ($reference_to_list_of_pvals);
my $pval = $obj->method_name;
my $pval = $obj->method_name (@arguments);
=head1 DESCRIPTION
There are a variety of circumstances under which one might have a number
of different kinds of tests and/or separate instances of the same kind
of test for one particular null hypothesis, where each of these tests returns a
p-value.
The problem is how to properly condense this list of
probabilities into a single value so as to be able to make
a statistical inference, e.g. whether to reject the null
hypothesis.
This problem was examined heavily starting about the 1930s, during
which time numerous mathematical contintencies were treated, e.g. dependence
vs. independence of tests, optimality, inter-test weighting, computational
efficiency, continuous vs. discrete tests and combinations thereof,
etc.
There is quite a large mathematical literature on
this topic (see L"REFERENCES"> below) and any one
particular situation might incur some of the above
subtleties.
This package concentrates on some of the more straightforward
scenarios, furnishing various methods for combining
p-vals.
The main consideration will usually be the trade-off between
the exactness of the p-value (according to strict frequentist
modeling) and the computational efficiency, or even its actual
feasibility.
Tests should be chosen with this factor in
mind.
Note also that this scenario of combining p-values (many
tests of a single hypothesis) is fundamentally different
from that where a given hypothesis is tested multiple
times.
The latter instance usually calls for some method of multiple testing
correction.
=head1 REFERENCES
Here is an abbreviated list of the substantive works on the topic of combining
probabilities.
=over
=item *
Birnbaum, A. (1954)
I,
Journal of the American Statistical Association B<49>(267), 559-574.
=item *
David, F. N. and Johnson, N. L. (1950)
I,
Biometrika B<37>(1/2), 42-49.
=item *
Fisher, R. A. (1958)
I, 13-th Ed. Revised,
Hafner Publishing Co., New York.
=item *
Lancaster, H. O. (1949)
I,
Biometrika B<36>(3/4), 370-382.
=item *
Littell, R. C. and Folks, J. L. (1971)
I,
Journal of the American Statistical Association B<66>(336), 802-806.
=item *
Pearson, E. S. (1938)
I,
Biometrika B<30>(12), 134-148.
=item *
Pearson, E. S. (1950)
I,
Biometrika B<37>(3/4), 383-398.
=item *
Pearson, K. (1933)
I
Biometrika B<25>(3/4), 379-410.
=item *
Van Valen, L. (1964)
I,
Nature B<201>(4919), 642.
=item *
Wallis, W. A. (1942)
I,
Econometrica B<10>(3/4), 229-248.
=item *
Zelen, M. and Joel, L. S. (1959)
I,
Annals of Mathematical Statistics B<30>(4), 885-895.
=back
=head1 AUTHOR
Michael C. Wendl
S
Copyright (C) 2009 Washington University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=head1 GENERAL REMARKS ON METHODS
The available methods are listed
below.
Each of computational techniques assumes that tests, as
well as their associated p-values, are independent of one
another and none considers any form of differential
weighting.
=cut
################################################################################
## ##
## P R O G R A M M E R N O T E S ##
## ##
################################################################################
#
# The obj schematic resembles:
#
# $obj = {
#
# #__PROBABILITY VALUES FOR THE INDIVIDUAL TESTS
# pvals => [0.103, 0.078, 0.03, 0.2,...],
#
# #__PRODUCT OF THE INDIVIDUAL PROBABILITY VALUES
# big_q => 0.103 * 0.078 * 0.03 * 0.2 * ...,
#
# #__THE ACTUAL NUMBER OF TESTS (SAVED FOR CONVENIENCE)
# num_tests = integer,
#
# #__BINOMIAL COEFFICIENTS
#
# this will only be defined when passing multiple lists of genes, i.e.
# for the approximate "binning" solution - we only define the symmetric
# half of pascal's triangle
#
# binom_coeffs => [[1], [1], [1,2], [1,3], [1,4,6], [1,5,10], ....],
#
# };
################################################################################
## ##
## P U B L I C M E T H O D S ##
## ##
################################################################################
=head1 CONSTRUCTOR METHODS
These methods return an object in the CombinePvals
class.
=cut
################################
# BEGIN: CONSTRUCTOR METHODS #
################################
# ===
# NEW create a new object
# === ~~~~~~~~~~~~~~~~~~~
=head2 new
This is the usual object constructor, which takes a mandatory,
but otherwise un-ordered (reference to a) list of the p-values
obtained by a set of independent
tests.
my $obj = CombinePvals->new ([0.103, 0.078, 0.03, 0.2,...]);
The method checks to make sure that all elements are actual
p-values, i.e. they are real numbers and they have values bounded by 0 and
1.
=cut
sub new {
my $class = shift;
my ($pvals) = @_;
#__OBJECT TEMPLATE
my $self = {};
#__PROCESS PVALS IF THEY'RE SPECIFIED
if (defined $pvals && $pvals) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference" unless ref $pvals eq "ARRAY";
#__SAVE LIST
$self->{'pvals'} = $pvals;
#__PROCESS THE INPUT
my ($big_q, $num_tests) = (1, 0);
foreach my $pval (@{$pvals}) {
#__MAKE SURE THIS IS A PVAL
croak "'$pval' is not a p-val" unless &is_a_pval ($pval);
#__COUNT NUMBER OF TESTS
$num_tests++;
#__SAVE THE PRODUCT --- THIS IS THE DERIVED TEST STATISTIC
$big_q *= $pval;
}
$self->{'big_q'} = $big_q;
$self->{'num_tests'} = $num_tests;
#__OTHERWISE CROAK
} else {
croak "must specify a list of pvals as an argument";
}
#__BLESS INTO CLASS AND RETURN OBJECT
bless $self, $class;
return $self;
}
##############################
# END: CONSTRUCTOR METHODS #
##############################
=head1 EXACT ENUMERATIVE PROCEDURES FOR STRICTLY DISCRETE DISTRIBUTIONS
When all the individual p-vals are derived from tests based on discrete
distributions, the "standard" continuum methods cannot be used in the strictest
sense.
Both Wallis (1942) and Lancaster (1949) discuss the option of
full enumeration, which will only be feasible when there are a
limited number of p-values and their range is not too
large.
Feasibility experiments are suggested, depending
upon the type of hardware and size of
calculation.
=cut
# Again, these methods do some rudimentary checking, but the calling
# program is responsible for making sure all elements are actual
# p-values, i.e. real numbers, have values bounded by 0 and 1,
# etc.
# They are also responsible for making sure all p-values are
# listed in decreasing order of extremity, as illustrated
# below.
####################################################################
# BEGIN: EXACT ENUMERATIVE PROCEDURES FOR DISCRETE DISTRIBUTIONS #
####################################################################
# ====================
# EXACT ENUM ARBITRARY
# ====================
#
# exact enumerative solution for a set of p-values obtained from an
# arbitrary set of not-necessarily-the-same *discrete* distributions
=head2 exact_enum_arbitrary
This routine is designed for combining p-values
from completely arbitrary discrete probability
distributions.
It takes a list-of-lists data structure, each list being the probability
tails I (i.e. as a probability
cummulative density function) associated with each individual
test.
However, the ordering of the lists themselves is not
important.
For instance, Wallis (1942) gives the example of two binomials, a one-tailed
test having tail values of 0.0625, 0.3125, 0.6875, 0.9375, and 1, and a
two-tailed test having tail values 0.125, 0.625, and
1.
We would then call this method using
my $pval = $obj->exact_enum_arbitrary (
[0.0625, 0.3125, 0.6875, 0.9375, 1],
[0.125, 0.625, 1]
);
The internal computational method is relatively
straightforard and described in detail by
Wallis (1942).
Note that this method does "all-by-all" multiplication,
so it is the least efficient, although entirely
exact.
=cut
sub exact_enum_arbitrary {
my $obj = shift;
my (@pvals_lists) = @_;
my $pval = 0;
#__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @pvals_lists;
croak "number of lists passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
#__CHECK LIST INPUT
my $list_num = 0;
foreach my $list (@pvals_lists) {
$list_num++;
my $previous_pval = 0;
foreach my $test_pval (@{$list}) {
#__MAKE SURE THIS IS A PVAL
croak "'$test_pval' in distribution $list_num is not a p-val"
unless &is_a_pval ($test_pval);
#__MAKE SURE THIS PVAL IS LARGER THAN PREVIOUS ONE: I.E. THIS IS A C.D.F.
croak "distribution $list_num is not in ascending order (not a CDF)"
unless $test_pval > $previous_pval;
#__RESET
$previous_pval = $test_pval;
}
}
#__COMBINE INDIVIDUAL-TEST-P-VALS INTO A SINGLE P-VAL
$pval = $obj->_recursive_exact_enum_arbitrary ([@pvals_lists], 0, 1, 1);
#__RETURN P-VAL
return $pval;
}
# ====================
# EXACT ENUM IDENTICAL
# ====================
#
# exact enumerative solution for a set of p-values obtained from one,
# or rather, a set of identical *discrete* distributions
=head2 exact_enum_identical
This routine is designed for combining a set of p-values that
all come from a single probability
distribution.
NOT IMPLEMENTED YET
=cut
##################################################################
# END: EXACT ENUMERATIVE PROCEDURES FOR DISCRETE DISTRIBUTIONS #
##################################################################
=head1 TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS
The mathematical literature furnishes several straightforward
options for combining p-vals if all of the distributions
underlying all of the individual tests are
continuous.
=cut
####################################################
# BEGIN: TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS #
####################################################
# ===========================
# FISHER CHI-SQUARE TRANSFORM
# ===========================
#
# Fisher's solution using the chi-square transform, valid strictly for
# continuum distributions, but can be used approximately for discrete
# distributions. Accuracy increases with the support of the distributions.
=head2 fisher_chisq_transform
This routine implements R.A. Fisher's (1958, originally 1932) chi-square
transform method for combining p-vals from continuous
distributions, which is essentially a CPU-efficient approximation of
K. Pearson's log-based result (see e.g. Wallis (1942) pp
232).
Note that the underlying distributions are
not actually relevant, so no arguments are
passed.
my $pval = $obj->fisher_chisq_transform;
This is certainly the fastest and easiest method for combining p-vals,
but its accuracy for discrete distributions will not usually be very
good.
For such cases, an exact or a corrected method are better
choices.
=cut
sub fisher_chisq_transform {
my $obj = shift;
#__GO THROUGH LIST OF INDIVIDUAL-TEST-P-VALS ACCUMULATING FISHER'S LOG
# TRANSFORM TO CHI-SQUARE STATISTIC
# my $chisq = 0;
# foreach my $test_pval (@{$obj->{'pvals'}}) {
#### $chisq += - 2 * log ($test_pval);
# $chisq -= 2 * log ($test_pval);
# }
#__WALLIS (1942) MAKES THIS CLEVER SIMPLIFICATION
my $chisq = -2 * log ($obj->{'big_q'});
#__TRANSFORM: TWICE THE DEGREES OF FREEDOM
my $dof = 2 * $obj->{'num_tests'};
#__NOW GET P-VAL FROM A CHI-SQUARE TEST
my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq);
#__RETURN P-VAL
return $pval;
}
##################################################
# END: TRANSFORMS FOR CONTINUOUS DISTRIBUTIONS #
##################################################
=head1 CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS
Enumerative procedures quickly become infeasible if the
number of tests and/or the support of each test grow
large.
A number of procedures have been described for
correcting the methodologies designed for continuum
testing, mostly in the context of applying so-called continuity
corrections.
Essentially, these seek to "spread" dicrete data out into a pseudo-continuous
configuration as appropriate as possible, and then apply standard
transforms.
Accuracy varies and should be suitably established in each
case.
The methods in this section are due to H.O. Lancaster (1949), who discussed
two corrections based upon the idea of describing how a chi-square transformed
statistic varies between the points of a discrete
distribution.
Unfortunately, these methods require one to pass some extra information
to the routines, i.e. not only the CDF (the p-val of each test), but the
CDF value associated with the next-most-extreme
statistic.
These two pieces of information are the basis of
interpolating.
For example, if an underlying distribution has the possible tail values of
0.0625, 0.3125, 0.6875, 0.9375, 1 and the test itself has a value of
0.6875, then you would pass I 0.3125 I 0.6875 to the
routine.
I
While there generally will be some extra inconvenience in obtaining
this information, the accuracy is much improved over Fisher's
method.
=cut
# PROGRAMMING NOTE ON LANCASTER'S METHODS
#
# Each method differs substantively by only a few lines of code, so there
# are a lot of extra lines here that are required to offer the user 3
# individually-named methods. This should be fixed when time permits, for
# example, perhaps pass the name of the correction method as an argument too.
################################################################################
# BEGIN: CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS #
################################################################################
# ==========================================================
# LANCASTER'S MEAN-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM
# ==========================================================
=head2 lancaster_mean_corrected_transform
This method is based on the mean value of the chi-squared transformed
statistic.
my $pval = $obj->lancaster_mean_corrected_transform (@cdf_pairs);
Its accuracy is good, but the method is not strictly defined if one
of the tests has either the most extreme or second-to-most-extreme
statistic.
=cut
sub lancaster_mean_corrected_transform {
my $obj = shift;
my (@fxm1_and_fx_pvals) = @_;
#__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @fxm1_and_fx_pvals;
croak "number of pairs passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
#__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC
my ($chisq, $list_num) = (0, 0);
foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) {
$list_num++;
my ($fxm1, $fx) = @{$fxm1_and_fx_pair};
#__MAKE SURE BOTH ARE PVALS
croak "'$fxm1' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fxm1);
croak "'$fx' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fx);
#__MAKE SURE THEY'RE ORDERED AS EXPECTED
croak "cdf pair $list_num is not in ascending order"
unless $fx > $fxm1;
#__MEAN CORECTION
$chisq += 2 * (1 - ($fx * log($fx) - $fxm1 * log($fxm1))/($fx - $fxm1));
}
#__TRANSFORM: TWICE THE DEGREES OF FREEDOM
my $dof = 2 * $obj->{'num_tests'};
#__NOW GET P-VAL FROM A CHI-SQUARE TEST
my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq);
#__RETURN P-VAL
return $pval;
}
# ============================================================
# LANCASTER'S MEDIAN-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM
# ============================================================
=head2 lancaster_median_corrected_transform
This method is based on the median value of the chi-squared transformed
statistic.
my $pval = $obj->lancaster_median_corrected_transform (@cdf_pairs);
Its accuracy may sometimes be not quite as good as when using the
average, but the method is strictly defined for I values of the
statistic.
=cut
sub lancaster_median_corrected_transform {
my $obj = shift;
my (@fxm1_and_fx_pvals) = @_;
#__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @fxm1_and_fx_pvals;
croak "number of pairs passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
#__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC
my ($chisq, $list_num) = (0, 0);
foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) {
$list_num++;
my ($fxm1, $fx) = @{$fxm1_and_fx_pair};
#__MAKE SURE BOTH ARE PVALS
croak "'$fxm1' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fxm1);
croak "'$fx' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fx);
#__MAKE SURE THEY'RE ORDERED AS EXPECTED
croak "cdf pair $list_num is not in ascending order"
unless $fx > $fxm1;
#__MEDIAN CORRECTION
if ($fxm1) {
$chisq -= 2 * log (($fx + $fxm1)/2);
} else {
$chisq += 2 * (1 - log ($fx));
}
}
#__TRANSFORM: TWICE THE DEGREES OF FREEDOM
my $dof = 2 * $obj->{'num_tests'};
#__NOW GET P-VAL FROM A CHI-SQUARE TEST
my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq);
#__RETURN P-VAL
return $pval;
}
# ===========================================================
# LANCASTER'S MIXED-CONTINUITY-CORRECTED CHI-SQUARE TRANSFORM
# ===========================================================
=head2 lancaster_mixed_corrected_transform
This method is a mixture of both the mean and median
methods.
Specifically, mean correction is used wherever it
is well-defined, otherwise median correction is
used.
my $pval = $obj->lancaster_mixed_corrected_transform (@cdf_pairs);
This will be a good way to handle certain
cases.
=cut
sub lancaster_mixed_corrected_transform {
my $obj = shift;
my (@fxm1_and_fx_pvals) = @_;
#__NUMBER OF CDF PAIRS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @fxm1_and_fx_pvals;
croak "number of pairs passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
#__ACCUMULATE LANCASTER'S MEAN CHI-SQUARED STATISTIC
my ($chisq, $list_num) = (0, 0);
foreach my $fxm1_and_fx_pair (@fxm1_and_fx_pvals) {
$list_num++;
my ($fxm1, $fx) = @{$fxm1_and_fx_pair};
#__MAKE SURE BOTH ARE PVALS
croak "'$fxm1' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fxm1);
croak "'$fx' in cdf pair $list_num is not a p-val"
unless &is_a_pval ($fx);
#__MAKE SURE THEY'RE ORDERED AS EXPECTED
#
# NOTE: WE ALLOW FOR EQUIVALENCE OF ADJACENT VALUES (WITHIN FLOATING-POINT
# PRECISION) FOR THOSE CASES WHERE THE CDF IS LOCALLY EXTREMELY
# ASYMPTOTIC
croak "cdf pair $list_num is not in ascending order"
if $fx < $fxm1;
# unless $fx > $fxm1;
#__NO CORRECTION NEEDED IF VALS IDENTICAL WITHIN FLOATING-POINT PRECISION
if ($fx == $fxm1) {
$chisq -= 2 * log ($fx); # Fisher
#__ELSE APPLY LANCASTER'S CONTINUITY CORRECTION (MIXED DEPENDING UPON VALS)
} else {
#__USE LANCASTER'S MEAN IF POSSIBLE
if ($fxm1) {
$chisq += 2 * (1 - ($fx * log($fx) - $fxm1 * log($fxm1))/($fx - $fxm1));
#__OTHERWISE USE LANCASTER'S MEDIAN
} else {
$chisq += 2 * (1 - log ($fx));
}
}
}
#__TRANSFORM: TWICE THE DEGREES OF FREEDOM
my $dof = 2 * $obj->{'num_tests'};
#__NOW GET P-VAL FROM A CHI-SQUARE TEST
my $pval = Statistics::Distributions::chisqrprob ($dof, $chisq);
#__RETURN P-VAL
return $pval;
}
###############################################################################
# END: CORRECTION PROCEDURES FOR DISCRETE DISTRIBUTIONS: LANCASTER'S MODELS #
###############################################################################
################################################################################
## ##
## S E M I - P R I V A T E M E T H O D S ##
## ##
## methods that are not ordinarily called externally but can be if needed ##
## because they are cast according to the object-oriented interface ##
## ##
################################################################################
=head2 additional methods
The basic functionality of this package is encompassed in the methods described
above.
However, some lower-level functions can also sometimes be
useful.
=cut
# ======================
# EXACT_ENUM_ARBITRARY_2 2-distrib precursor of exact_enum_arbitrary
# ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head3 exact_enum_arbitrary_2
Hard-wired precursor of I for 2
distributions.
Does no pre-checking, but may be useful for
comparing to the output of the general
program.
=cut
sub exact_enum_arbitrary_2 {
my $obj = shift;
my (@pvals_lists) = @_;
my $pval = 0;
#__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @pvals_lists;
croak "number of lists passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
croak "you must have exactly 2 tests" unless $obj->{'num_tests'} == 2;
#__TWO-LIST SPECIAL CASE
my $list1 = $pvals_lists[0];
my $list2 = $pvals_lists[1];
#__TRAVERSE LIST 1
for (my $i = 0; $i <= $#{$list1}; $i++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail1 = $list1->[$i];
my $probability_of_ptail1;
if ($i > 0) {
$probability_of_ptail1 = $list1->[$i] - $list1->[$i-1];
} else {
$probability_of_ptail1 = $list1->[$i];
}
#__TRAVERSE LIST 2
for (my $j = 0; $j <= $#{$list2}; $j++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail2 = $list2->[$j];
my $probability_of_ptail2;
if ($j > 0) {
$probability_of_ptail2 = $list2->[$j] - $list2->[$j-1];
} else {
$probability_of_ptail2 = $list2->[$j];
}
#__PRODUCT OF TAIL VALUES AND THE PROBABILITY OF THIS PRODUCT
my $product_tail_pval = $ptail1 * $ptail2;
my $probability_of_product_tail_pval = $probability_of_ptail1 * $probability_of_ptail2;
#__TALLY TO RESULTANT COMPOUND P-VAL IF SIGNIFICANT
$pval += $probability_of_product_tail_pval
if $product_tail_pval <= $obj->{'big_q'};
}
}
#__RETURN P-VAL
return $pval;
}
# ======================
# EXACT_ENUM_ARBITRARY_3 3-distrib precursor of exact_enum_arbitrary
# ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head3 exact_enum_arbitrary_3
Hard-wired precursor of I for 3
distributions.
Does no pre-checking, but may be useful for
comparing to the output of the general
program.
=cut
sub exact_enum_arbitrary_3 {
my $obj = shift;
my (@pvals_lists) = @_;
my $pval = 0;
#__NUMBER OF LISTS SHOULD BE SAME AS NUMBER OF PVALS PASSED TO CONSTRUCTOR
my $num_lists = scalar @pvals_lists;
croak "number of lists passed ($num_lists) not equal to number of tests" .
"in 'new' constructor ($obj->{'num_tests'})"
unless $num_lists == $obj->{'num_tests'};
croak "you must have exactly 3 tests" unless $obj->{'num_tests'} == 3;
#__THREE-LIST SPECIAL CASE
my $list1 = $pvals_lists[0];
my $list2 = $pvals_lists[1];
my $list3 = $pvals_lists[2];
#__TRAVERSE LIST 1
for (my $i = 0; $i <= $#{$list1}; $i++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail1 = $list1->[$i];
my $probability_of_ptail1;
if ($i > 0) {
$probability_of_ptail1 = $list1->[$i] - $list1->[$i-1];
} else {
$probability_of_ptail1 = $list1->[$i];
}
#__TRAVERSE LIST 2
for (my $j = 0; $j <= $#{$list2}; $j++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail2 = $list2->[$j];
my $probability_of_ptail2;
if ($j > 0) {
$probability_of_ptail2 = $list2->[$j] - $list2->[$j-1];
} else {
$probability_of_ptail2 = $list2->[$j];
}
#__TRAVERSE LIST 3
for (my $k = 0; $k <= $#{$list3}; $k++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail3 = $list3->[$k];
my $probability_of_ptail3;
if ($k > 0) {
$probability_of_ptail3 = $list3->[$k] - $list3->[$k-1];
} else {
$probability_of_ptail3 = $list3->[$k];
}
#__PRODUCT OF TAIL VALUES AND THE PROBABILITY OF THIS PRODUCT
my $product_tail_pval = $ptail1 * $ptail2 * $ptail3;
my $probability_of_product_tail_pval = $probability_of_ptail1 * $probability_of_ptail2 * $probability_of_ptail3;
#__TALLY TO RESULTANT COMPOUND P-VAL IF SIGNIFICANT
$pval += $probability_of_product_tail_pval
if $product_tail_pval <= $obj->{'big_q'};
}
}
}
#__RETURN P-VAL
return $pval;
}
# ============
# BINOM COEFFS calculate binomial coefficients
# ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head3 binom_coeffs
Calculates the binomial coefficients needed
in the binomial (convolution) approximate
solution.
$pmobj->binom_coeffs;
The internal data structure is essentially the
symmetric half of the appropriately-sized Pascal
triangle.
Considerable memory is saved by not storing the full
triangle.
=cut
### This is called automatically, if necessary, before the probability
### calculation, so there is not typically a need to call it
### manually.
sub binom_coeffs {
my $obj = shift;
croak "need to know most populous list first"
unless defined $obj->{'most_populous_list'};
carp "already have binomial coefficients" if defined $obj->{'binom_coeffs'};
#__SET-UP FOR COEFFICIENTS
my $bin_coeffs = [1];
$obj->{'binom_coeffs'} = [];
push (@{$obj->{'binom_coeffs'}}, $bin_coeffs);
#__CALCULATE COEFFICIENTS UP TO THAT REQUIRED BY THE MOST POPULOUS LIST
for (my $i = 1; $i <= $obj->{'most_populous_list'}; $i++) {
$bin_coeffs = &next_bin_coeff_row ($i, $bin_coeffs);
push (@{$obj->{'binom_coeffs'}}, $bin_coeffs);
}
}
################################################################################
## ##
## M E T H O D S M E A N T T O B E P R I V A T E ##
## ##
## methods that cannot be called in a contextually meaningful way from an ##
## external application using the object-oriented interface ##
## ##
################################################################################
# ==========================================================================
# ROUTINE FOR DETERMINING WHETHER A VARIABLE REPRESENTS A LEGITIMATE P-VALUE
# ==========================================================================
sub is_a_pval {
my ($val) = @_;
# print "VAL IS '$val'\n";
#__MUST BE A FLOAT (REGEXP: PERL COOKBOOK CHAP 2.1) & MUST BE BOUNDED BY 0 AND 1
if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/
&& $val >= 0 && $val <= 1) {
return 1;
##################
# if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
# print "VAL '$val' IS REAL\n";
# if ($val >= 0) {
# print " VAL '$val' >= 0\n";
# if ($val <= 1) {
# print " VAL '$val' <= 1\n";
# return 1;
# } else {
# print " VAL '$val' NOT <= 1\n";
# # chop $val;
# # if ($val <= 1) {
# # print " VAL '$val' NOW <= 1\n";
# # } else {
# # print " VAL '$val' STILL NOT <= 1\n";
# # }
# return 0;
# }
# } else {
# print " VAL '$val' NOT >= 0\n";
# return 0;
# }
##################
#__ELSE IT IS NOT A PVAL
} else {
# print "VAL '$val' IS NOT REAL\n";
return 0;
}
}
# ================================================================
# RECURSIVE EXACT PVAL CALCULATION FOR ARITRARY TEST DISTRIBUTIONS
# ================================================================
sub _recursive_exact_enum_arbitrary {
my $obj = shift;
my ($list_of_pvals_lists, $prev_recursion_level,
$product_tail_pval, $probability_of_product_tail_pval) = @_;
#__THE CURRENT LIST
my $current_list = $prev_recursion_level;
my $list = $list_of_pvals_lists->[$current_list];
#__THE CURRENT LEVEL OF RECURSION
my $curr_recursion_level = $prev_recursion_level + 1;
my $local_pval = 0;
#__LOOP AT THE CURRENT RECURSION LEVEL
for (my $i = 0; $i <= $#{$list}; $i++) {
#__TAIL VALUE AND THE PROBABILITY OF THIS TAIL VALUE
my $ptail = $list->[$i];
my $probability_of_ptail;
if ($i > 0) {
$probability_of_ptail = $list->[$i] - $list->[$i-1];
} else {
$probability_of_ptail = $list->[$i];
}
#__RECURSE FURTHER IF NECESSARY
if ($curr_recursion_level < $obj->{'num_tests'}) {
#__RECURSE AND ACCUMULATE P-VAL CONTRIBUTIONS
$local_pval +=
$obj->_recursive_exact_enum_arbitrary (
$list_of_pvals_lists,
$curr_recursion_level,
$product_tail_pval * $ptail,
$probability_of_product_tail_pval * $probability_of_ptail
);
#__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT
} else {
#__FINAL PRODUCTS
my $local_product_tail_pval = $product_tail_pval * $ptail;
my $local_probability_of_product_tail_pval =
$probability_of_product_tail_pval * $probability_of_ptail;
#__TALLY IF CONDITION IS SATISFIED
if ($local_product_tail_pval <= $obj->{'big_q'}) {
$local_pval += $local_probability_of_product_tail_pval;
#__ELSE SKIP REST OF THIS DISTRIBUTION CUZ SUCCEEDING VALS ARE ALL LARGER
} else {
last;
}
}
}
#__RETURN RESULT TO THE ANTECEDENT LEVEL
return $local_pval;
}
# ==================
# NEXT BIN COEFF ROW compute 1/2 row i of binomial coefficients given row i-1
# ================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub next_bin_coeff_row {
my ($i, $im1_row) = @_;
#__FIRST ELEMENT I,0 IN ALL ROWS DEFINED AS UNITY
my $next_row = [1];
#__IF I > 1 COMPUTE REST OF NEXT (I) ROW USING PASCAL TRIANGLE ON PREV (I-1) ROW
if ($i > 1) {
#__COMPUTE STOPPING POINT BASED ON SYMMETRY
my $i_mirror = $i / 2;
my $i_end = int $i_mirror;
#__FILL IN INTERMEDIATE ELEMENTS I,1 TO I,I_END USING PASCALS TRIANGLE
for (my $iposition = 1; $iposition <= $i_end; $iposition++) {
#__START WITH "LEFT SIDE" VAL OF PREVIOUS ROW
my $element = $im1_row->[$iposition-1];
#__COMPUTE NEXT ELEMENT USING THE PASCAL TRIANGLE METHOD
if ($iposition == $i_mirror) {
$element += $im1_row->[$iposition-1];
} else {
$element += $im1_row->[$iposition];
}
#__SAVE
push (@{$next_row}, $element);
}
}
#__RETURN LIST OF HALF SYMETRIC NEXT ROW OF BINOMIAL COEFFICIENTS
return $next_row;
}
# ========
# BINCOEFF return binomial coefficient using 1/2 symetric stored table
# ======== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# RETURNS C_{$i_top, $k_bottom}
sub bincoeff {
my $obj = shift;
my ($i_top, $k_bottom) = @_;
#__SYMMETRY CUTOFF IS HALF THE VALUE OF THE I'TH ROW
my $i_half = $i_top / 2;
#__IF I,K IS WITHIN THE STORED SYMMETRIC HALF OF TRIANGLE THEN SIMPLY RETURN VAL
if ($k_bottom <= $i_half) {
return $obj->{'binom_coeffs'}->[$i_top]->[$k_bottom];
#__ELSE COMPUTE SYMMETRIC REFLECTION OF NON-STORED COMPONENT AND RETURN THAT VAL
} else {
my $k_reflect = $i_top - $k_bottom;
return $obj->{'binom_coeffs'}->[$i_top]->[$k_reflect];
}
}
################################################################################
## ##
## T R A I L I N G P O D D O C U M E N T A T I O N ##
## ##
################################################################################
################################################################################
## ##
## - E N D - ##
## ##
################################################################################
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/PathScan.pm 000444 000765 000024 131613 12013522176 27205 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::PathScan::PathScan;
# DEBUG
# print "USING LOCAL MCW VERSION OF REGULAR +/-\n";
# DEBUG
#__STANDARD PERL PACKAGES
use strict;
use Carp;
#__CONSTANT OF PI -- NEEDED IN RAMANUJAN APPROX FOR POISSON PROBABILITY MASSES
# (SEE "PATH-SCAN TEST" NOTES PP 29-31)
use constant PI => 4*atan2 1, 1;
use constant LOG_PI_OVER_2 => log (PI) / 2;
################################################################################
## ##
## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ##
## ##
################################################################################
=head1 NAME
PathScan - the Path-Scan significance test for mutations in
groups of putative cancer genes
=head1 SYNOPSIS
use PathScan;
my $pmobj = PathScan->new ($list_of_gene_lengths);
my $pval = $pmobj->path_scan ($actual_hits, $background_mutation_rate);
=head1 DESCRIPTION
This package calculates the so-called path-scan statistic
P-value for sets of putative cancer genes under the null
hypothesis that somatic mutations found in data are the result
of a random process characterized by the background mutation
rate.
This test is applied to, for example, a biologically-relevant group of genes,
say all the genes in a particular pathway, for which somatic mutation data are
available.
A low p-value would imply that the null hypothesis should be
rejected.
In other words, the result suggests that the mutation configuration
in this pathway is probably not the result of a strictly random
process.
=head2 Nature of the Path-Scan Test
This statistic considers individual genes in a "binary" fashion,
i.e. a gene is either mutated (has one or more mutations) or it is not
mutated.
I
This is the "path-scan" aspect of the
test.
Why is such information
discarded?
The somatic background mutation rate is typically
very small compared to the size of the average
gene.
Consequently, the expected number of mutations
in any given gene is very low, much less than one, in
fact.
Under the null hypothesis, most genes will have no
mutations.
Genes with one (or just a few) may be interesting, but when many
genes in a biologically-relevant group (say a pathway) have one (or
just a few) mutations, that could be a sign of some underlying I
process.
In other words, this test is useful in cases where many
genes in a group might each contribute a small component
(i.e. a small fitness advantage) in the context of the disease
process.
What this test is not concerned with (and will I detect)
is the case where a single, specific gene has a non-random
association and it reflects this fact via a large number of
mutations.
Other single-gene tests should presumably flag such
cases.
The path-scan test should, therefore, be thought
of as just one tool within a larger statistical
"toolbox".
=head2 Assumptions in the Test
The main assumption is that a single background
mutation rate applies to the set of genes of
interest.
That is, the rate does not vary among genes, among
chromosomes (if more than one hosts genes of interest),
etc.
=head1 BUGS AND OPPORTUNITIES FOR EXTENSION
Coefficients are recalculated for every individual test, but it
would be good for these to persist between tests, adding more as
necessary (i.e. if a subsequent test involves more genes than the current
one).
=head1 AUTHOR
Michael C. Wendl
S
Copyright (C) 2007, 2008 Washington University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=head1 METHODS
The available methods are as follows.
=cut
################################################################################
## ##
## P R O G R A M M E R N O T E S ##
## ##
################################################################################
#
# The obj schematic resembles:
#
# $obj = {
#
# #__LIST OF GENE LENGTHS (RAW DATA)
#
# one list-within-a-list implies doing the exact solution, while 2 or more
# lists-within-a-list imply doing the approximate "binning" approach
#
# genes => [
# [3500, 4234, 5609, 4550, 10763, 9879,...],
# [33500, 44234, 35609, 34550, 110763, 49879,...],
# :
# ],
#
#
# #__CONTEXT: 0=EXACT OR >0 FOR APPROXIMATE
#
# this indicator is 0 in the exact context, but in the approximate context
# it gives the number of lists that have been passed
#
# context_approx => 1,
#
#
# #__EXPECTATION
#
# this is the expected number of mutated genes and is only used for the
# asymptotic (Poisson) model --- it will computed whenever the gene sizes
# are passed as a single list (i.e. as when we do the exact solution), but
# it is only used for the Poisson solution
#
# expected_genes => 0.097,
#
#
# #__TOTAL COUNT OF GENES (FOR CONVENIENCE)
#
# this is the total count of all genes, whether in a single list (exact)
# context, or multiple lists (approximate "binning" context)
#
# num_genes => 35,
#
#
# #__AVERAGE GENE LENGTHS AND SIZES OF LISTS
#
# these will only be defined when passing multiple lists of genes, i.e.
# for the approximate "binning" context - each element in 'avg_lengths' and
# 'list_sizes' give, respectively, the average length and the number of
# genes in the corresponding list of gene raw data above
#
# avg_lengths => [5445.454, 48984.39, ...],
# list_sizes => [21, 34, 32, 54, 19, ...],
#
#
# #__TOTAL LENGTH OF ALL GENES CONCATENATED AND SIZE OF MOST POPULOUS LIST
#
# this is the total length of all genes, whether in a single list (exact)
# context, or multiple lists (approximate "binning" context), and the number
# of elements in the most populous list, respectively -- if there is just
# one list, then the latter will be equivalent to the value in 'num_genes'
#
# total_length => 75983,
# most_populous_list = 35,
#
#
# #__LIST OF CORRESPONDING "MODIFIED" BERNOULLI "PROBABILITY OF FAILURE" VALUES
#
# in the exact-solution context, each element is the bernoulli probability
# for the corresponding gene length in the single list-within-a-list (above)
# but in the approximate-"binning"-solution-context, each value is the
# bernoulli probability for the average gene length in the corresponding
# list of genes, e.g. the zero-th value is for the zero-th list, the first
# value for the first list, etc. - the code will interpret the context
# automatically
#
# mpvals => [0.003596, 0.004243, 0.005625, 0.004560, 0.01082, 0.009928,...],
#
#
# #__BACKGROUND MUTATION RATE I.E. PROBABILITY OF A MUTATION IN A GIVEN NUCLEO
# mutation_prob => 0.000001,
#
#
# #__BINOMIAL COEFFICIENTS
#
# this will only be defined when passing multiple lists of genes, i.e.
# for the approximate "binning" solution - we only define the symmetric
# half of pascal's triangle
#
# binom_coeffs => [[1], [1], [1,2], [1,3], [1,4,6], [1,5,10], ....],
#
# };
################################################################################
## ##
## P U B L I C M E T H O D S ##
## ##
################################################################################
# ===
# NEW create a new path-scan object
# === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 new
This is the usual object constructor, which
optionally takes all the gene-length data as
input.
If you want to use the exact probability solution or the
asymptotic approximate solution, pass all lengths in a single list
reference
my $pmobj = PathScan->new ([3434, 54565, 6445, ...]);
but if you want to use the convolution approximation method, divide the list of
gene sizes into the desired number of bins and pass each of these as a
reference
my $pmobj = PathScan->new ([3434, 54565], [6445, ...]);
In other words, the way you pass these arguments at partially determines
the context in which you will obtain your P-value for this set of
genes.
The latter choice is typically betetr, as it
gives good accuracy and good computational
efficiency.
Conversely, the exact solution is identically correct, but can be difficult to
compute.
The asymptotic approximation is always computationally
efficient, but not necessarily accurate for small test
sets.
=cut
sub new {
my $class = shift;
my (@list_of_gene_length_lists) = @_;
#__OBJECT TEMPLATE
my $self = {};
#__PROCESS GENE LENGTH DATA IF SPECIFIED
&store_genes ($self, @list_of_gene_length_lists)
if scalar @list_of_gene_length_lists;
#__BLESS INTO CLASS AND RETURN OBJECT
bless $self, $class;
return $self;
}
# ==========
# PATH SCAN path-scan (tail) probability value
# ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 path_scan
This function calculates the path-scan statistic in one of the
appropriate contexts (exact or convolution approximation, as described
above).
It takes the actual number of "hits" you've observed in
the data, i.e. the number of genes that have a mutated
status.
my $pval = $pmobj->path_scan (7);
If you have not yet done the pre-processing with respect to the
background mutation rate (see below), then pre-processing can be
executed implicitly by passing the rate as the second
argument.
my $pval = $pmobj->path_scan (7, 0.000001);
=cut
sub path_scan {
my $obj = shift;
my ($actual_hits, $mutation_prob) = @_;
#__SOME BASIC CHECKS
croak "argument list: '$actual_hits' must be an integer"
unless $actual_hits =~ /^-?\d+$/;
croak "argument list: '$actual_hits' cant be negative"
unless $actual_hits >= 0;
croak "argument list: '$actual_hits' cant exceed number of " .
"genes '$obj->{'num_genes'}'" if $actual_hits > $obj->{'num_genes'};
#__PREPROCESS IF NECESSARY
if ($mutation_prob) {
$obj->preprocess ($mutation_prob);
}
#__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY
if ($obj->{'context_approx'}) {
$obj->binom_coeffs unless defined $obj->{'binom_coeffs'};
}
#__CALCULATE P-VAL WITH THE MINIMUM OF EFFORT
my $half_of_num_genes = $obj->{'num_genes'} / 2;
my $path_scan_pval = 0;
#__COMPUTE ACTUAL TAIL IF NUM HITS IS IN THIS RANGE
if ($actual_hits > $half_of_num_genes) {
#__COMPUTE THE ACTUAL UPPER TAIL
for (my $k = $actual_hits; $k <= $obj->{'num_genes'}; $k++) {
my $pval;
if ($obj->{'context_approx'}) {
$pval = $obj->p_value_binomial_approx ($k);
} else {
$pval = $obj->p_value_exact ($k);
}
$path_scan_pval += $pval;
}
#__RETURN RESULT
return $path_scan_pval;
#__COMPUTE 1 - COMPLIMENTARY TAIL NUM HITS IS IN THIS RANGE
} else {
#__COMPUTE COMPLIMENTARY (LOWER) TAIL
for (my $k = 0; $k < $actual_hits; $k++) {
my $pval;
if ($obj->{'context_approx'}) {
$pval = $obj->p_value_binomial_approx ($k);
} else {
$pval = $obj->p_value_exact ($k);
}
$path_scan_pval += $pval;
}
#__RETURN 1 - THIS VAL AS THE RESULT
return 1 - $path_scan_pval;
}
}
# =============
# CDF_TRUNCATED truncated cummulative distribution function
# ============= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# NOTE ON INDEX OF p_value_binomial_approx FUNCTION
#
# we have the identity for the probability masses of
#
# P(0) + P(1) + P(2) + ... + P(M) = 1
#
# where M is the total number of genes, so that the tailed p-values in the
# CDF list, ordered from most extreme to least extreme are
#
# P(K >= 0) = 1
# P(K >= 1) = 1 - P(0) = P(K >= 0) - P(0)
# P(K >= 2) = 1 - P(0) - P(1) = P(K >= 1) - P(1)
# P(K >= 3) = 1 - P(0) - P(1) - P(2) = P(K >= 2) - P(2)
# : : : : : : :
# P(K >= k) = = P(K >= k-1) - P(k-1)
#
# So, the index of p_value_binomial_approx is '$k-1' and each succeeding
# element is UNSHIFTED onto the list to get the desired ordering from most
# extreme to least extreme.
#
# Method "cdf_asymptot" has the same indexing for its probability calling
# function.
=head2 cdf_truncated
This function returns the cummulative distribution in the
context of the convolution approximation I such that
it contains only enough information to process the given number of
hits.
my $pvals_list = $pmobj->cdf_truncated ($hits);
The list is ordered from most extreme to least extreme probability
tail values, i.e. the last value in the list is always
unity.
However, tailed p-values more extreme than that associated with the argument
are not, in fact, calculated, but rather are replaced with the flag
-1.
This saves processing time and also reduces the chances of numerical
overflow for large pathways, as the full CDF must ultimately raise
an "mval" (>1) to a power equal to the number of genes in the
pathway.
The method assumes you have already done the pre-processing with respect to the
background mutation rate.
=cut
sub cdf_truncated {
my $obj = shift;
my ($hits) = @_;
#__MAKE SURE HITS IS AN INTEGER BETWEEN 0 AND THE NUMBER OF GENES
croak "hit number must be integer > 0" unless $hits =~ /^\d+$/ && $hits >= 0;
croak "hit number must be less than total number of genes"
unless $hits <= $obj->{'num_genes'};
#__INCREMENT BY ONE FOR ANY METHODS THAT MAY ALSO NEED THE NEXT MORE EXTREME
# TAILED P-VALUE, E.G. THE LANCASTER CORRECTION IN THE POPULATION CALCULATION,
# UNLESS WE'RE ALREADY AT THE TOTAL NUMBER OF GENES, E.G. FOR A 1-GENE PATHWAY
# LIKE HSA04112
$hits++ unless $hits == $obj->{'num_genes'};
#__MAKE SURE PRE-PROCESSING HAS ALREADY BEEN DONE
croak "cannot call 'cdf_truncated' without first pre-processing"
unless defined $obj->{'mutation_prob'};
#__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY
if ($obj->{'context_approx'}) {
$obj->binom_coeffs unless defined $obj->{'binom_coeffs'};
}
#__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY
# if ($obj->{'context_approx'}) {
# $obj->binom_coeffs unless defined $obj->{'binom_coeffs'};
# }
#__CDF INITIALIZED WITH UNITY --- MORE EXTREME VALS WILL BE UNSHIFTED IN FRONT
my $path_scan_pval = 1;
my $cdf = [$path_scan_pval];
#__COMPUTE CDF VALS PROGRESSIVELY MORE EXTREME PUSHING EACH TO FRONT OF LIST
for (my $k = 1; $k <= $obj->{'num_genes'}; $k++) {
#__COMPUTE ACTUAL TAILED P-VALUE IF WE'RE WITHIN TRUNCATION RANGE
if ($k <= $hits) {
$path_scan_pval -= $obj->p_value_binomial_approx ($k-1);
unshift @{$cdf}, $path_scan_pval;
#__OTHERWISE JUST INSERT A FLAG TO FUNCTION AS A PLACEHOLDER FOR OTHER
# METHODS THAT EXPECT THE *FORM* OF THE LIST TO BE A FULL CDF
} else {
unshift @{$cdf}, -1;
}
}
#__RETURN TRUNCATED CDF
return $cdf;
}
# ===
# CDF cummulative distribution function
# === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 cdf
This function returns the cummulative distribution in one of the
appropriate contexts (exact or convolution approximation, as described
above).
There are no arguments,
my $pvals_list = $pmobj->cdf;
unless you have not yet done the pre-processing with respect to the
background mutation rate (see below), in which case the pre-processing can be
executed implicitly by passing the rate as the sole
argument.
my $pvals_list = $pmobj->cdf (0.000001);
The list is ordered from most extreme to least extreme probability
tail values, i.e. the last value in the list is always
unity.
=cut
sub cdf {
my $obj = shift;
my ($mutation_prob) = @_;
my $cdf = [];
#__PREPROCESS IF NECESSARY
if ($mutation_prob) {
$obj->preprocess ($mutation_prob);
}
#__CALCULATE BINOMIAL COEFFICIENTS IF NECESSARY
if ($obj->{'context_approx'}) {
$obj->binom_coeffs unless defined $obj->{'binom_coeffs'};
}
#__CALCULATE P-VAL WITH THE MINIMUM OF EFFORT
my $path_scan_pval = 0;
#__COMPUTE CDF STARTING WITH MOST THE EXTREME STATE WORKING TOWARD LEAST EXTREME
for (my $k = $obj->{'num_genes'}; $k >= 0; $k--) {
my $pval;
if ($obj->{'context_approx'}) {
$pval = $obj->p_value_binomial_approx ($k);
} else {
$pval = $obj->p_value_exact ($k);
}
$path_scan_pval += $pval;
push @{$cdf}, $path_scan_pval;
}
return $cdf;
}
# ============
# CDF_ASYMPTOT cummulative distribution function using asymptotic analysis
# ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 cdf_asymptot
This function returns the cummulative distribution based on asymptotic
analysis.
There are no arguments, i.e.
my $pvals_list = $pmobj->cdf_asymptot;
unless you have not yet done the pre-processing with respect to the
background mutation rate (see below), in which case the pre-processing can be
executed implicitly by passing the rate as the sole
argument.
my $pvals_list = $pmobj->cdf_asymptot (0.000001);
The list is ordered from most extreme to least extreme probability
tail values, i.e. the last value in the list is always
unity.
Note that asymptotic analysis gives a function (the Poisson) having infinite
support.
The infinite tail probability for all values past the most extreme
physical case are all bundled into that most extreme
p-value.
=cut
sub cdf_asymptot {
my $obj = shift;
my ($mutation_prob) = @_;
#__CDF INITIALIZED WITH UNITY --- MORE EXTREME VALS WILL BE UNSHIFTED IN FRONT
my $path_scan_pval = 1;
my $cdf = [$path_scan_pval];
#__PREPROCESS IF NECESSARY
if ($mutation_prob) {
$obj->preprocess ($mutation_prob);
}
#__COMPUTE CDF VALS PROGRESSIVELY MORE EXTREME PUSHING EACH TO FRONT OF LIST
# SEE PROGRAMMING NOTES OF CDF_TRUNCATED METHOD THAT EXPLAIN INDEXING OF THE
# PROBABILITY CALL
for (my $k = 1; $k <= $obj->{'num_genes'}; $k++) {
$path_scan_pval -= $obj->p_value_asymptot_approx ($k-1);
unshift @{$cdf}, $path_scan_pval;
}
return $cdf;
}
# ===================
# PATH SCAN ASYMPTOT asymptotic path-scan probability value (CDF)
# =================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 path_scan_asymptot
This function calculates the path-scan statistic in the asymptotic (Poisson)
context.
It takes the actual number of "hits" you've observed in
the data, i.e. the number of genes that have a mutated
status.
my $pval = $pmobj->path_scan_asymptot (7);
If you have not yet done the pre-processing with respect to the
background mutation rate (see below), then pre-processing can be
executed implicitly by passing the rate as the second
argument.
my $pval = $pmobj->path_scan_asymptot (7, 0.000001);
You must set up the object, somewhat paradoxically,
I.
(This is a consequence of how data are stored internally within the
object.)
=cut
# The Poisson distribution has infinite support, so to be rigorous, we
# should sum probability masses to a very large number. Strictly speaking,
# we get the same (rigorous) result by simply computing the complement and
# subtracting that from one.
sub path_scan_asymptot {
my $obj = shift;
my ($actual_hits, $mutation_prob) = @_;
#__SOME BASIC CHECKS
croak "argument list: '$actual_hits' must be an integer"
unless $actual_hits =~ /^-?\d+$/;
croak "argument list: '$actual_hits' cant be negative"
unless $actual_hits >= 0;
croak "argument list: '$actual_hits' cant exceed number of " .
"genes '$obj->{'num_genes'}'" if $actual_hits > $obj->{'num_genes'};
#__PREPROCESS IF NECESSARY
if ($mutation_prob) {
$obj->preprocess ($mutation_prob);
}
#__POISSON HAS INFINITE SUPPORT SO COMPUTE COMPLIMENTARY (LOWER) TAIL AS PVAL
my $path_scan_pval = 0;
for (my $k = 0; $k < $actual_hits; $k++) {
$path_scan_pval += $obj->p_value_asymptot_approx ($k);
}
#__RETURN 1 - THIS VAL AS THE RESULT
return 1 - $path_scan_pval;
}
################################################################################
## ##
## S E M I - P R I V A T E M E T H O D S ##
## ##
## methods that are not ordinarily called externally but can be if needed ##
## because they are cast according to the object-oriented interface ##
## ##
################################################################################
=head2 additional methods
The basic functionality of this package is encompassed in the methods described
above.
However, some lower-level functions can also sometimes be
useful.
=cut
# =============
# P VALUE EXACT returns probability of exactly k genes mutated by chance
# ============= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# number of required recursion levels is based on number of hits = k
=head3 p_value_exact
This function returns the exact value of the
probability I for a specific number of
hits.
$pval_exact = $pmobj->p_value_exact (7);
You must make sure to call this only if you've
configured the object in the exact context (see
above).
=cut
sub p_value_exact {
my $obj = shift;
my ($k) = @_;
#__SOME BASIC CHECKS
croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/;
croak "argument list: '$k' cant be negative" unless $k >= 0;
croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'"
if $k > $obj->{'num_genes'};
#__COMPUTE THE LEADING TERM
my $leading_term = exp (- $obj->{'mutation_prob'} * $obj->{'total_length'});
#__THIS IS ALSO THE EXACT P-VALUE IN THE SPECIFIC CASE OF K=0
return $leading_term if $k == 0;
#__CALCULATE EXACT P-VALUE FOR K>0
my $total_subpart_of_pval = $obj->_recursive_exact_pval_calculation ($k, 0, 1, 1);
return $leading_term * $total_subpart_of_pval;
}
# =======================
# P VALUE BINOMIAL APPROX returns approx prob of k genes mutated by chance
# ======================= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE: we have not implemented the approximate method for the case of just
# a single bin, although a simple solution exists (path-scan test notes pp 16)
# because the assumption is that, if the user just passes a single list, they
# want the exact solution. In other words, we assume that the approximate
# solution is only desired when there are at least 2 lists.
#
# number of required recursion levels is based on the number of lists
# that the data have been binned into
=head3 p_value_binomial_approx
This function returns the convolution approximated
value (i.e. using the binomial binning approximation)
of the probability I for a specific number of
hits.
$pval_exact = $pmobj->p_value_binomial_approx (7);
You must make sure to call this only if you've
configured the object in the approximate binomial context (see
above).
Also, you must explicitly calculate the
necessary binomial coefficients beforehand
(see C).
=cut
sub p_value_binomial_approx {
my $obj = shift;
my ($k) = @_;
#__SOME BASIC CHECKS
croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/;
croak "argument list: '$k' cant be negative" unless $k >= 0;
croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'"
if $k > $obj->{'num_genes'};
croak "need to first calculate binomial coefficients"
unless defined $obj->{'binom_coeffs'};
#__COMPUTE THE LEADING TERM
my $leading_term = exp (- $obj->{'mutation_prob'} * $obj->{'total_length'});
#__THIS IS ALSO THE EXACT P-VALUE IN THE SPECIFIC CASE OF K=0
return $leading_term if $k == 0;
#__COMPUTE RECURSIVE PORTION OF THE SOLUTION
my $num_lists = $obj->{'context_approx'};
my $total_subpart_of_pval =
$obj->_recursive_binom_pval_calculation ($num_lists - 1, 0, $k);
# THIS CHUNK DOES NOT WORK IN GENERAL BECAUSE A CDF IS NOT GUARENTEED TO START
# FROM K=0
#
#
#__CALCULATE EXACT P-VALUE FOR K>0
# my $prob_mass = $leading_term * $obj->{'binomial_multiplier'} *
# $total_subpart_of_pval;
#__TAKE CARE OF THE POWER TERM FOR NEXT ITERATION (STEP-BY-STEP BUILD-UP)
# -1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING
# $obj->{'binomial_multiplier'} *= $obj->{'mpvals'}->[$num_lists-1];
#__RETURN P
# return $prob_mass;
#__RETURN (-1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING)
return $leading_term * $obj->{'mpvals'}->[$num_lists-1]**$k *
$total_subpart_of_pval;
}
# =======================
# P VALUE ASYMPTOT APPROX returns poisson approx prob of k genes mutated
# ======================= ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head3 p_value_asymptot_approx
This function returns the asymptotic approximated
value (i.e. using the Poisson limit approximation)
of the probability I for a specific number of
hits.
$pval_exact = $pmobj->p_value_asymptot_approx (7);
Somewhat paradoxically, you must make sure to call this only
if you've configured the object in the exact context (see
above).
=cut
sub p_value_asymptot_approx {
my $obj = shift;
my ($k) = @_;
#__SOME BASIC CHECKS
croak "argument list: '$k' must be an integer" unless $k =~ /^-?\d+$/;
croak "argument list: '$k' cant be negative" unless $k >= 0;
croak "argument list: '$k' cant exceed number of genes '$obj->{'num_genes'}'"
if $k > $obj->{'num_genes'};
#__SOME SHORTHAND
my $avg = $obj->{'expected_genes'};
#__ASYMPTOTIC PROBABILITY MASS
my $pmass;
if ($k) {
#__PROBABILITY MASS
my $ramanuj = log($k * (1 + 4*$k*(1 + 2*$k))) / 6;
my $arg_h0 = $k * log ($avg / $k) - $avg + $k;
$arg_h0 -= $ramanuj;
$arg_h0 -= LOG_PI_OVER_2;
$pmass = exp ($arg_h0);
} else {
$pmass = exp (-$avg);
}
#__RETURN PROBABILITY MASS
return $pmass;
}
# ===========
# STORE GENES run some basic consistency checks on raw data then store them
# =========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONFIGURED FOR BOTH EXACT AND APPROXIMATE CONTEXTS
#
# this has to take the whole data struct (list-of-lists) because it can be
# called externally and we don't expect the user to have to do the proper
# looping
=head3 store_genes
Stores the raw gene length
data.
Use this if you did not pass these data to
C before you call any calculation
methods.
Works in the same way as C, described
above.
Specifically, the context is partially determined by whether you
pass a single list (exact context or asymptotic approximation)
$pmobj->store_genes ([3434, 54565, 6445, ...]);
or more than one list (convolution approximate context)
$pmobj->store_genes ([3434, 54565], [6445, ...]);
=cut
sub store_genes {
my $obj = shift;
my (@list_of_gene_length_lists) = @_;
#__INIT
my ($total_length, $most_populous_list) = (0, 0);
$obj->{'num_genes'} = 0;
#__DETERMINE WHETHER USER WANTS TO RUN IN EXACT CONTEXT OR APPROXIMATE CONTEXT
# IF APPROXIMATE THE RECORD THE NUMBER OF LISTS
$obj->{'context_approx'} = 0;
if (scalar @list_of_gene_length_lists > 1) {
$obj->{'context_approx'} = scalar @list_of_gene_length_lists;
}
#__PROCESS THE DATA
foreach my $gene_list (@list_of_gene_length_lists) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference" unless ref $gene_list eq "ARRAY";
#__PROCESS EACH GENE IN THIS LIST
my $list_concat_length = 0;
foreach my $length (@{$gene_list}) {
#__MAKE SURE THIS LENGTH IS A POSITIVE INTEGER
croak "argument list: '$length' must be an integer"
unless $length =~ /^-?\d+$/;
croak "argument list: '$length' must be positive" unless $length > 0;
#__TALLY THIS LENGTH TO THE TOTAL LENGTH OF THIS LIST
$list_concat_length += $length;
}
#__SOME ADDITIONAL INFORMATION NEEDED IF IN APPROXIMATE CONTEXT
if ($obj->{'context_approx'}) {
#__RECORD THE AVERAGE LENGTH IN THIS LIST AND THE LIST SIZE
my $num_elements = scalar @{$gene_list};
push @{$obj->{'avg_lengths'}}, $list_concat_length/$num_elements;
push @{$obj->{'list_sizes'}}, $num_elements;
#__DISCERN WHICH SUB-LIST HAS THE MOST MEMBERS
$most_populous_list = $num_elements
if $num_elements > $most_populous_list;
}
#__TALLY THE NUMBER OF GENES IN THIS LIST TO THE GRAND TOTAL
$obj->{'num_genes'} += scalar @{$gene_list};
#__SAVE THIS GENE LIST TO THE OBJECT
push @{$obj->{'genes'}}, $gene_list;
#__TALLY TO OVERALL LENGTH OF ALL THE GENES
$total_length += $list_concat_length;
}
#__RECORD GRAND TOTAL LENGTH OF ALL GENES CONCATENATED TOGETHER
$obj->{'total_length'} = $total_length;
#__RECORD SIZE OF MOST POPULOUS LIST
$obj->{'most_populous_list'} = $most_populous_list;
#
# NOTES
#
# Here, we should probably now make sure to erase
# $obj->{'mutation_prob'} and $obj->{'mpvals'} = $mpvals
# if they exist, because their contents would no longer be valid (i.e. as
# based on an "old" set of gene lengths)
#
}
# ============
# BINOM COEFFS calculate binomial coefficients
# ============ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head3 binom_coeffs
Calculates the binomial coefficients needed
in the binomial (convolution) approximate
solution.
$pmobj->binom_coeffs;
The internal data structure is essentially the
symmetric half of the appropriately-sized Pascal
triangle.
Considerable memory is saved by not storing the full
triangle.
=cut
### This is called automatically, if necessary, before the probability
### calculation, so there is not typically a need to call it
### manually.
sub binom_coeffs {
my $obj = shift;
croak "need to know most populous list first"
unless defined $obj->{'most_populous_list'};
carp "already have binomial coefficients" if defined $obj->{'binom_coeffs'};
#__SET-UP FOR COEFFICIENTS
my $bin_coeffs = [1];
$obj->{'binom_coeffs'} = [];
push (@{$obj->{'binom_coeffs'}}, $bin_coeffs);
#__CALCULATE COEFFICIENTS UP TO THAT REQUIRED BY THE MOST POPULOUS LIST
for (my $i = 1; $i <= $obj->{'most_populous_list'}; $i++) {
$bin_coeffs = &next_bin_coeff_row ($i, $bin_coeffs);
push (@{$obj->{'binom_coeffs'}}, $bin_coeffs);
}
}
# ==========
# PREPROCESS calculate list of modified bernoulli probabilities for each gene
# ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONFIGURED FOR BOTH EXACT AND APPROXIMATE CONTEXTS
=head3 preprocess
Calculates the Bernoulli kernel probabilities for the individual genes
or gene bins
$pmobj_binom->preprocess ($background_mutation_rate);
The data structure can be re-configured to run the test with different
background mutation rates by just re-calling this routine with a different
value
$pmobj_binom->preprocess ($new_background_mutation_rate);
=cut
sub is_float {
my $val = shift;
if ($val =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
return 1;
}
else {
return 0;
}
}
sub preprocess {
my $obj = shift;
my ($mutation_prob) = @_;
#__PRELIMINARY VALIDATION
croak "need background mutation rate" unless $mutation_prob;
croak "background mutation '$mutation_prob' rate must be a p-val"
unless (Genome::Model::Tools::Music::PathScan::PathScan::is_float($mutation_prob) &&
$mutation_prob > 0 && $mutation_prob < 1);
croak "preprocessing: no data" unless defined $obj->{'genes'};
#__JUST RETURN SILENTLY IF WE'RE ABOUT TO COMPUTE WHAT HAS ALREADY BEEN DONE
# carp "data already preprocessed" if defined $obj->{'mpvals'};
#
# NOTE: IF THE CALLER WANTS TO NOW ANALYZE A NEW BACKGROUND MUTATION RATE
# TEST FOR EQUIVALENCE TO THE OLD ONE USING "STRING" CONTEXT
if (defined $obj->{'mpvals'} && $obj->{'mutation_prob'} eq $mutation_prob) {
return;
}
#__CALCULATE THE MODIFIED BERNOULLI PROBABILITY VALUE FOR EACH GENE
my $mpvals = [];
#__CALCULATE IN THIS FASHION IF IN APPROXIMATE CONTEXT
if ($obj->{'context_approx'}) {
foreach my $length (@{$obj->{'avg_lengths'}}) {
my $pval = exp ($mutation_prob * $length) - 1;
if ($pval > 0) {
push @{$mpvals}, $pval;
} else {
croak "unexpected bernoulli pval '$pval' from average gene length '$length'";
}
}
#__ALSO CALCULATE EXPECTED NUMBER OF GENES BEING MUTATED IN CASE WE NEED TO
# REVERT TO ASYMPTOTIC ANALYSIS
my $expected_mutated_genes = 0;
foreach my $gene_list (@{$obj->{'genes'}}) {
foreach my $length (@{$gene_list}) {
$expected_mutated_genes += 1 - exp (- $mutation_prob * $length);
}
}
$obj->{'expected_genes'} = $expected_mutated_genes;
#__ELSE CALCULATE IN THIS FASHION IF IN EXACT CONTEXT OR IF USING THE
# ASYMPTOTIC (POISSON) APPROXIMATION
} else {
my $expected_mutated_genes = 0;
#__THERE IS ONLY 1 LIST OF GENES HERE (INDEX 0 IN THE DATA STRUCTURE)
foreach my $length (@{$obj->{'genes'}->[0]}) {
#__ACTUAL PVAL FOR THIS GENE BEING MUTATED TALLIED TOWARD EXPECTED VALUE
$expected_mutated_genes += 1 - exp (- $mutation_prob * $length);
#__ALGORITHMIC MODEL PVALUE (SEE NOTES)
my $pval = exp ($mutation_prob * $length) - 1;
#__STORE ALGORITHMIC MODELED PVALUE
if ($pval > 0) {
push @{$mpvals}, $pval;
} else {
croak "unexpected bernoulli pval '$pval' from gene length '$length'";
}
}
#__STORE EXPECTED NUMBER OF MUTATED GENES
$obj->{'expected_genes'} = $expected_mutated_genes;
}
#__STORE RESULT
$obj->{'mutation_prob'} = $mutation_prob;
$obj->{'mpvals'} = $mpvals;
}
################################################################################
## ##
## M E T H O D S M E A N T T O B E P R I V A T E ##
## ##
## methods that cannot be called in a contextually meaningful way from an ##
## external application using the object-oriented interface ##
## ##
################################################################################
# ================================
# RECURSIVE EXACT PVAL CALCULATION recursion for handling summation of terms
# ================================ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub _recursive_exact_pval_calculation {
my $obj = shift;
my ($k, $prev_recursion_level, $start_index, $coeff_for_this_level) = @_;
#__THE CURRENT LEVEL OF RECURSION
my $curr_recursion_level = $prev_recursion_level + 1;
my $local_contribution = 0;
#__LOOP AT THE CURRENT RECURSION LEVEL USING APPROPRIATE START/STOP INDECES
for (my $i = $start_index;
$i <= $obj->{'num_genes'} - $k + $curr_recursion_level;
$i++) {
#__RECURSE FURTHER IF NECESSARY: NOT
if ($curr_recursion_level < $k) {
#__RECURSION: NOTE LAST ARG RESOLVES DIFFERENCE BETWEEN PHYSICAL GENE
# NUMBERING (STARTING AT 1) VS NUMBERING IN PERL LIST (STARTING AT 0)
$local_contribution +=
$obj->_recursive_exact_pval_calculation (
$k,
$curr_recursion_level,
$i+1, # see mcw notes
$coeff_for_this_level * $obj->{'mpvals'}->[$i-1] # resolve number
);
#__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT: NOTE AGAIN
# WE RESOLVE DIFFERENCE BETWEEN PHYSICAL GENE NUMBERING (STARTING AT 1)
# VS NUMBERING IN PERL LIST (STARTING AT 0)
} else {
$local_contribution +=
$coeff_for_this_level * $obj->{'mpvals'}->[$i-1];
}
}
#__RETURN RESULT TO THE ANTECEDENT LEVEL
return $local_contribution;
}
# ================================
# RECURSIVE BINOM PVAL CALCULATION recursion for handling summation of terms
# ================================ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# HERE $obj->{'mpvals'}->[$i-1] IS THE BERNOULLI PROBABILITY FOR THE i-th BIN
# I.E. THE -1 MODIFIER RECONCILES MATH LIST NUMBERING WITH PERL LIST NUMBERING
#
# note that the summation structure in the solution means that binomial
# coefficients that are out-of-bounds can nevertheless be called upon, so
# we must explicitly test when this is the case and skip these (see code below)
sub _recursive_binom_pval_calculation {
my $obj = shift;
my ($recurs_limit, $prev_recursion_level, $k_stop_index) = @_;
#__THE CURRENT LEVEL OF RECURSION
my $curr_recursion_level = $prev_recursion_level + 1;
my $local_contribution = 0;
#__SUBSCRIPT
my $current_list_number = $recurs_limit + 1 - $curr_recursion_level;
#__EFFICIENCY -- WE DO NOT NEED TO COMPUTE THE POWER TERM WITHIN THE LOOP
# THE POWER VALUE CORRESPONDS TO THE LOOP ITERATOR, SO WE CAN INCREMENT
# THESE TOGETHER -- SEE SOLUTION NOTES
my $fraction_coeff = 1;
my $multiplier = $obj->{'mpvals'}->[$current_list_number-1] /
$obj->{'mpvals'}->[$current_list_number];
#__LOOP AT THE CURRENT RECURSION LEVEL USING APPROPRIATE START/STOP INDECES
for (my $k = 0;
$k <= $k_stop_index;
$k++) {
#__RECURSE FURTHER IF NECESSARY: NOT
if ($curr_recursion_level < $recurs_limit) {
#__RECURSION: CURRENT ITERATION IN THIS LOOP IS STOP INDEX FOR NEXT
if (
$obj->{'list_sizes'}->[$current_list_number] >= $k_stop_index - $k
) {
$local_contribution += $fraction_coeff
* &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number], $k_stop_index - $k)
* $obj->_recursive_binom_pval_calculation (
$recurs_limit,
$curr_recursion_level,
$k
);
}
#__ELSE WE'RE "AT THE BOTTOM" SO TAKE THE NECESSARY PRODUCT: NOTE AGAIN
# WE RESOLVE DIFFERENCE BETWEEN PHYSICAL GENE NUMBERING (STARTING AT 1)
# VS NUMBERING IN PERL LIST (STARTING AT 0)
} else {
if (
$obj->{'list_sizes'}->[$current_list_number-1] >= $k &&
$obj->{'list_sizes'}->[$current_list_number] >= $k_stop_index - $k
) {
$local_contribution += $fraction_coeff
* &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number-1], $k)
* &bincoeff ($obj, $obj->{'list_sizes'}->[$current_list_number], $k_stop_index - $k);
}
}
#__TAKE CARE OF THE POWER TERM FOR NEXT ITERATION (STEP-BY-STEP BUILD-UP)
$fraction_coeff *= $multiplier;
}
#__RETURN RESULT TO THE ANTECEDENT LEVEL
return $local_contribution;
}
# ==================
# NEXT BIN COEFF ROW compute 1/2 row i of binomial coefficients given row i-1
# ================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub next_bin_coeff_row {
my ($i, $im1_row) = @_;
#__FIRST ELEMENT I,0 IN ALL ROWS DEFINED AS UNITY
my $next_row = [1];
#__IF I > 1 COMPUTE REST OF NEXT (I) ROW USING PASCAL TRIANGLE ON PREV (I-1) ROW
if ($i > 1) {
#__COMPUTE STOPPING POINT BASED ON SYMMETRY
my $i_mirror = $i / 2;
my $i_end = int $i_mirror;
#__FILL IN INTERMEDIATE ELEMENTS I,1 TO I,I_END USING PASCALS TRIANGLE
for (my $iposition = 1; $iposition <= $i_end; $iposition++) {
#__START WITH "LEFT SIDE" VAL OF PREVIOUS ROW
my $element = $im1_row->[$iposition-1];
#__COMPUTE NEXT ELEMENT USING THE PASCAL TRIANGLE METHOD
if ($iposition == $i_mirror) {
$element += $im1_row->[$iposition-1];
} else {
$element += $im1_row->[$iposition];
}
#__SAVE
push (@{$next_row}, $element);
}
}
#__RETURN LIST OF HALF SYMETRIC NEXT ROW OF BINOMIAL COEFFICIENTS
return $next_row;
}
# ========
# BINCOEFF return binomial coefficient using 1/2 symetric stored table
# ======== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# RETURNS C_{$i_top, $k_bottom}
sub bincoeff {
my $obj = shift;
my ($i_top, $k_bottom) = @_;
#__SYMMETRY CUTOFF IS HALF THE VALUE OF THE I'TH ROW
my $i_half = $i_top / 2;
#__IF I,K IS WITHIN THE STORED SYMMETRIC HALF OF TRIANGLE THEN SIMPLY RETURN VAL
if ($k_bottom <= $i_half) {
return $obj->{'binom_coeffs'}->[$i_top]->[$k_bottom];
#__ELSE COMPUTE SYMMETRIC REFLECTION OF NON-STORED COMPONENT AND RETURN THAT VAL
} else {
my $k_reflect = $i_top - $k_bottom;
return $obj->{'binom_coeffs'}->[$i_top]->[$k_reflect];
}
}
################################################################################
## ##
## T R A I L I N G P O D D O C U M E N T A T I O N ##
## ##
################################################################################
=head1 EXAMPLES
The following examples may be helpful in using this
package.
In each case, assume we have first executed some required preliminary
code.
#__USE THE PACKAGE
use PathScan;
#__SOME DATA FOR AN "EXACT CONTEXT" CALCULATION
my $genes_exact = [
4000, 4000, 4000, 4000, 4000,
15000, 15000, 15000, 15000, 15000,
35000, 35000, 35000, 35000, 35000
];
#__SOME DATA FOR AN "APPROXIMATE CONTEXT" CALCULATION
my @genes_binned = (
[4000, 4000, 4000, 4000, 4000],
[15000, 15000, 15000, 15000, 15000],
[35000, 35000, 35000, 35000, 35000]
);
=head2 simple path-scan test
Here, we compare the values returned by both the exact
and approximate algorithms over the whole domain of
possible hits for a case where the answers should be
identical.
#__SET BACKGROUND MUTATION RATE
my $rho = 0.00002;
#__CONFIGURE OBJECTS IN "EXACT" AND "APPROXIMATE" CONTEXTS
my $pmobj_exact = PathScan->new ($genes_exact);
$pmobj_exact->preprocess ($rho);
my $pmobj_binom = PathScan->new (@genes_binned);
$pmobj_binom->preprocess ($rho);
#__CALCULATE AND TALLY THE MAXIMUM DIFFERENCE
my $maxdiff = 0;
for (my $i = 0; $i <= scalar @{$genes_exact}; $i++) {
my $pm_pval_exact = $pmobj_exact->path_scan($i);
my $pm_pval_binom = $pmobj_binom->path_scan($i);
my $diff = abs ($pm_pval_exact - $pm_pval_binom);
$maxdiff = $diff if $diff > $maxdiff;
print "$i hits: $pm_pval_exact $pm_pval_binom $diff\n";
}
print "MAXIMUM DIFFERENCE IS $maxdiff\n";
=head2 testing at different background rates
This example shows how to run the test for a fixed number of hits, say 7 in this
case, for various different background mutation rates.
#__CONFIGURE OBJECT
my $pmobj_binom = PathScan->new (@genes_binned);
#__CALCULATE
for (my $rho = 0.00001; $rho <= 0.0001; $rho += 0.00001) {
my $pm_pval_binom = $pmobj_binom->path_scan(7, $rho);
print "7 hits at background $rho : P = $pm_pval_binom\n";
}
Note that we did not run C explicitly,
but rather let the C method call it implicitly
for each new value of the background mutation
rate.
=head2 computing asymptotic approximate solution
The asymptotic (Poisson) approximate probabiltiy value is straightforward to
compute.
#__SET BACKGROUND MUTATION RATE
my $rho = 0.00002;
#__CONFIGURE OBJECT
my $pmobj_poisson = PathScan->new ($genes_exact);
$pmobj_poisson->preprocess ($rho);
#__P-VALUE FOR 7 OBSERVED MUTATED GENES
my $pm_pval_poisson = $pmobj_exact->path_scan_asymptot (7);
=head2 accessing individual probability masses
The probability masses for specific numbers of hits can also be
calculated.
#__SET BACKGROUND MUTATION RATE
my $rho = 0.00002;
#__CONFIGURE OBJECT
my $pmobj_exact = PathScan->new ($genes_exact);
$pmobj_exact->preprocess ($rho);
#__CALCULATE MASSES
my $total_prob = 0;
for (my $i = 0; $i <= scalar @{$genes_exact}; $i++) {
my $pval_exact = $pmobj_exact->p_value_exact ($i);
$total_prob += $pval_exact;
print "$i hits : probability mass = $pval_exact\n";
}
print "total probability = $total_prob\n";
=cut
################################################################################
## ##
## - E N D - ##
## ##
################################################################################
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/PathScan/PopulationPathScan.pm 000444 000765 000024 55250 12013522176 31242 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::PathScan::PopulationPathScan;
#__STANDARD PERL PACKAGES
use strict;
use warnings;
use Carp;
use Genome::Model::Tools::Music::PathScan::CombinePvals;
use Genome::Model::Tools::Music::PathScan::PathScan;
# DEBUG
# print "USING LOCAL MCW VERSION OF POPULATION PATHSCAN\n";
# DEBUG
# DEBUG -- PLEASE REMOVE
# use lib "/home/mwendl/work/perl_modules";
# use PostData;
################################################################################
## ##
## I N T R O D U C T O R Y P O D D O C U M E N T A T I O N ##
## ##
################################################################################
=head1 NAME
PopulationPathScan - apply PathScan test to populations rather
than just single individuals
=head1 SYNOPSIS
use PopulationPathScan;
my $obj = PopulationPathScan->new ($ref_to_list_of_gene_lengths);
$obj->assign ($number_of_compartments);
$obj->preprocess ($background_mutation_rate);
$pval = $obj->population_pval_approx ($ref_to_list_of_hits_per_sample);
$pval = $obj->population_pval_exact ($ref_to_list_of_hits_per_sample);
=head1 DESCRIPTION
The C package is implemented strictly
as a test of a set of genes, e.g. a pathway, for a I
individual.
Specifically, knowing the gene lengths in the pathway, the number
of genes that have at least one mutation, and the estimated background
mutation rate, one can test the null hypothesis that these observed
mutations are well-explained simply by the mechanism of random background
mutation.
However, it will often be the case that data for a pathway will be available
for many individuals, meaning that we now have many tests of the given (single)
hypothesis.
(This should not be confused with the scenario of multiple hypothesis
testing.)
The set of values contains much more information than a single value,
suggesting that significance must be judged on the basis of the collective
result.
For example, while no single p-value by itself may exceed the chosen statistical
threshold, the overall set of probabilities may still give the impression of
significance.
Properly combining such numbers is a necessary, but not entirely trivial
task.
This package basically serves as a high-level interface to first
perform individual tests using the methods of C,
and then to properly combine the resulting p-values using the methods of
C.
=head1 AUTHOR
Michael C. Wendl
S
Copyright (C) 2009 Washington University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=head1 METHODS
The available methods are listed
below.
=cut
################################################################################
## ##
## P R O G R A M M E R N O T E S ##
## ##
################################################################################
#
# The obj schematic resembles:
#
# $obj = {
#
# #__GENE LENGTHS IN THE POPULATION PATHSCAN TEST
# gene_lengths => [474, 1038, 285, ...],
#
# #__THE ACTUAL NUMBER OF GENES IN TEST (SAVED FOR CONVENIENCE)
# num_genes = 15,
#
# #__ARGUMENT LIST FOR PATHSCAN COMPUTATION (PathScan) STRUCTURE
# IS DETERMINED BY THE WAY THE "assign" METHOD IS CALLED
# path_scan_arg_list = [],
# path_scan_arg_list = [ [], [], [] ],
#
# #__ASSIGN LEVEL (ESSENTIALLY THE ARGUMENT OF THE 'ASSIGN' METHOD)
# assign_level = 1,
#
# #__CUMMULATIVE DISTRIBUTION FOR THIS SET OF GENES ORDERED MOST EXTREME
# TO LEAST EXTREME --- COULD BE EITHER THE "COMPLETE" CDF, I.E. THE ENTIRE
# DISTRIBUTION
# cdf = [0.003, 0.0234, 0.1001, 0.23, 0.4, 0.8, 0.9, 0.94, 0.97, 0.99, 1],
#
# #__OR COULD BE A TRUNCATED LIST WITH JUST ENOUGH VALUES TO DO A CALCULATION
# I.E. WHERE THE MORE EXTREME TAILED PROBABILITY VALUES ARE OMITTED, BEING
# REPLACED BY A SIMPLE PLACEHOLDER FLAG -1
# cdf = [-1, -1, -1, -1, -1, -1, -1, 0.94, 0.97, 0.99, 1],
#
# #__MAXIMUM NUMBER OF MUTATED GENES TAKEN OVER ALL SAMPLES -- SEE PREPROCESS
# max_hits = 5,
# };
################################################################################
## ##
## P U B L I C M E T H O D S ##
## ##
################################################################################
# ===
# NEW create a new population path-scan object
# === ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 new
The object constructor takes a mandatory, but otherwise un-ordered reference
to a list of gene lengths comprising the biological group (e.g. a pathway)
whose mutation significance is to be analyzed using the PathScan
paradigm.
my $obj = PopulationPathScan->new ([474, 1038, 285, ...]);
The method checks to make sure that all elements
are legitimate lengths, i.e. integers exceeding
3.
=cut
sub new {
my $class = shift;
my ($gene_lengths) = @_;
#__OBJECT TEMPLATE
my $self = {};
#__PROCESS GENE LENGTHS IF THEY'RE SPECIFIED
if (defined $gene_lengths && $gene_lengths) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference"
unless ref $gene_lengths eq "ARRAY";
#__SAVE LIST
$self->{'gene_lengths'} = $gene_lengths;
$self->{'num_genes'} = scalar @{$gene_lengths};
#__VALIDATE THE INPUT
foreach my $gene_length (@{$gene_lengths}) {
#__MAKE SURE THIS IS A LEGITIMATE LENGTH
croak "'$gene_length' is not a gene length" unless
$gene_length =~ /^\d+$/ && $gene_length >= 3;
}
#__OTHERWISE CROAK
} else {
croak "must specify a list of gene lengths as an argument";
}
#__BLESS INTO CLASS AND RETURN OBJECT
bless $self, $class;
return $self;
}
# ======
# ASSIGN assign the manner in which genes will be internally organized
# ====== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 assign
This method assigns the manner in which genes will be internally
organized for passing to the PathScan calculation
component.
The main consideration here is how the list may be compartmentalized for greater
computational efficiency, though at some loss of accuracy, for the PathScan
calculation.
If the gene list is long, exact calculation is generally
infeasible.
The method takes a single argument representing the number of compartments
(or sub-lists) the lengths will be divided into, e.g. 1 represents a
single list, i.e. exact computation, 2 indicates two lists, 3 three lists,
etc.
$obj->assign (3);
The values are then organized internally such that the smallest
genes are grouped together, then the slightly larger ones, and so
forth.
Generally, 3 or 4 lists give reasonable balance
between accuracy and computation (Wendl et al., in
progress).
=cut
# THIS HAS NOT BEEN IMPLEMENTED YET
#
# The method can also be called without an argument
#
# $obj->assign;
#
# in which case the gene lengths will put into a number of
# compartments such that each one has a maximum of 10
# values.
sub assign {
my $obj = shift;
my ($assign_level) = @_;
#__ORDER THE LIST OF VALIDATED GENE LENGTHS ACCORDING TO INCREASING SIZE
@{$obj->{'gene_lengths'}} = sort _numerical_ @{$obj->{'gene_lengths'}};
sub _numerical_ {$a <=> $b}
#__ASSIGN TO A SPECIFIC NUMBER OF COMPARTMENTS IF SPECIFIED
if (defined $assign_level && $assign_level) {
$obj->{'assign_level'} = $assign_level;
#__QUICK-PROCESSING IF THERE'S NO COMPARTMENTALIZATION
if ($assign_level == 1) {
$obj->{'path_scan_arg_list'} = $obj->{'gene_lengths'};
return;
}
#__REMAINDER AFTER DIVIDING GENE LIST INTO AN INTEGER-NUMBER OF COMPARTMENTS
my $remain = $obj->{'num_genes'} % $assign_level;
#__LENGTH OF ALL COMPARTMENTS (EXCEPT LAST ONE IF THERE'S A REMAINDER)
my $list_length = ($obj->{'num_genes'} - $remain) / $assign_level;
#__BUILD-UP THE COMPARTMENTALIZED ARGUMENT LIST
my ($list_number, $gene_number, $compartment) = (1, 1, []);
foreach my $gene_length (@{$obj->{'gene_lengths'}}) {
push @{$compartment}, $gene_length;
$gene_number++;
if ($gene_number > $list_length) {
$list_number++;
push @{$obj->{'path_scan_arg_list'}}, $compartment;
($gene_number, $compartment) = (1, []);
$list_length += $remain if $list_number == $assign_level;
}
}
#__ELSE ASSIGN SUCH THAT EACH COMPARTMENT HAS A MAXIMUM SIZE
} else {
croak "illegal assignment level";
}
}
# ==========
# PREPROCESS set-up PathScan calculation and compute CDF
# ========== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=head2 preprocess
This method pre-processes the population-level calculation, specifically,
it sets up and executes the PathScan module to obtain the CDF associated
with the given gene set and background mutation
rate.
It takes the latter as an
argument.
$obj->preprocess (0.0000027);
Executing this method will take various amounts of CPU time,
depending upon the level of accuracy and the number of genes in the
calculation.
The method optionally takes the list of the number of mutated genes
in the group for each sample as a second argument, if this information is
known at this point
$obj->preprocess (0.0000027, [4, 5, 7, 3, 0, ...]);
and it is usually better to use this form because the
internals will compute only a truncated CDF that is just
sufficient to process this list, rather than computing the full
CDF.
Not only is speed improved, but this helps avoid overflow errors for large
pathways.
=cut
sub preprocess {
my $obj = shift;
my ($mutation_prob, $list_of_hits) = @_;
my $max_hits = 0;
#__PRELIMINARY VALIDATION OF ARGUMENT
croak "need background mutation rate" unless $mutation_prob;
croak "background mutation '$mutation_prob' rate must be a p-val"
unless is_a_pval ($mutation_prob);
#__INVOKE NEW PATHSCAN OBJECT USING PRE-COMPUTED ARGUMENT LIST FOR EXACT SOLN
my $pm_obj;
if ($obj->{'assign_level'} == 1) {
$pm_obj = Genome::Model::Tools::Music::PathScan::PathScan->new ($obj->{'path_scan_arg_list'});
#__OR FOR APPROXIMATE SOLUTION
} elsif ($obj->{'assign_level'} > 1) {
$pm_obj = Genome::Model::Tools::Music::PathScan::PathScan->new (@{$obj->{'path_scan_arg_list'}});
#__ALSO FIND THE MAX NUMBER OF MUTATED GENES AMONG ALL SAMPLES IF GIVEN HITS
if (defined $list_of_hits && $list_of_hits) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference"
unless ref $list_of_hits eq "ARRAY";
#__HARD-SET MAX HITS TO 1 IN CASE 0 SAMPLES HAVE HITS & TRIGGERS TRUNC CDF
$max_hits = 1;
#__FIND MAXIMUM NUMBER OF HITS
foreach my $hits (@{$list_of_hits}) {
#__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER
croak "'$hits' is not a hit number" unless
$hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'};
#__RECORD MAXIMUM
$max_hits = $hits if $hits > $max_hits;
}
#__SAVE MAX HITS TO OBJECT
$obj->{'max_hits'} = $max_hits;
}
#__ELSE WE CANT PROCESS
} else {
croak "I dont understand the 'assign' level you used previously";
}
#__STANDARD PREPROCESSING FOR PathScan OBJECT
$pm_obj->preprocess ($mutation_prob);
#__COMPUTE AND STORE CDF -- EITHER FULL OR TRUNCATED
if ($max_hits) {
$obj->{'cdf'} = $pm_obj->cdf_truncated ($max_hits);
} else {
$obj->{'cdf'} = $pm_obj->cdf;
}
# $obj->{'cdf'} = $pm_obj->cdf_asymptot;
#__WEIRD HEURISTIC: MAKE SURE LAST VALUE IN CDF LIST IS ALWAYS IDENTICALLY UNITY
#
# We have seen sometimes that the last value appears to be unity, i.e. it
# prints as such, but the 'is_a_pval' rejects it either on the real number
# regexp, or the <= 1 condition. Here is an actual croak:
#
# VAL '1' IS NOT REAL
# '1' in distribution 1 is not a p-val at Statistics/PopulationPathScan.pm line 395
#
# Please track down this problem when you have a chance, but this practical
# fix seems to work acceptably for the moment.
$obj->{'cdf'}->[$#{$obj->{'cdf'}}] = 1;
}
# =====================
# POPULATION_PVAL_EXACT tail prob for the population using exact enumeration
# ===================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# PROGRAMMING NOTES:
#
# 1. remember that the cdf list returned from PathScan->cdf
# is ordered from most etreme (lowest p-value, highest number of hits) to
# least extreme (highest p-value = 1, lowest number of hits = 0). Therefore,
# the correct p-value corresponding to the actual number of hits cannot
# be naively looked-up in the list according to order, but must rather be
# looked up according to the *reverse order*. For example, for the usual
# binomial (0.5 + 0.5)^4 (see e.g. Wallis (1942) pp 244), we have
#
# cdf = [0.0625, 0.3125, 0.6875, 0.9375, 1]
# position = [0, 1, 2, 3, 4]
# meaning = [all 4 hit, at least 3 hit, at least 2, at least 1, at least 0]
#
# therefore, the actual "hit" pvalue is in position
#
# $obj->{'num_genes'} - $hits
#
# 2. CombinePvals does not yet have a method that exploits
# scenarios, such as this one, where each individual p-val comes from the
# _same_ distribution. Currently, we must call "exact_enum_arbitrary",
# which does a full enumeration. Change methods here if the CombinePvals
# class ever gets such a method.
=head2 population_pval_exact
This method performs the population-level calculation using exact
enumeration.
It takes the list of the number of mutated genes in the
group for each sample, e.g. each patient's whole genome
sequence, for example
patient 1: 4 genes in the pathway are mutated
patient 2: 5 genes in the pathway are mutated
patient 3: 7 genes in the pathway are mutated
patient 4: 3 genes in the pathway are mutated
patient 5: 0 genes in the pathway are mutated
: : : : : : : : :
which is invoked as
$pval = $obj->population_pval_exact ([4, 5, 7, 3, 0, ...]);
Most scenarios will not actually be able to make use of this method
because enumeration of all possible cases is rarely computationally
feasible.
This method will mostly be useful for examining small test
cases.
=cut
sub population_pval_exact {
my $obj = shift;
my ($list_of_hits) = @_;
#__PROCESS HITS IF THEY'RE SPECIFIED
if (defined $list_of_hits && $list_of_hits) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference"
unless ref $list_of_hits eq "ARRAY";
#__WE NEED 2 LISTS FOR EXACT METHOD
my ($default_arg_list, $cdf_list) = ([], []);
#__VALIDATE AND PROCESS THE INPUT INTO ARGUMENT LISTS
foreach my $hits (@{$list_of_hits}) {
#__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER
croak "'$hits' is not a hit number" unless
$hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'};
#__TAIL PVAL FOR THIS HIT NUMBER (SEE PROGRAMMING NOTE ABOVE)
my $pval_x = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits];
#__STORE IN DEFAULT CombinePvals ARG LIST
push @{$default_arg_list}, $pval_x;
#__CDFS GO IN SPECIAL ARG LIST FOR EXACT ENUMERATION
push @{$cdf_list}, $obj->{'cdf'};
}
#__INVOKE NEW COMBINE_PVALS OBJECT USING PRE-COMPUTED ARGUMENT LIST
my $combine_obj = Genome::Model::Tools::Music::PathScan::CombinePvals->new ($default_arg_list);
#__COMPUTE OVERALL "GROUP" P-VALUE BASED ON EXACT ENUMERATION
###### DEBUG
# print "from PopulationPathScan --- args for new\n";
# &PostData ($default_arg_list);
# print "from PopulationPathScan --- args for exact_enum_arbitrary\n";
# &PostData ($cdf_list);
###### DEBUG
my $pval = $combine_obj->exact_enum_arbitrary (@{$cdf_list});
return $pval;
#__OTHERWISE CROAK
} else {
croak "must specify a list of number of genes mutated for the sample set";
}
}
# ======================
# POPULATION_PVAL_APPROX tail prob for the population using Lancaster approx
# ====================== ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# PROGRAMMING NOTE:
#
# remember that the cdf list returned from PathScan->cdf
# is ordered from most etreme (lowest p-value, highest number of hits) to
# least extreme (highest p-value = 1, lowest number of hits = 0). Therefore,
# the correct p-value corresponding to the actual number of hits cannot
# be naively looked-up in the list according to order, but must rather be
# looked up according to the *reverse order*. For example, for the usual
# binomial (0.5 + 0.5)^4 (see e.g. Wallis (1942) pp 244), we have
#
# cdf = [0.0625, 0.3125, 0.6875, 0.9375, 1]
# position = [0, 1, 2, 3, 4]
# meaning = [all 4 hit, at least 3 hit, at least 2, at least 1, at least 0]
#
# therefore, the actual "hit" pvalue is in position
#
# $obj->{'num_genes'} - $hits
#
# and the next-most-extreme (lower p-value) is in position
#
# $obj->{'num_genes'} - $hits - 1
#
# for using Lancaster's correction methods
=head2 population_pval_approx
This method performs the population-level
calculation using Lancaster's approximate transform
correction.
It takes, as a mandatory argument, the list of the number of mutated
genes in the group for each sample, e.g. each patient's whole genome
sequence.
$pval = $obj->population_pval_approx ([4, 5, 7, 3, 0, ...]);
You must pass the list of hits, even if you already
passed this list earlier to the pre-processing
method.
Most cases will use this method because exact combination
of individual probability values is rarely computationally
feasible.
Note that Lancaster's method typically gives much better (more
accurate) results than Fisher's "standard" chi-square
transform.
=over
=item *
Fisher, R. A. (1958)
I, 13-th Ed. Revised,
Hafner Publishing Co., New York.
=item *
Lancaster, H. O. (1949)
I,
Biometrika B<36>(3/4), 370-382.
=back
=cut
sub population_pval_approx {
my $obj = shift;
my ($list_of_hits) = @_;
#__PROCESS HITS IF THEY'RE SPECIFIED
if (defined $list_of_hits && $list_of_hits) {
#__MAKE SURE THIS IS A LIST
croak "argument must be list reference"
unless ref $list_of_hits eq "ARRAY";
#__WE NEED 2 LISTS FOR LANCASTER'S METHOD
my ($default_arg_list, $lancaster_list) = ([], []);
#__VALIDATE AND PROCESS THE INPUT INTO ARGUMENT LISTS
# DEBUG
#print "processing list of hits\n";
# DEBUG
foreach my $hits (@{$list_of_hits}) {
#__MAKE SURE THIS IS A LEGITIMATE HIT NUMBER
croak "'$hits' is not a hit number" unless
$hits =~ /^\d+$/ && $hits >= 0 && $hits <= $obj->{'num_genes'};
#__TAIL PVALS FOR THIS HIT NUMBER (SEE PROGRAMMING NOTE ABOVE)
my $pval_x = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits];
# my $pval_x_m_1 = $obj->{'cdf'}->[$obj->{'num_genes'} - $hits - 1];
my $pval_x_m_1;
my $x_m_1_index = $obj->{'num_genes'} - $hits - 1;
if ($x_m_1_index >= 0) {
$pval_x_m_1 = $obj->{'cdf'}->[$x_m_1_index];
} else {
$pval_x_m_1 = 0; # dont allow this to inadvertently loop to list end
}
# DEBUG
# print " hit number = $hits\n";
# print " number of genes = $obj->{'num_genes'}\n";
# print " pval_x = $pval_x\n";
# print " pval_x_m_1 = $pval_x_m_1\n";
# DEBUG
#__STORE IN DEFAULT CombinePvals ARG LIST
# (THIS IS ACUTALLY JUST A FORMALITY IF USING LANCASTERS METHOD)
push @{$default_arg_list}, $pval_x;
#__STORE IN SPECIAL ARG LIST FOR LANCASTERS METHOD: P(X-1) THEN P(X)
push @{$lancaster_list}, [$pval_x_m_1, $pval_x];
}
#__INVOKE NEW COMBINE_PVALS OBJECT USING PRE-COMPUTED ARGUMENT LIST
my $combine_obj = Genome::Model::Tools::Music::PathScan::CombinePvals->new ($default_arg_list);
#__COMPUTE OVERALL "GROUP" P-VALUE BASED ON LANCASTERS TRANSFORM CORRECTION
my $pval = $combine_obj->lancaster_mixed_corrected_transform (@{$lancaster_list});
return $pval;
#__OTHERWISE CROAK
} else {
croak "must specify a list of number of genes mutated for the sample set";
}
}
################################################################################
## ##
## P R I V A T E M E T H O D S ##
## ##
################################################################################
# ==========================================================================
# ROUTINE FOR DETERMINING WHETHER A VARIABLE REPRESENTS A LEGITIMATE P-VALUE
# ==========================================================================
sub is_a_pval {
my ($val) = @_;
$DB::single = 1;
#__MUST BE A FLOAT (REGEXP: PERL COOKBOOK CHAP 2.1) & MUST BE BOUNDED BY 0 AND 1
if (Genome::Model::Tools::Music::PathScan::PathScan::is_float($val)
&& $val >= 0 && $val <= 1) {
return 1;
#__ELSE IT IS NOT A PVAL
} else {
return 0;
}
}
################################################################################
## ##
## T R A I L I N G P O D D O C U M E N T A T I O N ##
## ##
################################################################################
################################################################################
## ##
## - E N D - ##
## ##
################################################################################
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot 000755 000765 000024 0 12013522176 24161 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm 000444 000765 000024 10003 12013522176 30164 0 ustar 00nnutter staff 000000 000000 package Genome::Model::Tools::Music::Plot::MutationRelation;
use warnings;
use strict;
use Genome;
use IO::File;
use List::Util qw( sum );
use Carp;
use POSIX qw( WIFEXITED );
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Plot::MutationRelation {
is => 'Command::V2',
has_input => [
input_matrix => { is => 'String', doc => "A gene/sample matrix generated by the mutation-relation tool" },
output_pdf => { is => 'String', doc => "An output pdf file to draw the plot to" },
],
has_optional_input => [
skip_zero_mut_samples => { is => 'Boolean', doc => "Don't plot samples that have no mutations on any gene being plotted", default => 1 },
genes_to_plot => { is => 'String', doc => "Comma-separated list of genes to plot (example: DNMT3A,NPM1)" },
plot_genes_in_order_listed => { is => 'Boolean', doc => "Plot the genes in the order that they're listed. Default is to plot them in descending order by number of mutations", default => 0 },
],
doc => "Makes plots for results from the mutation-relation tool",
};
sub help_detail {
return <input_matrix;
my $output_pdf = $self->output_pdf;
my $skip_zero_mut_samples = $self->skip_zero_mut_samples;
my $genes_to_plot = $self->genes_to_plot;
my $plot_genes_in_order_listed = $self->plot_genes_in_order_listed;
# Load the gene names from the matrix
my $headFh = IO::File->new( $input_matrix );
my @cols = split( /\t/, $headFh->getline );
# Create a hash of the gene names, which associates each with an R-friendly name
my %genes_in_matrix = map{chomp; my $t = $_; s/\W/_/g; ( $t, $_ )} splice( @cols, 1 );
$headFh->close;
# Discard any user-specified genes that are not in the input matrix
# being sure to preserve the order of the genes in the input
if( defined $genes_to_plot ) {
my %genes = map{($_,1)} split( /,/, $genes_to_plot );
foreach my $gene ( keys %genes ) {
unless( defined $genes_in_matrix{$gene} ) {
warn "Skipping gene $gene which is not seen in the provided input-matrix\n";
delete $genes{$gene};
}
}
#preserve the order of genes in the input
my @glist = split(",",$genes_to_plot);
my @finalList;
foreach my $g (@glist){
if(defined($genes{$g})){
push(@finalList, $g)
}
}
$genes_to_plot = join(",",@finalList);
}
# If user didn't specify genes-to-plot, then plot every gene in the input-matrix
else {
$genes_to_plot = join( ",", values %genes_in_matrix );
}
# Create a temporary mutation matrix with R-friendly gene names
my $tmp_matrix = Genome::Sys->create_temp_file_path;
( $tmp_matrix ) or die "Couldn't create a temp file. $!";
my $inFh = IO::File->new( $input_matrix );
my $outFh = IO::File->new( $tmp_matrix, ">" );
while( my $line = $inFh->getline ) {
chomp( $line );
my @cols = split( /\t/, $line );
if( $line =~ m/^Sample\t/ ) { # Handle the header line
$outFh->print( join( "\t", $cols[0], map{$genes_in_matrix{$_}}@cols[1..$#cols] ) . "\n" );
}
elsif( !$skip_zero_mut_samples || sum( @cols[1..$#cols] ) > 0 ) {
$outFh->print( join( "\t", @cols ) . "\n" );
}
}
$outFh->close;
$inFh->close;
# Call R to create a plot for the user-specified genes
my $plot_cmd = "R --slave --args < " . __FILE__ . ".R $tmp_matrix $genes_to_plot $output_pdf $plot_genes_in_order_listed";
WIFEXITED( system $plot_cmd ) or croak "Couldn't run: $plot_cmd ($?)";
return 1;
}
1;
Genome-Model-Tools-Music-0.04/lib/Genome/Model/Tools/Music/Plot/MutationRelation.pm.R 000444 000765 000024 7424 12013522176 30361 0 ustar 00nnutter staff 000000 000000 # Fetch command line arguments
args = commandArgs();
input_matrix = as.character(args[4]);
genes_to_plot = as.character(args[5]);
output_pdf = as.character(args[6]);
preserveGeneOrder = as.numeric(as.character(args[7]));
sort.data.frame <- function( x, by ) {
if(by[[1]] != "~")
stop("Argument 'by' must be a one-sided formula.")
## Make the formula into character and remove spaces
formc <- as.character(by[2])
formc <- gsub(" ", "", formc)
## If the first character is not + or -, add +
if(!is.element(substring(formc, 1, 1), c("+", "-")))
formc <- paste("+", formc, sep = "")
## Extract the variables from the formula
vars <- unlist(strsplit(formc, "[\\+\\-]"))
vars <- vars[vars != ""] # Remove any extra "" terms
## Build a list of arguments to pass to "order" function
calllist <- list()
pos <- 1 # Position of + or -
for(i in 1:length(vars)){
varsign <- substring(formc, pos, pos)
pos <- pos + 1 + nchar(vars[i])
if(is.factor(x[, vars[i]])){
if(varsign == "-") {
calllist[[i]] <- -rank(x[, vars[i]])
} else {
calllist[[i]] <- rank(x[, vars[i]])
}
} else {
if(varsign == "-") {
calllist[[i]] <- -x[, vars[i]]
} else {
calllist[[i]] <- x[,vars[i]]
}
}
}
return( x[do.call( "order", calllist ), ] )
}
plotMutRel <- function( infile, genes, outfile, preserveGeneOrder=FALSE) {
##------------------
a = read.table(infile,row.names=1,header=T)
gene_list = unlist(strsplit(genes, split=","))
print(genes)
df=a[,gene_list]
numSamp=length(df[,1])
numGenes=length(df)
if(numGenes < 1){
return("Error: genes to plot not found in matrix")
}
##adjustments to plot and text sizes for different numbers of samples
samptext=0.3
genetext=0.75
pdfwidth=numSamp/10
offset=2
if(numSamp < 50){
samptext=0.3
#genetext=0.85
pdfwidth=numSamp/5
offset=1.25
}
if(numSamp < 35){
samptext=0.4
}
if(pdfwidth < 3) { pdfwidth = 3 }
# if(numSamp >=100) { genetext=1.1 }
# if(numSamp >=200) { genetext=1.1 }
pdfheight=3+(0.25*numGenes-1)
##-------
#sort the data using the number of mutations in each, desc
sortdf = data.frame(g=gene_list[1],s=sum(df[,gene_list[1]]))
for(i in 2:length(gene_list)){
sortdf = rbind(sortdf, data.frame(g=gene_list[i],s=sum(df[,gene_list[i]])))
}
if(!(preserveGeneOrder)){
gene_list = as.vector(sort.data.frame(sortdf,~-s)$g)
}
print(preserveGeneOrder)
print(gene_list)
for(i in rev(gene_list)){
df = sort.data.frame(df,c("~",paste("-",i,sep="")))
}
#output pdf here
pdf(outfile,height=pdfheight,width=pdfwidth)
par(xpd=T)
plot(-100,-100, xlim=c(0,numSamp), ylim=c(-numGenes,0), axes=F, xlab="", ylab="")
hspace=0.10
vspace=0.05
##plot grey rects
for(i in -(1:numGenes)){
rect( (1:numSamp)+hspace,
rep(i,numSamp)+(1-vspace),
(1:numSamp)+(1-hspace),
rep(i,numSamp)+vspace,
col="grey80",
border=F );
}
##plot color rects
for(i in -(1:numGenes)){
pos=which(df[,gene_list[-i]]==1)
rect(pos+hspace,
rep(i,length(pos))+(1-vspace),
pos+(1-hspace),
rep(i,length(pos))+vspace,
col="darkgreen",
border=F )
}
##gene labels
for(i in -(1:numGenes)){
text(1,i+0.5,gene_list[-i],pos=2, cex=genetext)
}
##sample labels
for(i in 1:numSamp){
text(i+offset,-numGenes,row.names(df)[i],srt=90,pos=2,cex=samptext)
}
dev.off()
}
plotMutRel( input_matrix, genes_to_plot, output_pdf, preserveGeneOrder );
Genome-Model-Tools-Music-0.04/t 000755 000765 000024 0 12013522176 16306 5 ustar 00nnutter staff 000000 000000 Genome-Model-Tools-Music-0.04/t/Play.t 000444 000765 000024 2173 12013522176 17540 0 ustar 00nnutter staff 000000 000000 #!/usr/bin/env genome-perl
use strict;
use warnings;
use above "Genome";
use Test::More tests => 12;
BEGIN {
#It's important this be loaded first before the sub is redefined.
use_ok('Genome::Model::Tools::Music::Play');
};
#don't run all the commands while testing the glue that binds them--instead just record that we tried
no warnings qw(redefine);
sub Genome::Model::Tools::Music::Play::_run_command {
my $self = shift;
my $command = shift;
pass('Would have run ' . $command->command_name);
return 1;
}
use warnings qw(redefine);
#fake parameters--since commands not being run (per above)
my $play_cmd = Genome::Model::Tools::Music::Play->create(
bam_list => 'bam.list',
roi_file => 'roi.file',
reference_sequence => 'reference.sequence',
output_dir => Genome::Sys->create_temp_directory,
maf_file => 'maf.file',
genetic_data_type => 'gene',
pathway_file => 'pathway.file',
);
isa_ok($play_cmd, 'Genome::Model::Tools::Music::Play', 'created play comand');
my $rv = eval { $play_cmd->execute; };
if($@) {
diag('Error executing: ' . $@);
}
ok($rv, 'Play executed successfully.');