pax_global_header00006660000000000000000000000064134712422440014515gustar00rootroot0000000000000052 comment=fbba1bb0d0451285ac1b2839fd78336dda63c619 geneimpacts-0.3.7/000077500000000000000000000000001347124224400140235ustar00rootroot00000000000000geneimpacts-0.3.7/.gitignore000066400000000000000000000001161347124224400160110ustar00rootroot00000000000000*.pyc .installed.cfg bin develop-eggs *.egg-info tmp build dist geneimpacts-0.3.7/.travis.yml000066400000000000000000000001631347124224400161340ustar00rootroot00000000000000language: python python: - "2.7" - "3.4" - "3.5" script: python setup.py test install: - pip install -e . geneimpacts-0.3.7/CHANGES.md000066400000000000000000000002041347124224400154110ustar00rootroot000000000000000.3.3 ----- + handle unknown stuff better. + add `start_retained_variant` from VEP. 0.3.0 ----- + add support for bcftools BCSQ geneimpacts-0.3.7/LICENSE000066400000000000000000000021131347124224400150250ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2015 Brent Pedersen - Bioinformatics Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. geneimpacts-0.3.7/MANIFEST.in000066400000000000000000000000761347124224400155640ustar00rootroot00000000000000include README.md include LICENSE include geneimpacts/tests/* geneimpacts-0.3.7/README.md000066400000000000000000000036401347124224400153050ustar00rootroot00000000000000Given multiple snpEff or VEP or BCFTools consequence annotations for a single variant, get an orderable python object for each annotation. [![Build Status](https://travis-ci.org/brentp/geneimpacts.svg?branch=master)](https://travis-ci.org/brentp/geneimpacts) This is to provide a consistent interface to different variant annotations such as from [snpEff ANN field](http://snpeff.sourceforge.net/) and the [VEP CSQ field](http://www.ensembl.org/info/docs/tools/vep/index.html). and the [BCFTools consequence field](http://biorxiv.org/content/early/2016/12/01/090811) This will be used in [gemini](http://gemini.rtfd.org/) but should also be of general utility. Design ====== There is an effect base-class and then a sub-class for `snpEff`, one for `VEP`, and one for `BCFT` `Effect` objects are orderable (via \_\_le\_\_ ) and should have an \_\_eq\_\_ method so that we can use [functools.total_ordering](https://docs.python.org/2/library/functools.html#functools.total_ordering) to provide the other comparison operators. Given 2 effects objects, `a` and `b`: `a < b == True` iff the *severity* of `b` is greater than `a`. We will have a classmethod: `Effect.top_severity([eff1, ... effn]) that will return the single highest serverity if that exists or a list of the ties for highest Rules for severity: =================== Given 2 annotations, *a* and *b* *a* is more severe than *b* if: 1. *b* is a pseudogene and *a* is not 2. *a* is coding and *b* is not 3. *a* has higher severity than *b* ( see below) 4. polyphen, then sift 5. ??? transcript length? (we dont have access to this). severity -------- Severity is based on the [impacts from VEP](http://uswest.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick) and the [impacts from snpEff](http://snpeff.sourceforge.net/VCFannotationformat_v1.0.pdf). We reduce from the 4 categories HIGH, MEDIUM, LOW, MODIFIER to 3 by renaming MEDIUM to MED and renaming MODIFIER to LOW. geneimpacts-0.3.7/geneimpacts/000077500000000000000000000000001347124224400163225ustar00rootroot00000000000000geneimpacts-0.3.7/geneimpacts/__init__.py000066400000000000000000000001201347124224400204240ustar00rootroot00000000000000from .effect import Effect, SnpEff, VEP, OldSnpEff, BCFT __version__ = "0.3.7" geneimpacts-0.3.7/geneimpacts/effect.py000066400000000000000000000574041347124224400201420ustar00rootroot00000000000000from __future__ import print_function import sys from functools import total_ordering import re import itertools as it try: izip = it.izip except AttributeError: izip = zip basestring = str old_snpeff_effect_so = {'CDS': 'coding_sequence_variant', 'CODON_CHANGE': 'coding_sequence_variant', 'CODON_CHANGE_PLUS_CODON_DELETION': 'disruptive_inframe_deletion', 'CODON_CHANGE_PLUS_CODON_INSERTION': 'disruptive_inframe_insertion', 'CODON_DELETION': 'inframe_deletion', 'CODON_INSERTION': 'inframe_insertion', 'DOWNSTREAM': 'downstream_gene_variant', 'EXON': 'exon_variant', 'EXON_DELETED': 'exon_loss_variant', 'FRAME_SHIFT': 'frameshift_variant', 'GENE': 'gene_variant', 'INTERGENIC': 'intergenic_variant', 'INTERGENIC_REGION': 'intergenic_region', 'INTERGENIC_CONSERVED': 'conserved_intergenic_variant', 'INTRAGENIC': 'intragenic_variant', 'INTRON': 'intron_variant', 'INTRON_CONSERVED': 'conserved_intron_variant', 'NON_SYNONYMOUS_CODING': 'missense_variant', 'RARE_AMINO_ACID': 'rare_amino_acid_variant', 'SPLICE_SITE_ACCEPTOR': 'splice_acceptor_variant', 'SPLICE_SITE_DONOR': 'splice_donor_variant', 'SPLICE_SITE_REGION': 'splice_region_variant', #'START_GAINED': '5_prime_UTR_premature_start_codon_gain_variant', 'START_GAINED': '5_prime_UTR_premature_start_codon_variant', 'START_LOST': 'start_lost', 'STOP_GAINED': 'stop_gained', 'STOP_LOST': 'stop_lost', 'SYNONYMOUS_CODING': 'synonymous_variant', 'SYNONYMOUS_START': 'start_retained_variant', 'SYNONYMOUS_STOP': 'stop_retained_variant', 'TRANSCRIPT': 'transcript_variant', 'UPSTREAM': 'upstream_gene_variant', 'UTR_3_DELETED': '3_prime_UTR_truncation_+_exon_loss_variant', 'UTR_3_PRIME': '3_prime_UTR_variant', 'UTR_5_DELETED': '5_prime_UTR_truncation_+_exon_loss_variant', 'UTR_5_PRIME': '5_prime_UTR_variant', 'NON_SYNONYMOUS_START': 'initiator_codon_variant', 'NONE': 'None', 'CHROMOSOME_LARGE_DELETION': 'chromosomal_deletion'} old_snpeff_lookup = {'CDS': 'LOW', 'CHROMOSOME_LARGE_DELETION': 'HIGH', 'CODON_CHANGE': 'MED', 'CODON_CHANGE_PLUS_CODON_DELETION': 'MED', 'CODON_CHANGE_PLUS_CODON_INSERTION': 'MED', 'CODON_DELETION': 'MED', 'CODON_INSERTION': 'MED', 'DOWNSTREAM': 'LOW', 'EXON': 'LOW', 'EXON_DELETED': 'HIGH', 'FRAME_SHIFT': 'HIGH', 'GENE': 'LOW', 'INTERGENIC': 'LOW', 'INTERGENIC_CONSERVED': 'LOW', 'INTRAGENIC': 'LOW', 'INTRON': 'LOW', 'INTRON_CONSERVED': 'LOW', 'NONE': 'LOW', 'NON_SYNONYMOUS_CODING': 'MED', 'NON_SYNONYMOUS_START': 'HIGH', 'RARE_AMINO_ACID': 'HIGH', 'SPLICE_SITE_ACCEPTOR': 'HIGH', 'SPLICE_SITE_DONOR': 'HIGH', 'SPLICE_SITE_REGION': 'MED', 'START_GAINED': 'LOW', 'START_LOST': 'HIGH', 'STOP_GAINED': 'HIGH', 'STOP_LOST': 'HIGH', 'SYNONYMOUS_CODING': 'LOW', 'SYNONYMOUS_START': 'LOW', 'SYNONYMOUS_STOP': 'LOW', 'TRANSCRIPT': 'LOW', 'UPSTREAM': 'LOW', 'UTR_3_DELETED': 'MED', 'UTR_3_PRIME': 'LOW', 'UTR_5_DELETED': 'MED', 'UTR_5_PRIME': 'LOW'} # http://uswest.ensembl.org/info/genome/variation/predicted_data.html#consequences IMPACT_SEVERITY = [ ('chromosome_number_variation', 'HIGH'), # snpEff ('transcript_ablation', 'HIGH'), # VEP ('exon_loss_variant', 'HIGH'), # snpEff ('exon_loss', 'HIGH'), # snpEff ('rare_amino_acid_variant', 'HIGH'), ('protein_protein_contact', 'HIGH'), # snpEff ('structural_interaction_variant', 'HIGH'), #snpEff ('feature_fusion', 'HIGH'), #snpEff ('bidirectional_gene_fusion', 'HIGH'), #snpEff ('gene_fusion', 'HIGH'), #snpEff ('feature_ablation', 'HIGH'), #snpEff, structural varint ('splice_acceptor_variant', 'HIGH'), # VEP ('splice_donor_variant', 'HIGH'), # VEP ('start_retained_variant', 'HIGH'), # new VEP ('stop_gained', 'HIGH'), # VEP ('frameshift_variant', 'HIGH'), # VEP ('stop_lost', 'HIGH'), # VEP ('start_lost', 'HIGH'), # VEP ('transcript_amplification', 'HIGH'), # VEP ('disruptive_inframe_deletion', 'MED'), #snpEff ('conservative_inframe_deletion', 'MED'), #snpEff ('disruptive_inframe_insertion', 'MED'), #snpEff ('conservative_inframe_insertion', 'MED'), #snpEff ('duplication', 'MED'), # snpEff, structural variant ('inversion', 'MED'), # snpEff, structural variant ('exon_region', 'MED'), # snpEff, structural variant ('inframe_insertion', 'MED'), # VEP ('inframe_deletion', 'MED'), # VEP ('missense_variant', 'MED'), # VEP ('protein_altering_variant', 'MED'), # VEP ('initiator_codon_variant', 'MED'), # snpEff ('regulatory_region_ablation', 'MED'), # VEP ('5_prime_UTR_truncation', 'MED'), # found in snpEff ('splice_region_variant', 'MED'), # VEP changed to have medium priority ('3_prime_UTR_truncation', 'LOW'), # found in snpEff ('non_canonical_start_codon', 'LOW'), # found in snpEff ('synonymous_variant', 'LOW'), # VEP ('coding_sequence_variant', 'LOW'), # VEP ('incomplete_terminal_codon_variant', 'LOW'), # VEP ('stop_retained_variant', 'LOW'), # VEP ('mature_miRNA_variant', 'LOW'), # VEP ('5_prime_UTR_premature_start_codon_variant', 'LOW'), # snpEff ('5_prime_UTR_premature_start_codon_gain_variant', 'LOW'), #snpEff ('5_prime_UTR_variant', 'LOW'), # VEP ('3_prime_UTR_variant', 'LOW'), # VEP ('non_coding_transcript_exon_variant', 'LOW'), # VEP ('conserved_intron_variant', 'LOW'), # snpEff ('intron_variant', 'LOW'), # VEP ('exon_variant', 'LOW'), # snpEff ('gene_variant', 'LOW'), # snpEff ('NMD_transcript_variant', 'LOW'), # VEP ('non_coding_transcript_variant', 'LOW'), # VEP ('upstream_gene_variant', 'LOW'), # VEP ('downstream_gene_variant', 'LOW'), # VEP ('TFBS_ablation', 'LOW'), # VEP ('TFBS_amplification', 'LOW'), # VEP ('TF_binding_site_variant', 'LOW'), # VEP ('regulatory_region_amplification', 'LOW'), # VEP ('feature_elongation', 'LOW'), # VEP ('miRNA', 'LOW'), # snpEff ('transcript_variant', 'LOW'), # snpEff ('start_retained', 'LOW'), # snpEff ('regulatory_region_variant', 'LOW'), # VEP ('feature_truncation', 'LOW'), # VEP ('non_coding_exon_variant', 'LOW'), ('nc_transcript_variant', 'LOW'), ('conserved_intergenic_variant', 'LOW'), # snpEff ('intergenic_variant', 'LOW'), # VEP ('intergenic_region', 'LOW'), # snpEff ('intragenic_variant', 'LOW'), # snpEff ('non_coding_transcript_exon_variant', 'LOW'), # snpEff ('non_coding_transcript_variant', 'LOW'), # snpEff ('transcript', 'LOW'), # ? snpEff older ('sequence_feature', 'LOW'), # snpEff older ('non_coding', 'LOW'), # BCSQ ('?', 'UNKNOWN'), # some VEP annotations have '?' ('', 'UNKNOWN'), # some VEP annotations have '' ('UNKNOWN', 'UNKNOWN'), # some snpEFF annotations have 'unknown' ] # bcftools doesn't add _variant on the end. for (csq, imp) in list(IMPACT_SEVERITY[::-1]): if csq.endswith('_variant'): for i, (a, b) in enumerate(IMPACT_SEVERITY): if (a, b) == (csq, imp): IMPACT_SEVERITY.insert(i, (csq[:-8].lower(), imp)) break IMPACT_SEVERITY_ORDER = dict((x[0], i) for i, x in enumerate(IMPACT_SEVERITY[::-1])) IMPACT_SEVERITY = dict(IMPACT_SEVERITY) EXONIC_IMPACTS = set(["stop_gained", "exon_variant", "stop_lost", "frameshift_variant", "initiator_codon_variant", "inframe_deletion", "inframe_insertion", "missense_variant", "protein_altering_variant", "incomplete_terminal_codon_variant", "stop_retained_variant", "5_prime_UTR_premature_start_codon_variant", "synonymous_variant", "coding_sequence_variant", "5_prime_UTR_variant", "3_prime_UTR_variant", "transcript_ablation", "transcript_amplification", "feature_elongation", "feature_truncation"]) for im in list(EXONIC_IMPACTS): if im.endswith("_variant"): EXONIC_IMPACTS.add(im[:-8]) EXONIC_IMPACTS = frozenset(EXONIC_IMPACTS) def snpeff_aa_length(self): try: v = self.effects['AA.pos / AA.length'] if v.strip(): return int(v.split("/")[1].strip()) except: try: return int(self.effects['Amino_Acid_length']) except: return None def vep_aa_length(self): if not 'Protein_position' in self.effects: return None try: return int(self.effects['Protein_position']) except ValueError: try: return self.effects['Protein_position'] except KeyError: return None def vep_polyphen_pred(self): try: return self.effects['PolyPhen'].split('(')[0] except (KeyError, IndexError): return None def vep_polyphen_score(self): try: return float(self.effects['PolyPhen'].split('(')[1][:-1]) except (KeyError, IndexError): return None def vep_sift_score(self): try: return float(self.effects['SIFT'].split("(")[1][:-1]) except (IndexError, KeyError): return None def vep_sift_pred(self): try: return self.effects['SIFT'].split("(")[0] except (IndexError, KeyError): return None snpeff_lookup = { 'transcript': ['Feature_ID', 'Transcript_ID', 'Transcript'], 'gene': 'Gene_Name', 'exon': ['Rank', 'Exon', 'Exon_Rank'], 'codon_change': ['HGVS.c', 'Codon_Change'], 'aa_change': ['HGVS.p', 'Amino_Acid_Change', 'Amino_Acid_change'], 'aa_length': snpeff_aa_length, 'biotype': ['Transcript_BioType', 'Gene_BioType'], 'alt': 'Allele', } bcft_lookup = {} vep_lookup = { 'transcript': 'Feature', 'gene': ['SYMBOL', 'HGNC', 'Gene'], 'ensembl_gene_id': 'Gene', 'exon': 'EXON', 'codon_change': 'Codons', 'aa_change': 'Amino_acids', 'aa_length': vep_aa_length, 'biotype': 'BIOTYPE', 'polyphen_pred': vep_polyphen_pred, 'polyphen_score': vep_polyphen_score, 'sift_pred': vep_sift_pred, 'sift_score': vep_sift_score, 'alt': 'ALLELE', } # lookup here instead of returning ''. defaults = {'gene': None} @total_ordering class Effect(object): _top_consequence = None lookup = None def __init__(self, key, effect_dict, keys, prioritize_canonical): raise NotImplemented @classmethod def new(self, key, effect_dict, keys): lookup = {"CSQ": VEP, "ANN": SnpEff, "EFF": OldSnpEff, "BCSQ": BCFT} assert key in lookup return lookup[key](effect_dict, keys) @property def is_exonic(self): return self.top_consequence in EXONIC_IMPACTS def unused(self): return [] @property def top_consequence(self): # sort by order and return the top if self._top_consequence is None: self._top_consequence = sorted([(IMPACT_SEVERITY_ORDER.get(c, 0), c) for c in self.consequences], reverse=True)[0][1] return self._top_consequence @property def so(self): return self.top_consequence @property def is_coding(self): return self.biotype == "protein_coding" and self.is_exonic and ("_UTR_" not in self.top_consequence) @property def is_splicing(self): return "splice" in self.top_consequence @property def is_lof(self): return self.biotype == "protein_coding" and self.impact_severity == "HIGH" def __le__(self, other): # we sort so that the effects with the highest impacts come last # (highest) and so, we: # + return true if self has lower impact than other. # + return false if self has higher impact than other. self_has_lower_impact = True self_has_higher_impact = False if self.prioritize_canonical: scanon, ocanon = self.is_canonical, other.is_canonical if scanon and not ocanon: return self_has_higher_impact elif ocanon and not scanon: return self_has_lower_impact spg = self.is_pseudogene opg = other.is_pseudogene if spg and not opg: return self_has_lower_impact elif opg and not spg: return self_has_higher_impact sc, oc = self.coding, other.coding if sc and not oc: # other is not coding. is is splicing? # if other is splicing, we have lower impact. if not (self.is_splicing or other.is_splicing): return self_has_higher_impact elif oc and not sc: # self. is not coding. is it splicing? # if self is splicing it has higher impact if not (self.is_splicing or other.is_splicing): return self_has_lower_impact if self.severity != other.severity: return self.severity <= other.severity if self.biotype == "protein_coding" and not other.biotype == "protein_coding": return False elif other.biotype == "protein_coding" and not self.biotype == "protein_coding": return True if self.biotype == "processed_transcript" and not other.biotype == "processed_transcript": return False elif other.biotype == "processed_transcript" and not self.biotype == "processed_transcript": return True # sift higher == more damaing if (self.sift_value or 10000) < (other.sift_value or 10000): return True # polyphen, lower == more damaging if (self.polyphen_value or -10000) > (other.polyphen_value or -10000): return True return max(IMPACT_SEVERITY_ORDER.get(c, 0) for c in self.consequences) <= \ max(IMPACT_SEVERITY_ORDER.get(co, 0) for co in other.consequences) @classmethod def top_severity(cls, effects): for i, e in enumerate(effects): if isinstance(e, basestring): effects[i] = cls(e) if len(effects) == 0: return None if len(effects) == 1: return effects[0] effects = sorted(effects) if effects[-1] > effects[-2]: return effects[-1] ret = [effects[-1], effects[-2]] for i in range(-3, -(len(effects) - 1), -1): if effects[-1] > effects[i]: break ret.append(effects[i]) return ret def __getitem__(self, key): return self.effects[key] def __eq__(self, other): if not isinstance(other, Effect): return False return self.effect_string == other.effect_string def __str__(self): return repr(self) def __repr__(self): return "%s(%s-%s, %s)" % (self.__class__.__name__, self.gene, self.consequence, self.impact_severity) @property def effect_severity(self): return self.impact_severity @property def lof(self): return self.biotype == "protein_coding" and self.impact_severity == "HIGH" @property def severity(self, lookup={'HIGH': 3, 'MED': 2, 'LOW': 1, 'UNKNOWN': 0}, sev=IMPACT_SEVERITY): # higher is more severe. used for ordering. try: v = max(lookup[sev[csq]] for csq in self.consequences) except KeyError: v = 0 if v == 0: excl = [] for i, c in [(i, c) for i, c in enumerate(self.consequences) if not c in sev]: sys.stderr.write("WARNING: unknown severity for '%s' with effect '%s'\n" % (self.effect_string, c)) sys.stderr.write("Please report this on github with the effect-string above\n") excl.append(i) if len(excl) == len(self.consequences): v = 1 else: v = max(lookup[sev[csq]] for i, csq in enumerate(self.consequences) if not i in excl) return max(v, 1) @property def impact_severity(self): return ['xxx', 'LOW', 'MED', 'HIGH'][self.severity] @property def consequence(self): return self.top_consequence @property def is_pseudogene(self): #bool return self.biotype is not None and 'pseudogene' in self.biotype def __getattr__(self, k): v = self.lookup.get(k) if v is None: return v if isinstance(v, basestring): ret = self.effects.get(v) # if we didnt get value, there may be a column # specific value stored in defaults so we look import # up. if not ret and ret is not False: return defaults.get(k, '') return ret elif isinstance(v, list): for key in v: try: return self.effects[key] except KeyError: continue return defaults.get(k, '') return v(self) class BCFT(Effect): __slots__ = ('effect_string', 'effects', 'biotype', 'gene', 'transcript', 'aa_change', 'dna_change') keys = "consequence,gene,transcript,biotype,strand,amino_acid_change,dna_change".split(",") lookup = bcft_lookup def __init__(self, effect_string, keys=None, prioritize_canonical=False): if keys is not None: self.keys = keys self.effect_string = effect_string self.effects = dict(izip(self.keys, (x.strip().replace(' ', '_') for x in effect_string.split("|")))) self.biotype = self.effects.get('biotype', None) self.transcript = self.effects.get('transcript', None) self.gene = self.effects.get('gene', None) self.aa_change = self.effects.get('amino_acid_change', None) self.consequences = self.effects[self.keys[0]].split('&') def unused(self, used=frozenset("csq|gene|transcript|biotype|strand|aa_change|dna_change".lower().split("|"))): """Return fields that were in the VCF but weren't utilized as part of the standard fields supported here.""" return [k for k in self.keys if not k.lower() in used] @property def exonic(self): return self.biotype == "protein_coding" and any(csq in EXONIC_IMPACTS for csq in self.consequences) @property def coding(self): # what about start/stop_gained? return self.exonic and any(csq[1:] != "_prime_utr" for csq in self.consequences) class VEP(Effect): __slots__ = ('effect_string', 'effects', 'biotype') keys = "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL".split("|") lookup = vep_lookup def __init__(self, effect_string, keys=None, checks=True, prioritize_canonical=False): if checks: assert not "," in effect_string assert not "=" in effect_string self.effect_string = effect_string if keys is not None: self.keys = keys self.effect_string = effect_string self.effects = dict(izip(self.keys, (x.strip() for x in effect_string.split("|")))) self.biotype = self.effects.get('BIOTYPE', None) self.prioritize_canonical = prioritize_canonical @property def consequences(self, _cache={}): try: # this is a bottleneck so we keep a cache return _cache[self.effects['Consequence']] except KeyError: res = _cache[self.effects['Consequence']] = list(it.chain.from_iterable(x.split("+") for x in self.effects['Consequence'].split('&'))) return res def unused(self, used=frozenset("Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL".lower().split("|"))): """Return fields that were in the VCF but weren't utilized as part of the standard fields supported here.""" return [k for k in self.keys if not k.lower() in used] @property def coding(self): # what about start/stop_gained? return self.exonic and any(csq[1:] != "_prime_UTR_variant" for csq in self.consequences) @property def exonic(self): return self.biotype == "protein_coding" and any(csq in EXONIC_IMPACTS for csq in self.consequences) @property def is_canonical(self): return self.effects.get("CANONICAL", "") != "" class SnpEff(Effect): lookup = snpeff_lookup __slots__ = ('effects', 'effect_string', 'biotype') keys = [x.strip() for x in 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'.split("|")] def __init__(self, effect_string, keys=None, prioritize_canonical=False): assert not "," in effect_string assert not "=" == effect_string[3] self.effect_string = effect_string if keys is not None: self.keys = keys self.effects = dict(izip(self.keys, (x.strip() for x in effect_string.split("|", len(self.keys))))) self.biotype = self.effects['Transcript_BioType'] @property def consequences(self): return list(it.chain.from_iterable(x.split("+") for x in self.effects['Annotation'].split('&'))) @property def coding(self): # TODO: check start_gained and utr return self.exonic and not "utr" in self.consequence and not "start_gained" in self.consequence @property def exonic(self): csqs = self.consequence if isinstance(csqs, basestring): csqs = [csqs] return any(csq in EXONIC_IMPACTS for csq in csqs) and self.effects['Transcript_BioType'] == 'protein_coding' class OldSnpEff(SnpEff): keys = [x.strip() for x in "Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon | ERRORS | WARNINGS".split("|")] def __init__(self, effect_string, keys=None, _patt=re.compile("\||\("), prioritize_canonical=False): assert not "," in effect_string assert not "=" in effect_string effect_string = effect_string.rstrip(")") self.effect_string = effect_string if keys is not None: self.keys = keys self.effects = dict(izip(self.keys, (x.strip() for x in _patt.split(effect_string)))) @property def consequence(self): if '&' in self.effects['Effect']: return self.effects['Effect'].split('&') return self.effects['Effect'] @property def consequences(self): try: return [old_snpeff_effect_so.get(c, old_snpeff_effect_so[c.upper()]) for c in it.chain.from_iterable(x.split("+") for x in self.effects['Effect'].split('&'))] except KeyError: return list(it.chain.from_iterable(x.split("+") for x in self.effects['Effect'].split('&'))) @property def severity(self, lookup={'HIGH': 3, 'MED': 2, 'LOW': 1}): # higher is more severe. used for ordering. try: return max(lookup[old_snpeff_lookup[csq]] for csq in self.consequences) except KeyError: try: #in between sevs = [IMPACT_SEVERITY.get(csq, "LOW") for csq in self.consequences] return max(lookup[s] for s in sevs) except KeyError: return Effect.severity.fget(self) @property def is_lof(self): return self.biotype == "protein_coding" and self.impact_severity == "HIGH" geneimpacts-0.3.7/geneimpacts/tests/000077500000000000000000000000001347124224400174645ustar00rootroot00000000000000geneimpacts-0.3.7/geneimpacts/tests/__init__.py000066400000000000000000000000021347124224400215650ustar00rootroot00000000000000# geneimpacts-0.3.7/geneimpacts/tests/bcfts.txt.gz000066400000000000000000000150341347124224400217500ustar00rootroot00000000000000@Xbcfts.txt][o丱~9l !L̞ruԘb3.f&OQ틤Iyle,ՅEHqjD2#no}z?5b5!!"n7噻 LO|@P,959 Bss(Z t@HL>ǔ+oKDt˧?:}Fv;}0.9e<A&cJJ|do\H=w89\DB I08"8 MKתM000?aV0Q@RLӞ1O>i۟ Z,MBzrFcJcI?TL}YGϿ}_DMif]u[WE᧬u,r6U]o+SjSFk mrE]֖NMKc7emUiZtkޛZ]J﫪njWWۢ,mU awkFe[69Xe5ƺz k0p1bqźvvkS=n *P ֔W۔4Wn}֘]mmniv宺:2n2(@6,ZWm]M[;zpM߹jAV+ V%ع)AIotӘ` h iF5ڮrF}_llaƬ_= SS; VtSWm~w\Xk05ß%k[]Сu[ju -D9u2lasaȥ e,Cd"! e`\x E:X.#5Fw|eIX<_}X)Xiod\(hb2S.kˇϟ?/~O7| 7ف1N*8;"Tyfs34@&R+9}{^,tQș}q>W}?Q%{!dCKB̲}EdNR 52FB#$wAPA("A&4Q$QԹ%,+~=bUP}:,34q5clsfmD!$қD'3&mN:"dB">ȌaNQgN>3aB?>|9NSBbՏXdX f8ǴV2C(^|7[ODzw &"o;8%bE4YΊx) G>$P7JRD H; 8OCM3B\p%"@/& Xx(BOJji₽}ICH, s3GQ (^P(O%` ?$ ((Q¡ Y8飼j0cS02mLFH D0(*Gz->Œ h(xPaħJ6PġLZ+O* 9CI>s#ʿ+QVRġLY+4gְatG,Ed,Ĥ9zpx J21B0ơP,3A Eh,Ĥ9Bv*@)aSX)S$2u=-("h b!LG a jWW!"bhk3$'@IN ,|eS8BR_`h o%A 7 `PF9Og!(B=o>ĝ+u'nvq6 DZ]l 2X) (X<)0WE`h4Y2~O"#weM3  Gě$sC| dcE\0AB.I39:PXjQ*? oF@L GDPSEK2AOMmݰS.1IG:d/V"q3//]כ'spjbax,^xr5|XrzZh>! 0VD1Es )8=}n"6֊ >!쬱,r(ZZP!HI]ys'dʡl4zM0v@e@E-` o}Iރ$iWI uNQ(ǻ-} XrV%WȬgȧ^WVČgnowzTC:[OhDـ;$&X ^ ҥL*4Gϋxhza&;T?`pt#HScH0XF!b.:%1B0F@wPos{J2HZ.:`03ǿPģU yR[j[L]GZ5؀gbv- 4 +W[Rɗ or(*gB/Й>(f7 {INMr'w"jU;)t'Y;gU|;y!9ZeVeS7rtF4;߻ֈd9(AQ UdNPa $4."%h I>xqU`ɯhT>A'ѿQ2c%O!9Fï h?;3tP5nBzy߿$*c S]2eb}0?W~%ڮVM7T\eW\Uwvrl YmY *˦fm6;ݮZkCM[z/Ҕw]ۺh*)Zk]SuVՕ-6EtQlM]uojJk ̛tm;[kK\e5FJS5:g\rij Gel]W9$擦DI^txI6xl׫muFu]_ʬ kj&f6\NeiUݷUa[W5]9~ۃq] YָhݮAj[Zߢ;]5 hkZ`ӖԶڪ(-jJ{/ڙUmu P73@_<$޸1ٛ2ѫ‹{uԹ8Cӓ,ѲuďKL{SMN ^bĘ"mlg Yt#M'^]x- 9cTuTR֧CCg Ee6>q3|0'&.B\OSa}QdjKP,;.ޜO{.aR3#JnC>[gD1J~K l<9& ӵw(ɏ^0(,`jA$)!|Y儌\,bQ]/K6> N=E >8Q|%TdqS+h,K?X?GDۀ)ls-:&Nx)Ȓ{;;"7HDSJ{l ?NX)cJXqtittMa^#F "|_e!Dv8@$#E)TSI5"{$Ng=I)qGDz{ **$tԙ @Ys1NǠݱcE3_ E}7a/"0:н wώБya/asy"Y;yFET[F[Ƹ|QQB"r/yħ#uLᓟV8+~@;^a3E\GA0bhQ&$Mw+QF@QX h)B=ء)RBDR!$`λ䎼{鐨bu^Ӊtq&p .334Kys4tCJiXNsp@vXɅw mMl9 {^x'f_Tt?D$VĞ[JB b 4!hIdFL6q֑q&q(OCx4}b1o{'V-''pFouív;HƎ NQtώ12|DcR{ e~&yCQD#3NA YX0(q"hڟk49'ds|sth|vH4/ KJcbƝxNQvDE ˁGAY6JB*4cRI ޅ+6==|c[Y[b8Tm7eJV}U稜u[ۥJݵVZw {y'B>jZ}avSJ6ѵs{Un꽳+_iMB~w}J)wq $_Ib;]Ov)p`f8&Zc1.tʺ0[ǫ&꫶&Y` xu1XT$"Wae[pwB-8~xUbfa>dN;PV 3Qe^T@Ы>²Au+84mY!BvˤM=*BIm>Քr'Duu%8G!ڬ&B6-G,I=]6z48VD<~d2kϬͤP&W`ȣ&JQՏqKo^o"c.wĻd'&9=^,I:1dR˂!Bߎ(IKBI( %$PJBI( %$-޵ӌ|\`k_zqLp ˂!B,tC#8< |A_|z$U?>ip{35M].0ֹ!`'朴K&p }ݼ6[]2)WzDJ=Q($)~9U}Me;p7[FPZ7,A }xdCTȪ>AINR;uhRweͨ8L î!BJԥ pۮfc_p J5C) P_~P5 Fg>;R*GtyB2 1ܶ4q6M1dR(])mG!uIQ xaF'RMBP,0geneimpacts-0.3.7/geneimpacts/tests/snpeff-anns.txt.gz000066400000000000000000000154221347124224400230660ustar00rootroot00000000000000Usnpeff-anns.txt=kSJPpgJ*bXnow 2 H,#VwLO;ngiu\M_d^꽽8?yG魨Ӌr:㥂.)mp沚ͪuS=y7_UYVi?xSOV_{v tnoF0vuW(<'` On_f41V?|ė%#'7j<\V?lD"L>z|Y5/>mpkv!aKc{L4(W7=M)}GP! HE+ծ(WYGL^-&Eϒ{aq 6BX"6x X}\\^V-²vtoE}zs7[g#u +J3ͫbyKX>oߝWSॆ3hX[ ARa>Ui_q80.u ̖QZo$RH2Rq.oN-G8%gxR] (0n[N 'ً#l`}WU'K^LdoTݛ̘4ή{]Ȕ?<H(2•VH_1ǰ{y ߟL w%Ɓ ptzpVptt))))uȩ?fhD8^ =$||]C+i"""^-ZE|<>;={p15ZGpA_żڞ-z(,Q\zpQDL<C`Zw0_ FҘZ̯'B νO$pZ!ͧf܂{mV9" p`J|ũyqN@;߃~홖]<= i9[qu^%JHʼn_qM\ [A+V!\%5D=G?TN?5B@a+,TF.$&(Aƛ.i*Q\vR)eVS*X65Ưn16FZ{N0|})?/N3T}[p} bJrtb4LY^Z+( h9(ۡ†YNP[{*W, 2tRkX3P0K TB I>ڙ>H~XXA.|^%DKfd߇ò5i+X.ApN!Oʅןf;ܐp S'k/M27,Y=t1 唟@h`"$?A)ŗ7%2C-4x MFT/ Yp"}"c*8<;^IzsE *Q 0x,|_;m0.T8P^Q++*+ &eR 3rvA%3`38Pa]Uf_">p1y렫'@o!$o^,eKSBAZ&VB+<&1&g[l­5h/#.?m Z)#Iq6KPݫ0! h]bcuUL/buMy"(t~_Zp>P{ת t[7a,¹ *)G)$3!Q%cEGGGG٤\"M47%{!on;Z@xh$#,WJE7R)gKLVy+=9䒣Ë3#3+*Q? %6tۣ=Ief<@; jz*3QQzvq;9᪣cN! T 0964xKUTn^kgD:"~bڲ=m1Ft4°A MѨY-ub0[Q`ԝnvnryAnD8B,mIUdXr{ĆҐM,.lY!`fiP׋k)9\U $N'RHHHH"QZb/[Jt< jC[VV\υ4 Ec^'yDuunlG PIp!8!N14xˎ[=JmV,L"Ch7ХZ"!{rA;&qGUF(sg"B3 /Z;V.fl_` l5ZQ0ʖ;6 uҦqlbdI!LE!V)4*NQhg()5>IO#zc{# G6oMNp2nQڤA/weK^beтM)Qm:Ƅ Rwaf;t;;\.`/*qBܲ稅pH# =CQ#p8vu@.DpakF?XQF-vth4N% OA)B)q3ultt>lKgB)TwT㣹+5XZ;HU$m30Tji6%C6W1uŵ\J+υ d'|" |}| \lZXb.C"Cܘ]UrYͱ:i(4-=kTM$J666[M>h -FH۲=ꄷg\9 ʼ)wj,Lh7-m.sZ䡯ݐҠ)0js@Rg$X4 '‰ ;{q!Gn ƥFJ($u| Jt}o'` ]lG21xbqjDֳwpT]UW)\"z6<.oH㫪%;-&]`V)%hCb) :RkVWP/o [EU JeΒմ̉[#﬩D,ƾe)9,XzT)1#U &8̕aovF$ .#DF`x+%K$ }oWg墙PȀ3nwuթSi J20Lt cNSEjcbzwKLR[5Fjj]E']mS^᫆(2 !Cx^q} \2ї:~ݍOR+d+c_‘U|&=cAs# o]Քĕ> Ϩ于kPE_5ۛO̴)sl<#ت}^4Fg9C; v!NU3!XVl:cz$s>p+R$ x :]?07X}Sc߽__2X(8T ˻nBs<x9=-0\N^-ت_IG#⹣G#e j F cRgS(~Iy&,~'(Yo7ُ߹YZV97Hj/T0X-*__7}1|OĪ%qұB@j7JTsMDrH$p@8~v]?,ͬ:pՑRAFj+Rc (wQeYQ1( Ӫt!mWmT, vs)cʘ?V OA;o;С 9 a$1!K~:rI ql#ƈ1bIYA[G3 ;A1rfټO Nv"{38]&`#&eT N, &E`_TЀY-5NviTR!J|:VH|^QdcCћDgeneimpacts-0.3.7/geneimpacts/tests/test_impacts.py000066400000000000000000000462461347124224400225510ustar00rootroot00000000000000import sys import os import gzip from geneimpacts import SnpEff, VEP, Effect, OldSnpEff, BCFT HERE = os.path.dirname(__file__) def test_bug(): e = sorted([VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding'), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript")]) assert e[-1].so == 'missense_variant', e[-1].so def test_snpeff(): ann = SnpEff("C|splice_donor_variant&splice_region_variant&splice_region_variant&intron_variant|HIGH|DDX11L1|ENSG00000223972|transcript|ENST00000518655|transcribed_unprocessed_pseudogene|3/3|n.734+2_734+3delAG||||||") assert ann.gene == "DDX11L1" assert ann.transcript == "ENST00000518655" assert ann.biotype == "transcribed_unprocessed_pseudogene", ann.biotype assert ann.consequences == 'splice_donor_variant&splice_region_variant&splice_region_variant&intron_variant'.split('&') assert ann.severity == 3 assert ann.impact_severity == "HIGH" assert ann.aa_change == "" assert ann.exon == '3/3', ann.exon assert not ann.coding assert ann.is_pseudogene def test_unused(): extra = ['YYY'] keys = VEP.keys + extra ann = VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|xval|yval', keys=keys) assert ann.unused() == extra, ann.unused() assert ann.effects['YYY'] == 'yval' def test_vep(): ann = VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|') assert ann.gene == 'OR4F5' assert ann.transcript == 'ENST00000335137' assert ann.aa_change == "F/C", ann.aa_change assert ann.consequences == ['missense_variant'] assert ann.coding assert ann.biotype == "protein_coding" assert ann.severity == 2 assert ann.impact_severity == "MED", ann.impact_severity assert not ann.is_pseudogene assert ann.polyphen_score == 0.568, ann.polyphen assert ann.polyphen_pred == "possibly_damaging", ann.polyphen assert ann.sift_score == 0.0, ann.sift assert ann.sift_pred == "deleterious", ann.sift assert not ann.canonical def test_vep_canonical(): ann = VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|*', prioritize_canonical=True) assert ann.gene == 'OR4F5' assert ann.transcript == 'ENST00000335137' assert ann.aa_change == "F/C", ann.aa_change assert ann.consequences == ['missense_variant'] assert ann.coding assert ann.biotype == "protein_coding" assert ann.severity == 2 assert ann.impact_severity == "MED", ann.impact_severity assert not ann.is_pseudogene assert ann.polyphen_score == 0.568, ann.polyphen assert ann.polyphen_pred == "possibly_damaging", ann.polyphen assert ann.sift_score == 0.0, ann.sift assert ann.sift_pred == "deleterious", ann.sift assert ann.is_canonical def test_bcfts(): f = os.path.join(HERE, "bcfts.txt.gz") with gzip.open(f, "rt") as fh: for csq in (BCFT(l.rstrip()) for l in fh): assert csq.severity in (1, 2, 3) assert csq.is_pseudogene in (True, False) assert csq.coding in (True, False), (csq.coding, csq) assert csq.is_exonic in (True, False) def test_veps(): f = os.path.join(HERE, "vep-csqs.txt.gz") with gzip.open(f, "rt") as veps: for csq in (VEP(l.strip()) for l in veps): assert csq.severity in (1, 2, 3) assert csq.is_pseudogene in (True, False) assert csq.coding in (True, False) assert isinstance(csq.polyphen_value, float) or csq.polyphen_value is None csq.gene assert isinstance(csq.sift_value, float) or csq.sift_value is None def test_snpeffs(): f = os.path.join(HERE, "snpeff-anns.txt.gz") with gzip.open(f, "rt") as anns: for csq in (SnpEff(l.strip()) for l in anns): assert csq.severity in (1, 2, 3) assert csq.is_pseudogene in (True, False) assert csq.coding in (True, False) assert csq.polyphen_value is None EFFECTS = [VEP("upstream_gene_variant|||ENSG00000223972|DDX11L1|ENST00000456328|||||processed_transcript"), VEP("downstream_gene_variant|||ENSG00000227232|WASH7P|ENST00000488147|||||unprocessed_pseudogene"), VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"), VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"), VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"), VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene"), VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene"), VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding'), VEP("non_coding_exon_variant&nc_transcript_variant&feature_elongation|||ENSG00000223972|DDX11L1|ENST00000456328|3/3||||processed_transcript"), ] def test_order(): effects = sorted(EFFECTS) assert effects[-1].impact_severity == "MED" assert effects[0].impact_severity == "LOW" def test_canonical_order(): effects = EFFECTS[:] effects.append(VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene|*", prioritize_canonical=True)) effects = sorted(effects) assert effects[-1].is_canonical assert effects[0].impact_severity == "LOW" assert not effects[0].is_canonical def test_o2(): keys = [x.strip() for x in "Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon | ERRORS | WARNINGS".split("|")] effects = [OldSnpEff(v, keys) for v in "DOWNSTREAM(MODIFIER|||||RP5-902P8.10|processed_transcript|NON_CODING|ENST00000434139|),DOWNSTREAM(MODIFIER|||||RP5-902P8.10|processed_transcript|NON_CODING|ENST00000453732|),INTRON(MODIFIER||||138|SCNN1D|protein_coding|CODING|ENST00000470022|3),INTRON(MODIFIER||||638|SCNN1D|protein_coding|CODING|ENST00000338555|3),INTRON(MODIFIER||||638|SCNN1D|protein_coding|CODING|ENST00000400928|2),INTRON(MODIFIER||||669|SCNN1D|protein_coding|CODING|ENST00000379110|6),INTRON(MODIFIER||||704|SCNN1D|protein_coding|CODING|ENST00000325425|2),INTRON(MODIFIER||||802|SCNN1D|protein_coding|CODING|ENST00000379116|5),INTRON(MODIFIER|||||SCNN1D|nonsense_mediated_decay|CODING|ENST00000379101|5),INTRON(MODIFIER|||||SCNN1D|processed_transcript|CODING|ENST00000467651|3)".split(",")] effects = sorted(effects) assert effects[-1].gene == "SCNN1D", effects[-1].gene effects = sorted([OldSnpEff(v, keys) for v in "DOWNSTREAM(MODIFIER||||85|FAM138A|protein_coding|CODING|ENST00000417324|),DOWNSTREAM(MODIFIER|||||FAM138A|processed_transcript|CODING|ENST00000461467|),DOWNSTREAM(MODIFIER|||||MIR1302-10|miRNA|NON_CODING|ENST00000408384|),EXON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000469289|1),INTRON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000473358|1),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000423562|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000430492|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000438504|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000488147|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000538476|)".split(",")]) s = "\n".join(e.effect_string for e in effects[::-1]) # reversed so that most significant is first assert s == """\ DOWNSTREAM(MODIFIER||||85|FAM138A|protein_coding|CODING|ENST00000417324| DOWNSTREAM(MODIFIER|||||FAM138A|processed_transcript|CODING|ENST00000461467| INTRON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000473358|1 EXON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000469289|1 DOWNSTREAM(MODIFIER|||||MIR1302-10|miRNA|NON_CODING|ENST00000408384| UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000423562| UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000430492| UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000438504| UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000488147| UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000538476|""" def test_highest(): effects = sorted(EFFECTS) top = Effect.top_severity(effects) assert top.impact_severity == "MED" assert top.so == "missense_variant" #assert top[0]. effects.append(effects[-1]) top = Effect.top_severity(effects) assert isinstance(top, list) assert top[0].impact_severity == "MED" def test_splice(): e = VEP('splice_acceptor_variant&intron_variant&feature_truncation|||ENSG00000221978|CCNL2|ENST00000408918||||-/226|protein_coding|1') assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, True) e = VEP('intron_variant&feature_elongation|||ENSG00000187634|SAMD11|ENST00000341065||||-/589|protein_coding|1') assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, False) def test_eff_splice(): keys = [x.strip() for x in "Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon | ERRORS | WARNINGS".split("|")] e = OldSnpEff("SPLICE_SITE_REGION+SYNONYMOUS_CODING(LOW|SILENT|acG/acA|T245|1134|ANKS1A|protein_coding|CODING|ENST00000360359|5|A)", keys) assert e.aa_change == "T245" # note that we choose splice_site_region over synonymous coding assert e.is_splicing, e.is_splicing assert not e.is_coding e = OldSnpEff("intergenic_region(MODIFIER|||n.null_nulldelAAGGAAGG|||||||A", keys) assert e.consequences != [] def test_regr(): keys = [x.strip() for x in 'Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number | ERRORS | WARNINGS'.split("|")] v = OldSnpEff('SPLICE_SITE_REGION+SYNONYMOUS_CODING(LOW|SILENT|acG/acA|T245|1134|ANKS1A|protein_coding|CODING|ENST00000360359|5|A)', keys) assert v.consequences == ['splice_region_variant', 'synonymous_variant'], v.consequences assert v.severity == 2, v.severity assert v.aa_change == 'T245' v = OldSnpEff('UPSTREAM(MODIFIER||2771|||PSMB1|processed_transcript|CODING|ENST00000462957||C)', keys) assert v.consequences == ['upstream_gene_variant'], v.consequences assert v.severity == 1, v.severity v = OldSnpEff('NEXT_PROT[maturation_peptide](LOW||||241|PSMB1|protein_coding|CODING|||C)', keys) assert v.consequences == ['NEXT_PROT[maturation_peptide]'], v.consequences assert v.severity == 1, v.severity assert v <= v def test_aa_change(): eff = OldSnpEff('NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Agc/Ggc|S418G|696|C1orf170|protein_coding|CODING|ENST00000433179|3|C)') assert eff.aa_change == 'S418G' ann = SnpEff('C|missense_variant|MODERATE|C1orf170|ENSG00000187642|transcript|ENST00000433179|protein_coding|3/5|c.1252A>G|p.Ser418Gly|1252/3064|1252/2091|418/696||') assert ann.aa_change == 'p.Ser418Gly' def test_old(): keys = [x.strip() for x in 'Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number | ERRORS | WARNINGS'.split("|")] v = OldSnpEff('SPLICE_SITE_REGION+SYNONYMOUS_CODING(LOW|SILENT|acG/acA|T245|1134|ANKS1A|protein_coding|CODING|ENST00000360359|5|A)', keys) assert v.so == "splice_region_variant", v.so v = OldSnpEff('SYNONYMOUS_CODING+SPLICE_SITE_REGION(LOW|SILENT|acG/acA|T245|1134|ANKS1A|protein_coding|CODING|ENST00000360359|5|A)', keys) assert v.so == "splice_region_variant", v.so assert v.aa_length == 1134, v.aa_length assert v.exon == "5", v.exon assert v.codon_change == "acG/acA", v.codon_change assert v.transcript == "ENST00000360359", v.transcript def test_old2(): keys = [x.strip() for x in 'Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number | ERRORS | WARNINGS'.split("|")] v = OldSnpEff('SPLICE_SITE_REGION+NON_SYNONYMOUS_CODING(LOW|SILENT|acG/acA|T245|1134|ANKS1A|protein_coding|CODING|ENST00000360359|5|A)', keys) assert v.so == "missense_variant", v.so def test_weird_vep(): keys = "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|CCDS|RadialSVM_score|RadialSVM_pred|LR_score|LR_pred|CADD_raw|CADD_phred|Reliability_index".split("|") csqs = ["?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||,non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||", "non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||,?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||", "?|||115286|SLC25A26|NM_173471.3|1/1||||protein_coding|YES||||||||", "|||ENSG00000138190|EXOC6|ENST00000260762||||-/804|protein_coding,|||ENSG00000138190|EXOC6|ENST00000371547||||-/820|protein_coding,|||ENSG00000138190|EXOC6|ENST00000443748||||-/701|protein_coding,NMD_transcript_variant|||ENSG00000138190|EXOC6|ENST00000495132||||-/404|nonsense_mediated_decay,|||ENSG00000138190|EXOC6|ENST00000371552||||-/799|protein_coding", "|||ENSG00000013503|POLR3B|ENST00000539066||||-/1075|protein_coding,nc_transcript_variant|||ENSG00000013503|POLR3B|ENST00000549195|||||processed_transcript,|||ENSG00000013503|POLR3B|ENST00000549569||||-/170|protein_coding,|||ENSG00000013503|POLR3B|ENST00000228347||||-/1133|", "|||ENSG00000147202|DIAPH2|ENST00000373054||||-/1097|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000355827||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000324765||||-/1101|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373049||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373061||||-/1101|protein_coding", ] import sys for cs in csqs: for c in cs.split(","): v = VEP(c, keys) assert v.impact_severity in ('LOW', 'MEDIUM', 'HIGH') def test_empty_snpeff(): keys = [x.strip() for x in 'Effect | Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number | ERRORS | WARNINGS'.split("|")] eff = "(MODIFIER||||||||||A|ERROR_CHROMOSOME_NOT_FOUND)" v = OldSnpEff(eff, keys) assert v.impact_severity == "LOW", v.impact_severity def test_protein_contact(): ann = SnpEff('C|protein_protein_contact|HIGH|C1orf170|ENSG00000187642|transcript|ENST00000433179|protein_coding|3/5|c.1252A>G|p.Ser418Gly|1252/3064|1252/2091|418/696||') assert ann.impact_severity == "HIGH" def test_gemini_issue812(): ann = VEP('protein_altering_variant|caGCAGCAGCAGCAGCAACAGCAG/caA|QQQQQQQQ/Q|ENSG00000204842|ATXN2|ENST00000608853|1/25|||14-21/1153|protein_coding|', keys="Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL".split("|")) assert ann.is_coding def test_bug_vcf2db_21(): ann = VEP('synonymous_variant|tcA/tcG|S|ENSG00000186092|OR4F5|ENST00000335137|1/1|||60/305|protein_coding||Low_complexity_(Seg):seg&Transmembrane_helices:TMhelix&Prints_domain:PR00237&Superfamily_domains:SSF81321&Gene3D:1.20.1070.10&hmmpanther:PTHR26451&hmmpanther:PTHR26451:SF72&PROSITE_profiles:PS50262||||ENST00000335137.3:c.180A>G|ENST00000335137.3:c.180A>G(p.%3D)|||-0.817044|0.039', keys="Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|DOMAINS|CLIN_SIG".split("|")) assert ann.codon_change == "tcA/tcG", ann.codon_change def test_32(): keys = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|SOURCE|GIVEN_REF|USED_REF|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|SpliceRegion".split("|") s = "-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|ENSG00000197915|Transcript|ENST00000368801|protein_coding|2/3||ENST00000368801.2:c.1del|ENSP00000357791.2:p.Met1?|77/9623|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|HGNC|HGNC:20846|YES|1|P1|CCDS30859.1|ENSP00000357791|Q86YZ3||UPI00001D7CAD||Ensembl|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000420707|antisense_RNA||1/8|ENST00000420707.5:n.159-25632del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||5||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000593011|antisense_RNA||1/3|ENST00000593011.5:n.296+54843del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||4||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|388697|Transcript|NM_001009931.2|protein_coding|2/3||NM_001009931.2:c.1del|NP_001009931.1:p.Met1?|80/9632|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|EntrezGene|HGNC:20846|YES||||NP_001009931.1||||rseq_mrna_match|RefSeq|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||".split(",") for e in s: eff = VEP(e, keys=keys) if not "intron" in e.lower(): assert eff.impact_severity == "HIGH", (eff.impact_severity, e) geneimpacts-0.3.7/geneimpacts/tests/vep-csqs.txt.gz000066400000000000000000000141141347124224400224060ustar00rootroot000000000000004Zvep-csqs.txt[sȶL=B@͋KCm8V>S?t [FBcM3đOR{mv3I.lrf2iA(u.8e`xr{;~$gjO-0K0:R"*3ݟk{f׳,zqgc`T^>d1O.o&ԕId:nv# <_FRߏF'Ifr9q=K<%| zLv#$5ZcHrp؛"o'v*%bDR=k#j$j6r (~=y%g".aI*9Pq`Ɯx:vNvqu{llѽDrC!{a0DB悲t5v557dq(68_?\%9;}cbnjhHmv2Mw@o]f&?oWOQ%AcAz}&a?h vkv8gݢrbʗG?pT_wrEypZ Hp2W~Lgc;)YXJdu+)eJ6X^ʻtI&0rٷs|l|FVĪ!UIJq(aΛ @ғzҗHcA{OFoOaDWEqDQ`_q99E0i:Qw0:Nۃ3\;,T^9sr@ZDn\%iUQf'<{^!q8n*~wϋђDUk%8qM/;Dy.x(ݗ2`._Fe>EM E~t!~>Ik>YB;62B9]x._)k#cAC]1Bt}6B0QjRz+ZXg=Į ǿd [hah!QЍya@J.BTCv: P  Z.&KJv9EzwX!c(lpCSI\. !JyMEzeıY(QR\uK'/t=,rT dfP= Mf_ }yvcyzt"cAC$CʛzxL2 lYFaqnw-ċ )Ľ+AEP0Ø0__{4wO1$ I H_XcW@):9p}̈X]M6k ٺz9TûܝQO?GbXA7 NYOo_FdA ۽A;2wIѣJ^VdEc6kg;gAs&荦*mT]vČV[&I;,<'mm;:0JpaE}ZyNv JHayxO+k Ry4υyK('[SPId>oqK\h';Ūj֎Ij!F ( uIАcku{iy vzD-xg]dsN3S m1cSlZŕy76R m[3؊@’5鱞eàMY fF?O/E 5?p{ Q YjGւV+y;e) ʁW^n.kx .QF/!ܬWI= >DPp%B2DqFoqgfGsDwt^##Jn#Y/:A*] GmH/ :^4S7وX 4lj5i4Bb=DsʫsBdwʌ&a~3MqfRBӞWLjyTz[=H,ڦټf6>SwAX<~4ı=]XGzV\|fAe[oQڍ )6 g'=f ZsETz`KLE!YJ2eY:q+fU >|n"Db^3 l`-] Ik?OaզV58/g'͍aӯ5iRٲS$uE"׳4Qpy7*e8M6>qW&9yΜʼ?aum]=S>L\8JNלJn5E;S^uݲ3' 1YݺyhO_q>=S0 b|!ӽ`!|Z!+GI+ I|a+Q$)w^=8Bjo80' "y7E ߶A鰐*VS=KB0K"w4k&6]4)1++!F5CĤ`z4u %)eL-ۏQĿ{`Mm9`S4h ψG՛R1IK@. o pET1F6켋bW(&x/e}FB"D{ +\>V:v+>o+A_r?_SӽR:G`׏UsBB(b"D<'qmcPih /{u{El<}pnC&9e[MS3-0 RF. a#<06v[UEA*#8J7 .nTuk bJ$֟F  2vzؗ7XuERE>.e$*pqlݲA$އ93\}O֪8ƥ4v2iA"ÓB++Ms-=>:t Kb|s$QpRn )7h PnV\]t\KnݨKX)saR0/„K]VN'lh;ZڒZw7BlZ{ItU+AٍQek \)mB]U+! ce&%Y q THYvh:˞yE"N0@IX]"R$]b{0%SAEa;%Sfx;fg6 .|tEjQbF3M&-[AaG>ݏ@~Q“!yt8l''*[wXv0k w̄M͂6Miteʘ: IR%}((ۧ\Gq3բe!洅xf^Og뫟e6Βz $a F 8; { UXafkWI35SG{< Y26*7#tAn:0%lq_*nD bln!bEG]# Ҵ?z&n !k/TҼTOu$<6ʢ2RppQc%1b"LQ/XZdًǪd1H0W`<A|y񙎿 ?}? (@:==O q.R%ڪj`;gftoPv)1J$! Fup`wNY 'g#%Z5/_WwM;H^T(gÄhEbZqqԃ hsϯfksBJWlBj1jHWQD<(y pQ0]v 0Osgyz~ڞI$,=pslu,jK}uS8i'k {'kJ@I__UF87Bgeneimpacts-0.3.7/setup.py000066400000000000000000000022771347124224400155450ustar00rootroot00000000000000import os from setuptools import setup def get_version(): """Get the version info from the mpld3 package without importing it""" import ast with open(os.path.join("geneimpacts", "__init__.py"), "r") as init_file: module = ast.parse(init_file.read()) version = (ast.literal_eval(node.value) for node in ast.walk(module) if isinstance(node, ast.Assign) and node.targets[0].id == "__version__") try: return next(version) except StopIteration: raise ValueError("version could not be located") setup(version=get_version(), name='geneimpacts', description="normalize effects from variant annotation tools (snpEff, VEP)", packages=['geneimpacts', 'geneimpacts.tests'], long_description=open('README.md').read(), author="Brent Pedersen", author_email="bpederse@gmail.com", zip_safe=False, test_suite='nose.collector', include_package_data=True, tests_require=['nose'], classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Topic :: Scientific/Engineering :: Bio-Informatics' ])