pax_global_header00006660000000000000000000000064130200457330014507gustar00rootroot0000000000000052 comment=d3218a8c5437f61eeb44b7e0bb27a8856074ce8f marcelm-sqt-d3218a8c5437/000077500000000000000000000000001302004573300147565ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/.gitattributes000066400000000000000000000000351302004573300176470ustar00rootroot00000000000000sqt/_version.py export-subst marcelm-sqt-d3218a8c5437/.gitignore000066400000000000000000000002161302004573300167450ustar00rootroot00000000000000*~ *.pyc *.pyo build .pydevproject .project tmp dist .tox sqt.egg-info/ .settings/ sqt/*.so sqt/_*.c *.bak venv/ .ipynb_checkpoints/ MANIFEST marcelm-sqt-d3218a8c5437/MANIFEST.in000066400000000000000000000001301302004573300165060ustar00rootroot00000000000000include sqt/_*.pyx include sqt/_helpers.c include versioneer.py include sqt/_version.py marcelm-sqt-d3218a8c5437/README.md000066400000000000000000000104321302004573300162350ustar00rootroot00000000000000sqt - SeQuencing Tools ====================== *sqt* is a collection of useful command-line tools for working with high-throughput sequencing data. Each *sqt* command is a seperate binary program or script with the prefix `sqt-`. This architecture allows each command to be implemented in any programming language. Simple, one-off scripts can be written in a high-level scripting language such as Python or Perl and later, when performance turns out to be critical, be converted to fast binaries written in a compiled language such as C or C++. Many *sqt* subcommands are currently implemented in Python. For them, a Python package is available with functions for reading and writing FASTA/FASTQ files, computing alignments, quality trimming, etc. We welcome submission of new tools! Since on a technical level, there is almost no defined API, except that subcommands simply need to be callable binaries, all subcommands must observe some guidelines in order to offer a consistent interface. For example, the exit code must be zero on success and each tool must offer a `--help` command-line parameter. Project homepage ---------------- License ------- (This is the so-called MIT or X11 license.) * Copyright (c) 2009-2014 Marcel Martin * Copyright (c) 2010,2011 Tobias Marschall * Copyright (c) 2011 Sven Rahmann * Copyright (c) 2012-2013 Johannes Köster Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Dependencies ------------ - Python - [cutadapt](https://github.com/marcelm/cutadapt) - Pysam Preliminary Guidelines ---------------------- All tools (subcommands) must: * accept a `--help` parameter that displays help * give useful exit codes: 0 on success, nonzero on error The sqt python package is planned to: * be compatible with Python 2.6 and higher, including Python 3 * be backwards-compatible with older versions of sqt. Backwards-compatibility is measured by running the tests: If they pass without changes to the tests, everything is ok. List of Tools ------------- `sqt-coverage` -- Compute per-reference statistics such as coverage and GC content `sqt-fastqmod` -- FASTQ modifications: shorten, subset, reverse complement, quality trimming. `sqt-fastastats` -- Compute N50, min/max length, GC content etc. of a FASTA file `sqt-qualityguess` -- Guess quality encoding of one or more FASTA files. `sqt-globalalign` -- Compute a global or semiglobal alignment of two strings. `sqt-chars` -- Count length of the first word given on the command line. `sqt-sam-cscq` -- Add the CS and CQ tags to a SAM file with colorspace reads. `sqt-fastamutate` -- Add substitutions and indels to sequences in a FASTA file. `sqt-fastaextract` -- Efficiently extract one or more regions from an indexed FASTA file. `sqt-translate` -- Replace characters in FASTA files (like the 'tr' command). `sqt-sam-fixn` -- Replace all non-ACGT characters within reads in a SAM file. `sqt-sam-insertsize` -- Mean and standard deviation of paired-end insert sizes. `sqt-sam-set-op` -- Set operations (union, intersection, ...) on SAM/BAM files. `sqt-bam-eof` -- Check for the End-Of-File marker in compressed BAM files. `sqt-checkfastqpe` -- Check whether two FASTQ files contain correctly paired paired-end data. marcelm-sqt-d3218a8c5437/setup.cfg000066400000000000000000000002261302004573300165770ustar00rootroot00000000000000[versioneer] VCS = git style = pep440 versionfile_source = sqt/_version.py versionfile_build = sqt/_version.py tag_prefix = v parentdir_prefix = sqt- marcelm-sqt-d3218a8c5437/setup.py000066400000000000000000000105121302004573300164670ustar00rootroot00000000000000from glob import glob import os import sys from setuptools import setup, Extension from distutils.version import LooseVersion from distutils.command.sdist import sdist as _sdist from distutils.command.build_ext import build_ext as _build_ext import versioneer MIN_CYTHON_VERSION = '0.17' if sys.version_info < (3, 4): sys.stdout.write("At least Python 3.4 is required.\n") sys.exit(1) def no_cythonize(extensions, **_ignore): """ Change file extensions from .pyx to .c or .cpp. Copied from Cython documentation """ for extension in extensions: sources = [] for sfile in extension.sources: path, ext = os.path.splitext(sfile) if ext in ('.pyx', '.py'): if extension.language == 'c++': ext = '.cpp' else: ext = '.c' sfile = path + ext sources.append(sfile) extension.sources[:] = sources def check_cython_version(): """exit if Cython not found or out of date""" try: from Cython import __version__ as cyversion except ImportError: sys.stdout.write( "ERROR: Cython is not installed. Install at least Cython version " + str(MIN_CYTHON_VERSION) + " to continue.\n") sys.exit(1) if LooseVersion(cyversion) < LooseVersion(MIN_CYTHON_VERSION): sys.stdout.write( "ERROR: Your Cython is at version '" + str(cyversion) + "', but at least version " + str(MIN_CYTHON_VERSION) + " is required.\n") sys.exit(1) cmdclass = versioneer.get_cmdclass() versioneer_build_ext = cmdclass.get('build_ext', _build_ext) versioneer_sdist = cmdclass.get('sdist', _sdist) class build_ext(versioneer_build_ext): def run(self): # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip # file retrieved from PyPI that already includes the pre-cythonized # extension modules, and then we do not need to run cythonize(). if os.path.exists('PKG-INFO'): no_cythonize(extensions) else: # Otherwise, this is a 'developer copy' of the code, and then the # only sensible thing is to require Cython to be installed. check_cython_version() from Cython.Build import cythonize self.extensions = cythonize(self.extensions) versioneer_build_ext.run(self) class sdist(versioneer_sdist): def run(self): # Make sure the compiled Cython files in the distribution are up-to-date from Cython.Build import cythonize check_cython_version() cythonize(extensions) versioneer_sdist.run(self) cmdclass['build_ext'] = build_ext cmdclass['sdist'] = sdist extensions = [ Extension('sqt._helpers', sources=['sqt/_helpers.pyx']), ] setup( name = 'sqt', version = versioneer.get_version(), author = 'Marcel Martin', author_email = 'marcel.martin@scilifelab.se', url = 'https://bitbucket.org/marcelm/sqt', description = 'Command-line tools for the analysis of high-throughput sequencing data', license = 'MIT', cmdclass = cmdclass, packages = [ 'sqt', 'sqt.io', 'sqt.commands' ], entry_points = {'console_scripts': [ 'sqt = sqt.__main__:main', #'sqt-addadapt = sqt.commands.addadapt:main', #'sqt-bam2fastq = sqt.commands.bam2fastq:main', #'sqt-bamstats = sqt.commands.bamstats:main', #'sqt-checkfastqpe = sqt.commands.checkfastqpe:main', #'sqt-checkvcfref = sqt.commands.checkvcfref:main', #'sqt-compare-sequences = sqt.commands.compare_sequences:main', #'sqt-coverage = sqt.commands.coverage:main', #'sqt-fastaextract = sqt.commands.fastaextract:main', #'sqt-fastamutate = sqt.commands.fastamutate:main', #'sqt-fastastats = sqt.commands.fastastats:main', #'sqt-fastxmod = sqt.commands.fastxmod:main', #'sqt-fixbam64 = sqt.commands.fixbam64:main', #'sqt-globalalign = sqt.commands.globalalign:main', #'sqt-histogram = sqt.commands.histogram:main', #'sqt-qualityguess = sqt.commands.qualityguess:main', #'sqt-readcov = sqt.commands.readcov:main', #'sqt-samfixn = sqt.commands.samfixn:main', #'sqt-simreads = sqt.commands.simreads:main', #'sqt-translate = sqt.commands.translate:main', ]}, install_requires = [ 'pysam!=0.9.0', 'cutadapt', 'matplotlib', 'seaborn', 'xopen', ], ext_modules = extensions, test_suite = 'nose.collector', classifiers = [ "Development Status :: 4 - Beta", #Development Status :: 5 - Production/Stable "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics" ] ) marcelm-sqt-d3218a8c5437/sqt/000077500000000000000000000000001302004573300155655ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/sqt/__init__.py000066400000000000000000000005501302004573300176760ustar00rootroot00000000000000from ._version import get_versions __version__ = get_versions()['version'] del get_versions from .args import HelpfulArgumentParser from .io.fasta import ( SequenceReader, FastaReader, FastqReader, FastaWriter, FastqWriter, IndexedFasta, guess_quality_base ) from .io.gtf import GtfReader from .cigar import Cigar # TODO Deprecated from xopen import xopen marcelm-sqt-d3218a8c5437/sqt/__main__.py000066400000000000000000000025711302004573300176640ustar00rootroot00000000000000#!/usr/bin/env python3 """ SeQuencing Tools -- command-line tools for working with sequencing data """ __author__ = "Marcel Martin" import logging import importlib from . import HelpfulArgumentParser from . import __version__ from .commands import fastxmod logger = logging.getLogger(__name__) # List of all subcommands. A module of the given name must exist and define # add_arguments() and main() functions. Documentation is taken from the first # line of the module’s docstring. COMMANDS = [ 'align', 'bam2fastq', 'fastxmod', 'qgramfreq', 'chars', 'fastagrep', 'readcov', 'randomseq', 'samsetop', 'bameof', 'readlenhisto', 'cutvect', ] def main(): logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') parser = HelpfulArgumentParser(description=__doc__, prog='sqt') parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) subparsers = parser.add_subparsers() for command_name in COMMANDS: module = importlib.import_module('.commands.' + command_name, 'sqt') subparser = subparsers.add_parser(command_name, help=module.__doc__.split('\n')[1], description=module.__doc__) subparser.set_defaults(func=module.main) module.add_arguments(subparser) args = parser.parse_args() if not hasattr(args, 'func'): parser.error("Please provide a command") else: args.func(args) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/_codons.py000066400000000000000000000015271302004573300175700ustar00rootroot00000000000000 GENETIC_CODE = { 'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S', 'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', # 'TAA': stop 'TAC': 'Y', # 'TAG': stop, 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', # 'TGA': stop 'TGC': 'C', 'TGG': 'W', 'TGT': 'C', 'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F' } marcelm-sqt-d3218a8c5437/sqt/_helpers.pyx000066400000000000000000000262101302004573300201310ustar00rootroot00000000000000# kate: syntax Python; from cpython.mem cimport PyMem_Malloc, PyMem_Free from collections import Counter from cython.view cimport array as cvarray import cython from ._codons import GENETIC_CODE DEF START_WITHIN_SEQ1 = 1 DEF START_WITHIN_SEQ2 = 2 DEF STOP_WITHIN_SEQ1 = 4 DEF STOP_WITHIN_SEQ2 = 8 DEF SEMIGLOBAL = 15 DEF ALLOW_WILDCARD_SEQ1 = 1 DEF ALLOW_WILDCARD_SEQ2 = 2 DEF INSERTION_COST = 1 DEF DELETION_COST = 1 DEF MATCH_COST = 0 DEF MISMATCH_COST = 1 DEF WILDCARD_CHAR = b'N' # structure for a DP matrix entry ctypedef struct ScoreEntry: int score int backtrace # insertion means: inserted into seq1 (does not appear in seq2) DEF GAPCHAR = b'\0' @cython.boundscheck(False) def edit_distance(s, t, int maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ cdef int m = len(s) # index: i cdef int n = len(t) # index: j cdef int e = maxdiff cdef int i, j, start, stop, c, prev, smallest cdef bint match cdef bytes s_bytes, t_bytes cdef char* sv cdef char* tv # Return early if string lengths are too different if e != -1 and abs(m - n) > e: return abs(m - n) s_bytes = s.encode() if isinstance(s, unicode) else s t_bytes = t.encode() if isinstance(t, unicode) else t sv = s_bytes tv = t_bytes # Skip identical prefixes while m > 0 and n > 0 and sv[0] == tv[0]: sv += 1 tv += 1 m -= 1 n -= 1 # Skip identical suffixes while m > 0 and n > 0 and sv[m-1] == tv[n-1]: m -= 1 n -= 1 cdef int[:] costs = cvarray(shape=(m+1,), itemsize=sizeof(int), format="i") if e == -1: # Regular (unbanded) global alignment with nogil: for i in range(m + 1): costs[i] = i # compute columns of the alignment matrix (using unit costs) prev = 0 for j in range(1, n+1): prev = costs[0] costs[0] += 1 for i in range(1, m+1): match = sv[i-1] == tv[j-1] c = min( prev + 1 - match, costs[i] + 1, costs[i-1] + 1) prev = costs[i] costs[i] = c else: # Banded alignment with nogil: for i in range(m + 1): costs[i] = i smallest = 0 for j in range(1, n + 1): stop = min(j + e + 1, m + 1) if j <= e: prev = costs[0] costs[0] += 1 smallest = costs[0] start = 1 else: start = j - e prev = costs[start - 1] smallest = maxdiff + 1 for i in range(start, stop): match = sv[i-1] == tv[j-1] c = min( prev + 1 - match, costs[i] + 1, costs[i-1] + 1) prev = costs[i] costs[i] = c smallest = min(smallest, c) if smallest > maxdiff: break if smallest > maxdiff: return smallest return costs[m] #@cython.boundscheck(False) def globalalign(char* s1, char* s2, int flags=0, int match=1, int mismatch=-2, int insertion=-2, int deletion=-2): """ Compute an optimal global or semiglobal alignment between strings s1 and s2. An alignment is optimal if it has maximal score. The optimal score is not returned. Instead, the number of errors is computed and returned. Return ... -> (r1, r2, start1, stop1, start2, stop2, errors) TODO This is a direct translation of the C code and should be re-written to make it more readable. (Use Cython's memoryview for the matrix, avoid pointer-like access to p1 and p2.) FIXME THE REMAINDER OF THIS DOCSTRING Return a tuple (row1, row2, start1, stop1, start2, stop2, errors) where row1 and row2 are strings of the same length containing the alignment (an INDEL is marked by a null byte ('\\0'). start1 is the position within row1 at which the part of s1, that is aligned, starts. start2 is the position within row1 at which the part of s1, that is aligned, ends. The same holds for start2, stop2. It is always the case that at least one of start1 and start2 is zero. It is always the case that either stop1==len(row1) or stop2==len(row2) or both (note that len(row1)==len(row2)). This is a property of semiglobal alignments. errors is the number of errors in the alignment. For example, globalalign("SISSI", "MISSISSIPPI") returns: row1 = [ 0, 0, 0, 'S', 'I', 'S', 'S', 'I', 0, 0, 0] row2 = [ 'M', 'I', 'S', 'S', 'I', 'S', 'S', 'I', 'P', 'P', 'I'] start1, stop1 = 0, 5 start2, stop2 = 3, 8 errors = 0 This corresponds to the following alignment: SISSI ||||| MISSISSIPPI """ cdef int m = len(s1) cdef int n = len(s2) # DP Matrix: # s2 (j) # ----------> n # | # s1 (i) | # | # V # m # direction constants for backtrace table cdef int LEFT = 1, UP = 2, DIAG = 3 # the DP matrix is stored column-major cdef ScoreEntry[:,:] columns = cvarray(shape=(m+1, n+1), itemsize=sizeof(ScoreEntry), format="ii") cdef int i, j, bt, score, tmp # initialize first column for i in range(m + 1): columns[i, 0].score = 0 if (flags & START_WITHIN_SEQ1) else i * deletion columns[i, 0].backtrace = UP # initialize first row for j in range(n + 1): columns[0, j].score = 0 if (flags & START_WITHIN_SEQ2) else j * insertion columns[0, j].backtrace = LEFT # fill the entire DP matrix # outer loop goes over columns for j in range(1, n+1): for i in range(1, m+1): bt = DIAG score = columns[i-1,j-1].score + (match if (s1[i-1] == s2[j-1]) else mismatch) tmp = columns[i-1,j].score + insertion if tmp > score: bt = UP score = tmp tmp = columns[i,j-1].score + deletion if tmp > score: bt = LEFT score = tmp columns[i,j].score = score columns[i,j].backtrace = bt # initialize best score and its position to the bottomright cell cdef int best_i = m # also: s1stop cdef int best_j = n # also: s2stop cdef int best = columns[m,n].score if flags & STOP_WITHIN_SEQ2: # search also in last row for j in range(n + 1): if columns[m,j].score >= best: best = columns[m,j].score best_i = m best_j = j cdef ScoreEntry* last_column if flags & STOP_WITHIN_SEQ1: # search also in last column #last_column = &(columns[0,n]) for i in range(m + 1): if columns[i,n].score >= best: best_i = i best_j = n best = columns[i,n].score # trace back cdef char* alignment1 = PyMem_Malloc((m+n+4)*sizeof(char)) if not alignment1: raise MemoryError() cdef char* alignment2 = PyMem_Malloc((m+n+4)*sizeof(char)) if not alignment2: PyMem_Free(alignment2) raise MemoryError() cdef char* p1 = alignment1 cdef char* p2 = alignment2 i = m j = n # first, walk from the lower right corner to the # position where we found the maximum score cdef int errors = 0 cdef int gaps_are_errors # if gaps are currently errors, this is 1, otherwise it's 0 gaps_are_errors = 0 if (flags & STOP_WITHIN_SEQ2) else 1 if i == best_i: # we are in the last row while j > best_j: p1[0] = GAPCHAR j -= 1 p2[0] = s2[j] p1 += 1 p2 += 1 errors += gaps_are_errors else: # we are in the last column gaps_are_errors = 0 if (flags & STOP_WITHIN_SEQ1) else 1 while i > best_i: i -= 1 p1[0] = s1[i] p2[0] = GAPCHAR p1 += 1 p2 += 1 errors += gaps_are_errors assert i == best_i and j == best_j # the actual backtracing # The alignments are constructed in reverse # and this is undone afterwards. cdef int direction while i > 0 and j > 0: direction = columns[i,j].backtrace if direction == DIAG: i -= 1 j -= 1 if s1[i] != s2[j]: errors += 1 p1[0] = s1[i] p2[0] = s2[j] p1 += 1 p2 += 1 elif direction == LEFT: errors += 1 p1[0] = GAPCHAR j -= 1 p2[0] = s2[j] p1 += 1 p2 += 1 elif direction == UP: i -= 1 p1[0] = s1[i] p2[0] = GAPCHAR errors += 1 p1 += 1 p2 += 1 else: assert False, 'DP table corrupt' cdef int start1 = i if (flags & START_WITHIN_SEQ1) else 0 cdef int start2 = j if (flags & START_WITHIN_SEQ2) else 0 errors += (i - start1) + (j - start2) while j > 0: p1[0] = GAPCHAR j -= 1 p2[0] = s2[j] p1 += 1 p2 += 1 while i > 0: i -= 1 p1[0] = s1[i] p2[0] = GAPCHAR p1 += 1 p2 += 1 assert i == 0 and j == 0 align1 = alignment1[:(p1-alignment1)] align2 = alignment2[:(p2-alignment2)] align1 = align1[::-1] align2 = align2[::-1] PyMem_Free(alignment1) PyMem_Free(alignment2) return (align1, align2, start1, best_i, start2, best_j, errors) def byte_frequencies(bytes s): """Faster replacement for collections.Counter(s) for the case when s is a bytes object. Speed advantage depends on the length of the bytes object. When the length is less than 10, speedup is at least 2x. For length 100, speedup is 14x. For length 1000, speedup is approx 100x. """ cdef int[256] frequencies cdef int i cdef unsigned char c for i in range(256): frequencies[i] = 0 for c in s: frequencies[c] += 1 counter = Counter() for i in range(256): if frequencies[i] > 0: counter[i] = frequencies[i] return counter def expected_errors(str qualities, int base=33): cdef int i, q cdef bytes quals = qualities.encode() cdef char* cq = quals cdef double e = 0.0 for i in range(len(qualities)): q = cq[i] - base e += 10 ** (-q / 10) return e def hamming_distance(unicode s, unicode t): """ Compute hamming distance between two strings. The two strings must have the same length. Return the number of differences between the strings. """ cdef Py_ssize_t m = len(s) cdef Py_ssize_t n = len(t) if m != n: raise IndexError("sequences must have the same length") cdef Py_ssize_t e = 0 cdef Py_ssize_t i for i in range(m): if s[i] != t[i]: e += 1 return e # Lookup table that maps nucleotides to their 2-bit representation # and everything else to 255. cdef bytearray _nt_trans = bytearray([255]*256) for frm, to in zip(b'ACGTacgt', b'\x00\x01\x02\x03\x00\x01\x02\x03'): _nt_trans[frm] = to # Lookup table that maps 6-bit encoded codons to amino acids def _make_codon_array(stop_aa='*'): triples = bytearray([ord(stop_aa)]*64) for codon, aa in GENETIC_CODE.items(): b = codon.encode().translate(_nt_trans) index = b[0] * 16 + b[1] * 4 + b[2] triples[index] = ord(aa) return triples cdef bytearray _codon_array = _make_codon_array() def nt_to_aa(s: str): """ Translate a sequence of nucleotides to a sequence of amino acids, using the genetic code. >>> nt_to_aa('AAA') 'K' >>> nt_to_aa('AAATGATGG) 'K*W' """ cdef int i = 0 cdef int j = 0 cdef int v = 0 cdef unsigned char nt0, nt1, nt2 cdef char* nt_trans_ptr = _nt_trans cdef char* codon_array_ptr = _codon_array cdef bytes s_bytes = s.encode() cdef char* b = s_bytes cdef bytearray result = bytearray([0]*((len(s)+2)//3)) cdef char* c = result cdef int n = len(b) for i in range(0, n-2, 3): v = 0 nt0 = nt_trans_ptr[b[i]] nt1 = nt_trans_ptr[b[i+1]] nt2 = nt_trans_ptr[b[i+2]] if nt0 > 3 or nt1 > 3 or nt2 > 3: raise ValueError("Encountered non-nucleotide character in codon {!r}".format(s[i:i+3])) v = nt0 * 16 + nt1 * 4 + nt2 c[j] = codon_array_ptr[v] j += 1 if i < n: c[j] = '*' return result.decode() marcelm-sqt-d3218a8c5437/sqt/_version.py000066400000000000000000000406021302004573300177650ustar00rootroot00000000000000 # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.16 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = " (tag: v0.8.0)" git_full = "d3218a8c5437f61eeb44b7e0bb27a8856074ce8f" keywords = {"refnames": git_refnames, "full": git_full} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "v" cfg.parentdir_prefix = "sqt-" cfg.versionfile_source = "sqt/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) return None return stdout def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. """ dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: print("guessing rootdir is '%s', but '%s' doesn't start with " "prefix '%s'" % (root, dirname, parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None} @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs-tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags"} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ if not os.path.exists(os.path.join(root, ".git")): if verbose: print("no .git in %s" % root) raise NotThisMethod("no .git directory") GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"]} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree"} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} marcelm-sqt-d3218a8c5437/sqt/align.py000066400000000000000000000136661302004573300172450ustar00rootroot00000000000000import subprocess from collections import Counter from io import StringIO from .io.fasta import FastaReader from .utils import available_cpu_count from sqt._helpers import globalalign, hamming_distance class GlobalAlignment: """A global alignment between two strings""" def __init__(self, s, t, semiglobal=False): if type(s) is str: s = s.encode('ascii') if type(t) is str: t = t.encode('ascii') flags = 15 if semiglobal else 0 # TODO this constant shouldn't be here # globalalign uses scores. Set match to 0 to get the same result as with # edit distance unless we compute semiglobal alignments, where edit # distance does not work. match = 1 if semiglobal else 0 (row1, row2, start1, stop1, start2, stop2, errors) = globalalign(s, t, flags=flags, match=match) assert semiglobal or start1 == start2 == 0 assert semiglobal or stop1 == len(s) assert semiglobal or stop2 == len(t) assert len(row1) == len(row2) self.row1 = row1.decode('ascii') self.row2 = row2.decode('ascii') self.start1 = start1 self.stop1 = stop1 self.start2 = start2 self.stop2 = stop2 self.errors = errors # Overlap start and stop in alignment coordinates. # row1[overlap_start:overlap_stop] is the overlapping alignment. self.overlap_start = max(start1, start2) self.overlap_stop = len(row1) - max(len(s) - stop1, len(t) - stop2) def _replace_gaps(self, row, gap_char): start = self.overlap_start stop = self.overlap_stop front = row[:start].replace('\0', ' ') middle = row[start:stop].replace('\0', gap_char) end = row[stop:].replace('\0', ' ') return front + middle + end def print(self, width=100, gap_char='-'): """multi-line output and parameters, therefore not called __str__""" row1 = self._replace_gaps(self.row1, gap_char) row2 = self._replace_gaps(self.row2, gap_char) for i in range(0, len(row1), width): top = row1[i:i+width] bottom = row2[i:i+width] m = [] for c, d in zip(top, bottom): if c == d: m.append('|') elif c == ' ' or d == ' ': m.append(' ') else: m.append('X') middle = ''.join(m) if i > 0: print() print(top) print(middle) print(bottom) def __str__(self): return 'GA(row1={}, row2={}, errors={})'.format( self.row1, self.row2, self.errors) def edit_distance(s, t, maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ m = len(s) # index: i n = len(t) # index: j e = maxdiff if e != -1 and abs(m - n) > e: return abs(m - n) # dynamic programming "table" (just a single column) # note that using an array('h', ...) here is not faster costs = list(range(m+1)) # calculate alignment (using unit costs) for j in range(1, n+1): start = 1 stop = m + 1 if e != -1: # banded stop = min(stop, j + e + 1) if j <= e: prev = costs[0] costs[0] += 1 start = 1 else: start = j - e prev = costs[start-1] else: prev = costs[0] costs[0] += 1 for i in range(start, stop): c = min( prev + int(s[i-1] != t[j-1]), costs[i] + 1, costs[i-1] + 1) prev = costs[i] costs[i] = c return costs[-1] try: from sqt._helpers import edit_distance except: pass def multialign(sequences, program='mafft', threads=available_cpu_count()): """ Wrapper for multiple sequence alignment tools. Currently supported are * ClustalO (http://www.clustal.org/omega/), * MAFFT (http://mafft.cbrc.jp/alignment/software/) * MUSCLE (http://www.drive5.com/muscle/) A package using the libclustalo library directly also exists: https://github.com/benchling/clustalo-python/blob/master/clustalo.c It is not for Python 3. sequences -- a dictionary mapping names to sequences. Use an OrderedDict if order matters. program -- must be 'clustalo', 'mafft', 'muscle', 'muscle-medium', 'muscle-fast'. The latter calls MUSCLE with parameters that make it run faster (but less accurate). 'muscle-medium' is in between muscle and muscle-fast. threads -- number of threads to use for those programs that support it. By default, set to the number of processors. """ if program == 'mafft': args = ['mafft', '--quiet', '--thread', str(threads), '-'] elif program == 'clustalo': args = ['clustalo', '--threads='+str(threads), '--infile=-'] elif program == 'muscle': args = ['muscle', '-quiet', '-in', '-', '-out', '-'] elif program == 'muscle-fast': args = ['muscle', '-quiet', '-maxiters', '1', '-diags', '-in', '-', '-out', '-'] elif program == 'muscle-medium': args = ['muscle', '-quiet', '-maxiters', '2', '-diags', '-in', '-', '-out', '-'] else: raise ValueError('program {!r} not supported'.format(program)) fasta_data = ''.join('>{}\n{}\n'.format(name, seq) for name, seq in sequences.items()) result = subprocess.check_output(args, input=fasta_data, universal_newlines=True) aligned = list(FastaReader(StringIO(result))) return { record.name: record.sequence.upper() for record in aligned } def consensus(aligned, threshold=0.7, ambiguous='N', keep_gaps=False): """ Compute a consensus from multialign() output. Idea taken from BioPython’s SummaryInfo.dumb_consensus function. aligned -- a dict mapping names to sequences or a list of sequences keep_gaps -- whether the returned sequence contains gaps (-) """ result = [] n = len(aligned) if hasattr(aligned, 'values'): sequences = aligned.values() else: sequences = aligned ambiguous = 'N' for chars in zip(*sequences): char, freq = Counter(chars).most_common(1)[0] if freq / n >= threshold: if keep_gaps or char != '-': result.append(char) else: result.append(ambiguous) return ''.join(result) marcelm-sqt-d3218a8c5437/sqt/ansicolor.py000066400000000000000000000036741302004573300201420ustar00rootroot00000000000000""" ANSI escape codes and some helper functions for colored terminal output. """ # dark foreground colors BLACK = "\x1b[0;30m" RED = "\x1b[0;31m" GREEN = "\x1b[0;32m" BROWN = "\x1b[0;33m" BLUE = "\x1b[0;34m" PURPLE = "\x1b[0;35m" CYAN = "\x1b[0;36m" GRAY = "\x1b[0;37m" # light foreground colors DARKGRAY = "\x1b[1;30m" LIGHTRED = "\x1b[1;31m" LIGHTGREEN = "\x1b[1;32m" YELLOW = "\x1b[1;33m" LIGHTBLUE = "\x1b[1;34m" LIGHTPURPLE = "\x1b[1;35m" LIGHTCYAN = "\x1b[1;36m" WHITE = "\x1b[1;37m" # dark background colors BACKGROUND_BLACK = "\x1b[0;40m" BACKGROUND_BLUE = "\x1b[0;44m" BACKGROUND_GREEN = "\x1b[0;42m" BACKGROUND_CYAN = "\x1b[0;46m" BACKGROUND_RED = "\x1b[0;41m" BACKGROUND_PURPLE = "\x1b[0;45m" BACKGROUND_BROWN = "\x1b[0;43m" BACKGROUND_GRAY = "\x1b[0;47m" # light background colors BACKGROUND_DARKGRAY = "\x1b[1;40m" BACKGROUND_LIGHTBLUE = "\x1b[1;44m" BACKGROUND_LIGHTGREEN = "\x1b[1;42m" BACKGROUND_LIGHTCYAN = "\x1b[1;46m" BACKGROUND_LIGHTRED = "\x1b[1;41m" BACKGROUND_LIGHTPURPLE = "\x1b[1;45m" BACKGROUND_YELLOW = "\x1b[1;43m" BACKGROUND_WHITE = "\x1b[1;47m" # other codes RESET = "\x1b[0m" BOLD_ON = "\x1b[1m" ITALICS_ON = "\x1b[3m" UNDERLINE_ON = "\x1b[4m" INVERSE_ON = "\x1b[7m" STRIKETHROUGH_ON = "\x1b[9m" BOLD_OFF = "\x1b[22m" ITALICS_OFF = "\x1b[23m" UNDERLINE_OFF = "\x1b[24m" INVERSE_OFF = "\x1b[27m" STRIKETHROUGH_OFF = "\x1b[29m" def colored(s, color): """ Enclose string s in ANSI escape codes such that s appears in the given color when printed on a terminal. """ return color + s + RESET def red(s): return colored(s, RED) def blue(s): return colored(s, BLUE) def green(s): return colored(s, GREEN) def yellow(s): return colored(s, YELLOW) def lightred(s): return colored(s, LIGHTRED) def bgred(s): return colored(s, BACKGROUND_RED) def bgblue(s): return colored(s, BACKGROUND_BLUE) def bggreen(s): return colored(s, BACKGROUND_GREEN) bgreen = bggreen def bgyellow(s): return colored(s, BACKGROUND_YELLOW) marcelm-sqt-d3218a8c5437/sqt/args.py000066400000000000000000000010041302004573300170660ustar00rootroot00000000000000from argparse import ArgumentParser, RawDescriptionHelpFormatter import sys class HelpfulArgumentParser(ArgumentParser): """An ArgumentParser that prints full help on errors.""" def __init__(self, *args, **kwargs): if 'formatter_class' not in kwargs: kwargs['formatter_class'] = RawDescriptionHelpFormatter super().__init__(*args, **kwargs) def error(self, message): self.print_help(sys.stderr) args = {'prog': self.prog, 'message': message} self.exit(2, '%(prog)s: error: %(message)s\n' % args) marcelm-sqt-d3218a8c5437/sqt/cigar.py000066400000000000000000000257611302004573300172370ustar00rootroot00000000000000""" CIGAR operations. There are two ways to represent a CIGAR string: - as a string, such as "17M1D5M4S" - as a list of (operator, length) pairs, as used by pysam: [ (0, 17), (1, 2), (0, 5), (0, 4) ] The naming convention in this module uses cigar and cigar_string to distinguish both types. The mapping of CIGAR operators to numbers is: MIDNSHPX= => 012345678 """ import sys from itertools import repeat, chain __author__ = 'Marcel Martin' # constants M = 0 # match or mismatch I = 1 # insertion D = 2 # deletion N = 3 # skipped reference region S = 4 # soft clipping H = 5 # hard clipping P = 6 # padding X = 7 # mismatch EQ = 8 # match # use this as a sequence to map an encoded operation to the appropriate # character OPERATORS = 'MIDNSHPX=' DECODE = OPERATORS # this dictionary maps operations to their integer encodings _ENCODE = dict( (c,i) for (i, c) in enumerate(DECODE) ) def _assert_at_end(i): """Assert that the iterator i is at its end""" if __debug__: try: next(i) assert False except StopIteration: pass def alignment_iter(read, ref, cigar, gap='-'): """ Yield triples (read_char, reference_char, cigar_char) that fully describe the alignment betwen read and ref according to cigar. If the cigar operation is a 'M', the cigar_char is set to either '=' or 'X' depending on whether read_char matches reference_char or not. At gaps in the alignment, either read_char or reference_char are set to the given gap character. read -- an iterable representing the read ref -- an iterable representing the reference sequence cigar -- a list of (operator, length) pairs """ i = iter(read) j = iter(ref) for op in decoded_ops(cigar): if op == 'S': ci = next(i) elif op == 'M': ci = next(i) cj = next(j) yield (ci, cj, '=' if ci == cj else 'X') elif op == 'I': yield (next(i), gap, 'I') elif op == 'D': yield (gap, next(j), 'D') else: raise ValueError("CIGAR operator {} not supported".format(op)) _assert_at_end(i) _assert_at_end(j) def print_alignment(read, ref, cigar, file=sys.stdout): """ Print an alignment between read and ref according to a CIGAR. This uses the alignment_iter() function from above. cigar -- a list of (operator, length) pairs """ row1 = '' row2 = '' align = '' for read_char, reference_char, op in alignment_iter(read, ref, cigar): row1 += read_char align += op row2 += reference_char print(row1, align, row2, sep='\n', file=file) def unclipped_region(cigar): """ Return tuple (cigar, start, stop), where cigar is the given cigar without soft clipping and (start, stop) is the interval in which the read is *not* soft-clipped. """ if cigar[0][0] == S: start = cigar[0][1] cigar = cigar[1:] else: start = 0 if cigar[-1][0] == S: stop = -cigar[-1][1] cigar = cigar[:-1] else: stop = None return (cigar, start, stop) def reference_to_query_length(cig, reference_bases): """ Given a prefix of length reference_bases relative to the reference, how long is the prefix of the read? If the position is within an insertion, then the number of bases up to Hard- and soft-clipped bases are always included in the resulting coordinate. """ rpos = 0 qpos = 0 for op, length in cig: if op == S or op == H: qpos += length elif op == M: rpos += length qpos += length if rpos >= reference_bases: return qpos + reference_bases - rpos elif op == D: rpos += length if rpos >= reference_bases: return qpos elif op == I: qpos += length else: raise ValueError('CIGAR operator {!r} not supported, yet'.format(op)) return None class Cigar: """ Representation of an alignment in the form of a CIGAR string. TODO - Rename .cigar attribute to .ops/.opslist/.operations? - What should len(Cigar(...)) return? Length of .cigar attribute? - Length of alignment on reference - What should __iter__ do? Should it be .elements()? """ def __init__(self, cigar=''): """ cigar -- either a string such as '3M2I2M' or a list of (operator, length) tuples. """ if isinstance(cigar, str): self.cigar = self.parse(cigar) # TODO # elif isinstance(cigar, Cigar): self.cigar = cigar.cigar else: self.cigar = cigar def __eq__(self, other): return self.cigar == other.cigar def __ne__(self, other): return self.cigar != other.cigar def __getitem__(self, key): return Cigar(self.cigar[key]) def _as_string(self, join_by=''): """ Format the CIGAR string. join_by is an optional separator. >>> Cigar('3M2S')._as_string(join_by=' ') '3M 2S' """ return join_by.join( '{}{}'.format(l, DECODE[op]) for op, l in self.cigar) def __format__(self, format_spec): if format_spec in ('', ' '): return self._as_string(join_by=format_spec) else: raise ValueError( "Format specification '{}' not supported".format(format_spec)) def __str__(self): return self._as_string() def __repr__(self): return "Cigar('{}')".format(str(self)) @staticmethod def parse(cigar_string): """ Parse a CIGAR string and return a list of (operator, length) pairs. Spaces are ignored. >>> parse("3S17M8D4M9I3H") [(4, 3), (0, 17), (2, 8), (0, 4), (1, 9), (5, 3)] >>> parse("3S 17M") [(4, 3), (0, 17)] """ cigar = [] n = '' # This is a string to which digits are appended for c in cigar_string: if c.isdigit(): n += c elif c in _ENCODE: if n == '': raise ValueError("CIGAR string should start with a number.") cigar.append( (_ENCODE[c], int(n)) ) n = '' elif c == ' ': continue else: raise ValueError( 'Character "{}" unexpected in CIGAR string.'.format(c)) if n != '': raise ValueError("Unexpected end of CIGAR string.") return cigar def __add__(self, other): """ Return the concatenation of this CIGAR and another one. >>> Cigar('2S1M') + Cigar('3M4S')) Cigar('2S4M4S') """ if len(other.cigar) == 0: return Cigar(self.cigar) if len(self.cigar) == 0: return Cigar(other.cigar) self_last = self.cigar[-1] other_first = other.cigar[0] # same operation? if self_last[0] == other_first[0]: return Cigar( self.cigar[:-1] + [(self_last[0], self_last[1] + other_first[1])] + other.cigar[1:]) return Cigar(self.cigar + other.cigar) def split_at_element(self, i, consumed): """ Split a CIGAR. i is the index to the element and consumed is the index within the element. >>> c = Cigar("3M 1D 6M 2I 4M") c.split_at_element(2, 5) (Cigar("3M 1D 5M"), Cigar("1M 2I 4M")) """ middle_op, middle_length = self.cigar[i] assert consumed <= middle_length if consumed > 0: left = self.cigar[:i] + [(middle_op, consumed)] else: left = self.cigar[:i] if consumed < middle_length: right = [(middle_op, middle_length-consumed)] + self.cigar[i+1:] else: right = self.cigar[i+1:] return Cigar(left), Cigar(right) def elements(self, numbers=False): """ Yield all operations one by one. If numbers is set to True, the operations are returned numerically. >>> ''.join(Cigar("3S2I3M").elements()) "SSSIIMMM" >>> list(Cigar("3S2I3M").elements(numbers=True)) [4, 4, 4, 1, 1, 0, 0, 0] """ if numbers: return chain.from_iterable(repeat(op, l) for (op, l) in self.cigar) else: return chain.from_iterable( repeat(DECODE[op], l) for (op, l) in self.cigar) def alignment_length(self): """ Return the number of bases of the read that are used in the alignment. This counts all matches, mismatches and insertions. Clipped bases are not counted. """ return sum(l for op, l in self.cigar if op in (M, I, EQ, X)) def query_length(self, count_clipped='soft'): """ Return the length of the query sequence deduced from the length of the operations used in this CIGAR string. Matches, mismatches and insertions are counted (M, I, =, X operators). The count_clipped parameter determines whether hard- or soft-clipped bases are counted (H and S operators). It can be set to None, 'soft' or 'hard'. - None: Do not count any clipped bases. Only the bases actually aligned to the reference are counted. - 'soft': Count also soft-clipped bases. The returned length should be identical to the length of the SEQ field in a SAM file that follows the specification. From the spec: 'sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ'. - 'hard': Count both soft- and hard-clipped bases. The returned length is the same as the length of the original read. """ if count_clipped is None: ops_to_count = (M, I, EQ, X) elif count_clipped == 'soft': ops_to_count = (M, I, EQ, X, S) elif count_clipped == 'hard': ops_to_count = (M, I, EQ, X, S, H) else: raise ValueError( "count_clipped must be either None, 'soft' or 'hard'") return sum(length for op, length in self.cigar if op in ops_to_count) def reference_length(self): """ Return the length of the reference sequence deduced from the length of the operations used in the CIGAR string. This counts the M, D, N, X, = operations. """ return sum(length for op, length in self.cigar if op in (M, D, EQ, X, N)) def _clipping_length(self, where, op): """ where == 0: beginning where == -1: end """ if not self.cigar: return 0 elem = self.cigar[where] if elem[0] == op: return elem[1] return 0 @staticmethod def _clipping_length_both(elements): if not elements: return 0 n = 0 for op, length in elements: if op == H or op == S: n += length else: break return n @property def soft_clipping_left(self): """ Return length of soft-clipping in the beginning of the alignment. If the alignment is hard-clipped, return 0. """ return self._clipping_length(0, S) @property def soft_clipping_right(self): """ Return length of soft-clipping in the end of the alignment. If the alignment is hard-clipped, return 0. """ return self._clipping_length(-1, S) @property def hard_clipping_left(self): """ Return length of hard-clipping in the beginning of the alignment. """ return self._clipping_length(0, H) @property def hard_clipping_right(self): """ Return length of hard-clipping in the end of the alignment. """ return self._clipping_length(-1, H) @property def clipping_left(self): """ Return the length of soft- and hard-clipping in the beginning of the alignment """ return self._clipping_length_both(self.cigar) @property def clipping_right(self): """ Return the length of soft- and hard-clipping in the end of the alignment """ return self._clipping_length_both(self.cigar[::-1]) # deprecated functions that have become methods of the Cigar class def decoded_ops(cigar): return Cigar(cigar).elements() def ops(cigar): return Cigar(cigar).elements(numbers=True) def as_string(cigar, join_by=''): return str(Cigar(cigar)) def parse(cigar_string): return Cigar.parse(cigar_string) def concat(left, right): return Cigar(left) + Cigar(right) def aligned_bases(cigar): return Cigar(cigar).query_length(count_clipped=None) def seq_length(cigar): return Cigar(cigar).query_length(count_clipped='soft') def read_length(cigar): return Cigar(cigar).query_length(count_clipped='hard') marcelm-sqt-d3218a8c5437/sqt/colorspace.py000066400000000000000000000034241302004573300202740ustar00rootroot00000000000000""" Colorspace conversion routines. Inspired by agapython/util/Dibase.py from Corona lite, but reimplemented to avoid licensing issues. Encoding Table A C G T A 0 1 2 3 C 1 0 3 2 G 2 3 0 1 T 3 2 1 0 """ import string import sys __author__ = 'Marcel Martin' if sys.version > '3': xrange = range def _initialize_dicts(): """ Create the colorspace encoding and decoding dictionaries. """ enc = {} for i, c1 in enumerate("ACGT"): enc['N' + c1] = '4' enc[c1 + 'N'] = '4' enc['.' + c1] = '4' enc[c1 + '.'] = '4' for j, c2 in enumerate("ACGT"): # XOR of nucleotides gives color enc[c1 + c2] = chr(ord('0') + (i ^ j)) enc.update({ 'NN': '4', 'N.': '4', '.N': '4', '..': '4'}) dec = {} for i, c1 in enumerate("ACGT"): dec['.' + str(i)] = 'N' dec['N' + str(i)] = 'N' dec[c1 + '4'] = 'N' for j, c2 in enumerate("ACGT"): # XOR of nucleotides gives color dec[c1 + chr(ord('0') + (i ^ j))] = c2 dec['N4'] = 'N' return (enc, dec) def encode(s): """ Given a sequence of nucleotides, convert them to color space. Only uppercase characters are allowed. >>> encode("ACGGTC") "A13012" """ if not s: return s r = s[0] for i in range(len(s) - 1): r += ENCODE[s[i:i+2]] return r def decode(s): """ Decode a sequence of colors to nucleotide space. The first character in s must be a nucleotide. Only uppercase characters are allowed. >>> decode("A13012") "ACGGTC" """ if len(s) < 2: return s x = s[0] result = x for c in s[1:]: x = DECODE[x + c] result += x return result (ENCODE, DECODE) = _initialize_dicts() if sys.version > '3': # convert to "bytes" def _str_dict_to_bytes(d): return dict((bytes(k, 'ascii'), bytes(v, 'ascii')) for k,v in d.items()) ENCODE.update(_str_dict_to_bytes(ENCODE)) DECODE.update(_str_dict_to_bytes(DECODE)) marcelm-sqt-d3218a8c5437/sqt/commands/000077500000000000000000000000001302004573300173665ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/sqt/commands/__init__.py000066400000000000000000000000001302004573300214650ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/sqt/commands/addadapt.py000066400000000000000000000045671302004573300215160ustar00rootroot00000000000000#!/usr/bin/env python3 """ Read in reads in FASTA format, insert an adapter sequence at a random position, shorten the read to its original length, and write it out. An annotation is added to the read description that describes the adapter position, such as "adapterpos=17". The position is 1-based. """ import sys from random import randint, random, seed, choice from sqt import HelpfulArgumentParser from sqt.io.fasta import FastaReader __author__ = "Marcel Martin" #adapter = 'GCCTAACTTCTTAGACTGCCTTAAGGACGT' def mutate_sequence(seq, rate=0.1, alphabet='ACGT', indels=False, indel_rate=0.1): """ If indels is True, the mutation_rate is equally split up between insertions, deletions and substitutions. """ mutated = [] charpos = dict( (c,i) for (i,c) in enumerate(alphabet)) for c in seq: c = c.upper() r = random() if indels and r < 0.5 * rate * indel_rate: # insertion mutated.append(choice(alphabet)) mutated.append(c) elif indels and r < rate * indel_rate: # deletion pass elif r < rate: # mutate base d = alphabet[(charpos[c] + randint(1, len(alphabet)-1)) % len(alphabet)] assert d != c mutated.append(d) else: # no change mutated.append(c) return ''.join(mutated) def main(): parser = HelpfulArgumentParser(description=__doc__) arg = parser.add_argument arg("--seed", type=int, default=None, help="seed for random number generator") arg("--erate", type=float, default=None, help="error rate for simulated errors, no indels (default: no errors)") arg("--adapter", "-a", default='GCCTAACTTCTTAGACTGCCTTAAGGACGT', help="Adapter sequence") arg("--probability", "-p", default=0.5, help="Fraction of reads (approximate) that should contain the adapter") arg("fasta", help="Input FASTA file. Use '-' for standard input.") args = parser.parse_args() if args.seed is not None: seed(args.seed) adapter = args.adapter adapter_prob = args.probability with FastaReader(args.fasta) as reader: for read in reader: if random() < adapter_prob: if args.erate is not None: a = mutate_sequence(adapter, args.erate, indels=False) else: a = adapter l = len(read) pos = randint(0, l-1) seq = read.sequence seq = seq[:pos] + a + seq[pos:] read.sequence = seq[:l] read.name = read.name + ' adapterpos={}'.format(pos+1) print('>{}\n{}'.format(read.name, read.sequence)) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/align.py000066400000000000000000000067171302004573300210450ustar00rootroot00000000000000#!/usr/bin/env python3 """ Compare two strings """ from sqt import HelpfulArgumentParser import sys import os from collections import namedtuple from cutadapt import align from sqt import FastaReader from sqt.align import globalalign, GlobalAlignment from sqt.dna import reverse_complement __author__ = "Marcel Martin" Alignment = namedtuple('Alignment', 'start1 stop1 start2 stop2 matches errors') def add_arguments(parser): arg = parser.add_argument arg('--semiglobal', '--overlap', action='store_true', default=False, help='Run a semi-global alignment (for detecting overlaps). ' 'Default: global alignment.') arg('--max-error-rate', '-e', type=float, default=None, help='Switch to cutadapt algorithm (also enables --semiglobal). ' 'No alignment will be printed.') arg('--reverse-complement', '--rc', action='store_true', default=False, help='Run the alignment also with the reverse-complement of the second ' 'sequence') arg('--merge', action='store_true', default=False, help='Output a merged sequence (also enables --semiglobal)') arg('sequence1', help='Sequence or path to FASTA file. If FASTA, only the first sequence is used.') arg('sequence2', help='Sequence or path to FASTA file. If FASTA, only the first sequence is used.') def print_numbers(sequence1, sequence2, alignment, overlap): print() print('Length of sequence 1:', len(sequence1)) print('Length of sequence 2:', len(sequence2)) print('Errors in alignment:', alignment.errors) if overlap: print('Length of overlap in sequence 1:', alignment.stop1 - alignment.start1) print('Length of overlap in sequence 2:', alignment.stop2 - alignment.start2) if hasattr(alignment, 'matches'): print('Matches:', alignment.matches) def load_sequence(path_or_sequence): if os.path.exists(path_or_sequence): with FastaReader(path_or_sequence) as fr: sequence = next(iter(fr)).sequence else: sequence = path_or_sequence return sequence def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() sequence1 = load_sequence(args.sequence1) sequence2 = load_sequence(args.sequence2) sequences = [ (False, sequence2) ] if args.reverse_complement: sequences.append((True, reverse_complement(sequence2))) if args.merge: args.semiglobal = True # credit: http://stackoverflow.com/questions/566746/ rows, columns = os.popen('stty size', 'r').read().split() columns = int(columns) for revcomp, sequence2 in sequences: if revcomp: print('Alignment with reverse-complemented sequence 2:') else: print('Alignment:') print() if args.max_error_rate is not None: flags = align.SEMIGLOBAL result = align.locate(sequence1.upper(), sequence2.upper(), max_error_rate=args.max_error_rate, flags=flags) if result is None: print('No alignment found') continue alignment = Alignment(*result) print_numbers(sequence1, sequence2, alignment, overlap=True) else: alignment = GlobalAlignment(sequence1.upper(), sequence2.upper(), semiglobal=args.semiglobal) alignment.print(width=columns, gap_char='-') print_numbers(sequence1, sequence2, alignment, args.semiglobal) if args.merge: merged = sequence1[0:alignment.start1] merged += sequence2[0:alignment.start2] merged += sequence1[alignment.start1:alignment.stop1] merged += sequence1[alignment.stop1:] merged += sequence2[alignment.stop2:] print('Merged (length {}):'.format(len(merged)), merged) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/bam2fastq.py000066400000000000000000000107201302004573300216200ustar00rootroot00000000000000#!/usr/bin/env python3 """ Extract all reads from a BAM file that map to a certain location, but try hard to extract them even if hard clipping is used. TODO reverse-complementarity is ignored TODO behavior when no region is given is not well documented """ __author__ = 'Marcel Martin' import sys import logging from pysam import Samfile from sqt import HelpfulArgumentParser, FastqWriter from sqt.region import Region from sqt.reads import AlignedRead from sqt.cigar import Cigar logger = logging.getLogger(__name__) def add_arguments(parser): arg = parser.add_argument arg("--missing-quality", type=int, default=40, help='Quality value to use if an entry does not have qualities ' '(default: %(default)s)') arg("bam", metavar="SAM/BAM", help="Name of a SAM or BAM file") arg("-L", '--bed', metavar="BED-FILE", action='append', help="BED file with regions", default=[]) arg("region", nargs='*', help="Region") def extract_read(aligned_read, aligned_segment, samfile): """ Return the primary AlignedSegment (the one that is not hard clipped) for the aligned read. """ def is_hard_clipped(segment): cig = Cigar(segment.cigar) return cig.hard_clipping_left != 0 or cig.hard_clipping_right != 0 if not is_hard_clipped(aligned_segment): return aligned_segment for supplementary in aligned_read: refname, pos = supplementary.reference_name, supplementary.pos for segment in samfile.fetch(refname, pos, pos+1, multiple_iterators=True): if (segment.query_name == aligned_segment.query_name and not is_hard_clipped(segment)): return segment return None def parse_bed(file): """ Yield Region objects for each line of the BED-formatted input file. If a line in the BED file contains less than three fields, the end coordinate is set to None. If it contains only one field, the start coordinate is set to zero. """ for line in file: line = line.strip() if line.startswith('#'): continue fields = line.split('\t') if len(fields) == 1: start = 0 stop = None else: start = int(fields[1]) if len(fields) == 2 or fields[2] == '': stop = None else: stop = int(fields[2]) yield Region(fields[0], start, stop) def collect_regions(region_specifications, bedpaths): """ Special case: if neither specifications nor bedpaths are given, yield a single None. """ if not region_specifications and not bedpaths: yield None return for path in bedpaths: with open(path) as f: for region in parse_bed(f): yield region for spec in region_specifications: yield Region(spec) def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() missing_quality = chr(args.missing_quality + 33) written_reads = set() not_found = set() no_qualities = 0 indirect = 0 n_regions = 0 names = set() regions = list() with FastqWriter(sys.stdout) as writer, Samfile(args.bam) as sf: for region in collect_regions(args.region, args.bed): if region is None: region_iter = sf else: region_iter = sf.fetch(region.reference, region.start, region.stop) n_regions += 1 for record in region_iter: names.add(record.query_name) if region is None: if record.is_supplementary or record.is_secondary or record.is_unmapped: continue if record.mapping_quality < 1: continue if record.query_alignment_length < 1000: continue segment = record else: if record.is_unmapped: assert False, 'shouldn’t happen' continue assert record.cigar is not None if record.query_name in written_reads or record.query_name in not_found: continue aligned_read = AlignedRead(record, sf.getrname(record.tid)) segment = extract_read(aligned_read, record, sf) if segment is None: not_found.add(record.query_name) continue if segment is not record: indirect += 1 assert segment.query_name == record.query_name if segment.query_qualities is not None: qualities = ''.join(chr(c+33) for c in segment.query_qualities) else: qualities = missing_quality * len(segment.query_sequence) no_qualities += 1 writer.write(segment.query_name, segment.query_sequence, qualities) written_reads.add(record.query_name) logger.info('%s unique read names in %s region(s)', len(names), n_regions) logger.info('%s entries written (%s found indirectly)', len(written_reads), indirect) logger.info('%s without qualities', no_qualities) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/bameof.py000066400000000000000000000025551302004573300212000ustar00rootroot00000000000000#!/usr/bin/env python3 """ Check whether the EOF marker is present in BAM files. If it's not, this may be a sign that the BAM file was corrupted. The exit code is 1 if the marker was present in *all files*. It is 0 if the marker was missing in any of the files. BUGS/TODO - Does not work with uncompressed BAM files. """ import sys from sqt import HelpfulArgumentParser __author__ = "Marcel Martin" def add_arguments(parser): arg = parser.add_argument arg("-q", "--quiet", action='store_true', help="Don't print anything, just set the exit code.") arg("bam", metavar='BAM', nargs='+') def bam_eof_is_ok(f): """ Check whether BAM file f contains the 'magic' end-of-file marker. Adapted from samtools function bgzf_check_EOF (in bgzf.c). """ try: f.seek(-28, 2) except IOError: return False data = f.read(28) return data == b"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0" def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() exitcode = 0 for name in args.bam: with open(name, 'rb') as f: if bam_eof_is_ok(f): if not args.quiet: print(name, ": OK", sep='') else: if not args.quiet: print(name, ": MISSING", sep='') exitcode = 1 else: sys.exit(1) sys.exit(exitcode) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/bamstats.py000066400000000000000000000174761302004573300215750ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print a report about a SAM/BAM file. """ __author__ = 'Marcel Martin' import sys from collections import Counter, defaultdict from itertools import islice from pysam import Samfile from .. import HelpfulArgumentParser from ..reads import AlignedRead def header(s): print(s) print('-' * len(s)) print() def print_coverage_report(aligned_read, report=sys.stdout, minimum_cover_fraction=0.01): """ Print coverage report for a single read to the file-like object 'report'. """ read_length = len(aligned_read) alignments = aligned_read.alignments print('Read {} ({:.1f}kbp):'.format(aligned_read.query_name, read_length / 1000), file=report) for alignment in sorted(alignments, key=lambda a: (a.query_start, a.query_end)): alignment_length = alignment.cigar.query_length(count_clipped=None) if alignment_length / read_length < minimum_cover_fraction: continue print( '{:9} bp'.format(alignment_length), '{:6.1%}'.format(alignment_length / read_length), '{:6} ... {:6} '.format(alignment.query_start+1, alignment.query_end), '{} {:>2}:{}-{}'.format(alignment.strand, alignment.reference_name, alignment.pos+1, alignment.reference_end), file=report) bases = aligned_read.aligned_bases() print('{:.1%} aligned ({}/{})\n'.format(bases / read_length, bases, read_length), file=report) def print_basics(aligned_reads, aligned_bases): header('Basic statistics') bases = sum(len(aligned_read) for aligned_read in aligned_reads) total_reads = len(aligned_reads) total_alignments = sum(len(aligned_read.alignments) for aligned_read in aligned_reads) print('Number of reads: {:10,d}'.format(total_reads)) #print('unfiltered+CIGAR: {:10,d} ({:.2%})'.format(mapped_reads, mapped_reads / total_reads)) print('Alignments per read: {:.3f} (only mapped reads)'.format(total_alignments / total_reads)) # TODO is this correct? print('bases: {:15,d} ({:.2f} Gbp)'.format(bases, bases / 1E9)) print('aligned bases: {:15,d} ({:.2f} Gbp) ({:.2%})'.format(aligned_bases, aligned_bases/1E9, aligned_bases/bases)) print() def print_subalignment_histogram(number_alignments): print('Histogram of number of subalignments') rest = 0 for number, count in number_alignments.items(): if number <= 10: print(' {:2} {:9}'.format(number, count)) else: rest += count if rest > 0: print('>10 {:9}'.format(rest)) print() def print_subalignment_stats(aligned_reads, total_reads): header('Subalignment statistics') fully_aligned_95 = 0 # reads whose bases are 95% aligned within one subalignment fully_aligned_99 = 0 # reads whose bases are 99% aligned within one subalignment number_alignments = Counter() interesting = 0 for aligned_read in aligned_reads: alignments = aligned_read.alignments lengths = sorted(alignment.query_length for alignment in alignments) refnames = set(alignment.reference_name for alignment in alignments) if len(lengths) >= 1 and lengths[-1] >= 0.95 * len(aligned_read): fully_aligned_95 += 1 if len(lengths) >= 1 and lengths[-1] >= 0.99 * len(aligned_read): fully_aligned_99 += 1 number_alignments[len(lengths)] += 1 # is this an 'interesting' read? (arbitrary thresholds) if 2 <= len(lengths) <= 4 and len(set(refnames)) > 1: interesting += 1 print_subalignment_histogram(number_alignments) print('fully aligned (95%):{:10,d} ({:.2%})'.format(fully_aligned_95, fully_aligned_95/total_reads)) print('fully aligned (99%):{:10,d} ({:.2%})'.format(fully_aligned_99, fully_aligned_99/total_reads)) print('no of interesting reads:', interesting) print() def print_cigar_usage(counter): header("CIGAR operator usage") total_ops = sum(counter.values()) ops = 'MIDNSHPX=' for op_i, op in enumerate(ops): print("{:2} {:14,d} ({:7.2%})".format(op, counter[op_i], counter[op_i]/total_ops)) print() def print_reference_usage(reflengths, reference_hits, minimum_reference_length=1000): header('Scaffold/chromosome/references usage') long_refs = sum(1 for length in reflengths.values() if length >= minimum_reference_length) ref_hits_length = sum(reflengths[refname] for refname in reference_hits) total_ref_length = sum(reflengths.values()) print('total length of references: {:,d} ({:.2f} Gbp)'.format(total_ref_length, total_ref_length/1E9)) print('references:', len(reflengths)) print('references hit by at least one alignment:', len(reference_hits)) print('length of those references: {:,d} ({:.2%})'.format(ref_hits_length, ref_hits_length/total_ref_length)) print('length of references not hit: {:,d}'.format(total_ref_length - ref_hits_length)) #_refs = infile.references #assert infile.nreferences == len(_refs) #refname_to_length = dict(zip(_refs, infile.lengths)) #for i in range(infile.nreferences): #assert refname_to_length[infile.getrname(i)] == reflengths[i] long_ref_hits = sum(1 for tid in reference_hits if reflengths[tid] >= minimum_reference_length) ref_hits_length = sum(reflengths[refname] for refname in reference_hits) print('references >= {} bp:'.format(minimum_reference_length), long_refs) print('references >= {} bp hit by at least one alignment:'.format(minimum_reference_length), long_ref_hits) def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument('--quality', '-q', type=int, default=0, help='Minimum mapping quality (default: %(default)s') parser.add_argument('--minimum-reference-length', metavar='N', type=int, default=0, help='For reference usage statistics, ignore references shorter than N.') parser.add_argument('--limit', metavar='N', type=int, default=None, help='Process only the first N entries in the input file.') parser.add_argument('--cover', metavar='FILE', default=None, help='Print report about "read coverage" (which sections are aligned) to FILE') parser.add_argument('--minimum-cover-fraction', metavar='FRACTION', type=float, default=0.01, help='Alignment must cover at least FRACTION of the read to appear in the cover report. (%(default)s)') parser.add_argument("bam", metavar="SAM/BAM", help="Name of a SAM or BAM file") args = parser.parse_args() # Count how often each CIGAR operator occurs cigar_counter = Counter() n_records = 0 unmapped = 0 unmapped_bases = 0 aligned_reads = [] with Samfile(args.bam) as sf: for record in islice(sf, 0, args.limit): n_records += 1 if record.is_unmapped: unmapped += 1 unmapped_bases += len(record.seq) continue if record.mapq < args.quality: continue assert record.cigar is not None for op_i, l in record.cigar: cigar_counter[op_i] += l if not record.is_secondary and not record.is_supplementary: aligned_read = AlignedRead(record, sf.getrname(record.tid)) aligned_reads.append(aligned_read) reflengths = sf.lengths nreferences = sf.nreferences assert nreferences == len(reflengths) refnames_map = { tid: sf.getrname(tid) for tid in range(nreferences) } reference_lengths = dict(zip(sf.references, sf.lengths)) total_aligned_bases = 0 reference_hits = defaultdict(int) for aligned_read in aligned_reads: total_aligned_bases += aligned_read.aligned_bases() for alignment in aligned_read.alignments: reference_hits[alignment.reference_name] += 1 if args.cover is not None: with open(args.cover, 'wt') as cover: for aligned_read in aligned_reads: print_coverage_report(aligned_read, cover, args.minimum_cover_fraction) header('All entries in input file') print('Total entries: {:10,d}'.format(n_records)) print('Unmapped: {:10,d}'.format(unmapped)) print('Unmapped bases: {:10,d}'.format(unmapped_bases)) print() print_basics(aligned_reads, total_aligned_bases) print_subalignment_stats(aligned_reads, len(aligned_reads)) # TODO is len(aligned_reads) actually the number of unique reads? print_cigar_usage(cigar_counter) print_reference_usage(reference_lengths, reference_hits, minimum_reference_length=args.minimum_reference_length) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/chars.py000066400000000000000000000006741302004573300210470ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print the number of characters in a string. """ from sqt import HelpfulArgumentParser __author__ = "Marcel Martin" def add_arguments(parser): arg = parser.add_argument arg('string', help='The string') def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() print(len(args.string)) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/checkfastqpe.py000066400000000000000000000042071302004573300224040ustar00rootroot00000000000000#!/usr/bin/env python3 """ Check whether two FASTQ files contain paired-end reads. The corresponding reads in FASTQ1 and FASTQ2 must have the same names. A read name suffix of "/1" in FASTQ1 and a suffix of "/2" in FASTQ is ignored. The read name is only the part before the first space character. FASTQ1 and FASTQ2 may be gzipped. The exit code is nonzero on failure, zero otherwise. """ import sys from itertools import zip_longest from sqt.io.fasta import FastqReader from sqt import HelpfulArgumentParser __author__ = "Marcel Martin" def check(fastq1, fastq2, limit=None): """ Raise ValueError if given FASTQ files seem to be paired incorrectly. """ with FastqReader(fastq1) as fastq1: with FastqReader(fastq2) as fastq2: n = 1 for record1, record2 in zip_longest(fastq1, fastq2): if record1 is None: raise ValueError("File 2 is longer than file 1") if record2 is None: raise ValueError("File 1 is longer than file 2") name1 = record1.name.split(' ', 1)[0] if name1.endswith("/1"): name1 = name1[:-2] name2 = record2.name.split(' ', 1)[0] if name2.endswith("/2"): name2 = name2[:-2] if name1 != name2: raise ValueError("Incorrect names at record no. {}: '{}' != '{}'".format(n, name1, name2)) if n == limit: return n += 1 def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("-n", "--limit", type=int, default=1000, help="Only check the first N records. Set to zero to check the entire file (default: %(default)s)") parser.add_argument("--quiet", "-q", action='store_true', default=False, help="Don't print anything, just set the exit code.") parser.add_argument('fastq1', metavar='FASTQ1', help='File with first read of paired-end reads') parser.add_argument('fastq2', metavar='FASTQ2', help='File with second read of paired-end reads') args = parser.parse_args() if not args.quiet: def message(*args, **kwargs): print(*args, **kwargs) else: def message(*args, **kwargs): pass try: check(args.fastq1, args.fastq2, args.limit) except ValueError as e: message(e) sys.exit(1) message("OK") if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/checkvcfref.py000066400000000000000000000023551302004573300222160ustar00rootroot00000000000000#!/usr/bin/env python3 """ Read in a file with variants (typically VCF) and check if the given reference allele matches what is actually in the reference sequence at the given position. """ import sys import csv from sqt import HelpfulArgumentParser, IndexedFasta __author__ = "Marcel Martin" def main(): parser = HelpfulArgumentParser(description=__doc__) arg = parser.add_argument arg("reference", help="Reference file in FASTA format. An associated .fai index file must exist.") arg("variantfile", help="Input (VCF) file with variants.") args = parser.parse_args() reference = IndexedFasta(args.reference) with open(args.variantfile) as f: reader = csv.reader([row for row in f if row[0] != '#'], delimiter='\t') n = 0 for row in reader: n += 1 chrom = row[0] if chrom.startswith('chr'): chrom = chrom[3:] pos = int(row[1]) - 1 ref = row[3] actual = reference.get(chrom)[pos:pos+len(ref)].decode() if not actual == ref: print('Problem with record no.', n, 'found:') print('CHROM={} POS={} REF={}'.format(chrom, pos+1, ref)) print('Reference sequence in FASTA file is:', actual) sys.exit(1) else: print((n, 'records checked, everything ok.')) sys.exit(0) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/compare_sequences.py000066400000000000000000000113101302004573300234350ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print a comparison matrix for sequences from a FASTA or text file. All pairs of sequences are compared using the given comparison measure. By default, this is edit distance, the minimum number of substitutions, insertions, and deletions to transform one sequence into the other. For the resulting matrix, a header row and column is printed; this can be disabled by the options below. The given list of sequences can be automatically extended by their reverse complements, appending a suffix like _RC to each sequence's name. """ __author__ = "Sven Rahmann" import sys from sqt import HelpfulArgumentParser from sqt.io.fasta import SequenceReader from sqt.align import edit_distance from sqt.dna import reverse_complement as revcomp def get_argument_parser(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("file", help="file with sequences to compare") parser.add_argument("-c", "--compareby", default="editdistance", choices=("none", "identity", "editdistance"), help="comparison function [default: editdistance]") parser.add_argument("-s", "--show", default="matrix", choices=("none", "matrix", "min", "max"), help="output type (matrix or nearest neighbors)") parser.add_argument("-r", "--revcomp", nargs="?", const="_RC", metavar="SUFFIX", help="compare reverse complements, too; append _RC or argument to each name") parser.add_argument("-f", "--format", choices=("text","fasta","fastq"), help="explicitly specify file format if not FASTA") parser.add_argument("-R", "--norowheaders", action="store_true", help="do not print initial names in each row") parser.add_argument("-C", "--nocolumnheaders", action="store_true", help="do not print first row with column headers") parser.add_argument("-H", "--noheaders", action="store_true", help="both --norowheaders and --nocolumnheaders") parser.add_argument("-F", "--featurename", default="feature", metavar="NAME", help="use the given argument instead of 'feature' for features") parser.add_argument("-S", "--separator", default="\t", metavar="SEPARATOR", help="field separator [default: TAB]") return parser comparison_function = dict( none = lambda s,t: None, identity = lambda s,t: int(s==t), editdistance = edit_distance ) def process_file(fname, fformat): """return list of tuples (name, seq) from file """ if fformat=="text": with open(fname, "rt") as f: return [("seq_"+str(i), line.strip()) for line in f.readlines()] # now it's fasta or fastq seqs = [] for record in SequenceReader(fname): name, _, _ = record.name.partition(" ") seq = record.sequence.upper() seqs.append((name,seq)) return seqs def show_matrix(sequences, args): featurename = args.featurename separator = args.separator compare = comparison_function[args.compareby] if not args.nocolumnheaders: clist = [] if args.norowheaders else [featurename] clist.extend([s[0] for s in sequences]) print(separator.join(clist)) for si in sequences: clist = [] if args.norowheaders else [si[0]] for sj in sequences: c = compare(si[1],sj[1]) clist.append(str(c)) print(separator.join(clist)) def show_neighbors(sequences, args): assert args.show in ("max", "min") domax = (args.show == "max") separator = args.separator compare = comparison_function[args.compareby] for i,si in enumerate(sequences): clist = [] if args.norowheaders else [si[0]] best = None; bestlist = [] for j,sj in enumerate(sequences): if i==j: continue c = compare(si[1],sj[1]) if domax: if best is None or c > best: best = c; bestlist=[sj[0]] elif c == best: bestlist.append(sj[0]) else: if best is None or c < best: best = c; bestlist=[sj[0]] elif c == best: bestlist.append(sj[0]) clist.append(str(best)) print(separator.join(clist+bestlist)) show_functions = dict( none = lambda sequence, args: None, matrix = show_matrix, min = show_neighbors, max = show_neighbors, ) def main(): parser = get_argument_parser() args = parser.parse_args() sequences = process_file(args.file, args.format) if args.revcomp: rcsfx = args.revcomp rcs = [(name+rcsfx, revcomp(seq)) for (name,seq) in sequences] sequences.extend(rcs) fshow = show_functions[args.show] fshow(sequences, args) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/coverage.py000066400000000000000000000065231302004573300215410ustar00rootroot00000000000000#!/usr/bin/env python3 """ Compute per-reference statistics given a BAM file and a matching FASTA reference. The output is a tab-separated table with the following columns: chromosome: reference/scaffold/chromosome name gc: GC content (number of found G, C bases divided by the number of found A, C, G, T bases) length: length of the reference non_acgt: number of characters that are not one of A, C, G, T bases: total number of bases aligned to the reference, not counting those aligning to bases that are neither A, C, G nor T avg_cov: average coverage = bases / (length - non_acgt) median_cov: median coverage (as above, coverage over non-ACGT bases is not counted) TODO: - optionally print summary over all reference sequences """ __author__ = 'Marcel Martin' import sys from collections import Counter, namedtuple from pysam import Samfile from sqt import HelpfulArgumentParser from sqt._helpers import byte_frequencies from sqt.math import frequency_median from sqt.io.fasta import IndexedFasta Info = namedtuple('Info', 'length acgt_length bases acgt_bases median_coverage acgt_median_coverage gc') def collect_info(samfile, fasta, name, mask): sequence = fasta.get(name)[:] length = len(sequence) freqs = byte_frequencies(sequence) assert length == samfile.lengths[samfile.gettid(name)] acgt_length = sum(freqs[c] for c in b'ACGTacgt') gc = sum(freqs[c] for c in b'GCgc') # Get coverage acgt_bases = 0 bases = 0 coverages = Counter() acgt_coverages = Counter() for column in samfile.pileup(name, stepper='all', mask=mask): n = column.n if sequence[column.pos] in b'ACGTacgt': acgt_bases += n acgt_coverages[n] += 1 bases += n coverages[n] += 1 # Compute medians assert coverages[0] == 0 assert acgt_coverages[0] == 0 coverages[0] = length - sum(coverages.values()) acgt_coverages[0] = acgt_length - sum(acgt_coverages.values()) median_coverage = frequency_median(coverages) acgt_median_coverage = frequency_median(acgt_coverages) return Info( length=length, acgt_length=acgt_length, bases=bases, acgt_bases=acgt_bases, median_coverage=median_coverage, acgt_median_coverage=acgt_median_coverage, gc=gc, ) def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument('--mask', default='0x504', help="Exclude reads in which flags have one of the bits in MASK set. " "Default is 0x504, that is, exclude unmapped reads (0x4), " "secondary alignments (0x100) and PCR/optical duplicates (0x400).") parser.add_argument("fasta", metavar="FASTA", help="path to reference FASTA file") parser.add_argument("bam", metavar="BAM", help="path to BAM file") args = parser.parse_args() try: args.mask = int(args.mask, base=0) except ValueError: parser.error('Could not interpret mask value "{}"'.format(args.mask)) print('chromosome', 'gc', 'length', 'non_acgt', 'bases', 'avg_cov', 'median_cov', sep='\t') with Samfile(args.bam) as samfile, IndexedFasta(args.fasta) as fasta: reference_names = samfile.references for name in reference_names: info = collect_info(samfile, fasta, name, args.mask) print( name, "{:.4f}".format(info.gc / info.acgt_length), info.length, info.length - info.acgt_length, info.acgt_bases, "{:.3f}".format(info.acgt_bases / info.acgt_length), info.acgt_median_coverage, sep='\t', ) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/cutvect.py000066400000000000000000000104701302004573300214170ustar00rootroot00000000000000#!/usr/bin/env python3 """ Remove vector sequence Limitations: - Only one vector sequence supported. - Only the first and last 500 bp of the vector sequence are actually searched for. - If sequence is circular, vector might not be found. """ import sys from cutadapt import align from ..dna import reverse_complement from .. import FastaWriter, SequenceReader, FastaReader from argparse import ArgumentParser def add_arguments(parser): arg = parser.add_argument arg('vector', help='FASTA with vector sequence(s)') arg('reads', help='FASTA/FASTQ with read') def uncircularize(s, error_rate=0.05): """ Given a sequence s that overlaps itself (a prefix equals a suffix), return the sequence without the redundant suffix. To avoid random matches, the redundant sequence is only removed if it has a length of at least 100 characters or 10% of the length of s, whichever is smaller. >>> uncircularize('hellotherehallo', error_rate=0.2) 'hellothere' """ flags = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1 # We cannot just align the sequence to itself since that results in a not # very helpful alignment, so take a prefix. k = int(len(s) * max(0.25, error_rate)) + 1 prefix = s[:-k] result = align.locate(prefix, s, max_error_rate=error_rate, flags=flags) if result is None: return s pstart, pstop, sstart, sstop, matches, errors = result if pstop - pstart > min(100, int(0.1 * len(s))) and errors / (pstop - pstart) <= error_rate: # found overlap return s[:sstart] else: return s def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() vectors = list(FastaReader(args.vector)) assert len(vectors) == 1, "Only FASTA files with exactly one vector sequence currently supported" vector = vectors[0].sequence.upper() flags = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 prefix = vector[:500] suffix = vector[-500:] writer = FastaWriter(sys.stdout, line_length=80) for read in SequenceReader(args.reads): print('Working on', len(read.sequence), 'bp read', repr(read.name), file=sys.stderr) seq = read.sequence.upper() uncirc = uncircularize(seq) if len(uncirc) < len(seq): print(' uncircularized length:', len(uncirc), file=sys.stderr) circular = True seq = uncirc else: print(' not circular', file=sys.stderr) circular = False # The vector is either within the sequence: # ---XXXXXX--- # or (due to circularity) a suffix and prefix: # XX------XXXX # So by appending the sequence to itself, we make certain that a full # copy of the vector sequence appears: # XX------XXXXXX---... if circular: s = seq + seq # TODO 500 would be enough as long as we're only checking prefix and suffix else: s = seq for revcomp in False, True: if revcomp: s = reverse_complement(s) # Search for prefix of the vector result = align.locate(prefix, s, max_error_rate=0.1, flags=flags) if result is None: continue vstart, vstop, start, _, _, errors = result prefix_erate = errors / (vstop - vstart) assert prefix_erate <= 0.1 print(' Prefix match found with {:.2%} errors'.format(prefix_erate), file=sys.stderr) # Search for suffix of the vector, but not before the prefix vstart, vstop, _, stop, _, errors = align.locate(suffix, s[start:], max_error_rate=0.1, flags=flags) suffix_erate = errors / (vstop - vstart) if suffix_erate > 0.1: print(' Suffix error rate too large.', file=sys.stderr) continue print(' Suffix match found with {:.2%} errors'.format(suffix_erate), file=sys.stderr) if circular: # We can output a single sequence. # Stop coordinate is relative to s[start:], so the actual # start is at start + stop. seq = s[start+stop:start+len(seq)] print(' Read trimmed. Length changed from', len(read.sequence), 'to', len(seq), file=sys.stderr) writer.write(read.name, seq) else: # Need to output two separate sequences here. seq1 = s[0:start] seq2 = s[stop:] print(' Read split into two. Length changed from', len(read.sequence), 'to', len(seq1), '+', len(seq2), file=sys.stderr) # Numbering is switched on purpose since seq1 actually # follows seq2 on the reference. writer.write(read.name + '-1', seq2) writer.write(read.name + '-2', seq1) break if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/fastaextract.py000066400000000000000000000061421302004573300224340ustar00rootroot00000000000000#!/usr/bin/env python3 """ Efficiently extract a region from a FASTA file. When an .fai index of the file is available, only the necessary parts of the file are read. If the index is not available, the entire file is read into memory first. Create an index (.fai file) with "samtools faidx". The result is printed in FASTA format to standard output. Regions are specified in the format "[rc:]name[:start-stop]". If "start" and "stop" are omitted, the whole sequence is returned. Coordinates are 1-based and both endpoints of the interval are included. A region specification may be prefixed by 'rc:' in order to output the reverse complement of the specified region. It must hold that start <= stop, even when reverse complements are requested. If it does not hold, the output sequence is empty. Please be aware that samtools faidx uses only the part of the sequence name up to, but not including, the first whitespace character. That is, if an entry in your FASTA file looks like this: >seq1 this is a sequence Then the identifier for this sequence is simply 'seq1'. For consistency, this convention is also followed when the .fai file is not used. Examples -------- Extract chromosome 1 from a FASTA file named hg19.fa: fastaextract hg19.fa chr1 Extract the first 200 nucleotides from chromosome 1 in hg19.fa: fastaextract hg19.fa chr1:1-200 Extract the reverse complement of the bases 201 up to the end of chr1: fastaextract hg19.fa rc:chr1:201- TODO * create a .fai index on the fly instead of reading the full file into memory. * check for duplicate names when no index used """ import os.path import sys import mmap from xopen import xopen from .. import HelpfulArgumentParser from ..io.fasta import (FastaReader, NonIndexedFasta, IndexedFasta, FastaWriter, FastaIndexMissing) from ..dna import reverse_complement from ..region import Region __author__ = "Marcel Martin" def main(): if sys.version < '2.7': print("Sorry, Python version >= 2.7 required!", file=sys.stderr) sys.exit(1) parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("--width", "-w", type=int, default=80, help="Characters per line in output FASTA (default: %(default)s). " "Set to 0 to disallow line breaks entirely.") parser.add_argument("fasta", metavar="FASTA", help="The FASTA file") parser.add_argument("region", metavar="REGION", nargs='+') args = parser.parse_args() if args.width == 0: args.width = None try: fasta = IndexedFasta(args.fasta) except FastaIndexMissing: if os.path.getsize(args.fasta) > 1024 ** 3: # 1 GiB print("ERROR: The file is very large and no index exists, " "please create an index with 'samtools faidx'.", file=sys.stderr) sys.exit(1) fasta = NonIndexedFasta(args.fasta) writer = FastaWriter(sys.stdout, line_length=args.width) regions = [ Region(s) for s in args.region ] for region in regions: sequence = fasta[region.reference][region.start:region.stop] if region.is_reverse_complement: sequence = reverse_complement(sequence) if sys.version > '3': sequence = sequence.decode() writer.write(str(region), sequence) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/fastagrep.py000066400000000000000000000043451302004573300217220ustar00rootroot00000000000000#!/usr/bin/env python3 """ Search for a IUPAC string in the sequences of a FASTA file. Prints matching entries in the fasta file to standard output. If is not provided, read from standard input. If output is a terminal, the first occurrence of the pattern in each sequence is highlighted. """ import sys import re from xopen import xopen from sqt import HelpfulArgumentParser from sqt.io.fasta import FastaReader from sqt.ansicolor import red, lightred __author__ = "Marcel Martin, Tobias Marschall" def add_arguments(parser): arg = parser.add_argument arg("-d", "--description", action="store_true", default=False, help="Search the description/comment fields of the FASTA file instead " "of the sequences. If given, the pattern is interpreted as a regular " "expression, not as a IUPAC pattern. (default: %(default)s)") arg("pattern") arg("fasta", nargs='?', const=None) def iupac_to_regex(iupac): """ Converts a IUPAC string with wildcards to a regular expression. """ wildcards = { "R": "AG", "Y": "CT", "S": "CG", "W": "AT", "K": "GT", "M": "AC", "B": "CGT", "D": "AGT", "H": "ACT", "V": "ACG", "N": "ACGT", "X": "ACGT" } regex = "" for c in iupac.upper(): if c in "ACGT": regex += c elif c in wildcards: regex += "[" + wildcards[c] + "]" else: raise ValueError("don't know how to handle character %s" % c) return regex def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() if args.fasta is None: infile = sys.stdin else: infile = xopen(args.fasta) # whether to use color in output color = sys.stdout.isatty() if args.description: reg = args.pattern color = False else: reg = iupac_to_regex(args.pattern.upper()) regex = re.compile(reg) for record in FastaReader(infile): seq = record.sequence #print(desc, seq) if args.description: match = regex.search(record.name) else: match = regex.search(record.sequence.upper()) if match: print('>', record.name, sep='') if color: print(seq[0:match.start()] + lightred(seq[match.start():match.end()]) + seq[match.end():]) else: print(seq) match = regex.search(seq.upper()) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/fastastats.py000066400000000000000000000136071302004573300221240ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print lots of statistics about one or more FASTA or FASTQ files. TODO - computation of contig N50 is relatively slow """ import sys import os import subprocess from collections import Counter from .. import HelpfulArgumentParser, SequenceReader, IndexedFasta from ..dna import n_intervals, intervals_complement from ..math import n50, frequency_median, frequency_n50 __author__ = "Marcel Martin" def byte_frequencies(s): return Counter(s) try: from sqt._helpers import byte_frequencies except: pass def print_statistics(lengths, contig_lengths, shortest=None, longest=None, genome_size=None, character_frequencies=None): """ lengths -- a dictionary that maps length to frequency (Counter object) """ n = sum(lengths.values()) print('No. of sequences: {:11,}'.format(n)) if n == 0: return total = sum(length * count for length, count in lengths.items()) print('Total length: {:13,}'.format(total)) min_length, max_length = min(lengths), max(lengths) if shortest: print('Minimum length: {:13,} in entry "{}"'.format(min_length, shortest)) else: print('Minimum length: {:13,}'.format(min_length)) if longest: print('Maximum length: {:13,} in entry "{}"'.format(max_length, longest)) else: print('Maximum length: {:13,}'.format(max_length)) print('Average length: {:16.2f}'.format(total / n)) print('Median length: {:13,}'.format(frequency_median(lengths))) print('Scaffold N50: {:13,}'.format(frequency_n50(lengths))) if genome_size: print('Scaffold NG50: {:13,}'.format(frequency_n50(lengths, genome_size=genome_size))) if contig_lengths: lengths = contig_lengths min_length, max_length = min(lengths), max(lengths) print() n_contigs = sum(lengths.values()) print('Number of contigs: {:13,}'.format(n_contigs)) total_c = sum(length * count for length, count in lengths.items()) print('Total contig length: {:13,}'.format(total_c)) print('Minimum contig length: {:13,}'.format(min_length)) print('Maximum contig length: {:13,}'.format(max_length)) print('Average contig length: {:16.2f}'.format(total_c / n_contigs)) print('Median contig length: {:13,}'.format(frequency_median(lengths))) print('Contig N50: {:13,}'.format(frequency_n50(lengths))) if character_frequencies: print() print("Character distribution ( ):") assert total == sum(character_frequencies.values()) acgt = 0 gc = 0 for upper, lower in (b'Aa', b'Cc', b'Gg', b'Tt'): freq = character_frequencies[upper] + character_frequencies[lower] if upper in b'GC': gc += freq print(chr(upper), ' {:14,} {:6.1%}'.format(freq, freq / total)) acgt += freq other = sum(character_frequencies.values()) - acgt print('other {:14,} {:6.2%}'.format(other, other / total)) print('ACGT {:14,} {:6.2%}'.format(acgt, acgt / total)) print('GC {:14,} {:6.2%} (of ACGT)'.format(gc, gc / (total - other))) def filter_short_intervals(intervals, minimum_length): for start, stop in intervals: if stop - start >= minimum_length: yield (start, stop) def fasta_fastq_iter(path): with SequenceReader(path, 'rb') as reader: for record in reader: seq = record.sequence #.upper() yield (record.name, len(seq), seq) def indexed_fasta_iter(path): with IndexedFasta(path) as f: for index_entry in f.index.values(): yield (index_entry.name, index_entry.length, None) def stats(path, tolerable_gapsize, detailed): """ Determine scaffold lengths, contig lengths and character frequencies. Return a tuple (scaffold_lengths, contig_lengths, character_frequencies). """ scaffold_lengths = Counter() contig_lengths = Counter() nucleotides = Counter() # nucleotide frequencies shortest = None longest = None min_length = float('+inf') max_length = -1 if not detailed and os.path.exists(path + '.fai'): it = indexed_fasta_iter(path) else: it = fasta_fastq_iter(path) for (name, length, sequence) in it: scaffold_lengths[length] += 1 if length < min_length: min_length = length shortest = name if length > max_length: max_length = length longest = name if detailed and sequence is not None: nucleotides += byte_frequencies(sequence) intervals = intervals_complement( filter_short_intervals(n_intervals(sequence, ord(b'N')), tolerable_gapsize), length) for start, stop in intervals: contig_lengths[stop - start] += 1 return scaffold_lengths, contig_lengths, shortest, longest, nucleotides def get_argument_parser(): parser = HelpfulArgumentParser(description=__doc__) add = parser.add_argument add('--detailed', '-d', default=False, action='store_true', help='Print information about the sequences themselves, ' 'such as the character distribution and contig N50.') add('--genome-size', '-g', type=int, default=None, help='Estimated genome size. If given, also NG50 in addition to N50 is computed.') add('--tolerable-gapsize', '-t', type=int, default=10, help='A stretch of at most this many "N"s is not counted as a gap ' 'separating contigs.') add('fastaq', nargs='+', metavar='FASTA/FASTQ', help='Input FASTA or FASTQ file(s) (may be gzipped).') return parser def main(): parser = get_argument_parser() args = parser.parse_args() overall_frequencies = Counter() overall_lengths = Counter() if not args.detailed: character_frequencies = None contig_lengths = None for path in args.fastaq: print("## File:", path) scaffold_lengths, contig_lengths, shortest, longest, character_frequencies = \ stats(path, args.tolerable_gapsize, detailed=args.detailed) overall_frequencies += character_frequencies print_statistics(scaffold_lengths, contig_lengths, shortest, longest, args.genome_size, character_frequencies) overall_lengths += scaffold_lengths if len(args.fastaq) > 1: print("## Summary of", len(args.fastaq), "files") print_statistics(overall_lengths, None, None, None, args.genome_size, overall_frequencies if args.detailed else None) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/fastxmod.py000066400000000000000000000222421302004573300215670ustar00rootroot00000000000000#!/usr/bin/env python3 """ Modify FASTA and FASTQ files by picking subsets and modifying individual entries. Possible modifications: - Pick a subset of records (given by name). With lots of names, this is faster than 'grep -A 3 --no-group-separator -f readnames.txt file.fastq' magic, which may be used with FASTQ files. If the record name ends in '/1' or '/2', these two charecter are ignored when comparing to the names in the file. - Trim low-quality ends - Trim reads to a given length - Discard reads shorter than a given length - Discard reads in which the expected number of errors exceeds a threshold - Discard reads that contain characters other than those in a given set. - Reverse-complement each read - Make sequence characters upper- or lowercase - Convert from FASTA to FASTQ by assigning a fixed quality value to all bases - Convert from FASTQ to FASTA by dropping all quality values - Make read names unique - Pick only the first N sequences. Modifications are done in the order in which they are listed above. The result is written to standard output. The algorithm for quality trimming is the same as the one used by BWA: - Subtract the cutoff value from all qualities. - Compute partial sums from all indices to the end of the sequence. - Trim sequence at the index at which the sum is minimal. """ import sys import errno from collections import defaultdict from itertools import islice import random from sqt import HelpfulArgumentParser, SequenceReader, FastaWriter, FastqWriter from sqt.dna import reverse_complement, mutate from sqt.qualtrim import quality_trim_index as trim_index, expected_errors __author__ = "Marcel Martin" def add_arguments(parser): arg = parser.add_argument arg('--names', metavar='FILE', default=None, help='Keep only records whose name occurs in FILE (one per line)') arg('--not-names', metavar='FILE', default=None, help='Discard records whose name occurs in FILE (one per line)') arg("-q", "--cutoff", type=int, default=None, help="Quality cutoff. Only when input format is FASTQ") arg('--substitute', type=float, default=0, metavar='PROB', help='Randomly substitute bases at probability PROB. Default: %(default)s') arg("--length", "-l", type=int, default=None, help="Shorten records to LENGTH (default: do not shorten)") arg('-m', '--minimum-length', type=int, default=0, metavar='LENGTH', help='Discard reads shorter than LENGTH') arg('--max-errors', type=float, default=None, metavar='ERRORS', help='Discard reads whose expected number of errors (computed ' 'from quality values) exceeds ERRORS.') arg('--allowed-characters', default=None, metavar='CHARS', help='Discard ' 'reads that contain characters other than those in CHARS. CHARS is ' 'case-sensitive. Example: -c ACGTacgt.') arg('--reverse-complement', '-r', action='store_true', default=False, help='Reverse-complement each sequence') group = parser.add_mutually_exclusive_group() group.add_argument('--upper', dest='character_case', action='store_const', default=None, const='upper', help='Convert sequence characters to uppercase') group.add_argument('--lower', dest='character_case', action='store_const', default=None, const='lower', help='Convert sequence characters to lowercase') arg('--constant-quality', '-c', metavar='QUALITY', type=int, default=None, help='Set all quality values to QUALITY. Use this to convert from ' 'FASTA to FASTQ.') arg('--fasta', default=False, action='store_true', help='Always output FASTA (drop qualities if input is FASTQ)') arg('--unique-names', action='store_true', default=False, help="Make record names unique by appending _1, _2 etc. when necessary") arg('--limit', '-n', type=int, metavar='N', default=None, help="Pick only the first N sequences (default: all)") arg("--width", "-w", type=int, default=80, help="Characters per line in output FASTA (default: %(default)s). " "Set to 0 to disallow line breaks entirely. This is ignored for FASTQ files.") arg('--seed', type=int, default=None, help='Set random seed for reproducible runs. Relevant when --substitution-rate is used.' '(default: use different seed each run)') arg('path', metavar='FASTA/FASTQ', help='input FASTA or FASTQ file') class ReadPicker: def __init__(self, file_with_names, keep=True): """ keep -- If True, reads occurring in the file are kept. If False, they are discarded. """ read_names = [] with open(file_with_names) as f: read_names = f.read().split('\n') self.read_names = { rn for rn in read_names if rn != '' } self.keep = keep def __call__(self, read): rname = read.name.split(' ', maxsplit=1)[0] if rname.endswith('/1'): rname = rname[:-2] elif rname.endswith('/2'): rname = rname[:-2] if rname in self.read_names: return read if self.keep else None else: return None if self.keep else read class QualityTrimmer: def __init__(self, cutoff): self.cutoff = cutoff def __call__(self, read): index = trim_index(read.qualities, self.cutoff) return read[:index] class Mutater: def __init__(self, substitution_rate): self.substitution_rate = substitution_rate def __call__(self, read): read.sequence = mutate(read.sequence, substitution_rate=self.substitution_rate, indel_rate=0) return read class Shortener: def __init__(self, length): self.length = length def __call__(self, read): return read[:self.length] class MinimumLengthFilter: def __init__(self, length): self.minimum_length = length def __call__(self, read): if len(read) < self.minimum_length: return None else: return read class MaxExpectedErrorFilter: """ Discard reads whose expected number of errors, according to the quality values, exceeds the given threshold. The idea comes from usearch's -fastq_maxee parameter (http://drive5.com/usearch/). """ def __init__(self, max_errors): self.max_errors = max_errors def __call__(self, read): if expected_errors(read.qualities) > self.max_errors: return None else: return read class AllowedCharacterFilter: """ Discard reads that contain characters other than those in the given set. """ def __init__(self, allowed_characters): self.allowed = set(allowed_characters) def __call__(self, read): if set(read.sequence) <= self.allowed: return read else: return None def reverse_complementer(read): read.sequence = reverse_complement(read.sequence) if read.qualities: read.qualities = read.qualities[::-1] return read def lower_caser(read): read.sequence = read.sequence.lower() return read def upper_caser(read): read.sequence = read.sequence.upper() return read class QualitySetter: def __init__(self, value): self.quality_char = chr(33 + value) def __call__(self, read): read.qualities = self.quality_char * len(read) return read def quality_dropper(read): read.qualities = None return read class UniqueNamer: def __init__(self): # Counter for occurrences of a name self.names = defaultdict(int) def __call__(self, read): if ' ' in read.name: name, description = read.name.split(' ', maxsplit=1) else: name = read.name description = None self.names[name] += 1 if self.names[name] == 1: # Read not previously seen return read name = '{}_{}'.format(name, self.names[name] - 1) read.name = name if description is not None: read.name += ' ' + description return read def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() if args.width == 0: args.width = None if args.seed is not None: random.seed(args.seed) modifiers = [] if args.names: modifiers.append(ReadPicker(args.names, keep=True)) if args.not_names: modifiers.append(ReadPicker(args.not_names, keep=False)) if args.cutoff is not None: modifiers.append(QualityTrimmer(args.cutoff)) if args.substitute > 0: modifiers.append(Mutater(args.substitute)) if args.length: modifiers.append(Shortener(args.length)) if args.minimum_length != 0: modifiers.append(MinimumLengthFilter(args.minimum_length)) if args.max_errors is not None: modifiers.append(MaxExpectedErrorFilter(args.max_errors)) if args.allowed_characters is not None: modifiers.append(AllowedCharacterFilter(args.allowed_characters)) if args.reverse_complement: modifiers.append(reverse_complementer) if args.character_case == 'lower': modifiers.append(lower_caser) if args.character_case == 'upper': modifiers.append(upper_caser) if args.constant_quality is not None: modifiers.append(QualitySetter(args.constant_quality)) if args.fasta: modifiers.append(quality_dropper) if args.unique_names: modifiers.append(UniqueNamer()) with SequenceReader(args.path) as fr: format = fr.format outformat = format if args.constant_quality is not None: outformat = 'fastq' if args.fasta: outformat = 'fasta' if outformat == 'fastq': writer = FastqWriter(sys.stdout) else: writer = FastaWriter(sys.stdout, line_length=args.width) try: for record in islice(fr, 0, args.limit): for modifier in modifiers: record = modifier(record) if record is None: break else: # only executed if loop did not terminate via break writer.write(record) except IOError as e: if e.errno != errno.EPIPE: raise if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/fixbam64.py000066400000000000000000000015471302004573300213670ustar00rootroot00000000000000#!/usr/bin/env python3 """ Repair a BAM file that (incorrectly) contains phred64-encoded qualities. BAM files must encode their quality values in phred33. Output is written in SAM format to standard output. """ from sqt import HelpfulArgumentParser from pysam import Samfile __author__ = "Marcel Martin" PHRED_64_TO_33_TRANS = bytes.maketrans(bytes(range(64, 64+100)), bytes(range(33, 33+100))) def get_argument_parser(): parser = HelpfulArgumentParser(description=__doc__) add = parser.add_argument add('bam', help='Input BAM file') return parser def main(): parser = get_argument_parser() args = parser.parse_args() with Samfile(args.bam) as infile: with Samfile('-', 'wh', template=infile) as outfile: for record in infile: record.qual = record.qual.translate(PHRED_64_TO_33_TRANS) outfile.write(record) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/histogram.py000066400000000000000000000025671302004573300217470ustar00rootroot00000000000000#!/usr/bin/env python3 """ """ import sys from random import randint, random, seed import matplotlib as mpl mpl.use('pdf') # enable matplotpib over an ssh connection without X import matplotlib.pyplot as plt import numpy as np import pandas as pd from sqt import HelpfulArgumentParser from sqt.io.fasta import IndexedFasta __author__ = "Marcel Martin" def get_argument_parser(): parser = HelpfulArgumentParser(description=__doc__) add = parser.add_argument add("--left", type=float, default=None) add("--right", type=float, default=None) add("--title", help="Plot title (default: Histogram of )") add("--bins", type=int, default=40, help="number of bins (default: %(default)s") add("infile", help="use '-' for standard input") add("image", nargs='?', help="name of PDF or SVG file") return parser def main(): parser = get_argument_parser() args = parser.parse_args() if args.infile == '-': path = sys.stdin.buffer title = 'Histogram' else: path = args.infile title = 'Histogram of ' + args.infile if args.title: title = args.title data = pd.read_csv(path, dtype=float).values # delimiter=None is default fig, ax = plt.subplots() ax.set_title(title) plt.hist(data, bins=args.bins, rwidth=0.8) ax.set_xlim(left=args.left, right=args.right) if args.image is not None: plt.savefig(args.image) else: plt.show() if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/mutate.py000066400000000000000000000023151302004573300212400ustar00rootroot00000000000000#!/usr/bin/env python3 """ Mutate nucleotides in a FASTA or FASTQ file The modified sequences are written to standard output. """ import sys import random from sqt import HelpfulArgumentParser from sqt.io.fasta import FastaReader, FastaWriter from sqt.dna import mutate __author__ = "Marcel Martin" def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("--rate", type=float, default=0.03, help="Substitution rate (default: %(default)s)") parser.add_argument("--indel-rate", type=float, default=0.0005, help="Indel rate (default: %(default)s)") parser.add_argument("--seed", type=int, default=None, help="Set random seed for reproducible runs (default: use different seed each run)") parser.add_argument("fasta", metavar='FASTA-file', help="Input FASTA file") args = parser.parse_args() if args.seed is not None: random.seed(args.seed) fasta_output = FastaWriter(sys.stdout, line_length=80) for record in FastaReader(args.fasta): mutated, n_sub, n_indel = mutate(record.sequence, rate=args.rate, alphabet='ACGT', indel_rate=args.indel_rate, counts=True) fasta_output.write(record.name + '-sub{}-indel{}'.format(n_sub, n_indel), mutated) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/qgramfreq.py000066400000000000000000000020371302004573300217270ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print q-gram (also called k-mer) frequencies in a FASTA or FASTQ file. The result is a list of q-grams and their counts, sorted by counts from least to most frequent. """ import sys from collections import Counter from sqt import HelpfulArgumentParser from sqt import SequenceReader __author__ = "Marcel Martin" def q_grams(s, q): """yield all q-grams in s""" for i in range(len(s) - q): yield s[i:i+q] def add_arguments(parser): arg = parser.add_argument arg("-q", default=4, help="length of the q-grams (also called k-mers) (default: %(default)s)") arg("path", metavar='FASTA/FASTQ', help="input FASTA or FASTQ file") def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() counts = Counter() with SequenceReader(args.path) as reader: for record in reader: counts.update(q_grams(record.sequence, args.q)) for elem, count in counts.most_common()[::-1]: print(elem, count) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/qualityguess.py000066400000000000000000000027541302004573300225070ustar00rootroot00000000000000#!/usr/bin/env python3 """ Guess quality encoding of one or more FASTA files. """ import sys import os import subprocess from collections import Counter from sqt.io.fasta import FastqReader, guess_quality_base from sqt import HelpfulArgumentParser __author__ = "Marcel Martin" def get_argument_parser(): parser = HelpfulArgumentParser(description=__doc__) add = parser.add_argument add('--verbose', '-v', default=False, action='store_true', help='Print histogram of found characters') add('--limit', '-n', default=10000, type=int, help='Inspect the first LIMIT records in the FASTQ file (default: %(default)s)') add('fastq', nargs='+', metavar='FASTQ', help='Input FASTQ files (may be gzipped).') return parser def main(): parser = get_argument_parser() args = parser.parse_args() for path in args.fastq: if args.verbose: print('## File:', path) else: print(path, end='') freqs, guess = guess_quality_base(path) if args.verbose: print() print('character ASCII frequency') for c in sorted(freqs): print("{} {:3} {:7}".format(chr(c), c, freqs[c])) print() else: print(' is ', end='') guess = { 33: 'phred33', 64: 'phred64', None: 'unknown'}[guess] if args.verbose: print("Quality value range assuming phred33: {}..{}".format(min(freqs) - 33, max(freqs) - 33)) print("Quality value range assuming phred64: {}..{}".format(min(freqs) - 64, max(freqs) - 64)) print("This is probably", guess) else: print(guess) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/randomseq.py000066400000000000000000000014231302004573300217310ustar00rootroot00000000000000#!/usr/bin/env python3 """ Generate random sequences in FASTA format """ import sys from random import choice, randint from sqt import HelpfulArgumentParser __author__ = "Marcel Martin" def add_arguments(parser): arg = parser.add_argument arg("--minimum-length", "-m", type=int, default=20) arg("--maximum-length", "-M", type=int, default=50) arg("n", type=int, help="Number of sequences to generate") def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() ALPHABET = 'ACGT' for i in range(args.n): l = randint(args.minimum_length, args.maximum_length) seq = ''.join(choice('ACGT') for _ in range(l)) print(">seq{0}\n{1}".format(i+1, seq)) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/readcov.py000066400000000000000000000034461302004573300213720ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print a report for individual reads in a SAM/BAM file. """ __author__ = 'Marcel Martin' import sys from collections import Counter, namedtuple, defaultdict from itertools import islice from contextlib import ExitStack from pysam import Samfile from sqt import HelpfulArgumentParser, Cigar, cigar from sqt.region import Region from .bamstats import AlignedRead, print_coverage_report def add_arguments(parser): arg = parser.add_argument arg('--minimum-length', '-m', type=int, default=1, help='Minimum read length. Ignore reads that are shorter. Default: %(default)s') arg('--quality', '-q', type=int, default=0, help='Minimum mapping quality (default: %(default)s') arg('--minimum-cover-fraction', metavar='FRACTION', type=float, default=0.01, help='Alignment must cover at least FRACTION of the read to appear in the cover report. (%(default)s)') arg("bam", metavar="SAM/BAM", help="Name of a SAM or BAM file") arg("region", help="Region") def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() region = Region(args.region) n_records = 0 seen_reads = set() with Samfile(args.bam) as sf: for record in sf.fetch(region.reference, region.start, region.stop): if record.query_length < args.minimum_length: continue n_records += 1 if record.is_unmapped: unmapped += 1 unmapped_bases += len(record.seq) continue if record.mapq < args.quality: continue assert record.cigar is not None if not record.query_name in seen_reads: aligned_read = AlignedRead(record, sf.getrname(record.tid)) print_coverage_report(aligned_read, minimum_cover_fraction=args.minimum_cover_fraction) seen_reads.add(record.query_name) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/readlenhisto.py000066400000000000000000000074711302004573300224320ustar00rootroot00000000000000#!/usr/bin/env python3 """ Print and optionally plot a read length histogram of one or more FASTA or FASTQ files. If more than one file is given, a total is also printed. """ import sys from collections import Counter from sqt import SequenceReader, HelpfulArgumentParser # Make potential import failures happen before we read in files import matplotlib as mpl mpl.use('Agg') # enable matplotlib over an ssh connection without X import matplotlib.pyplot as plt import numpy as np import warnings __author__ = "Marcel Martin" warnings.filterwarnings('ignore', 'axes.color_cycle is deprecated and replaced with axes.prop_cycle') warnings.filterwarnings('ignore', 'The `IPython.html` package') warnings.filterwarnings('ignore', 'The `IPython.kernel` package') warnings.filterwarnings('ignore', 'IPython.utils.traitlets has moved') import seaborn as sns def add_arguments(parser): arg = parser.add_argument arg('--zero', default=False, action='store_true', help='Print also rows with a count of zero') arg('--plot', default=None, help='Plot to this file (.pdf or .png). ' 'If multiple sequence files given, plot only total.') arg('--bins', default=50, type=int, help='Number of bins in the plot. ' 'Default: %(default)s') arg('--maxy', default=None, type=float, help='Maximum y in plot') arg('--left', default=0, type=float, help='Minimum x in plot') arg('--outliers', default=False, action='store_true', help='In the plot, summarize outliers greater than the 99.9 percentile ' 'in a red bar.') arg('--title', default='Read length histogram of {}', help="Plot title. {} is replaced with the input file name. " "Default: '%(default)s'") arg('seqfiles', nargs='+', metavar='FASTA/FASTQ', help='Input FASTA/FASTQ file(s) (may be gzipped).') def length_histogram(path): """Return a list of lengths """ lengths = [] with SequenceReader(path) as reader: for record in reader: lengths.append(len(record.sequence)) return lengths def plot_histogram(lengths, path, title, max_y=None, min_x=0, bins=50, outliers=False): """ Plot histogram of lengths to path If outliers is True, then the lengths greater than the 99.9 percentile are marked separately with bar colored in red. """ lengths = np.array(lengths) if outliers: histomax = int(np.percentile(lengths, 99.9) * 1.01) else: histomax = int(max(lengths, default=100)) larger = sum(lengths > histomax) fig = plt.figure(figsize=(20/2.54, 10/2.54)) ax = fig.gca() ax.set_xlabel('Read length') ax.set_ylabel('Frequency') ax.set_title(title) _, borders, _ = ax.hist(lengths, bins=bins, range=(min_x, histomax)) if outliers: w = borders[1] - borders[0] ax.bar([histomax], [larger], width=w, color='red') ax.set_xlim(min_x, histomax + 1.5 * w) if max_y is not None: ax.set_ylim(0, max_y) fig.set_tight_layout(True) fig.savefig(path) def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() overall_lengths = [] for path in args.seqfiles: print("## File:", path) print("length", "frequency", sep='\t') lengths = length_histogram(path) freqs = Counter(lengths) for length in range(0, max(freqs, default=100) + 1): freq = freqs[length] if args.zero or freq > 0: print(length, freq, sep='\t') overall_lengths.extend(lengths) if len(args.seqfiles) > 1: print("## Total") print("length", "frequency", sep='\t') freqs = Counter(overall_lengths) for length in range(0, max(freqs) + 1): freq = freqs[length] if args.zero or freq > 0: print(length, freq, sep='\t') title = args.title.format('{} input files'.format(len(args.seqfiles))) else: title = args.title.format(args.seqfiles[0]) if args.plot: plot_histogram(overall_lengths, args.plot, title, args.maxy, min_x=args.left, bins=args.bins, outliers=args.outliers) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/samfixn.py000066400000000000000000000015151302004573300214070ustar00rootroot00000000000000#!/usr/bin/env python3 """ Read a SAM file from standard input, replace all characters in all reads that are not one of {a, c, g, t, n, A, C, G, T, N} with the character 'N'. Write the modified SAM file to standard output. This is approx. 8 times faster than an equivalent awk line using the gsub() function. """ import sys from os.path import join, dirname, realpath, isfile from sqt import HelpfulArgumentParser def main(): parser = HelpfulArgumentParser(usage=__doc__) args = parser.parse_args() tab = [ord('N')] * 256 for c in b'ACGTNacgtn': tab[c] = c trans = bytes(tab) for line in sys.stdin.buffer: if line.startswith(b'@'): sys.stdout.buffer.write(line) else: fields = line.split(b'\t') fields[9] = fields[9].translate(trans) sys.stdout.buffer.write(b'\t'.join(fields)) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/samsetop.py000066400000000000000000000114341302004573300215760ustar00rootroot00000000000000#!/usr/bin/env python3 """ Perform set operation on two SAM/BAM files. The output BAM file will have the same header as file A. WARNING: Implementation is neither very fast nor memory efficient. Possible operations: union: Output union of A and B, abort with error if different lines for same read are encountered. intersection: Output intersection of A and B, abort with error if different lines for same read are encountered. setminus: Outputs all read in A that are not in B. symdiff: Output all reads in A or B but not in both. """ from sqt import HelpfulArgumentParser import sys from pysam import Samfile __author__ = "Tobias Marschall" def add_arguments(parser): arg = parser.add_argument arg("-s", action="store_true", dest="sam_output", default=False, help="Output SAM file instead of BAM file") arg("-U", action="store_true", dest="exclude_unmapped_A", default=False, help="Exclude unmapped reads from file A") arg("-V", action="store_true", dest="exclude_unmapped_B", default=False, help="Exclude unmapped reads from file B") arg("-r", action="store_true", dest="remove_name_suffix", default=False, help="Remove trailing \"/*\" from read names. Useful if one mapper appends \"/1\" and another does not.") arg('bampath1', help='First BAM or SAM file') arg('operation', choices=('union','intersection','setminus','symdiff')) arg('bampath2', help='Second BAM or SAM file') arg('outputpath', nargs='?', help='Output BAM or SAM file. If omitted, only print the number of reads ' 'that would be written.') def SamOrBam(name, mode='r'): if name.endswith('.bam'): mode += 'b' return Samfile(name, mode) def remove_suffix(s): i = s.rfind('/') if i == -1: return s else: return s[:i] def nop(s): return s def dict_of_reads(reads, exclude_unmapped, rename): d = dict() for read in reads: if exclude_unmapped and read.is_unmapped: continue name = rename(read.qname) if d.has_key(name): raise Exception("Duplicate read in input file (%s)"%name) d[name] = read return d def union(A, B,outfile, exclude_unmapped_A, exclude_unmapped_B, rename): readsB = dict_of_reads(B, exclude_unmapped_B, rename) readnamesA = set() for read in A: if exclude_unmapped_A and read.is_unmapped: continue name = rename(read.qname) if name in readnamesA: raise Error("Duplicate read in input file (%s)"%name) readnamesA.add(name) if readsB.has_key(name): if read.compare(readsB[name]) != 0: print('Content mismatch for read %s:'%name, file=sys.stderr) print('File A:',read, file=sys.stderr) print('File B:',readsB[name], file=sys.stderr) sys.exit(1) readsB.pop(name) outfile.write(read) for read in readsB.itervalues(): outfile.write(read) def intersection(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename): readsB = dict_of_reads(B, exclude_unmapped_B, rename) readnamesA = set() for read in A: if exclude_unmapped_A and read.is_unmapped: continue name = rename(read.qname) if name in readnamesA: raise Error("Duplicate read in input file (%s)"%name) readnamesA.add(name) if readsB.has_key(name): if read.compare(readsB[name])!=0: print('Content mismatch for read %s:'%name, file=sys.stderr) print('File A:',read, file=sys.stderr) print('File B:',readsB[name], file=sys.stderr) sys.exit(1) outfile.write(read) def setminus(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename): if exclude_unmapped_B: readnamesB = set((rename(read.qname) for read in B if not read.is_unmapped)) else: readnamesB = set((rename(read.qname) for read in B)) for read in A: if exclude_unmapped_A and read.is_unmapped: continue if not rename(read.qname) in readnamesB: outfile.write(read) def symdiff(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename): if exclude_unmapped_B: readsB = dict(((rename(read.qname),read) for read in B if not read.is_unmapped)) else: readsB = dict(((rename(read.qname),read) for read in B)) for read in A: if exclude_unmapped_A and read.is_unmapped: continue name = rename(read.qname) if readsB.has_key(name): readsB.pop(name) else: outfile.write(read) for read in readsB.itervalues(): outfile.write(read) class Counter: count = 0 def write(self, x): self.count += 1 def main(args=None): if args is None: parser = HelpfulArgumentParser(description=__doc__) add_arguments(parser) args = parser.parse_args() operation = args.operation A = SamOrBam(args.bampath1) B = SamOrBam(args.bampath2) if args.outputpath: outfile = Samfile(args[3], 'wh' if args.sam_output else 'wb', template=A) else: outfile = Counter() globals()[operation](A, B, outfile, args.exclude_unmapped_A, args.exclude_unmapped_B, remove_suffix if args.remove_name_suffix else nop) if args.outputpath is None: print(outfile.count) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/simreads.py000066400000000000000000000037221302004573300215530ustar00rootroot00000000000000#!/usr/bin/env python3 """ Simulate reads that optionally contain adapter sequences. """ import sys from random import randint, random, seed from sqt import HelpfulArgumentParser from sqt.io.fasta import IndexedFasta __author__ = "Marcel Martin" LENGTH = 100 #adapter = 'GCCTAACTTCTTAGACTGCCTTAAGGACGT' #adapter_prob = 0.5 def main(): parser = HelpfulArgumentParser(description=__doc__) arg = parser.add_argument arg("--length", "-l", type=int, default=100) arg("--seed", type=int, default=None, help="seed for random number generator") #arg("--minimum-length", "-m", type=int, default=100) #arg("--maximum-length", "-M", type=int, default=100) arg("--adapter", "-a", default=None, help="Add an adapter") arg("--probability", "-p", default=0.5, help="Fraction of reads (approximate) that should contain the adapter") arg("n", type=int, help="Number of reads") arg("fasta", #nargs='?', help="FASTA file from which reads are sampled. Only the first chromosome (entry) is used.") arg("chromosome", help="chromosome that is picked from the FASTA file") args = parser.parse_args() if args.seed is not None: seed(args.seed) adapter = args.adapter adapter_prob = args.probability length = args.length fasta = IndexedFasta(args.fasta) chrom = fasta.get(args.chromosome) i = 0 while i < args.n: start = randint(0, len(chrom) - 100) seq = chrom[start:start+length] if b'N' in seq: continue i += 1 seq = seq.decode('ascii') if adapter is not None and random() < adapter_prob: pos = randint(0, length-1) seq = seq[:pos] + adapter + seq[pos:] seq = seq[:length] extra = ' adapterpos={}'.format(pos+1) else: extra = '' print('>r{} {}:{}-{}{}'.format(i, args.chromosome, start+1, start+length, extra)) print(seq) #ALPHABET = 'ACGT' #for i in range(args.n): #l = randint(args.minimum_length, args.maximum_length) #seq = ''.join(choice('ACGT') for _ in range(l)) #print(">r{0}\n{1}".format(i+1, seq)) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/commands/translate.py000066400000000000000000000027221302004573300217400ustar00rootroot00000000000000#!/usr/bin/env python3 """ Read FASTA-formatted data and replace characters in with corresponding characters in . If the name of a FASTA file is not given, read from standard input. This is similar to the unix 'tr' command, except that FASTA comment lines remain unchanged. You can also translate the sequences within FASTQ files by setting --format=fastq. Example (replace C with T and c with t): %(prog)s translate Cc Tt input.fa > output.fa """ import sys #import string from sqt import HelpfulArgumentParser from sqt.io.fasta import FastqReader, FastqWriter, FastaWriter, FastaReader __author__ = "Marcel Martin" def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("--format", choices=('fasta', 'fastq'), default='fasta') parser.add_argument("fromchars", metavar="FROM-CHARACTERS") parser.add_argument("tochars", metavar="TO-CHARACTERS") parser.add_argument("file", nargs='?', default='-') args = parser.parse_args() trans = bytes.maketrans(args.fromchars.encode('ascii'), args.tochars.encode('ascii')) if args.format == 'fasta': fw = FastaWriter(sys.stdout) with FastaReader(args.file) as fr: for record in fr: record.sequence = record.sequence.translate(trans) fw.write(record) else: fw = FastqWriter(sys.stdout) with FastqReader(args.file) as fr: for record in fr: record.sequence = record.sequence.translate(trans) fw.write(record) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/compat.py000066400000000000000000000010521302004573300174200ustar00rootroot00000000000000""" Minimal Py2/Py3 compatibility library. """ import sys PY3 = sys.version > '3' if PY3: maketrans = bytes.maketrans basestring = str zip = zip def bytes_to_str(s): return s.decode('ascii') def str_to_bytes(s): return s.encode('ascii') def force_str(s): if isinstance(s, (bytes, bytearray)): return s.decode('ascii') else: return s else: def bytes_to_str(s): return s def str_to_bytes(s): return s def force_str(s): return s from string import maketrans basestring = basestring from itertools import izip as zip marcelm-sqt-d3218a8c5437/sqt/dna.py000066400000000000000000000076311302004573300167100ustar00rootroot00000000000000#!/usr/bin/env python3 """ - Python 2 and 3 compatible fast reverse complement - definition of genetic code (triplets to amino acids) - fast translation of nucleotide strings to amino acids - other misc. functions """ import sys import re import random from ._codons import GENETIC_CODE # for re-export from sqt._helpers import nt_to_aa if sys.version < '3': from string import maketrans else: maketrans = bytes.maketrans _TR_STR = str.maketrans('ACGTUMRWSYKVHDBNacgtumrwsykvhdbn', 'TGCAAKYWSRMBDHVNtgcaakywsrmbdhvn') _TR = maketrans(b'ACGTUMRWSYKVHDBNacgtumrwsykvhdbn', b'TGCAAKYWSRMBDHVNtgcaakywsrmbdhvn') if sys.version < '3': def reverse_complement(s): return s.translate(_TR)[::-1] else: def reverse_complement(s): if isinstance(s, str): return s.translate(_TR_STR)[::-1] else: return s.translate(_TR)[::-1] AMINO_ACID_REGEXES = dict( A='GC[ACGT]', C='TG[CT]', D='GA[CT]', E='GA[AG]', F='TT[CT]', G='GG[ACGT]', H='CA[CT]', I='AT[ACT]', K='AA[AG]', L='(CT[ACGT]|TT[AG])', M='ATG', N='AA[CT]', P='CC[ACGT]', Q='CA[AG]', R='(AG[AG]|CG[ACGT])', S='AG[CT]|TC[ACGT]', T='AC[ACGT]', V='GT[ACGT]', W='TGG', Y='TA[CT]', X='[ACGT][ACGT][ACGT]' ) def amino_acid_regex(aa_sequence, compile=False): """ Given an amino acid sequence, return a regular expression that can be used to match a nucleotide sequence. If compile is True, the regular expression is compiled with re.compile, otherwise the regex is returned as a string. """ regex = ''.join(AMINO_ACID_REGEXES[aa] for aa in aa_sequence) return re.compile(regex) if compile else regex def n_intervals(sequence, N='N'): """ Given a sequence, yield all intervals containing only N characters as tuples (start, stop). If the sequence is a bytes/bytearray object, set N=ord(b'N') >>> list(n_intervals('ACGTnNAC')) [(4, 6)] >>> list(n_intervals(b'ACGTnNAC', N=ord(b'N'))) [(4, 6)] """ sequence = sequence.upper() start = sequence.find(N) while start >= 0: stop = start + 1 while stop < len(sequence) and sequence[stop] == N: stop += 1 yield (start, stop) start = sequence.find(N, stop) def intervals_complement(intervals, length): """ Given an iterable of sorted, nonoverlapping intervals as (start, stop) pairs, yield the complementary intervals. The result is equivalent to [(0, length)] minus the given intervals. >>> list(intervals_complement([(1, 2), (4, 6)], length=10)) [(0, 1), (2, 4), (6, 10)] """ prev_stop = 0 for start, stop in intervals: if start >= length: break if prev_stop != start: yield (prev_stop, start) prev_stop = stop if prev_stop < length: yield (prev_stop, length) def mutate(seq, substitution_rate=0.1, indel_rate=0.01, alphabet='ACGT', counts=False): """ Mutate a DNA sequence by randomly introducing substitutions and indels. This does not use a very sophisticated model. Return the mutated sequence. substitution_rate -- The probability at which an individual base is substituted with a different one. indel_rate -- At any position, a random base is inserted at probability indel_rate/2, and the base is deleted at probability indel_rate/2. counts -- If this is True, a triple (mutated_sequence, number_of_substitutions, number_of_indels) is returned instead of just the sequence. """ other_chars = {} for c in alphabet: other_chars[c] = alphabet.replace(c, '') mutated = [] n_sub = 0 n_indel = 0 indels = indel_rate > 0 for c in seq: c = c.upper() r = random.random() if r < substitution_rate: # mutate base d = random.choice(other_chars.get(c, alphabet)) mutated.append(d) n_sub += 1 elif indels and r < (substitution_rate + 0.5 * indel_rate): # insertion mutated.append(random.choice(alphabet)) mutated.append(c) n_indel += 1 elif indels and r < (substitution_rate + indel_rate): n_indel += 1 # deletion pass else: # no change mutated.append(c) if counts: return ''.join(mutated), n_sub, n_indel else: return ''.join(mutated) marcelm-sqt-d3218a8c5437/sqt/intervaltree.py000066400000000000000000000075121302004573300206500ustar00rootroot00000000000000import sys __author__ = "Johannes Koester" class Interval: """ Container for an interval with start and end point. """ def __init__(self, start, end): self.start = start self.end = end def is_overlap(self, other): """ Return True if this interval overlaps with the given interval. """ return (self.start - other.end <= 0 and self.end - other.start >= 0) or (self.end - other.start >= 0 and self.start - other.end <= 0) def __repr__(self): return "({},{})".format(self.start, self.end) class IntervalNode: """ Node in the IntervalTree. """ def __init__(self, interval, obj, tree): self.interval = interval self.obj = obj self.right = None self.left = None self.maxend = self.interval.end self.height = 0 self.tree = tree def recalc_maxend(self): self.maxend = self.interval.end if child in (self.left, self.right): if child: self.maxend = max(child.maxend, self.maxend) def __repr__(self): return str(self.interval) class IntervalTree: """ Interval tree as proposed by Cormen et al. """ def __init__(self): self.root = None def insert(self, *args, obj = None): """ Insert a new interval into the tree. *args can be a start and endpoint given as ints, or a single object with the attributes start and end. """ node = self._convert_args(*args, obj = obj, node = True) self._insert(self.root, node) # balance the tree to avoid degeneration self._rotate() def find(self, *args): """ Find all intervals in the tree that intersect with the given interval. *args can be a start and endpoint given as ints, or a single object with the attributes start and end. """ for interval in self._find(self.root, self._convert_args(*args)): yield interval def _insert(self, root, node): if not root: self.root = node return maxend = None if node.interval.start < root.interval.start: if root.left: maxend = self._insert(root.left, node) else: root.left = node else: if root.right: maxend = self._insert(root.right, node) else: root.right = node root.height += 1 if not maxend: maxend = node.interval.end root.maxend = max(maxend, root.maxend) return root.maxend def _find(self, root, interval): if not root: return if interval.start > root.maxend: return if root.interval.is_overlap(interval): yield root.interval if interval.start < root.interval.start: for i in self._find(root.left, interval): yield i elif not interval.end < root.interval.start: for i in self._find(root.right, interval): yield i def _rotate(self): if self.root.right and self.root.left: heightdiff = self.root.right.height - self.root.left.height if heightdiff > 1: # perform a left rotation p = self.root q = self.root.right self.root = q p.right = q.left q.left = p q.height += 1 p.height -= 1 p.recalc_maxend() q.recalc_maxend() elif heightdiff < -1: # perform a right rotation p = self.root q = self.root.left self.root = q p.left = q.right q.right = p q.height += 1 p.height -= 1 p.recalc_maxend() q.recalc_maxend() def _inorder(self, root): if not root: return for i in self._inorder(root.left): yield i yield root for i in self._inorder(root.right): yield i def __iter__(self): for i in self._inorder(self.root): yield i def __repr__(self): return ", ".join(map(str, self)) def _convert_args(self, *args, obj = None, node = False): """ Convert args into an interval. *args can be a start and endpoint given as ints, or a single object with the attributes start and end. """ if len(args) == 2: interval = Interval(*args) else: interval = Interval(args[0].start, args[0].end) if node: if not obj and len(args) == 1: obj = args[0] return IntervalNode(interval, obj, self) return interval marcelm-sqt-d3218a8c5437/sqt/io/000077500000000000000000000000001302004573300161745ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/sqt/io/__init__.py000066400000000000000000000000001302004573300202730ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/sqt/io/bam.py000066400000000000000000000146551302004573300173200ustar00rootroot00000000000000#!/usr/bin/env python3 from collections import namedtuple from struct import calcsize, unpack, unpack_from from array import array import gzip from itertools import chain import sys from ..cigar import Cigar Reference = namedtuple('Reference', ['name', 'length']) class FormatError: pass SEQ_TRANS = bytes.maketrans(bytes(range(16)), b'=ACMGRSVTWYHKDBN') CIGAR_OPS = 'MIDNSHP=X' QUAL_TRANS = bytes.maketrans(bytes(range(100)), bytes(range(33, 133))) def quality_to_ascii(qualities): if qualities is None: return '*' return qualities.translate(QUAL_TRANS).decode('ascii') TAG_TYPES = { 'A': 'c', # character 'c': 'b', # signed 8-bit integer 'C': 'B', # unsigned 8-bit integer 's': 'h', # signed 16-bit integer 'S': 'H', # unsigned 16-bit integer 'i': 'l', # signed 32-bit integer 'I': 'L', # unsigned 32-bit integer 'f': 'f', # 32-bit floating point number #'Z': zero-terminated string #'H': byte array in hex format #'B': integer or numeric array } INVERSE_TAG_TYPES = dict((value, key) for (key, value) in TAG_TYPES.items()) BAM_TO_SAM_TYPES = { 'A': 'A', 'c': 'i', 'C': 'i', 's': 'i', 'S': 'i', 'i': 'i', 'I': 'i', 'f': 'f', 'Z': 'Z', 'B': 'B', } class BamAlignment: def __init__(self, data, references): self.references = references align = '> 16 self.mapping_quality = (bin_mq_nl & 0xFF00) >> 8 self.flags = flag_nc >> 16 n = calcsize(align) # Query name query_name_length = bin_mq_nl & 0xFF self.query_name = data[n: n + query_name_length - 1].decode('ascii') # NULL-terminated if self.query_name == '*': self.query_name = None offset = n + query_name_length # CIGAR n_cigar_op = flag_nc & 0xFFFF cigar_codes = unpack_from('<{}I'.format(n_cigar_op), data, offset=offset) if not cigar_codes: self.cigar = None else: self.cigar = Cigar((code & 0xF, code >> 4) for code in cigar_codes) offset += 4 * n_cigar_op # Sequence encoded_seq = data[offset:offset + (l_seq + 1) // 2] sequence = bytes(chain(*[(v>>4, v&0xF) for v in encoded_seq])) sequence = sequence.translate(SEQ_TRANS).decode('ascii') if l_seq & 1 == 1: # if odd sequence = sequence[:l_seq] self.sequence = sequence offset += (l_seq + 1) // 2 # Qualities qualities = data[offset:offset + l_seq] if l_seq == 0 or qualities[0] == 255: qualities = None self.qualities = qualities offset += l_seq self.tags = self._parse_tags(data, offset) def _parse_tags(self, data, offset): tags = [] while offset < len(data): assert offset <= len(data) - 4 tag_name = data[offset:offset+2].decode('ascii') tag_type = chr(data[offset+2]) assert tag_type in 'AcCsSiIfZHB' if tag_type == 'A': tag_value = chr(data[offset+3]) offset += 4 elif tag_type == 'Z': i = data.index(0, offset+3) tag_value = data[offset+3:i].decode('ascii') offset = i + 1 elif tag_type == 'H': raise NotImplementedError("Tag type H not implemented") elif tag_type == 'B': array_type = chr(data[offset+3]) assert array_type in 'cCsSiIf' length = unpack_from('= 0: return self.references[id].name else: return None @property def reference_name(self): return self._id_to_reference_name(self.reference_id) @property def next_reference_name(self): return self._id_to_reference_name(self.next_reference_id) def __str__(self): """Return SAM-formatted representation of this record""" refname = self.reference_name if self.next_reference_id == self.reference_id and self.reference_id >= 0: nextrefname = '=' else: nextrefname = self.next_reference_name def format_value(type, v): if type == 'f': return '{:g}'.format(v) elif type == 'B': tc = INVERSE_TAG_TYPES[v.typecode] s = tc + ',' s += ','.join(format_value(tc, x) for x in v) return s else: return str(v) def asterisk(v): return '*' if v is None else v tags = '\t'.join(':'.join((name, BAM_TO_SAM_TYPES[type], format_value(type, value))) for name, type, value in self.tags) fields = [ asterisk(self.query_name), self.flags, asterisk(refname), self.position + 1, self.mapping_quality, asterisk(self.cigar), asterisk(nextrefname), self.next_position + 1, self.insert_length, self.sequence if self.sequence else '*', quality_to_ascii(self.qualities) ] if tags: # Avoid printing the tab character at the end of the line if there are no tags fields.append(tags) return '\t'.join(str(f) for f in fields) class BamReader: MAGIC = b'BAM\1' def __init__(self, file): """ open the file, read the header """ file = gzip.GzipFile(file, 'rb') data = file.read(4) if data != self.MAGIC: raise FormatError("magic bytes 'BAM\\1' not found, is this a BAM file?") # header in SAM text format header_length = unpack(' n: ellipsis = '...' if isinstance(s, str) else b'...' s = s[:n-3] + ellipsis return s def _quality_to_ascii(qualities, base=33): """ Convert a list containing qualities given as integer to a string of ASCII-encoded qualities. base -- ASCII code of quality zero (sensible values are 33 and 64). >>> _quality_to_ascii([17, 4, 29, 18]) '2%>3' """ qualities = ''.join(chr(q+base) for q in qualities) return qualities class UnknownFileType(Exception): """ Raised when SequenceReader could not autodetect the file type. """ pass class FastaIndexMissing(Exception): pass class FastaWriter: """ Write FASTA-formatted sequences to a file-like object. """ def __init__(self, file, line_length=80): """ If line_length is not None, the lines will be wrapped after line_length characters. """ self.line_length = line_length if line_length != 0 else None if isinstance(file, str): file = xopen(file, "w") self._file = file def write(self, name_or_seq, sequence=None): """Write an entry to the the FASTA file. If only one parameter (name_or_seq) is given, it must have attributes .name and .sequence, which are then used. Otherwise, the first parameter must be the name and the second the sequence. The effect is that you can write this: fr.write("name", "ACCAT") or fr.write(Sequence("name", "ACCAT")) """ if sequence is None: name = name_or_seq.name sequence = name_or_seq.sequence else: name = name_or_seq sequence = force_str(sequence) if self.line_length is not None: print('>{}'.format(name), file=self._file) for i in range(0, len(sequence), self.line_length): print(sequence[i:i+self.line_length], file=self._file) else: print('>{}'.format(name), sequence, file=self._file, sep='\n') def close(self): self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() class FastqWriter: """ Write sequences with qualities in FASTQ format. FASTQ files are formatted like this: @read name SEQUENCE + QUALITIS """ _close_on_exit = False def __init__(self, file, twoheaders=False): """ If twoheaders is set, then the read name will be repeated after the plus sign (which is redundant and therefore not recommended). """ self.twoheaders = twoheaders if isinstance(file, str): file = xopen(file, "w") self._close_on_exit = True self._file = file def write(self, name_or_seq, sequence=None, qualities=None): """Write an entry to the the FASTQ file. If only one parameter (name_or_seq) is given, it must have attributes .name, .sequence and .qualities, which are then used. Otherwise, all three parameters must be given and name_or_seq must be the name of the sequence. The effect is that you can write this: fq.write("name", "ACCAT", "#!!&B") or fq.write(Sequence("name", "ACCAT", "#!!&B")) """ if sequence is None: name = name_or_seq.name sequence = name_or_seq.sequence qualities = name_or_seq.qualities else: name = name_or_seq if self.twoheaders: two = name else: two = '' print("@{0}\n{1}\n+{2}\n{3}".format(name, force_str(sequence), two, force_str(qualities)), file=self._file) def close(self): if self._close_on_exit: self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() IndexEntry = namedtuple("IndexEntry", "name length offset nucleotides_per_line bytes_per_line") def _read_fasta_index(path): """ Return a dictionary that maps sequence names to IndexEntry tuples. The sequence name is the string before the first space character in the FASTA comment header. The offsets are converted to Python 0-based coordinates! """ index = OrderedDict() try: fai = open(path) except FileNotFoundError as e: raise FastaIndexMissing("Could not find the FASTA index file named '{}'.".format(path)) with fai as f: for line in f: name, length, offset, nucleotides_per_line, bytes_per_line = line.split('\t') indexed_name = name.split(' ', maxsplit=1)[0] length = int(length) offset = int(offset) nucleotides_per_line = int(nucleotides_per_line) bytes_per_line = int(bytes_per_line) entry = IndexEntry(name, length, offset, nucleotides_per_line, bytes_per_line) index[indexed_name] = entry return index class IndexedSequence: """A single sequence in an indexed FASTA file""" def __init__(self, mapped, indexentry): self.mapped = mapped self.indexentry = indexentry def __getitem__(self, key): if type(key) is int: key = slice(key, key+1) assert key.step is None return self.read(key.start, key.stop) def __len__(self): """Return length of sequence""" return self.indexentry.length def read(self, start=0, stop=None): """ DEPRECATED, use slice notation via sequence[5:20] instead. Read and return a substring of a specific entry of the FASTA file """ indexinfo = self.indexentry if stop is None: stop = indexinfo.length if start is None: start = 0 start = max(0, start) stop = min(indexinfo.length, stop) if stop <= start: return b'' def nucleotide_to_byte_offset(i): return indexinfo.offset + i // indexinfo.nucleotides_per_line * indexinfo.bytes_per_line + i % indexinfo.nucleotides_per_line byte_start = nucleotide_to_byte_offset(start) byte_stop = nucleotide_to_byte_offset(stop - 1) + 1 # these indices are relative to this sequence start_line = start // indexinfo.nucleotides_per_line stop_line = (stop - 1) // indexinfo.nucleotides_per_line + 1 assert start_line < stop_line if start_line + 1 == stop_line: # requested substring is within one line return self.mapped[byte_start:byte_stop] # otherwise, collect lines or line fragments collected = [] # offset of end of start_line byte_line_stop = indexinfo.offset + start_line * indexinfo.bytes_per_line + indexinfo.nucleotides_per_line assert byte_start < byte_line_stop collected.append(self.mapped[byte_start:byte_line_stop]) # all lines between the start_- and stop_line for line in range(start_line + 1, stop_line - 1): byte_line_start = indexinfo.offset + line * indexinfo.bytes_per_line assert byte_line_start < byte_line_start + indexinfo.nucleotides_per_line collected.append(self.mapped[byte_line_start:byte_line_start+indexinfo.nucleotides_per_line]) # last line byte_line_start = indexinfo.offset + (stop_line - 1) * indexinfo.bytes_per_line assert byte_line_start < byte_stop collected.append(self.mapped[byte_line_start:byte_stop]) sequence = b''.join(collected) assert len(sequence) == stop - start return sequence class IndexedFasta: """ Efficient access to FASTA files that have been indexed by 'samtools faidx'. Example: with IndexedFasta(path) as fasta: print(fasta['chr1'][50:100]) In Python 3, the returned sequence is of type bytes. """ def __init__(self, path): self.index = _read_fasta_index(path + '.fai') self._file = open(path) self.mapped = mmap.mmap(self._file.fileno(), 0, flags=mmap.MAP_PRIVATE) # 0: entire file def length(self, name): """ DEPRECATED, use len(indexedfasta[name]) instead Return length of sequence 'name'. """ return self.index[name].length def get(self, name): """ DEPRECATED, use indexing with indexed_fasta[name] instead. Retrieve an entry of the FASTA file as an IndexedSequence object. This is a light-weight operation: The actual work happens when you access the returned object. """ return IndexedSequence(self.mapped, self.index[name]) def __getitem__(self, name): """ Retrieve an entry of the FASTA file as an IndexedSequence object. This is a light-weight operation: The actual work happens when you access the returned object. """ if not isinstance(name, str): raise TypeError("The key must be a str.") return IndexedSequence(self.mapped, self.index[name]) def __len__(self): return len(self.index) def __contains__(self, name): return name in self.index def close(self): self.mapped.close() self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() def NonIndexedFasta(path): """ Efficient access to FASTA files that have not been indexed with 'samtools faidx'. In contrast to IndexedFasta above, this implementation reads the entire file into memory (into a dictionary) to provide efficient access. Return a dict that maps sequence names to sequences. In Python 3, the sequences are of type bytes. """ with FastaReader(path, wholefile=True, mode='rb') as fr: sequences = dict((seq.name.split(' ', maxsplit=1)[0], seq.sequence) for seq in fr) return sequences class Sequence: """qualities is a string and it contains the qualities encoded as ascii(qual+33).""" def __init__(self, name, sequence, qualities=None): """Set qualities to None if there are no quality values""" self.name = name self.sequence = sequence self.qualities = qualities def __getitem__(self, key): """slicing""" return self.__class__(self.name, self.sequence[key], self.qualities[key] if self.qualities is not None else None) def __repr__(self): qstr = '' if self.qualities is not None: qstr = '\', qualities={0!r}'.format(_shorten(self.qualities)) return 'Sequence(name={0!r}, sequence={1!r}{2})'.format(_shorten(self.name), _shorten(self.sequence), qstr) def __len__(self): return len(self.sequence) def __eq__(self, other): return self.name == other.name and \ self.sequence == other.sequence and \ self.qualities == other.qualities def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return hash((self.name, self.sequence, self.qualities)) class FormatError(Exception): """ Raised when an input file (FASTA or FASTQ) is malformatted. """ pass class FileWithPrependedLine(object): """ A file-like object that allows to "prepend" a single line to an already opened file. That is, further reads on the file will return the provided line and only then the actual content. This is needed to solve the problem of autodetecting input from a stream: As soon as the first line has been read, we know the file type, but also that line is "gone" and unavailable for further processing. """ def __init__(self, file, line): """ file is an already opened file-like object. line is a single string (newline will be appended if not included) """ if not line.endswith('\n'): line += '\n' self.first_line = line self._file = file def __iter__(self): yield self.first_line for line in self._file: yield line def close(self): self._file.close() @property def closed(self): return self._file.closed def SequenceReader(file, mode='rt', colorspace=False, fileformat=None, case='upper'): """ Reader for FASTA and FASTQ files that autodetects the file format. Returns either an instance of FastaReader or of FastqReader, depending on file type. The autodetection can be skipped by setting fileformat to the string 'fasta' or 'fastq' The case parameter is passed on to FastaReader (not FastqReader). The colorspace parameter is passed on to FastqReader (not FastaReader). file is a filename or a file-like object. If file is a filename, then .gz files are supported. If the file name is available, the file type is detected by looking at the file name. If the file name is not available (for example, reading from standard input), then the file is read and the file type determined from the content. """ fasta_kwargs = dict(mode=mode, case=case) fastq_kwargs = dict(mode=mode, colorspace=colorspace) if fileformat is not None: fileformat = fileformat.lower() if fileformat == 'fasta': return FastaReader(file, **fasta_kwargs) elif fileformat == 'fastq': return FastqReader(file, **fastq_kwargs) else: raise UnknownFileType("File format {0} is unknown (expected 'fasta' or 'fastq').".format(fileformat)) name = None if file == "-": file = sys.stdin elif isinstance(file, str): name = file elif hasattr(file, "name"): name = file.name if name is not None: if name.endswith('.gz'): name = name[:-3] name, ext = splitext(name) ext = ext.lower() if ext in ['.fasta', '.fa', '.fna', '.csfasta', '.csfa']: return FastaReader(file, **fasta_kwargs) elif ext in ['.fastq', '.fq'] or (ext == '.txt' and name.endswith('_sequence')): return FastqReader(file, **fastq_kwargs) else: raise UnknownFileType("Could not determine whether this is FASTA or FASTQ: file name extension {0} not recognized".format(ext)) # No name available. # Assume that 'file' is an open file # and autodetect its type by reading from it. for line in file: if line.startswith('#'): # Skip comment lines (needed for csfasta) continue if line.startswith('>'): return FastaReader(FileWithPrependedLine(file, line), **fasta_kwargs) if line.startswith('@'): return FastqReader(FileWithPrependedLine(file, line), **fastq_kwargs) raise UnknownFileType("File is neither FASTQ nor FASTA.") class FastaReader(object): """ Reader for FASTA files. """ def __init__(self, file, mode='rt', wholefile=False, keep_linebreaks=False, binary=None, case='upper'): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. If wholefile is True, then it is ok to read the entire file into memory. This is faster when there are many newlines in the file, but may obviously need a lot of memory. keep_linebreaks -- whether to keep the newline characters in the sequence case -- one of 'upper', 'lower', 'keep'. How to case of the sequence. binary -- DEPRECATED, use mode='rb' instead. In Python 3, when this is set, the returned Sequence objects will contain bytes objects for the sequence. The names will still be strings. If file was given as a file-like object, it must have been opened in binary mode. mode -- When set to 'rt', returned Sequence objects will contain bytes objects for the sequence (relevant in Python 3 only). """ if binary is not None: mode = 'rb' if binary else 'rt' if mode not in ('rt', 'rb'): raise ValueError("mode must be either 'rt' or 'rb'") if isinstance(file, str): file = xopen(file, mode) self._file = file self.binary = binary self.wholefile = wholefile self.keep_linebreaks = keep_linebreaks self.mode = mode self.format = 'fasta' if case not in ('keep', 'upper', 'lower'): raise ValueError("case must be 'keep', 'upper' or 'lower'") self.case = case assert not (wholefile and keep_linebreaks), "not supported" def __iter__(self): """ Yield Sequence objects. The qualities attribute is always None. """ return self._wholefile_iter() if self.wholefile else self._streaming_iter() def _streaming_iter(self): """ Read entry from the file (single entry at a time). """ name = None if self.mode == 'rt': seq_constructor = str delim = '\n' startchar = '>' if self.case == 'keep': seq_transform = lambda s: s elif self.case == 'upper': seq_transform = lambda s: s.upper() elif self.case == 'lower': seq_transform = lambda s: s.lower() name_transform = lambda name: name else: seq_constructor = bytearray delim = b'\n' startchar = ord('>') if self.case == 'keep': seq_transform = lambda s: bytes(s) elif self.case == 'upper': seq_transform = lambda s: bytes(s.upper()) elif self.case == 'lower': seq_transform = lambda s: bytes(s.lower()) name_transform = lambda name: name.decode('utf-8') seq = seq_constructor() for line in self._file: # strip() should also take care of DOS line breaks line = line.strip() if line and line[0] == startchar: if name is not None: assert self.keep_linebreaks or seq.find(delim) == -1 name = name_transform(name) seq = seq_transform(seq) yield Sequence(name, seq, None) name = line[1:] seq = seq_constructor() else: seq += line if self.keep_linebreaks: seq += delim if name is not None: assert self.keep_linebreaks or seq.find(delim) == -1 name = name_transform(name) seq = seq_transform(seq) yield Sequence(name, seq, None) def _wholefile_iter(self): """ This reads in the entire file at once, but is faster than the above code when there are lots of newlines. The idea comes from the TAMO package (http://fraenkel.mit.edu/TAMO/), module TAMO.seq.Fasta by David Benjamin Gordon. """ wholefile = self._file.read() if self.mode == 'rt': cr = '\r' linebreak = '\n' delim = '\n>' greater = '>' to_str = lambda s: s if self.case == 'keep': seq_transform = lambda s: s elif self.case == 'upper': seq_transform = lambda s: s.upper() elif self.case == 'lower': seq_transform = lambda s: s.lower() else: cr = b'\r' linebreak = b'\n' delim = b'\n>' greater = b'>' to_str = lambda s: s.decode() if self.case == 'keep': seq_transform = lambda s: bytes(s) elif self.case == 'upper': seq_transform = lambda s: bytes(s.upper()) elif self.case == 'lower': seq_transform = lambda s: bytes(s.lower()) assert cr not in wholefile, "Sorry, currently don't know how to deal with files that contain \\r linebreaks" assert len(wholefile) == 0 or wholefile[0:1] == greater, "FASTA file must start with '>'" parts = wholefile.split(delim) # first part has '>' in front parts[0] = parts[0][1:] for part in parts: lines = part.split(linebreak, 1) name = to_str(lines[0]) sequence = seq_transform(lines[1].replace(linebreak, bytes())) yield Sequence(name=name, sequence=sequence, qualities=None) def close(self): self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed FastaReader") return self def __exit__(self, *args): self._file.close() class FastqReader(object): """ Reader for FASTQ files. Does not support multi-line FASTQ files. """ def __init__(self, file, mode='rt', colorspace=False): """ file is a filename or a file-like object. If file is a filename, then the file is opened with xopen(). colorspace -- Usually (when this is False), there must be n characters in the sequence and n quality values. When this is True, there must be n+1 characters in the sequence and n quality values. mode -- For Python 3, set this to 'rb' to get Sequence objects in which both the sequence and the qualities field have type bytes. """ if mode not in ('rt', 'rb'): raise ValueError("mode must be either 'rt' or 'rb'") if isinstance(file, str): file = xopen(file, mode) self._file = file self.colorspace = colorspace self.twoheaders = False self.mode = mode self.format = 'fastq' def __iter__(self): """ Yield Sequence objects. """ if self.mode == 'rt': AT = '@' PLUS = '+' STRIP = '\n\r' name_transform = lambda name: name else: AT = b'@' PLUS = b'+' STRIP = b'\n\r' name_transform = lambda name: name.decode('utf-8') lengthdiff = 1 if self.colorspace else 0 for i, line in enumerate(self._file): if i % 4 == 0: if not line.startswith(AT): raise FormatError("at line {0}, expected a line starting with '@'".format(i+1)) name = name_transform(line.strip()[1:]) elif i % 4 == 1: sequence = line.strip() elif i % 4 == 2: line = line.strip() if not line.startswith(PLUS): raise FormatError("at line {0}, expected a line starting with '+'".format(i+1)) if len(line) > 1: self.twoheaders = True if not name_transform(line[1:]) == name: raise FormatError( "At line {0}: Two sequence descriptions are given in " "the FASTQ file, but they don't match " "('{1}' != '{2}')".format(i+1, name, line.rstrip()[1:])) elif i % 4 == 3: qualities = line.rstrip(STRIP) if len(qualities) + lengthdiff != len(sequence): raise ValueError("Length of quality sequence and length of read do not match (%d+%d!=%d)" % (len(qualities), lengthdiff, len(sequence))) yield Sequence(name, sequence, qualities) def __enter__(self): if self._file is None: raise ValueError("I/O operation on closed FastqReader") return self def __exit__(self, *args): self._file.close() class FastaQualReader(object): """ Reader for reads that are stored in .(CS)FASTA and .QUAL files. """ def __init__(self, fastafile, qualfile, colorspace=False): """ fastafile and qualfile are filenames file-like objects. If file is a filename, then .gz files are supported. colorspace -- Usually (when this is False), there must be n characters in the sequence and n quality values. When this is True, there must be n+1 characters in the sequence and n quality values. """ self.fastareader = FastaReader(fastafile) self.qualreader = FastaReader(qualfile, keep_linebreaks=True) self.colorspace = colorspace def __iter__(self): """ Yield Sequence objects. """ lengthdiff = 1 if self.colorspace else 0 for fastaread, qualread in zip(self.fastareader, self.qualreader): qualities = _quality_to_ascii(list(map(int, qualread.sequence.split()))) assert fastaread.name == qualread.name if len(qualities) + lengthdiff != len(fastaread.sequence): raise ValueError("Length of quality sequence and length of read do not match (%d+%d!=%d)" % ( len(qualities), lengthdiff, len(fastaread.sequence))) yield Sequence(fastaread.name, fastaread.sequence, qualities) def __enter__(self): if self.fastafile is None: raise ValueError("I/O operation on closed FastaQualReader") return self def __exit__(self, *args): self.fastareader.close() self.qualreader.close() def byte_frequencies(s): return Counter(s) def guess_quality_base(path, limit=10000): """ Guess quality encoding (Phred33/64) of a FASTQ file. Return a tuple (freqs, guess) where freqs are the character frequencies encountered in the first limit records and guess is one of 33, 64, None, indicating which quality encoding this file probably has (None indicates unknown). """ freqs = Counter() with FastqReader(path, mode='rb') as fqr: for record in islice(fqr, 0, limit): freqs += byte_frequencies(record.qualities) if min(freqs) - 64 < -10: guess = 33 elif max(freqs) - 33 > 60: guess = 64 else: guess = None return freqs, guess FastqInfo = namedtuple('FastqInfo', ['instrument', 'run', 'flowcell', 'lane', 'barcode']) def fastq_header(path): """ Inspect a FASTQ file and return a FastqInfo object. If a particular piece of information is unknown, the corresponding attribute of FastqInfo is set to None. This will try to auto-detect different types of Illumina headers: @HWI-ST552_0:4:1101:1179:1939#0/1 @HWI_ST139:8:1:1202:1874#GATCAG/1 @HWI-ST344:204:D14G8ACXX:8:1101:1638:2116 1:N:0:CGATGT @MISEQ:56:000000000-A4YM7:1:1101:15071:2257 1:N:0:CTTGTA @FCD20MKACXX:8:1101:1215:2155#TCGTAAGC/1 The format of a FASTQ header starting with CASAVA 1.8 is: :::::: ::: """ try: # line = path.readline() except: if path.endswith('.gz'): openfunc = gzip.open else: openfunc = open with openfunc(path, 'rt') as f: line = f.readline() line = line.rstrip() assert line.startswith('@') line = line[1:] # ignore comment field for now header = line.split(' ', maxsplit=1) read_name = header[0] comment = header[1] if len(header) == 2 else None fields = read_name.split(':') if len(fields) == 7: # probably new CASAVA 1.8 format instrument = fields[0] run_id = int(fields[1]) flowcell = fields[2] lane = int(fields[3]) barcode = None if comment: comment_fields = comment.split(':') if len(comment_fields) == 4: barcode = comment_fields[3] elif len(fields) == 5: if '#' in fields[4]: f = fields[4].split('#', maxsplit=1)[1] barcode = f.split('/', maxsplit=1)[0] else: barcode = None run_id = None lane = int(fields[1]) if fields[0].endswith('XX'): instrument, flowcell = None, fields[0] else: instrument, flowcell = fields[0], None else: raise ValueError("FASTQ header format not recognized") if barcode and not re.match('[acgtnACGTN]+$', barcode): barcode = None return FastqInfo(instrument=instrument, run=run_id, flowcell=flowcell, lane=lane, barcode=barcode) try: from .._helpers import byte_frequencies except: pass marcelm-sqt-d3218a8c5437/sqt/io/gtf.py000066400000000000000000000034321302004573300173300ustar00rootroot00000000000000""" Minimalistic GTF parsing. """ from collections import namedtuple from xopen import xopen GtfRecord = namedtuple('GtfRecord', ['chrom', 'source', 'feature', 'start', 'stop', 'score', 'strand', 'frame', 'attributes' ]) # this parser is very simplistic. It is only made to be able to parse ENSEMBL files such as these: # ftp://ftp.ensembl.org/pub/release-66/gtf/homo_sapiens/Homo_sapiens.GRCh37.66.gtf.gz # # First line in that file: # GL000213.1 protein_coding exon 138767 139339 . - . gene_id "ENSG00000237375"; transcript_id "ENST00000327822"; # exon_number "1"; gene_name "BX072566.1"; gene_biotype "protein_coding"; transcript_name "BX072566.1-201"; def _none_or_type(s, type): return None if s == '.' else type(s) def GtfReader(path): """Iterate over the GTF file named by given path. Yield GtfRecord objects.""" with xopen(path) as f: for line in f: if line.startswith('#'): continue line = line.rstrip('\n\r') fields = line.split('\t') chrom = fields[0] source = fields[1] feature = fields[2] start = int(fields[3]) - 1 stop = int(fields[4]) score = _none_or_type(fields[5], float) strand = fields[6] frame = _none_or_type(fields[7], int) assert strand in '+-' assert frame in (None, 0, 1, 2) atts = fields[8].strip(' ').split(';') attributes = dict() for att in atts: att = att.strip(' ') if not att: continue name, value = att.split(' ', maxsplit=1) value = value.strip('"') attributes[name] = value yield GtfRecord( chrom=chrom, source=source, feature=feature, start=start, stop=stop, score=score, strand=strand, frame=frame, attributes=attributes ) def main(): import sys for record in parse_gtf(sys.argv[1]): print(record) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/sqt/io/sff.py000066400000000000000000000236111302004573300173270ustar00rootroot00000000000000""" Reading 454 .sff files (flowgram file format) (c) Sven Rahmann, 2011 see: http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format """ __author__ = 'Sven Rahmann' from struct import calcsize, unpack from collections import namedtuple class FormatError(Exception): pass class SFFFile(): _MAGIC = 0x2E736666 def __init__(self, filename): self.filename = filename with open(filename, mode="rb") as f: self._set_info_from_header(f) def _process_padding(self, f, p): if not (0 <= p < 8): raise FormatError("Padding mismatch, padding=" + str(p)) padder = f.read(p) if padder.count(b'\0') != p: raise FormatError("Padding seems to contain data") def _fread(self, f, fmt): b = calcsize(fmt) data = f.read(b) if len(data) < b: raise FormatError("chunk for " + fmt + " too short: " + str(len(data)) + "/" + str(b)) return unpack(fmt, data) def _set_info_from_header(self, f): """read the sff header and store its information in self""" _FIXEDLEN = 31 # Read file header (constant part), length 31 bytes, 9 fields # big endian encdoing > # magic_number I == _MAGIC # version 4B == (0,0,0,1) # index_offset Q # index_length I # number_of_reads I # header_length H divisble by 8 # key_length H # number_of_flows_per_read H # flowgram_format_code B == 1 headerformat = '>I4BQIIHHHB' assert calcsize(headerformat) == _FIXEDLEN (magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, number_of_reads, header_length, key_length, number_of_flows_per_read, flowgram_format_code) = self._fread(f, headerformat) if magic_number != SFFFile._MAGIC: raise FormatError("Magic number is {} instead of {}").format(magic_number, SFFFile._MAGIC) if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1): raise FormatError("Unsupported .sff version ({}.{}.{}.{})".format(ver0, ver1, ver2, ver3)) if (index_offset != 0) ^ (index_length != 0): raise FormatError("Index offset is {}, but length is {}".format(index_offset, index_length)) if (index_offset % 8 != 0) or (index_length % 8 != 0): #raise FormatError("Index (offset, length) must be divisible by 8"+str(index_offset)+","+str(index_length)) pass if header_length % 8 != 0: raise FormatError("Header length must be divisible by 8, but is {}".format(header_length)) if flowgram_format_code != 1: raise FormatError("Flowgram format code {} not supported".format(flowgram_format_code)) # Read variable part of header: # flow_chars {number_of_flows_per_read}s # key_sequence {key_length}s flow_chars = f.read(number_of_flows_per_read) key_sequence = f.read(key_length) # padding *B padding = header_length - number_of_flows_per_read - key_length - _FIXEDLEN self._process_padding(f, padding) # set attributes: self.magic_number = magic_number self.version = (ver0, ver1, ver2, ver3) self.has_index = (index_offset != 0) and (index_length != 0) self.index_offset = index_offset self.index_length = index_length self.number_of_reads = number_of_reads self.header_length = header_length #self.key_length = key_length # == len(key_sequence) self.key_sequence = key_sequence self.number_of_flows_per_read = number_of_flows_per_read self.flow_chars = flow_chars self.flowgram_format_code = flowgram_format_code # generator function to iterate over all reads def reads(self): """yields each read in this .sff file as a Read object.""" header_length = self.header_length with open(self.filename, mode="rb") as f: f.read(header_length) fpos = header_length checkindex = self.has_index for r in range(self.number_of_reads): assert fpos % 8 == 0, "file position {} not divisible by 8".format(fpos) if checkindex and fpos == self.index_offset: f.read(index_length) fpos += index.length checkindex = False read = self._next_read(f, r) yield read def __iter__(self): return self.reads() def _next_read(self, f, readindex): """reads the next read record from an open .sff file f""" # Read fixed part of read header, 7 fields # read_header_length H # name_length H # seq_len I # clip_qual_left H # clip_qual_right H # clip_adapter_left H # clip_adapter_right H header_fmt = ">HHIHHHH" (read_header_length, name_length, seq_len, clip_qual_left, clip_qual_right, clip_adapter_left, clip_adapter_right) = self._fread(f, header_fmt) # check format expected = ((16 + name_length + 7) // 8) * 8 if read_header_length != expected: raise FormatError("read header length should be 16 + name length, rounded up mod 8") # name c * name_length # padding B * [to fill] name_fmt = ">" + str(name_length) + "s" (namebytes,) = self._fread(f, name_fmt) name = namebytes.decode() padding = read_header_length - (16 + name_length) self._process_padding(f, padding) # flowgram_values H * nflows [type may differ with future formats] # flow_index_per_base B * seq_len # bases c * seq_len # quality_scores B * seq_len # padding B * [to fill] datatypes = ".H" flow_fmt = ">" + str(self.number_of_flows_per_read) + datatypes[self.flowgram_format_code] byte_fmt = ">" + str(seq_len) + "B" char_fmt = ">" + str(seq_len) + "s" flowgram_values = self._fread(f, flow_fmt) flow_index_per_base = self._fread(f, byte_fmt) if seq_len > 0: flow_index_per_base = (flow_index_per_base[0] - 1,) + flow_index_per_base[1:] # convert to 0-based index (bases,) = self._fread(f, char_fmt) quality_scores = self._fread(f, byte_fmt) datalen = sum(map(calcsize, (flow_fmt, byte_fmt, char_fmt, byte_fmt))) padding = (-datalen) % 8 self._process_padding(f, padding) r = Read(readindex, name, self, # self is the current sff file clip_qual_left, clip_qual_right, clip_adapter_left, clip_adapter_right, flowgram_values, bases, flow_index_per_base, quality_scores) return r # TODO move the Flow and Flowgram class into some other module? class Flow(namedtuple('Flow', ['char', 'intensity'])): def __str__(self): return '{}{:.2f}'.format(chr(self.char), self.intensity) class Flowgram: """ A flowgram is mathematically defined to be a list of (char, intensity) pairs, but this implementation stores two lists, one with chars and one with intensities. Indexed access and iteration are supported, however, and result in (char, intensity) pairs. """ def __init__(self, flowchars, intensities): if len(flowchars) != len(intensities): raise ValueError("lengths of flowchars and intensities must be equal") self.flowchars = flowchars self.intensities = intensities def __iter__(self): return (Flow(*flow) for flow in zip(self.flowchars, self.intensities)) def __len__(self): return len(self.flowchars) def __str__(self): return ' '.join(str(flow) for flow in self) class Read(Flowgram): """ Represent a flowgram read by the following attributes: - index: int, 0-based running number of the read in the sff file - name: bytes, a unique id for the read - sff: SFFFile, from which the read was obtained - flowvalues: tuple of ints - flowchars: bytes, usually b'TACG' * n for some n - qual: tuple of ints, quality values (10log10-representation) - clip: 4-tuple with clipping information - key: bytes, the key sequence (usually b'TCAG') - bases: bytes, the 454-converted sequence representation of the flowvalues - flow_index_per_base: tuple of ints """ def __init__(self, index, name, sff_origin, clip_qual_left, clip_qual_right, clip_adapter_left, clip_adapter_right, flowgram_values, bases, flow_index_per_base, quality_scores): self.index = index self.name = name self.sff = sff_origin self.clip = dict(ql=clip_qual_left, qr=clip_qual_right, al=clip_adapter_left, ar=clip_adapter_right) self.flowchars = sff_origin.flow_chars self.flowvalues = flowgram_values self.intensities = [ v / 100.0 for v in flowgram_values ] self.key = sff_origin.key_sequence self.bases = bases self.flow_index_per_base = flow_index_per_base self.qual = quality_scores self.insert_start = max(1, max(self.clip['ql'], self.clip['al'])) - 1 self.insert_stop = min(len(bases) if self.clip['qr'] == 0 else self.clip['qr'], len(bases) if self.clip['ar'] == 0 else self.clip['ar']) class ClippedRead(Flowgram): """ A read in which both the bases and the flowgram (flowchars and intensities) are clipped to the insert region. A ClippedRead also has a base_to_flow attribute which maps base indices to flow indices. """ def __init__(self, read, start=0, stop=None): """ read is a Read object. start and stop specify additional offsets within the insert region. start and stop are interpreted as in slice notation (negative values and None are allowed). """ self._read = read self.name = read.name # _fl prefix: index into flowgram # _b prefix: index into bases insert_start_b = max(1, max(read.clip['ql'], read.clip['al'])) - 1 insert_stop_b = min(len(read.bases) if read.clip['qr'] == 0 else read.clip['qr'], len(read.bases) if read.clip['ar'] == 0 else read.clip['ar']) start_b, stop_b, _ = slice(start, stop).indices(insert_stop_b - insert_start_b) start_b += insert_start_b stop_b += insert_start_b b2fl = self._compute_base_to_flow() start_fl = b2fl[start_b] stop_fl = b2fl[stop_b] self.bases = read.bases[start_b:stop_b] self.intensities = read.intensities[start_fl:stop_fl] self.flowchars = read.flowchars[start_fl:stop_fl] self.base_to_flow = [ index_fl - start_fl for index_fl in b2fl[start_b:stop_b] ] assert len(self.bases) == len(self.base_to_flow) def _compute_base_to_flow(self): """ map a base index to a flow index """ m = [] v = 0 for index in self._read.flow_index_per_base: v += index m.append(v) assert v < len(self._read.intensities) m.append(len(self._read.intensities)) return m marcelm-sqt-d3218a8c5437/sqt/math.py000066400000000000000000000030161302004573300170700ustar00rootroot00000000000000def frequency_median(frequencies): """ Given a dictionary of frequencies, return the median. If the total no. of values is odd, the left of both middle values is returned. """ m = 0 # partial sum middle = (1 + sum(frequencies.values())) // 2 for length in sorted(frequencies): m += frequencies[length] if m >= middle: return length # never reached assert False def frequency_n50(lengths, genome_size=None): """ Return N50 or NG50 value given a Counter of lengths. If the genome_size is not given, it is set to the total length. The resulting value is the N50. If it is given, the resulting value is the NG50. If genome_size is greater than two times the number of sequences, None is returned. """ if genome_size is None: genome_size = sum(length * count for length, count in lengths.items()) running_total = 0 for length in sorted(lengths, reverse=True): running_total += length * lengths[length] if running_total >= genome_size * 0.5: return length return None def n50(lengths, genome_size=None): """ Return N50 or NG50 value given a list of lengths. If the genome_size is not given, it is set to sum(lengths). The resulting value is the N50. If it is given, the resulting value is the NG50. If genome_size is greater than 2 * sum(lengths), None is returned. """ if genome_size is None: genome_size = sum(lengths) lengths = sorted(lengths, reverse=True) running_total = 0 for length in lengths: running_total += length if running_total >= genome_size * 0.5: return length return None marcelm-sqt-d3218a8c5437/sqt/qualtrim.py000066400000000000000000000007561302004573300200050ustar00rootroot00000000000000""" Quality trimming and filtering. """ from cutadapt.qualtrim import quality_trim_index as _qtrimindex def quality_trim_index(qualities, cutoff, base=33): return _qtrimindex(qualities, 0, cutoff, base=base)[1] def expected_errors(qualities, base=33): """ Return expected number of errors. qualities -- ASCII-encoded qualities (chr(qual + base)). """ return sum(10 ** (-(ord(q) - base) / 10) for q in qualities) try: from ._helpers import expected_errors except ImportError: pass marcelm-sqt-d3218a8c5437/sqt/reads.py000066400000000000000000000101361302004573300172360ustar00rootroot00000000000000""" Classes that help in working with supplementary alignments. """ from collections import namedtuple from .cigar import Cigar class SupplementaryAlignment(namedtuple("SupplementaryAlignment", ['reference_name', 'pos', 'strand', 'cigar', 'mapping_quality', 'edit_distance'])): @property def query_start(self): return self.cigar.clipping_left @property def query_end(self): return self.cigar.clipping_left + self.cigar.query_length(count_clipped=None) @property def query_length(self): return self.cigar.query_length(count_clipped=None) @property def reference_end(self): return self.pos + self.cigar.reference_length() def parse_supplementary(sa): """ Parse supplementary alignments given by the SA:Z: tag in a SAM record. Return a list of SupplementaryAlignment objects. """ fields = sa.split(';') assert fields[-1] == '' # sa must end with a ';' alignments = [] for field in fields[:-1]: ref, pos, strand, cig, mapq, edit_dist = field.split(',') pos = int(pos) - 1 cig = Cigar(cig) # All information in the BAM file is relative to the reference. # Since we are interested in information relative to the read, # the CIGAR string needs to be reversed when the strand is '-'. if strand == '-': cig = cig[::-1] mapq = int(mapq) edit_dist = int(edit_dist) assert strand in '+-' a = SupplementaryAlignment(ref, pos, strand, cig, mapq, edit_dist) alignments.append(a) return alignments class AlignedRead: """ An AlignedRead describes all alignments of a read that appears in a SAM/BAM file. This collects all the supplementary alignments for the read in one place and gives a read-centric view of the alignments, instead of a reference-centric one. """ def __init__(self, read, reference_name): """ read -- an AlignedSegment. The SA tag of the read is parsed. """ #assert not read.is_secondary, "Read should not be secondary" #assert not read.is_supplementary, "Read should not be supplementary" self.alignments = self._extract_supplementary_alignments(read, reference_name) self.query_name = read.query_name self._primary_read = read # TODO not necessarily correct since assertions were removed above self._length = self.alignments[0].cigar.query_length(count_clipped='hard') #assert self._length == Cigar(read.cigar).query_length(count_clipped='soft'), "Shouldn't be hard-clipped" def __len__(self): """ Number of bases in this read. """ return self._length def __iter__(self): yield from self.alignments def aligned_bases(self): """ Return how many bases are aligned, considering all supplementary alignments. Overlapping supplementary alignments are allowed: Each base is counted at most once! """ events = [] for alignment in self.alignments: events.append((alignment.query_start, 'start', None)) events.append((alignment.query_end, 'stop', alignment)) depth = 0 # number of observed 'start' events bases = 0 # number of covered bases last_qstart = None for qpos, what, alignment in sorted(events, key=lambda x: x[0]): if what == 'start': if depth == 0: last_qstart = qpos depth += 1 elif what == 'stop': depth -= 1 if depth == 0: # interval (last_qstart, qpos) was covered bases += qpos - last_qstart return bases def _extract_supplementary_alignments(self, aligned_segment, reference_name): """ Given a single entry in a BAM file (an AlignedSegment) that potentially has supplementary alignments specified by the SA tag, return a list of SupplementaryAlignment objects. The list includes at least the aligned segment itself. """ tags = dict(aligned_segment.tags) cig = Cigar(aligned_segment.cigar) if aligned_segment.is_reverse: strand = '-' cig = cig[::-1] else: strand = '+' alignments = [SupplementaryAlignment(reference_name, aligned_segment.pos, strand, cig, aligned_segment.mapq, tags['NM'])] if 'SA' in tags: alignments.extend(parse_supplementary(tags['SA'])) if __debug__: l = alignments[0].cigar.query_length(count_clipped='hard') for alignment in alignments[1:]: assert alignment.cigar.query_length(count_clipped='hard') == l return alignments marcelm-sqt-d3218a8c5437/sqt/region.py000066400000000000000000000047341302004573300174320ustar00rootroot00000000000000""" Model an interval on a reference. """ class Region: def __init__(self, specification, start=None, stop=None, reverse_complement=False): """ specification -- description of the region as a string, such as "chr14:22-111" If start is given, the specification is considered to be a reference and the parameters start, stop, reverse_complement are used directly. """ if start is None: self.reference, self.start, self.stop, self.is_reverse_complement = self._parse_region(specification) else: self.reference = specification self.start = start self.stop = stop self.is_reverse_complement = reverse_complement @staticmethod def _parse_region(s): """ Parse a string like "name:begin-end" or "name:begin..end". The returned tuple is (name, start, stop, revcomp). start is begin-1, stop is equal to end. That is, this function converts from 1-based intervals to pythonic open intervals! The string may be prefixed with "rc:", in which case revcomp is set to True. If 'end' is an empty string (as in "chrx:1-"), then stop is set to None. If no range is given, as in "chrx:27", then stop is set to start+1. If only 'name' is given (or "rc:name"), start is set to 0 and stop to None. Commas within the numbers (thousands separators) are ignored. """ revcomp = False if s.startswith('rc:'): revcomp = True s = s[3:] fields = s.rsplit(':', 1) if len(fields) == 1: region = (fields[0], 0, None, revcomp) else: if '..' in fields[1]: sep = '..' else: sep = '-' coords = fields[1].split(sep, maxsplit=1) start = int(coords[0].replace(',', '')) if len(coords) == 1: stop = start else: stop = int(coords[1].replace(',', '')) if coords[1] != '' else None assert 0 < start and (stop is None or start <= stop) region = (fields[0], start-1, stop, revcomp) return region def __str__(self): """ """ prefix = 'rc:' if self.is_reverse_complement else '' if self.start == 0 and self.stop is None: return prefix + self.reference if self.start + 1 == self.stop: return "{}{}:{}".format(prefix, self.reference, self.start+1) stop = '' if self.stop is None else self.stop return "{}{}:{}-{}".format(prefix, self.reference, self.start+1, stop) def __repr__(self): return "Region({!r})".format(str(self)) def __eq__(self, other): return (self.reference == other.reference and self.start == other.start and self.stop == other.stop and self.is_reverse_complement == other.is_reverse_complement) marcelm-sqt-d3218a8c5437/sqt/utils.py000066400000000000000000000011361302004573300173000ustar00rootroot00000000000000import os import re def available_cpu_count(): """ Number of available virtual or physical CPUs on this system. Adapted from http://stackoverflow.com/a/1006301/715090 """ # cpuset may restrict the number of available processors cpus = 1 try: m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', open('/proc/self/status').read()) if m: res = bin(int(m.group(1).replace(',', ''), 16)).count('1') if res > 0: cpus = res except IOError: pass try: import multiprocessing cpus = min(cpus, multiprocessing.cpu_count()) except (ImportError, NotImplementedError): pass return cpus marcelm-sqt-d3218a8c5437/tests/000077500000000000000000000000001302004573300161205ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/tests/__init__.py000066400000000000000000000000001302004573300202170ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/tests/indexed.fasta000066400000000000000000000005051302004573300205600ustar00rootroot00000000000000>a comment GAGTTTTATC GCTTCCATGA CGCAGAAGTT AACACTTTCG GATATTTCTG ATGAGT >b comment CGAAAAATTATCTT GATAAAGCAGGAAT TACTACTGCTTGTT TACGAATTAAA >c comment TCGAAGTGGACTGCTGGCGGAAAATGAGAAA ATTCGACCTATCCTTGCGCAGCTCGAGAAGC TCTTACTTTGCGACCTTTCGCCATCAACTAA CGATTCTGTCAAAAACTGACGCGTTGGATGA G >d comment G A G A A >e comment GT GG CT T marcelm-sqt-d3218a8c5437/tests/indexed.fasta.fai000066400000000000000000000001041302004573300213110ustar00rootroot00000000000000a 56 11 10 11 b 53 84 14 15 c 125 152 31 32 d 5 293 1 2 e 7 314 2 3 marcelm-sqt-d3218a8c5437/tests/mirnas.fa000066400000000000000000002154741302004573300177360ustar00rootroot00000000000000>hsa-miR-590-5p GAGCTTATTCATAAAAGTGCAG >hsa-miR-19b-2-5p AGTTTTGCAGGTTTGCATTTCA >hsa-miR-361-3p TCCCCCAGGTGTGATTCTGATTT >hsa-miR-2467-3p AGCAGAGGCAGAGAGGCTCAGG >hsa-miR-4675 GGGGCTGTGATTGACCAGCAGG >hsa-miR-3663-3p TGAGCACCACACAGGCCGGGCGC >hsa-miR-608 AGGGGTGGTGTTGGGACAGCTCCGT >hsa-miR-3118 TGTGACTGCATTATGAAAATTCT >hsa-miR-202-3p AGAGGTATAGGGCATGGGAA >hsa-miR-3193 TCCTGCGTAGGATCTGAGGAGT >hsa-miR-346 TGTCTGCCCGCATGCCTGCCTCT >hsa-miR-34a-5p TGGCAGTGTCTTAGCTGGTTGT >hsa-miR-647 GTGGCTGCACTCACTTCCTTC >hsa-miR-5584-3p TAGTTCTTCCCTTTGCCCAATT >hsa-miR-548aw GTGCAAAAGTCATCACGGTT >hsa-miR-148b-3p TCAGTGCATCACAGAACTTTGT >hsa-miR-5680 GAGAAATGCTGGACTAATCTGC >hsa-miR-4745-3p TGGCCCGGCGACGTCTCACGGTC >hsa-miR-1276 TAAAGAGCCCTGTGGAGACA >hsa-miR-17-3p ACTGCAGTGAAGGCACTTGTAG >hsa-miR-1228-5p GTGGGCGGGGGCAGGTGTGTG >hsa-miR-518a-5p___hsa-miR-527 CTGCAAAGGGAAGCCCTTTC >hsa-miR-4754 ATGCGGACCTGGGTTAGCGGAGT >hsa-miR-1273e TTGCTTGAACCCAGGAAGTGGA >hsa-miR-4733-5p AATCCCAATGCTAGACCCGGTG >hsa-miR-4313 AGCCCCCTGGCCCCAAACCC >hsa-miR-4749-3p CGCCCCTCCTGCCCCCACAG >hsa-miR-1273d GAACCCATGAGGTTGAGGCTGCAGT >hsa-miR-374b-5p ATATAATACAACCTGCTAAGTG >hsa-miR-610 TGAGCTAAATGTGTGCTGGGA >hsa-miR-4729 TCATTTATCTGTTGGGAAGCTA >hsa-miR-455-3p GCAGTCCATGGGCATATACAC >hsa-miR-4662a-3p AAAGATAGACAATTGGCTAAAT >hsa-miR-301a-3p CAGTGCAATAGTATTGTCAAAGC >hsa-miR-542-5p TCGGGGATCATCATGTCACGAGA >hsa-miR-20b-3p ACTGTAGTATGGGCACTTCCAG >hsa-miR-515-5p TTCTCCAAAAGAAAGCACTTTCTG >hsa-miR-4514 ACAGGCAGGATTGGGGAA >hsa-miR-3074-3p GATATCAGCTCAGTAGGCACCG >hsa-miR-548a-3p CAAAACTGGCAATTACTTTTGC >hsa-miR-632 GTGTCTGCTTCCTGTGGGA >hsa-miR-488-3p TTGAAAGGCTATTTCTTGGTC >hsa-miR-3942-5p AAGCAATACTGTTACCTGAAAT >hsa-miR-3684 TTAGACCTAGTACACGTCCTT >hsa-miR-619 GACCTGGACATGTTTGTGCCCAGT >hsa-miR-377-5p AGAGGTTGCCCTTGGTGAATTC >hsa-miR-151a-3p CTAGACTGAAGCTCCTTGAGG >hsa-miR-551b-5p GAAATCAAGCGTGGGTGAGACC >hsa-miR-154-3p AATCATACACGGTTGACCTATT >hsa-miR-193b-5p CGGGGTTTTGAGGGCGAGATGA >hsa-miR-4263 ATTCTAAGTGCCTTGGCC >hsa-miR-345-5p GCTGACTCCTAGTCCAGGGCTC >hsa-miR-5684 AACTCTAGCCTGAGCAACAG >hsa-miR-3064-3p TTGCCACACTGCAACACCTTACA >hsa-miR-1470 GCCCTCCGCCCGTGCACCCCG >hsa-miR-337-3p CTCCTATATGATGCCTTTCTTC >hsa-miR-3121-3p TAAATAGAGTAGGCAAAGGACA >hsa-miR-155-3p CTCCTACATATTAGCATTAACA >hsa-miR-3685 TTTCCTACCCTACCTGAAGACT >hsa-miR-330-3p GCAAAGCACACGGCCTGCAGAGA >hsa-miR-4700-5p TCTGGGGATGAGGACAGTGTGT >hsa-miR-141-3p TAACACTGTCTGGTAAAGATGG >hsa-miR-106a-5p AAAAGTGCTTACAGTGCAGGTAG >hsa-miR-1277-3p TACGTAGATATATATGTATTTT >hsa-miR-3545-5p TAGTGGTCCTAAACATTTCACA >hsa-miR-4312 GGCCTTGTTCCTGTCCCCA >hsa-miR-5701 TTATTGTCACGTTCTGATT >hsa-miR-148b-5p AAGTTCTGTTATACACTCAGGC >hsa-miR-183-3p GTGAATTACCGAAGGGCCATAA >hsa-miR-633 CTAATAGTATCTACCACAATAAA >hsa-miR-661 TGCCTGGGTCTCTGGCCTGCGCGT >hsa-miR-4453 GAGCTTGGTCTGTAGCGGTT >hsa-miR-4789-3p CACACATAGCAGGTGTATATA >hsa-miR-642b-3p AGACACATTTGGAGAGGGACCC >hsa-miR-3607-5p GCATGTGATGAAGCAAATCAGT >hsa-miR-3673 ATGGAATGTATATACGGAATA >hsa-miR-3916 AAGAGGAAGAAATGGCTGGTTCTCAG >hsa-miR-218-1-3p ATGGTTCCGTCAAGCACCATGG >hsa-miR-4708-3p AGCAAGGCGGCATCTCTCTGAT >hsa-miR-1247-3p CCCCGGGAACGTCGAGACTGGAGC >hsa-miR-367-5p ACTGTTGCTAATATGCAACTCT >hsa-miR-3171 AGATGTATGGAATCTGTATATATC >hsa-miR-4669 TGTGTCCGGGAAGTGGAGGAGG >hsa-miR-3680-5p GACTCACTCACAGGATTGTGCA >hsa-miR-519a-3p AAAGTGCATCCTTTTAGAGTGT >hsa-miR-1204 TCGTGGCCTGGTCTCCATTAT >hsa-miR-769-5p TGAGACCTCTGGGTTCTGAGCT >hsa-miR-25-5p AGGCGGAGACTTGGGCAATTG >hsa-miR-4753-3p TTCTCTTTCTTTAGCCTTGTGT >hsa-miR-548u CAAAGACTGCAATTACTTTTGCG >hsa-miR-595 GAAGTGTGCCGTGGTGTGTCT >hsa-let-7e-5p TGAGGTAGGAGGTTGTATAGTT >hsa-miR-224-3p AAAATGGTGCCCTAGTGACTACA >hsa-miR-548au-3p TGGCAGTTACTTTTGCACCAG >hsa-miR-3652 CGGCTGGAGGTGTGAGGA >hsa-miR-643 ACTTGTATGCTAGCTCAGGTAG >hsa-miR-4786-5p TGAGACCAGGACTGGATGCACC >hsa-miR-4638-5p ACTCGGCTGCGGTGGACAAGT >hsa-miR-500b AATCCTTGCTACCTGGGT >hsa-miR-34a-3p CAATCAGCAAGTATACTGCCCT >hsa-miR-3925-5p AAGAGAACTGAAAGTGGAGCCT >hsa-miR-452-5p AACTGTTTGCAGAGGAAACTGA >hsa-miR-548j AAAAGTAATTGCGGTCTTTGGT >hsa-miR-4260 CTTGGGGCATGGAGTCCCA >hsa-miR-5588-3p AAGTCCCACTAATGCCAGC >hsa-miR-545-3p TCAGCAAACATTTATTGTGTGC >hsa-miR-552 AACAGGTGACTGGTTAGACAA >hsa-miR-4296 ATGTGGGCTCAGGCTCA >hsa-miR-1256 AGGCATTGACTTCTCACTAGCT >hsa-miR-3147 GGTTGGGCAGTGAGGAGGGTGTGA >hsa-miR-29c-5p TGACCGATTTCTCCTGGTGTTC >hsa-miR-4693-3p TGAGAGTGGAATTCACAGTATTT >hsa-miR-5090 CCGGGGCAGATTGGTGTAGGGTG >hsa-miR-4707-3p AGCCCGCCCCAGCCGAGGTTCT >hsa-miR-4670-5p AAGCGACCATGATGTAACTTCA >hsa-miR-200b-5p CATCTTACTGGGCAGCATTGGA >hsa-miR-3189-5p TGCCCCATCTGTGCCCTGGGTAGGA >hsa-miR-4775 TTAATTTTTTGTTTCGGTCACT >hsa-miR-3689b-3p___hsa-miR-3689c CTGGGAGGTGTGATATTGTGGT >hsa-miR-190a TGATATGTTTGATATATTAGGT >hsa-miR-5004-3p CTTGGATTTTCCTGGGCCTCAG >hsa-miR-4632 TGCCGCCCTCTCGCTGCTCTAG >hsa-miR-3687 CCCGGACAGGCGTTCGTGCGACGT >hsa-miR-4749-5p TGCGGGGACAGGCCAGGGCATC >hsa-miR-372 AAAGTGCTGCGACATTTGAGCGT >hsa-miR-483-3p TCACTCCTCTCCTCCCGTCTT >hsa-miR-2114-3p CGAGCCTCAAGCAAGGGACTT >hsa-miR-23b-5p TGGGTTCCTGGCATGCTGATTT >hsa-miR-3184-5p TGAGGGGCCTCAGACCGAGCTTTT >hsa-miR-431-3p CAGGTCGTCTTGCAGGGCTTCT >hsa-miR-569 AGTTAATGAATCCTGGAAAGT >hsa-miR-320e AAAGCTGGGTTGAGAAGG >hsa-miR-423-5p TGAGGGGCAGAGAGCGAGACTTT >hsa-miR-425-3p ATCGGGAATGTCGTGTCCGCCC >hsa-miR-26a-2-3p CCTATTCTTGATTACTTGTTTC >hsa-miR-711 GGGACCCAGGGAGAGACGTAAG >hsa-miR-541-5p AAAGGATTCTGCTGTCGGTCCCACT >hsa-miR-4520b-3p TTTGGACAGAAAACACGCAGGT >hsa-miR-3155b CCAGGCTCTGCAGTGGGA >hsa-miR-4753-5p CAAGGCCAAAGGAAGAGAACAG >hsa-miR-5683 TACAGATGCAGATTCTCTGACTTC >hsa-miR-4739 AAGGGAGGAGGAGCGGAGGGGCCCT >hsa-miR-4490 TCTGGTAAGAGATTTGGGCATA >hsa-miR-4678 AAGGTATTGTTCAGACTTATGA >hsa-miR-1288 TGGACTGCCCTGATCTGGAGA >hsa-miR-133b TTTGGTCCCCTTCAACCAGCTA >hsa-miR-133a TTTGGTCCCCTTCAACCAGCTG >hsa-miR-5007-3p ATCATATGAACCAAACTCTAAT >hsa-miR-572 GTCCGCTCGGCGGTGGCCCA >hsa-miR-4793-5p ACATCCTGCTCCACAGGGCAGAGG >hsa-miR-548as-3p TAAAACCCACAATTATGTTTGT >hsa-miR-19a-5p AGTTTTGCATAGTTGCACTACA >hsa-miR-5587-5p ATGGTCACCTCCGGGACT >hsa-miR-105-5p TCAAATGCTCAGACTCCTGTGGT >hsa-miR-4482-3p TTTCTATTTCTCAGTGGGGCTC >hsa-miR-1228-3p TCACACCTGCCTCGCCCCCC >hsa-miR-1185-2-3p ATATACAGGGGGAGACTCTCAT >hsa-miR-3670 AGAGCTCACAGCTGTCCTTCTCTA >hsa-miR-4715-3p GTGCCACCTTAACTGCAGCCAAT >hsa-miR-639 ATCGCTGCGGTTGCGAGCGCTGT >hsa-miR-5682 GTAGCACCTTGCAGGATAAGGT >hsa-miR-302a-3p TAAGTGCTTCCATGTTTTGGTGA >hsa-miR-4776-5p GTGGACCAGGATGGCAAGGGCT >hsa-miR-302b-5p ACTTTAACATGGAAGTGCTTTC >hsa-miR-4487 AGAGCTGGCTGAAGGGCAG >hsa-miR-1539 TCCTGCGCGTCCCAGATGCCC >hsa-miR-126-5p CATTATTACTTTTGGTACGCG >hsa-miR-522-3p AAAATGGTTCCCTTTAGAGTGT >hsa-miR-4774-3p ATTGCCTAACATGTGCCAGAA >hsa-miR-590-3p TAATTTTATGTATAAGCTAGT >hsa-miR-3688-3p TATGGAAAGACTTTGCCACTCT >hsa-miR-4727-3p ATAGTGGGAAGCTGGCAGATTC >hsa-miR-4726-3p ACCCAGGTTCCCTCTGGCCGCA >hsa-miR-646 AAGCAGCTGCCTCTGAGGC >hsa-miR-149-5p TCTGGCTCCGTGTCTTCACTCCC >hsa-miR-3152-3p TGTGTTAGAATAGGGGCAATAA >hsa-miR-1280 TCCCACCGCTGCCACCC >hsa-miR-194-5p TGTAACAGCAACTCCATGTGGA >hsa-miR-146a-5p TGAGAACTGAATTCCATGGGTT >hsa-miR-3651 CATAGCCCGGTCGCTGGTACATGA >hsa-miR-4697-3p TGTCAGTGACTCCTGCCCCTTGGT >hsa-miR-4731-3p CACACAAGTGGCCCCCAACACT >hsa-miR-429 TAATACTGTCTGGTAAAACCGT >hsa-miR-4707-5p GCCCCGGCGCGGGCGGGTTCTGG >hsa-miR-191-3p GCTGCGCTTGGATTTCGTCCCC >hsa-miR-5697 TCAAGTAGTTTCATGATAAAGG >hsa-miR-1178 TTGCTCACTGTTCTTCCCTAG >hsa-miR-3154 CAGAAGGGGAGTTGGGAGCAGA >hsa-miR-4677-5p TTGTTCTTTGGTCTTTCAGCCA >hsa-miR-3545-3p TTGAACTGTTAAGAACCACTGGA >hsa-miR-4526 GCTGACAGCAGGGCTGGCCGCT >hsa-miR-1255b-5p CGGATGAGCAAAGAAAGTGGTT >hsa-miR-624-3p CACAAGGTATTGGTATTACCT >hsa-miR-4787-5p GCGGGGGTGGCGGCGGCATCCC >hsa-miR-3976 TATAGAGAGCAGGAAGATTAATGT >hsa-miR-3908 GAGCAATGTAGGTAGACTGTTT >hsa-miR-4515 AGGACTGGACTCCCGGCAGCCC >hsa-miR-4772-5p TGATCAGGCAAAATTGCAGACT >hsa-miR-3654 GACTGGACAAGCTGAGGAA >hsa-miR-5006-3p TTTCCCTTTCCATCCTGGCAG >hsa-miR-4656 TGGGCTGAGGGCAGGAGGCCTGT >hsa-miR-3610 GAATCGGAAAGGAGGCGCCG >hsa-miR-767-3p TCTGCTCATACCCCATGGTTTCT >hsa-miR-130b-3p CAGTGCAATGATGAAAGGGCAT >hsa-miR-3194-3p AGCTCTGCTGCTCACTGGCAGT >hsa-miR-3660 ACTGACAGGAGAGCATTTTGA >hsa-miR-3187-5p CCTGGGCAGCGTGTGGCTGAAGG >hsa-miR-1247-5p ACCCGTCCCGTTCGTCCCCGGA >hsa-miR-4720-5p CCTGGCATATTTGGTATAACTT >hsa-miR-182-5p TTTGGCAATGGTAGAACTCACACT >hsa-miR-3653 CTAAGAAGTTGACTGAAG >hsa-miR-199b-5p CCCAGTGTTTAGACTATCTGTTC >hsa-miR-3189-3p CCCTTGGGTCTGATGGGGTAG >hsa-miR-5588-5p ACTGGCATTAGTGGGACTTTT >hsa-miR-4800-3p CATCCGTCCGTCTGTCCAC >hsa-miR-4497 CTCCGGGACGGCTGGGC >hsa-miR-2277-3p TGACAGCGCCCTGCCTGGCTC >hsa-miR-4694-5p AGGTGTTATCCTATCCATTTGC >hsa-miR-181b-3p CTCACTGAACAATGAATGCAA >hsa-miR-484 TCAGGCTCAGTCCCCTCCCGAT >hsa-miR-4659a-5p CTGCCATGTCTAAGAAGAAAAC >hsa-miR-188-5p CATCCCTTGCATGGTGGAGGG >hsa-miR-4689 TTGAGGAGACATGGTGGGGGCC >hsa-miR-3177-5p TGTGTACACACGTGCCAGGCGCT >hsa-miR-3124-3p ACTTTCCTCACTCCCGTGAAGT >hsa-miR-1269b CTGGACTGAGCCATGCTACTGG >hsa-miR-1292 TGGGAACGGGTTCCGGCAGACGCTG >hsa-miR-761 GCAGCAGGGTGAAACTGACACA >hsa-miR-550a-5p AGTGCCTGAGGGAGTAAGAGCCC >hsa-miR-199a-3p___hsa-miR-199b-3p ACAGTAGTCTGCACATTGGTTA >hsa-miR-4314 CTCTGGGAAATGGGACAG >hsa-miR-3130-3p GCTGCACCGGAGACTGGGTAA >hsa-miR-30e-5p TGTAAACATCCTTGACTGGAAG >hsa-miR-451a AAACCGTTACCATTACTGAGTT >hsa-miR-4538 GAGCTTGGATGAGCTGGGCTGA >hsa-miR-211-3p GCAGGGACAGCAAAGGGGTGC >hsa-miR-875-5p TATACCTCAGTTTTATCAGGTG >hsa-miR-887 GTGAACGGGCGCCATCCCGAGG >hsa-miR-4327 GGCTTGCATGGGGGACTGG >hsa-miR-2115-3p CATCAGAATTCATGGAGGCTAG >hsa-miR-3126-5p TGAGGGACAGATGCCAGAAGCA >hsa-miR-4428 CAAGGAGACGGGAACATGGAGC >hsa-miR-3186-3p TCACGCGGAGAGATGGCTTTG >hsa-miR-4685-3p TCTCCCTTCCTGCCCTGGCTAG >hsa-miR-539-5p GGAGAAATTATCCTTGGTGTGT >hsa-miR-1184 CCTGCAGCGACTTGATGGCTTCC >hsa-miR-1277-5p AAATATATATATATATGTACGTAT >hsa-miR-1197 TAGGACACATGGTCTACTTCT >hsa-miR-9-5p TCTTTGGTTATCTAGCTGTATGA >hsa-miR-4269 GCAGGCACAGACAGCCCTGGC >hsa-miR-449b-5p AGGCAGTGTATTGTTAGCTGGC >hsa-miR-4289 GCATTGTGCAGGGCTATCA >hsa-miR-4447 GGTGGGGGCTGTTGTTT >hsa-miR-4701-3p ATGGGTGATGGGTGTGGTGT >hsa-miR-33b-5p GTGCATTGCTGTTGCATTGC >hsa-miR-640 ATGATCCAGGAACCTGCCTCT >hsa-miR-10b-3p ACAGATTCGATTCTAGGGGAAT >hsa-miR-874 CTGCCCTGGCCCGAGGGACCGA >hsa-miR-3686 ATCTGTAAGAGAAAGTAAATGA >hsa-miR-606 AAACTACTGAAAATCAAAGAT >hsa-miR-1295a TTAGGCCGCAGATCTGGGTGA >hsa-miR-19b-1-5p AGTTTTGCAGGTTTGCATCCAGC >hsa-miR-4259 CAGTTGGGTCTAGGGGTCAGGA >hsa-miR-523-3p GAACGCGCTTCCCTATAGAGGGT >hsa-miR-616-3p AGTCATTGGAGGGTTTGAGCAG >hsa-miR-34c-5p AGGCAGTGTAGTTAGCTGATTGC >hsa-miR-5692a CAAATAATACCACAGTGGGTGT >hsa-miR-143-3p TGAGATGAAGCACTGTAGCTC >hsa-miR-4769-3p TCTGCCATCCTCCCTCCCCTAC >hsa-miR-379-3p TATGTAACATGGTCCACTAACT >hsa-miR-548t-3p___hsa-miR-548aa AAAAACCACAATTACTTTTGCACCA >hsa-miR-4752 TTGTGGATCTCAAGGATGTGCT >hsa-miR-4279 CTCTCCTCCCGGCTTC >hsa-miR-764 GCAGGTGCTCACTTGTCCTCCT >hsa-miR-4649-3p TCTGAGGCCTGCCTCTCCCCA >hsa-miR-4438 CACAGGCTTAGAAAAGACAGT >hsa-miR-498 TTTCAAGCCAGGGGGCGTTTTTC >hsa-miR-4633-5p ATATGCCTGGCTAGCTCCTC >hsa-miR-1281 TCGCCTCCTCCTCTCCC >hsa-miR-23a-3p ATCACATTGCCAGGGATTTCC >hsa-miR-3920 ACTGATTATCTTAACTCTCTGA >hsa-miR-379-5p TGGTAGACTATGGAACGTAGG >hsa-miR-1225-5p GTGGGTACGGCCCAGTGGGGGG >hsa-miR-510 TACTCAGGAGAGTGGCAATCAC >hsa-miR-300 TATACAAGGGCAGACTCTCTCT >hsa-miR-3168 GAGTTCTACAGTCAGAC >hsa-miR-3616-3p CGAGGGCATTTCATGATGCAGGC >hsa-miR-382-3p AATCATTCACGGACAACACTT >hsa-miR-4320 GGGATTCTGTAGCTTCCT >hsa-miR-3682-5p CTACTTCTACCTGTGTTATCAT >hsa-miR-377-3p ATCACACAAAGGCAACTTTTGT >hsa-miR-576-5p ATTCTAATTTCTCCACGTCTTT >hsa-miR-302f TAATTGCTTCCATGTTT >hsa-miR-4281 GGGTCCCGGGGAGGGGGG >hsa-miR-3176 ACTGGCCTGGGACTACCGG >hsa-miR-675-5p TGGTGCGGAGAGGGCCCACAGTG >hsa-miR-548y AAAAGTAATCACTGTTTTTGCC >hsa-miR-653 GTGTTGAAACAATCTCTACTG >hsa-miR-4324 CCCTGAGACCCTAACCTTAA >hsa-miR-548an AAAAGGCATTGTGGTTTTTG >hsa-miR-557 GTTTGCACGGGTGGGCCTTGTCT >hsa-miR-24-1-5p TGCCTACTGAGCTGATATCAGT >hsa-miR-3913-5p TTTGGGACTGATCTTGATGTCT >hsa-miR-3120-3p CACAGCAAGTGTAGACAGGCA >hsa-miR-1306-5p CCACCTCCCCTGCAAACGTCCA >hsa-miR-3180-5p CTTCCAGACGCTCCGCCCCACGTCG >hsa-miR-1260a ATCCCACCTCTGCCACCA >hsa-miR-5580-3p CACATATGAAGTGAGCCAGCAC >hsa-miR-4534 GGATGGAGGAGGGGTCT >hsa-miR-4424 AGAGTTAACTCAAAATGGACTA >hsa-miR-204-3p GCTGGGAAGGCAAAGGGACGT >hsa-miR-206 TGGAATGTAAGGAAGTGTGTGG >hsa-miR-518e-3p AAAGCGCTTCCCTTCAGAGTG >hsa-miR-562 AAAGTAGCTGTACCATTTGC >hsa-miR-374a-3p CTTATCAGATTGTATTGTAATT >hsa-miR-542-3p TGTGACAGATTGATAACTGAAA >hsa-miR-668 TGTCACTCGGCTCGGCCCACTAC >hsa-miR-96-5p TTTGGCACTAGCACATTTTTGCT >hsa-miR-615-5p GGGGGTCCCCGGTGCTCGGATC >hsa-miR-3659 TGAGTGTTGTCTACGAGGGCA >hsa-miR-1183 CACTGTAGGTGATGGTGAGAGTGGGCA >hsa-miR-3622b-3p TCACCTGAGCTCCCGTGCCTG >hsa-miR-198 GGTCCAGAGGGGAGATAGGTTC >hsa-miR-520d-3p AAAGTGCTTCTCTTTGGTGGGT >hsa-miR-4319 TCCCTGAGCAAAGCCAC >hsa-miR-4755-3p AGCCAGGCTCTGAAGGGAAAGT >hsa-miR-4762-5p CCAAATCTTGATCAGAAGCCT >hsa-miR-4642 ATGGCATCGTCCCCTGGTGGCT >hsa-miR-1270 CTGGAGATATGGAAGAGCTGTGT >hsa-miR-1 TGGAATGTAAAGAAGTATGTAT >hsa-miR-574-5p TGAGTGTGTGTGTGTGAGTGTGT >hsa-miR-3646 AAAATGAAATGAGCCCAGCCCA >hsa-miR-5008-5p TGAGGCCCTTGGGGCACAGTGG >hsa-miR-585 TGGGCGTATCTGTATGCTA >hsa-miR-4430 AGGCTGGAGTGAGCGGAG >hsa-miR-3911 TGTGTGGATCCTGGAGGAGGCA >hsa-miR-500a-3p ATGCACCTGGGCAAGGATTCTG >hsa-miR-188-3p CTCCCACATGCAGGGTTTGCA >hsa-miR-378h ACTGGACTTGGTGTCAGATGG >hsa-let-7a-2-3p CTGTACAGCCTCCTAGCTTTCC >hsa-miR-4695-3p TGATCTCACCGCTGCCTCCTTC >hsa-miR-3677-5p CAGTGGCCAGAGCCCTGCAGTG >hsa-miR-3173-3p AAAGGAGGAAATAGGCAGGCCA >hsa-miR-4434 AGGAGAAGTAAAGTAGAA >hsa-miR-2116-5p GGTTCTTAGCATAGGAGGTCT >hsa-miR-21-5p TAGCTTATCAGACTGATGTTGA >hsa-miR-3122 GTTGGGACAAGAGGACGGTCTT >hsa-miR-335-5p TCAAGAGCAATAACGAAAAATGT >hsa-miR-2277-5p AGCGCGGGCTGAGCGCTGCCAGTC >hsa-miR-614 GAACGCCTGTTCTTGCCAGGTGG >hsa-miR-4711-3p CGTGTCTTCTGGCTTGAT >hsa-miR-1271-3p AGTGCCTGCTATGTGCCAGGCA >hsa-miR-518b CAAAGCGCTCCCCTTTAGAGGT >hsa-miR-4666b TTGCATGTCAGATTGTAATTCCC >hsa-miR-3202 TGGAAGGGAGAAGAGCTTTAAT >hsa-miR-1972 TCAGGCCAGGCACAGTGGCTCA >hsa-miR-216b AAATCTCTGCAGGCAAATGTGA >hsa-miR-891b TGCAACTTACCTGAGTCATTGA >hsa-miR-4465 CTCAAGTAGTCTGACCAGGGGA >hsa-miR-4315 CCGCTTTCTGAGCTGGAC >hsa-miR-20a-3p ACTGCATTATGAGCACTTAAAG >hsa-miR-4794 TCTGGCTATCTCACGAGACTGT >hsa-miR-626 AGCTGTCTGAAAATGTCTT >hsa-miR-3140-5p ACCTGAATTACCAAAAGCTTT >hsa-miR-324-5p CGCATCCCCTAGGGCATTGGTGT >hsa-miR-342-5p AGGGGTGCTATCTGTGATTGA >hsa-miR-1324 CCAGACAGAATTCTATGCACTTTC >hsa-miR-4445-3p CACGGCAAAAGAAACAATCCA >hsa-miR-4254 GCCTGGAGCTACTCCACCATCTC >hsa-miR-4723-5p TGGGGGAGCCATGAGATAAGAGCA >hsa-miR-3679-5p TGAGGATATGGCAGGGAAGGGGA >hsa-miR-3174 TAGTGAGTTAGAGATGCAGAGCC >hsa-miR-4645-5p ACCAGGCAAGAAATATTGT >hsa-miR-3678-3p CTGCAGAGTTTGTACGGACCGG >hsa-miR-1227 CGTGCCACCCTTTTCCCCAG >hsa-miR-1343 CTCCTGGGGCCCGCACTCTCGC >hsa-miR-568 ATGTATAAATGTATACACAC >hsa-miR-4450 TGGGGATTTGGAGAAGTGGTGA >hsa-miR-589-3p TCAGAACAAATGCCGGTTCCCAGA >hsa-miR-4495 AATGTAAACAGGCTTTTTGCT >hsa-miR-5590-3p AATAAAGTTCATGTATGGCAA >hsa-miR-125a-3p ACAGGTGAGGTTCTTGGGAGCC >hsa-miR-339-3p TGAGCGCCTCGACGACAGAGCCG >hsa-miR-99a-3p CAAGCTCGCTTCTATGGGTCTG >hsa-miR-135b-3p ATGTAGGGCTAAAAGCCATGGG >hsa-miR-185-3p AGGGGCTGGCTTTCCTCTGGTC >hsa-miR-4433-3p ACAGGAGTGGGGGTGGGACAT >hsa-miR-424-5p CAGCAGCAATTCATGTTTTGAA >hsa-miR-3613-3p ACAAAAAAAAAAGCCCAACCCTTC >hsa-miR-939 TGGGGAGCTGAGGCTCTGGGGGTG >hsa-miR-212-3p TAACAGTCTCCAGTCACGGCC >hsa-let-7g-3p CTGTACAGGCCACTGCCTTGC >hsa-miR-3691-5p AGTGGATGATGGAGACTCGGTAC >hsa-miR-4460 ATAGTGGTTGTGAATTTACCTT >hsa-miR-4671-5p ACCGAAGACTGTGCGCTAATCT >hsa-miR-5006-5p TTGCCAGGGCAGGAGGTGGAA >hsa-miR-548ac CAAAAACCGGCAATTACTTTTG >hsa-miR-5699 TCCTGTCTTTCCTTGTTGGAGC >hsa-miR-485-3p GTCATACACGGCTCTCCTCTCT >hsa-miR-4796-3p TAAAGTGGCAGAGTATAGACAC >hsa-miR-4431 GCGACTCTGAAAACTAGAAGGT >hsa-miR-450a-5p TTTTGCGATGTGTTCCTAATAT >hsa-miR-600 ACTTACAGACAAGAGCCTTGCTC >hsa-miR-611 GCGAGGACCCCTCGGGGTCTGAC >hsa-miR-148a-3p TCAGTGCACTACAGAACTTTGT >hsa-miR-4666a-5p ATACATGTCAGATTGTATGCC >hsa-miR-3669 ACGGAATATGTATACGGAATATA >hsa-miR-664-5p ACTGGCTAGGGAAAATGATTGGAT >hsa-miR-4507 CTGGGTTGGGCTGGGCTGGG >hsa-miR-660-3p ACCTCCTGTGTGCATGGATTA >hsa-miR-4684-3p TGTTGCAAGTCGGTGGAGACGT >hsa-miR-891a TGCAACGAACCTGAGCCACTGA >hsa-miR-92a-2-5p GGGTGGGGATTTGTTGCATTAC >hsa-miR-556-5p GATGAGCTCATTGTAATATGAG >hsa-miR-520b AAAGTGCTTCCTTTTAGAGGG >hsa-miR-1237 TCCTTCTGCTCCGTCCCCCAG >hsa-miR-211-5p TTCCCTTTGTCATCCTTCGCCT >hsa-miR-3155a CCAGGCTCTGCAGTGGGAACT >hsa-miR-892a CACTGTGTCCTTTCTGCGTAG >hsa-miR-4661-5p AACTAGCTCTGTGGATCCTGAC >hsa-miR-3692-3p GTTCCACACTGACACTGCAGAAGT >hsa-miR-196b-5p TAGGTAGTTTCCTGTTGTTGGG >hsa-miR-4486 GCTGGGCGAGGCTGGCA >hsa-miR-221-5p ACCTGGCATACAATGTAGATTT >hsa-miR-553 AAAACGGTGAGATTTTGTTTT >hsa-miR-490-5p CCATGGATCTCCAGGTGGGT >hsa-miR-1911-5p TGAGTACCGCCATGTCTGTTGGG >hsa-miR-1298 TTCATTCGGCTGTCCAGATGTA >hsa-miR-4442 GCCGGACAAGAGGGAGG >hsa-miR-516b-5p ATCTGGAGGTAAGAAGCACTTT >hsa-miR-3667-5p AAAGACCCATTGAGGAGAAGGT >hsa-miR-802 CAGTAACAAAGATTCATCCTTGT >hsa-miR-4655-5p CACCGGGGATGGCAGAGGGTCG >hsa-miR-365a-3p___hsa-miR-365b-3p TAATGCCCCTAAAAATCCTTAT >hsa-miR-507 TTTTGCACCTTTTGGAGTGAA >hsa-miR-1245b-3p TCAGATGATCTAAAGGCCTATA >hsa-miR-3618 TGTCTACATTAATGAAAAGAGC >hsa-miR-3162-3p TCCCTACCCCTCCACTCCCCA >hsa-miR-4284 GGGCTCACATCACCCCAT >hsa-miR-5197-5p CAATGGCACAAACTCATTCTTGA >hsa-miR-4762-3p CTTCTGATCAAGATTTGTGGTG >hsa-miR-4680-5p AGAACTCTTGCAGTCTTAGATGT >hsa-miR-369-3p AATAATACATGGTTGATCTTT >hsa-miR-27b-3p TTCACAGTGGCTAAGTTCTGC >hsa-miR-200a-3p TAACACTGTCTGGTAACGATGT >hsa-miR-519c-5p___hsa-miR-519b-5p___hsa-miR-523-5p___hsa-miR-518e-5p___hsa-miR-522-5p___hsa-miR-519a-5p CTCTAGAGGGAAGCGCTTTCTG >hsa-miR-361-5p TTATCAGAATCTCCAGGGGTAC >hsa-miR-3191-3p TGGGGACGTAGCTGGCCAGACAG >hsa-miR-493-3p TGAAGGTCTACTGTGTGCCAGG >hsa-miR-4519 CAGCAGTGCGCAGGGCTG >hsa-miR-3923 AACTAGTAATGTTGGATTAGGG >hsa-miR-2964a-5p AGATGTCCAGCCACAATTCTCG >hsa-miR-3622b-5p AGGCATGGGAGGTCAGGTGA >hsa-miR-5581-5p AGCCTTCCAGGAGAAATGGAGA >hsa-miR-4662b AAAGATGGACAATTGGCTAAAT >hsa-miR-4253 AGGGCATGTCCAGGGGGT >hsa-miR-4462 TGACACGGAGGGTGGCTTGGGAA >hsa-miR-4262 GACATTCAGACTACCTG >hsa-miR-5186 AGAGATTGGTAGAAATCAGGT >hsa-miR-1273f GGAGATGGAGGTTGCAGTG >hsa-miR-199a-5p CCCAGTGTTCAGACTACCTGTTC >hsa-miR-4300 TGGGAGCTGGACTACTTC >hsa-miR-4674 CTGGGCTCGGGACGCGCGGCT >hsa-miR-5088 CAGGGCTCAGGGATTGGATGGAG >hsa-miR-4293 CAGCCTGACAGGAACAG >hsa-miR-96-3p AATCATGTGCAGTGCCAATATG >hsa-miR-548e AAAAACTGAGACTACTTTTGCA >hsa-miR-4258 CCCCGCCACCGCCTTGG >hsa-miR-4533 TGGAAGGAGGTTGCCGGACGCT >hsa-miR-3145-3p AGATATTTTGAGTGTTTGGAATTG >hsa-miR-3160-5p GGCTTTCTAGTCTCAGCTCTCC >hsa-miR-4501 TATGTGACCTCGGATGAATCA >hsa-miR-581 TCTTGTGTTCTCTAGATCAGT >hsa-miR-5195-5p AACCCCTAAGGCAACTGGATGG >hsa-miR-642b-5p GGTTCCCTCTCCAAATGTGTCT >hsa-miR-4664-3p CTTCCGGTCTGTGAGCCCCGTC >hsa-miR-3178 GGGGCGCGGCCGGATCG >hsa-miR-29b-1-5p GCTGGTTTCATATGGTGGTTTAGA >hsa-miR-296-5p AGGGCCCCCCCTCAATCCTGT >hsa-miR-495 AAACAAACATGGTGCACTTCTT >hsa-miR-4483 GGGGTGGTCTGTTGTTG >hsa-miR-24-3p TGGCTCAGTTCAGCAGGAACAG >hsa-miR-4747-3p AAGGCCCGGGCTTTCCTCCCAG >hsa-miR-4774-5p TCTGGTATGTAGTAGGTAATAA >hsa-miR-28-3p CACTAGATTGTGAGCTCCTGGA >hsa-miR-4520a-3p TTGGACAGAAAACACGCAGGAA >hsa-let-7c TGAGGTAGTAGGTTGTATGGTT >hsa-miR-4429 AAAAGCTGGGCTGAGAGGCG >hsa-miR-635 ACTTGGGCACTGAAACAATGTCC >hsa-miR-4498 TGGGCTGGCAGGGCAAGTGCTG >hsa-miR-671-3p TCCGGTTCTCAGGGCTCCACC >hsa-miR-487a AATCATACAGGGACATCCAGTT >hsa-miR-4999-5p TGCTGTATTGTCAGGTAGTGA >hsa-let-7a-5p TGAGGTAGTAGGTTGTATAGTT >hsa-miR-2861 GGGGCCTGGCGGTGGGCGG >hsa-miR-362-3p AACACACCTATTCAAGGATTCA >hsa-miR-3922-3p TCTGGCCTTGACTTGACTCTTT >hsa-miR-3620 TCACCCTGCATCCCGCACCCAG >hsa-miR-4703-3p TGTAGTTGTATTGTATTGCCAC >hsa-miR-3681-5p TAGTGGATGATGCACTCTGTGC >hsa-miR-3671 ATCAAATAAGGACTAGTCTGCA >hsa-miR-3143 ATAACATTGTAAAGCGCTTCTTTCG >hsa-miR-4761-5p ACAAGGTGTGCATGCCTGACC >hsa-miR-4252 GGCCACTGAGTCAGCACCA >hsa-miR-382-5p GAAGTTGTTCGTGGTGGATTCG >hsa-miR-4713-5p TTCTCCCACTACCAGGCTCCCA >hsa-miR-4523 GACCGAGAGGGCCTCGGCTGT >hsa-miR-4715-5p AAGTTGGCTGCAGTTAAGGTGG >hsa-miR-3675-3p CATCTCTAAGGAACTCCCCCAA >hsa-miR-1307-5p TCGACCGGACCTCGACCGGCT >hsa-miR-636 TGTGCTTGCTCGTCCCGCCCGCA >hsa-miR-3130-5p TACCCAGTCTCCGGTGCAGCC >hsa-miR-4648 TGTGGGACTGCAAATGGGAG >hsa-miR-18b-3p TGCCCTAAATGCCCCTTCTGGC >hsa-miR-4696 TGCAAGACGGATACTGTCATCT >hsa-miR-566 GGGCGCCTGTGATCCCAAC >hsa-miR-3978 GTGGAAAGCATGCATCCAGGGTGT >hsa-miR-616-5p ACTCAAAACCCTTCAGTGACTT >hsa-miR-5692c AATAATATCACAGTAGGTGTAC >hsa-miR-3177-3p TGCACGGCACTGGGGACACGT >hsa-miR-1193 GGGATGGTAGACCGGTGACGTGC >hsa-miR-214-3p ACAGCAGGCACAGACAGGCAGT >hsa-miR-630 AGTATTCTGTACCAGGGAAGGT >hsa-miR-1271-5p CTTGGCACCTAGCAAGCACTCA >hsa-miR-1912 TACCCAGAGCATGCAGTGTGAA >hsa-miR-5690 TCAGCTACTACCTCTATTAGG >hsa-miR-623 ATCCCTTGCAGGGGCTGTTGGGT >hsa-miR-3913-3p AGACATCAAGATCAGTCCCAAA >hsa-miR-1269a CTGGACTGAGCCGTGCTACTGG >hsa-miR-3159 TAGGATTACAAGTGTCGGCCAC >hsa-miR-3664-5p AACTCTGTCTTCACTCATGAGT >hsa-miR-3622a-5p CAGGCACGGGAGCTCAGGTGAG >hsa-miR-3914 AAGGAACCAGAAAATGAGAAGT >hsa-miR-3944-5p TGTGCAGCAGGCCAACCGAGA >hsa-miR-4756-3p CCAGAGATGGTTGCCTTCCTAT >hsa-miR-195-5p TAGCAGCACAGAAATATTGGC >hsa-miR-942 TCTTCTCTGTTTTGGCCATGTG >hsa-miR-4777-5p TTCTAGATGAGAGATATATATA >hsa-miR-935 CCAGTTACCGCTTCCGCTACCGC >hsa-miR-141-5p CATCTTCCAGTACAGTGTTGGA >hsa-miR-3675-5p TATGGGGCTTCTGTAGAGATTTC >hsa-miR-4802-5p TATGGAGGTTCTAGACCATGTT >hsa-miR-4435 ATGGCCAGAGCTCACACAGAGG >hsa-miR-654-3p TATGTCTGCTGACCATCACCTT >hsa-miR-376c AACATAGAGGAAATTCCACGT >hsa-miR-4273 GTGTTCTCTGATGGACAG >hsa-miR-4474-5p TTAGTCTCATGATCAGACACA >hsa-miR-602 GACACGGGCGACAGCTGCGGCCC >hsa-let-7d-3p CTATACGACCTGCTGCCTTTCT >hsa-miR-101-5p CAGTTATCACAGTGCTGATGCT >hsa-miR-3198 GTGGAGTCCTGGGGAATGGAGA >hsa-miR-4758-3p TGCCCCACCTGCTGACCACCCTC >hsa-miR-363-3p AATTGCACGGTATCCATCTGTA >hsa-miR-3943 TAGCCCCCAGGCTTCACTTGGCG >hsa-miR-124-5p CGTGTTCACAGCGGACCTTGAT >hsa-miR-3977 GTGCTTCATCGTAATTAACCTTA >hsa-miR-4670-3p TGAAGTTACATCATGGTCGCTT >hsa-miR-1273a GGGCGACAAAGCAAGACTCTTTCTT >hsa-miR-4730 CTGGCGGAGCCCATTCCATGCCA >hsa-miR-30c-5p TGTAAACATCCTACACTCTCAGC >hsa-miR-150-3p CTGGTACAGGCCTGGGGGACAG >hsa-miR-410 AATATAACACAGATGGCCTGT >hsa-miR-4444 CTCGAGTTGGAAGAGGCG >hsa-miR-3197 GGAGGCGCAGGCTCGGAAAGGCG >hsa-miR-20a-5p TAAAGTGCTTATAGTGCAGGTAG >hsa-miR-3169 TAGGACTGTGCTTGGCACATAG >hsa-miR-4690-3p GCAGCCCAGCTGAGGCCTCTG >hsa-miR-370 GCCTGCTGGGGTGGAACCTGGT >hsa-miR-4536-5p TGTGGTAGATATATGCACGAT >hsa-miR-1255a AGGATGAGCAAAGAAAGTAGATT >hsa-miR-708-5p AAGGAGCTTACAATCTAGCTGGG >hsa-miR-2355-3p ATTGTCCTTGCTGTTTGGAGAT >hsa-miR-539-3p ATCATACAAGGACAATTTCTTT >hsa-miR-4709-5p ACAACAGTGACTTGCTCTCCAA >hsa-miR-223-5p CGTGTATTTGACAAGCTGAGTT >hsa-miR-3975 TGAGGCTAATGCACTACTTCAC >hsa-miR-22-5p AGTTCTTCAGTGGCAAGCTTTA >hsa-miR-331-5p CTAGGTATGGTCCCAGGGATCC >hsa-miR-1249 ACGCCCTTCCCCCCCTTCTTCA >hsa-miR-579 TTCATTTGGTATAAACCGCGATT >hsa-miR-3157-3p CTGCCCTAGTCTAGCTGAAGCT >hsa-miR-19a-3p TGTGCAAATCTATGCAAAACTGA >hsa-miR-4318 CACTGTGGGTACATGCT >hsa-miR-1295b-3p AATAGGCCACGGATCTGGGCAA >hsa-miR-877-5p GTAGAGGAGATGGCGCAGGG >hsa-miR-526a___hsa-miR-520c-5p___hsa-miR-518d-5p CTCTAGAGGGAAGCACTTTCTG >hsa-miR-613 AGGAATGTTCCTTCTTTGCC >hsa-miR-1224-5p GTGAGGACTCGGGAGGTGG >hsa-miR-544b ACCTGAGGTTGTGCATTTCTAA >hsa-miR-4712-3p AATGAGAGACCTGTACTGTAT >hsa-miR-3117-3p ATAGGACTCATATAGTGCCAG >hsa-miR-5008-3p CCTGTGCTCCCAGGGCCTCGC >hsa-miR-4668-3p GAAAATCCTTTTTGTTTTTCCAG >hsa-miR-1205 TCTGCAGGGTTTGCTTTGAG >hsa-miR-5010-3p TTTTGTGTCTCCCATTCCCCAG >hsa-miR-323b-3p CCCAATACACGGTCGACCTCTT >hsa-miR-4270 TCAGGGAGTCAGGGGAGGGC >hsa-miR-4634 CGGCGCGACCGGCCCGGGG >hsa-miR-16-1-3p CCAGTATTAACTGTGCTGCTGA >hsa-miR-1538 CGGCCCGGGCTGCTGCTGTTCCT >hsa-miR-518c-5p TCTCTGGAGGGAAGCACTTTCTG >hsa-miR-412 ACTTCACCTGGTCCACTAGCCGT >hsa-miR-526b-3p GAAAGTGCTTCCTTTTAGAGGC >hsa-miR-4682 TCTGAGTTCCTGGAGCCTGGTCT >hsa-miR-5585-5p TGAAGTACCAGCTACTCGAGAG >hsa-miR-548ao-5p AGAAGTAACTACGGTTTTTGCA >hsa-miR-4474-3p TTGTGGCTGGTCATGAGGCTAA >hsa-miR-4502 GCTGATGATGATGGTGCTGAAG >hsa-miR-1915-5p ACCTTGCCTTGCTGCCCGGGCC >hsa-miR-3163 TATAAAATGAGGGCAGTAAGAC >hsa-miR-28-5p AAGGAGCTCACAGTCTATTGAG >hsa-miR-24-2-5p TGCCTACTGAGCTGAAACACAG >hsa-miR-920 GGGGAGCTGTGGAAGCAGTA >hsa-miR-4714-5p AACTCTGACCCCTTAGGTTGAT >hsa-miR-363-5p CGGGTGGATCACGATGCAATTT >hsa-miR-339-5p TCCCTGTCCTCCAGGAGCTCACG >hsa-miR-888-5p TACTCAAAAAGCTGTCAGTCA >hsa-miR-548t-5p CAAAAGTGATCGTGGTTTTTG >hsa-miR-203 GTGAAATGTTTAGGACCACTAG >hsa-miR-411-3p TATGTAACACGGTCCACTAACC >hsa-miR-98 TGAGGTAGTAAGTTGTATTGTT >hsa-miR-320d AAAAGCTGGGTTGAGAGGA >hsa-miR-601 TGGTCTAGGATTGTTGGAGGAG >hsa-miR-3164 TGTGACTTTAAGGGAAATGGCG >hsa-miR-548as-5p AAAAGTAATTGCGGGTTTTGCC >hsa-miR-645 TCTAGGCTGGTACTGCTGA >hsa-miR-4782-3p TGATTGTCTTCATATCTAGAAC >hsa-miR-302e TAAGTGCTTCCATGCTT >hsa-miR-2114-5p TAGTCCCTTCCTTGAAGCGGTC >hsa-miR-193b-3p AACTGGCCCTCAAAGTCCCGCT >hsa-miR-4329 CCTGAGACCCTAGTTCCAC >hsa-miR-4650-5p TCAGGCCTCTTTCTACCTT >hsa-miR-4732-3p GCCCTGACCTGTCCTGTTCTG >hsa-miR-1587 TTGGGCTGGGCTGGGTTGGG >hsa-miR-4803 TAACATAATAGTGTGGATTGA >hsa-miR-7-5p TGGAAGACTAGTGATTTTGTTGT >hsa-miR-4659b-5p TTGCCATGTCTAAGAAGAA >hsa-miR-3142 AAGGCCTTTCTGAACCTTCAGA >hsa-miR-3974 AAAGGTCATTGTAAGGTTAATGC >hsa-miR-2964a-3p AGAATTGCGTTTGGACAATCAGT >hsa-let-7i-3p CTGCGCAAGCTACTGCCTTGCT >hsa-miR-298 AGCAGAAGCAGGGAGGTTCTCCCA >hsa-miR-18b-5p TAAGGTGCATCTAGTGCAGTTAG >hsa-miR-563 AGGTTGACATACGTTTCCC >hsa-miR-4482-5p AACCCAGTGGGCTATGGAAATG >hsa-miR-4722-3p ACCTGCCAGCACCTCCCTGCAG >hsa-miR-140-5p CAGTGGTTTTACCCTATGGTAG >hsa-miR-5591-5p TGGGAGCTAAGCTATGGGTAT >hsa-miR-548k AAAAGTACTTGCGGATTTTGCT >hsa-miR-4470 TGGCAAACGTGGAAGCCGAGA >hsa-miR-4522 TGACTCTGCCTGTAGGCCGGT >hsa-miR-5193 TCCTCCTCTACCTCATCCCAGT >hsa-miR-195-3p CCAATATTGGCTGTGCTGCTCC >hsa-miR-541-3p TGGTGGGCACAGAATCTGGACT >hsa-miR-29a-5p ACTGATTTCTTTTGGTGTTCAG >hsa-miR-5003-3p TACTTTTCTAGGTTGTTGGGG >hsa-miR-3960 GGCGGCGGCGGAGGCGGGGG >hsa-miR-5187-3p ACTGAATCCTCTTTTCCTCAG >hsa-miR-425-5p AATGACACGATCACTCCCGTTGA >hsa-miR-4271 GGGGGAAGAAAAGGTGGGG >hsa-miR-301b CAGTGCAATGATATTGTCAAAGC >hsa-miR-1286 TGCAGGACCAAGATGAGCCCT >hsa-miR-4676-3p CACTGTTTCACCACTGGCTCTT >hsa-miR-3672 ATGAGACTCATGTAAAACATCTT >hsa-miR-4676-5p GAGCCAGTGGTGAGACAGTGA >hsa-miR-4479 CGCGCGGCCGTGCTCGGAGCAG >hsa-miR-4492 GGGGCTGGGCGCGCGCC >hsa-miR-3192 TCTGGGAGGTTGTAGCAGTGGAA >hsa-miR-142-5p CATAAAGTAGAAAGCACTACT >hsa-miR-548g-3p AAAACTGTAATTACTTTTGTAC >hsa-miR-4662a-5p TTAGCCAATTGTCCATCTTTAG >hsa-miR-4650-3p AGGTAGAATGAGGCCTGACAT >hsa-miR-4640-3p CACCCCCTGTTTCCTGGCCCAC >hsa-miR-488-5p CCCAGATAATGGCACTCTCAA >hsa-miR-5687 TTAGAACGTTTTAGGGTCAAAT >hsa-miR-450a-3p ATTGGGGACATTTTGCATTCAT >hsa-miR-3180-3p TGGGGCGGAGCTTCCGGAGGCC >hsa-miR-4423-5p AGTTGCCTTTTTGTTCCCATGC >hsa-miR-204-5p TTCCCTTTGTCATCCTATGCCT >hsa-miR-4654 TGTGGGATCTGGAGGCATCTGG >hsa-miR-422a ACTGGACTTAGGGTCAGAAGGC >hsa-miR-4746-3p AGCGGTGCTCCTGCGGGCCGA >hsa-miR-1263 ATGGTACCCTGGCATACTGAGT >hsa-miR-103a-2-5p AGCTTCTTTACAGTGCTGCCTTG >hsa-miR-95 TTCAACGGGTATTTATTGAGCA >hsa-miR-4672 TTACACAGCTGGACAGAGGCA >hsa-miR-1248 ACCTTCTTGTATAAGCACTGTGCTAAA >hsa-miR-1909-3p CGCAGGGGCCGGGTGCTCACCG >hsa-miR-5686 TATCGTATCGTATTGTATTGT >hsa-miR-4478 GAGGCTGAGCTGAGGAG >hsa-miR-4423-3p ATAGGCACCAAAAAGCAACAA >hsa-miR-4267 TCCAGCTCGGTGGCAC >hsa-miR-3181 ATCGGGCCCTCGGCGCCGG >hsa-miR-3917 GCTCGGACTGAGCAGGTGGG >hsa-miR-4765 TGAGTGATTGATAGCTATGTTC >hsa-miR-502-3p AATGCACCTGGGCAAGGATTCA >hsa-miR-125b-5p TCCCTGAGACCCTAACTTGTGA >hsa-miR-1245b-5p TAGGCCTTTAGATCACTTAAA >hsa-miR-3682-3p TGATGATACAGGTGGAGGTAG >hsa-miR-23c ATCACATTGCCAGTGATTACCC >hsa-miR-3140-3p AGCTTTTGGGAATTCAGGTAGT >hsa-miR-4719 TCACAAATCTATAATATGCAGG >hsa-miR-499a-5p TTAAGACTTGCAGTGATGTTT >hsa-miR-4659b-3p TTTCTTCTTAGACATGGCAGCT >hsa-miR-3146 CATGCTAGGATAGAAAGAATGG >hsa-miR-513c-5p TTCTCAAGGAGGTGTCGTTTAT >hsa-miR-4742-3p TCTGTATTCTCCTTTGCCTGCAG >hsa-miR-5587-3p GCCCCGGGCAGTGTGATCATC >hsa-miR-362-5p AATCCTTGGAACCTAGGTGTGAGT >hsa-miR-4796-5p TGTCTATACTCTGTCACTTTAC >hsa-miR-31-5p AGGCAAGATGCTGGCATAGCT >hsa-miR-593-3p TGTCTCTGCTGGGGTTTCT >hsa-miR-548m CAAAGGTATTTGTGGTTTTTG >hsa-miR-4529-5p AGGCCATCAGCAGTCCAATGAA >hsa-miR-29c-3p TAGCACCATTTGAAATCGGTTA >hsa-miR-550a-3p TGTCTTACTCCCTCAGGCACAT >hsa-miR-126-3p TCGTACCGTGAGTAATAATGCG >hsa-miR-4432 AAAGACTCTGCAAGATGCCT >hsa-miR-578 CTTCTTGTGCTCTAGGATTGT >hsa-miR-4436b-3p CAGGGCAGGAAGAAGTGGACAA >hsa-miR-519e-3p AAGTGCCTCCTTTTAGAGTGTT >hsa-miR-4744 TCTAAAGACTAGACTTCGCTATG >hsa-miR-431-5p TGTCTTGCAGGCCGTCATGCA >hsa-miR-2682-3p CGCCTCTTCAGCGCTGTCTTCC >hsa-miR-5581-3p TTCCATGCCTCCTAGAAGTTCC >hsa-miR-26b-3p CCTGTTCTCCATTACTTGGCTC >hsa-miR-4473 CTAGTGCTCTCCGTTACAAGTA >hsa-miR-3940-3p CAGCCCGGATCCCAGCCCACTT >hsa-miR-1827 TGAGGCAGTAGATTGAAT >hsa-miR-548q GCTGGTGCAAAAGTAATGGCGG >hsa-miR-4532 CCCCGGGGAGCCCGGCG >hsa-miR-4524b-3p GAGACAGGTTCATGCTGCTA >hsa-miR-3614-5p CCACTTGGATCTGAAGGCTGCCC >hsa-miR-1469 CTCGGCGCGGGGCGCGGGCTCC >hsa-miR-3668 AATGTAGAGATTGATCAAAAT >hsa-miR-4778-5p AATTCTGTAAAGGAAGAAGAGG >hsa-miR-587 TTTCCATAGGTGATGAGTCAC >hsa-miR-373-3p GAAGTGCTTCGATTTTGGGGTGT >hsa-miR-224-5p CAAGTCACTAGTGGTTCCGTT >hsa-miR-499b-3p AACATCACTGCAAGTCTTAACA >hsa-miR-4268 GGCTCCTCCTCTCAGGATGTG >hsa-miR-1537 AAAACCGTCTAGTTACAGTTGT >hsa-miR-5685 ACAGCCCAGCAGTTATCACGGG >hsa-miR-3141 GAGGGCGGGTGGAGGAGGA >hsa-miR-3972 CTGCCAGCCCCGTTCCAGGGCA >hsa-miR-4653-5p TCTCTGAGCAAGGCTTAACACC >hsa-miR-19b-3p TGTGCAAATCCATGCAAAACTGA >hsa-miR-181d AACATTCATTGTTGTCGGTGGGT >hsa-miR-4740-5p AGGACTGATCCTCTCGGGCAGG >hsa-miR-3121-5p TCCTTTGCCTATTCTATTTAAG >hsa-miR-205-5p TCCTTCATTCCACCGGAGTCTG >hsa-miR-4255 CAGTGTTCAGAGATGGA >hsa-miR-1234 TCGGCCTGACCACCCACCCCAC >hsa-miR-30a-5p TGTAAACATCCTCGACTGGAAG >hsa-miR-4677-3p TCTGTGAGACCAAAGAACTACT >hsa-miR-15a-5p TAGCAGCACATAATGGTTTGTG >hsa-miR-1275 GTGGGGGAGAGGCTGTC >hsa-miR-3912 TAACGCATAATATGGACATGT >hsa-miR-4782-5p TTCTGGATATGAAGACAATCAA >hsa-miR-34c-3p AATCACTAACCACACGGCCAGG >hsa-miR-15b-5p TAGCAGCACATCATGGTTTACA >hsa-miR-4517 AAATATGATGAAACTCACAGCTGAG >hsa-miR-3135b GGCTGGAGCGAGTGCAGTGGTG >hsa-miR-3689d GGGAGGTGTGATCTCACACTCG >hsa-miR-4779 TAGGAGGGAATAGTAAAAGCAG >hsa-miR-660-5p TACCCATTGCATATCGGAGTTG >hsa-miR-501-5p AATCCTTTGTCCCTGGGTGAGA >hsa-miR-4493 AGAAGGCCTTTCCATCTCTGT >hsa-miR-4789-5p GTATACACCTGATATGTGTATG >hsa-miR-1231 GTGTCTGGGCGGACAGCTGC >hsa-miR-518c-3p CAAAGCGCTTCTCTTTAGAGTGT >hsa-miR-4653-3p TGGAGTTAAGGGTTGCTTGGAGA >hsa-miR-103a-3p AGCAGCATTGTACAGGGCTATGA >hsa-miR-17-5p CAAAGTGCTTACAGTGCAGGTAG >hsa-miR-215 ATGACCTATGAATTGACAGAC >hsa-miR-1203 CCCGGAGCCAGGATGCAGCTC >hsa-miR-5096 GTTTCACCATGTTGGTCAGGC >hsa-miR-5188 AATCGGACCCATTTAAACCGGAG >hsa-miR-3150b-3p TGAGGAGATCGTCGAGGTTGG >hsa-miR-3151 GGTGGGGCAATGGGATCAGGT >hsa-miR-873-3p GGAGACTGATGAGTTCCCGGGA >hsa-miR-548c-5p___hsa-miR-548o-5p___hsa-miR-548am-5p AAAAGTAATTGCGGTTTTTGCC >hsa-miR-1200 CTCCTGAGCCATTCTGAGCCTC >hsa-miR-3195 CGCGCCGGGCCCGGGTT >hsa-miR-593-5p AGGCACCAGCCAGGCATTGCTCAGC >hsa-miR-30b-3p CTGGGAGGTGGATGTTTACTTC >hsa-miR-1258 AGTTAGGATTAGGTCGTGGAA >hsa-miR-15b-3p CGAATCATTATTTGCTGCTCTA >hsa-miR-4766-3p ATAGCAATTGCTCTTTTGGAA >hsa-miR-885-5p TCCATTACACTACCCTGCCTCT >hsa-miR-192-3p CTGCCAATTCCATAGGTCACAG >hsa-miR-520g ACAAAGTGCTTCCCTTTAGAGTGT >hsa-miR-5092 AATCCACGCTGAGCTTGGCATC >hsa-miR-625-3p GACTATAGAACTTTCCCCCTCA >hsa-miR-5191 AGGATAGGAAGAATGAAGTGCT >hsa-miR-451b TAGCAAGAGAACCATTACCATT >hsa-miR-556-3p ATATTACCATTAGCTCATCTTT >hsa-miR-4310 GCAGCATTCATGTCCC >hsa-miR-296-3p GAGGGTTGGGTGGAGGCTCTCC >hsa-miR-5708 ATGAGCGACTGTGCCTGACC >hsa-miR-4639-5p TTGCTAAGTAGGCTGAGATTGA >hsa-miR-489 GTGACATCACATATACGGCAGC >hsa-miR-3150a-3p CTGGGGAGATCCTCGAGGTTGG >hsa-miR-93-3p ACTGCTGAGCTAGCACTTCCCG >hsa-miR-493-5p TTGTACATGGTAGGCTTTCATT >hsa-miR-573 CTGAAGTGATGTGTAACTGATCAG >hsa-miR-4703-5p TAGCAATACAGTACAAATATAGT >hsa-miR-4694-3p CAAATGGACAGGATAACACCT >hsa-miR-3170 CTGGGGTTCTGAGACAGACAGT >hsa-miR-1323 TCAAAACTGAGGGGCATTTTCT >hsa-miR-129-5p CTTTTTGCGGTCTGGGCTTGC >hsa-miR-548ar-3p TAAAACTGCAGTTATTTTTGC >hsa-miR-4251 CCTGAGAAAAGGGCCAA >hsa-miR-33b-3p CAGTGCCTCGGCAGTGCAGCCC >hsa-miR-497-5p CAGCAGCACACTGTGGTTTGT >hsa-miR-4297 TGCCTTCCTGTCTGTG >hsa-miR-3934 TCAGGTGTGGAAACTGAGGCAG >hsa-miR-1468 CTCCGTTTGCCTGTTTCGCTG >hsa-miR-1265 CAGGATGTGGTCAAGTGTTGTT >hsa-miR-146b-3p TGCCCTGTGGACTCAGTTCTGG >hsa-miR-5583-5p AAACTAATATACCCATATTCTG >hsa-miR-183-5p TATGGCACTGGTAGAATTCACT >hsa-miR-4699-3p AATTTACTCTGCAATCTTCTCC >hsa-miR-4309 CTGGAGTCTAGGATTCCA >hsa-miR-1243 AACTGGATCAATTATAGGAGTG >hsa-miR-571 TGAGTTGGCCATCTGAGTGAG >hsa-miR-487b AATCGTACAGGGTCATCCACTT >hsa-miR-3158-3p AAGGGCTTCCTCTCTGCAGGAC >hsa-miR-877-3p TCCTCTTCTCCCTCCTCCCAG >hsa-miR-3128 TCTGGCAAGTAAAAAACTCTCAT >hsa-miR-187-5p GGCTACAACACAGGACCCGGGC >hsa-miR-4735-3p AAAGGTGCTCAAATTAGACAT >hsa-miR-4463 GAGACTGGGGTGGGGCC >hsa-miR-27b-5p AGAGCTTAGCTGATTGGTGAAC >hsa-miR-125a-5p TCCCTGAGACCCTTTAACCTGTGA >hsa-miR-4665-3p CTCGGCCGCGGCGCGTAGCCCCCGCC >hsa-miR-4641 TGCCCATGCCATACTTTTGCCTCA >hsa-miR-5583-3p GAATATGGGTATATTAGTTTGG >hsa-miR-3136-3p TGGCCCAACCTATTCAGTTAGT >hsa-miR-584-5p TTATGGTTTGCCTGGGACTGAG >hsa-miR-4698 TCAAAATGTAGAGGAAGACCCCA >hsa-miR-374c-5p ATAATACAACCTGCTAAGTGCT >hsa-miR-3605-3p CCTCCGTGTTACCTGTCCTCTAG >hsa-miR-3621 CGCGGGTCGGGGTCTGCAGG >hsa-miR-555 AGGGTAAGCTGAACCTCTGAT >hsa-miR-1321 CAGGGAGGTGAATGTGAT >hsa-miR-1224-3p CCCCACCTCCTCTCTCCTCAG >hsa-miR-4513 AGACTGACGGCTGGAGGCCCAT >hsa-miR-4477a CTATTAAGGACATTTGTGATTC >hsa-miR-599 GTTGTGTCAGTTTATCAAAC >hsa-miR-3157-5p TTCAGCCAGGCTAGTGCAGTCT >hsa-miR-26a-5p TTCAAGTAATCCAGGATAGGCT >hsa-miR-132-5p ACCGTGGCTTTCGATTGTTACT >hsa-miR-1257 AGTGAATGATGGGTTCTGACC >hsa-miR-3615 TCTCTCGGCTCCTCGCGGCTC >hsa-miR-770-5p TCCAGTACCACGTGTCAGGGCCA >hsa-miR-34b-3p CAATCACTAACTCCACTGCCAT >hsa-miR-3663-5p GCTGGTCTGCGTGGTGCTCGG >hsa-miR-1185-1-3p ATATACAGGGGGAGACTCTTAT >hsa-miR-4747-5p AGGGAAGGAGGCTTGGTCTTAG >hsa-miR-302d-5p ACTTTAACATGGAGGCACTTGC >hsa-miR-187-3p TCGTGTCTTGTGTTGCAGCCGG >hsa-miR-583 CAAAGAGGAAGGTCCCATTAC >hsa-miR-191-5p CAACGGAATCCCAAAAGCAGCTG >hsa-miR-3136-5p CTGACTGAATAGGTAGGGTCATT >hsa-miR-1272 GATGATGATGGCAGCAAATTCTGAAA >hsa-miR-514a-3p ATTGACACTTCTGTGAGTAGA >hsa-miR-151a-5p TCGAGGAGCTCACAGTCTAGT >hsa-miR-517c-3p ATCGTGCATCCTTTTAGAGTGT >hsa-miR-548ad GAAAACGACAATGACTTTTGCA >hsa-miR-3612 AGGAGGCATCTTGAGAAATGGA >hsa-miR-4764-5p TGGATGTGGAAGGAGTTATCT >hsa-miR-4777-3p ATACCTCATCTAGAATGCTGTA >hsa-miR-3667-3p ACCTTCCTCTCCATGGGTCTTT >hsa-miR-132-3p TAACAGTCTACAGCCATGGTCG >hsa-miR-139-5p TCTACAGTGCACGTGTCTCCAG >hsa-miR-325 CCTAGTAGGTGTCCAGTAAGTGT >hsa-miR-4452 TTGAATTCTTGGCCTTAAGTGAT >hsa-miR-4717-3p ACACATGGGTGGCTGTGGCCT >hsa-miR-1225-3p TGAGCCCCTGTGCCGCCCCCAG >hsa-miR-1290 TGGATTTTTGGATCAGGGA >hsa-miR-517a-3p___hsa-miR-517b-3p ATCGTGCATCCCTTTAGAGTGT >hsa-miR-551a GCGACCCACTCTTGGTTTCCA >hsa-miR-3648 AGCCGCGGGGATCGCCGAGGG >hsa-miR-3156-3p CTCCCACTTCCAGATCTTTCT >hsa-miR-3945 AGGGCATAGGAGAGGGTTGATAT >hsa-miR-27a-3p TTCACAGTGGCTAAGTTCCGC >hsa-miR-486-5p TCCTGTACTGAGCTGCCCCGAG >hsa-miR-659-5p AGGACCTTCCCTGAACCAAGGA >hsa-miR-4697-5p AGGGGGCGCAGTCACTGACGTG >hsa-miR-92b-5p AGGGACGGGACGCGGTGCAGTG >hsa-miR-5007-5p TAGAGTCTGGCTGATATGGTTT >hsa-miR-4704-5p GACACTAGGCATGTGAGTGATT >hsa-miR-548h-3p___hsa-miR-548z CAAAAACCGCAATTACTTTTGCA >hsa-miR-4446-3p CAGGGCTGGCAGTGACATGGGT >hsa-miR-1914-3p GGAGGGGTCCCGCACTGGGAGG >hsa-miR-4705 TCAATCACTTGGTAATTGCTGT >hsa-miR-218-5p TTGTGCTTGATCTAACCATGT >hsa-miR-384 ATTCCTAGAAATTGTTCATA >hsa-miR-4537 TGAGCCGAGCTGAGCTTAGCTG >hsa-miR-5582-3p TAAAACTTTAAGTGTGCCTAGG >hsa-miR-548g-5p___hsa-miR-548x-5p___hsa-miR-548aj-5p TGCAAAAGTAATTGCAGTTTTTG >hsa-miR-659-3p CTTGGTTCAGGGAGGGTCCCCA >hsa-miR-324-3p ACTGCCCCAGGTGCTGCTGG >hsa-miR-4724-3p GTACCTTCTGGTTCAGCTAGT >hsa-miR-4640-5p TGGGCCAGGGAGCAGCTGGTGGG >hsa-miR-378e ACTGGACTTGGAGTCAGGA >hsa-miR-323a-5p AGGTGGTCCGTGGCGCGTTCGC >hsa-miR-27a-5p AGGGCTTAGCTGCTTGTGAGCA >hsa-miR-4290 TGCCCTCCTTTCTTCCCTC >hsa-miR-3656 GGCGGGTGCGGGGGTGG >hsa-miR-4500 TGAGGTAGTAGTTTCTT >hsa-miR-519b-3p AAAGTGCATCCTTTTAGAGGTT >hsa-miR-4261 AGGAAACAGGGACCCA >hsa-miR-4711-5p TGCATCAGGCCAGAAGACATGAG >hsa-miR-302d-3p TAAGTGCTTCCATGTTTGAGTGT >hsa-miR-517-5p CCTCTAGATGGAAGCACTGTCT >hsa-miR-320a AAAAGCTGGGTTGAGAGGGCGA >hsa-miR-450b-5p TTTTGCAATATGTTCCTGAATA >hsa-miR-4725-5p AGACCCTGCAGCCTTCCCACC >hsa-miR-607 GTTCAAATCCAGATCTATAAC >hsa-miR-138-1-3p GCTACTTCACAACACCAGGGCC >hsa-miR-378d ACTGGACTTGGAGTCAGAAA >hsa-miR-505-3p CGTCAACACTTGCTGGTTTCCT >hsa-miR-378a-3p ACTGGACTTGGAGTCAGAAGG >hsa-miR-1291 TGGCCCTGACTGAAGACCAGCAGT >hsa-miR-30b-5p TGTAAACATCCTACACTCAGCT >hsa-miR-3160-3p AGAGCTGAGACTAGAAAGCCCA >hsa-miR-758 TTTGTGACCTGGTCCACTAACC >hsa-miR-3689a-5p___hsa-miR-3689b-5p___hsa-miR-3689e TGTGATATCATGGTTCCTGGGA >hsa-miR-302c-3p TAAGTGCTTCCATGTTTCAGTGG >hsa-miR-147b GTGTGCGGAAATGCTTCTGCTA >hsa-miR-1304-3p TCTCACTGTAGCCTCGAACCCC >hsa-miR-4734 GCTGCGGGCTGCGGTCAGGGCG >hsa-miR-513a-3p TAAATTTCACCTTTCTGAGAAGG >hsa-miR-4419b GAGGCTGAAGGAAGATGG >hsa-miR-513c-3p TAAATTTCACCTTTCTGAGAAGA >hsa-miR-486-3p CGGGGCAGCTCAGTACAGGAT >hsa-miR-127-3p TCGGATCCGTCTGAGCTTGGCT >hsa-miR-4282 TAAAATTTGCATCCAGGA >hsa-miR-449c-3p TTGCTAGTTGCACTCCTCTCTGT >hsa-miR-376b ATCATAGAGGAAAATCCATGTT >hsa-miR-551b-3p GCGACCCATACTTGGTTTCAG >hsa-miR-1267 CCTGTTGAAGTGTAATCCCCA >hsa-miR-23a-5p GGGGTTCCTGGGGATGGGATTT >hsa-miR-4433-5p CGTCCCACCCCCCACTCCTGT >hsa-miR-4504 TGTGACAATAGAGATGAACATG >hsa-miR-4540 TTAGTCCTGCCTGTAGGTTTA >hsa-miR-4421 ACCTGTCTGTGGAAAGGAGCTA >hsa-miR-4291 TTCAGCAGGAACAGCT >hsa-miR-2681-5p GTTTTACCACCTCCAGGAGACT >hsa-miR-2116-3p CCTCCCATGCCAAGAACTCCC >hsa-miR-106b-5p TAAAGTGCTGACAGTGCAGAT >hsa-miR-1307-3p ACTCGGCGTGGCGTCGGTCGTG >hsa-miR-3200-3p CACCTTGCGCTACTCAGGTCTG >hsa-miR-876-3p TGGTGGTTTACAAAGTAATTCA >hsa-miR-567 AGTATGTTCTTCCAGGACAGAAC >hsa-miR-4658 GTGAGTGTGGATCCTGGAGGAAT >hsa-miR-181a-3p ACCATCGACCGTTGATTGTACC >hsa-miR-3619-3p GGGACCATCCTGCCTGCTGTGG >hsa-miR-1260b ATCCCACCACTGCCACCAT >hsa-miR-3166 CGCAGACAATGCCTACTGGCCTA >hsa-miR-147a GTGTGTGGAAATGCTTCTGC >hsa-miR-3175 CGGGGAGAGAACGCAGTGACGT >hsa-miR-5196-3p TCATCCTCGTCTCCCTCCCAG >hsa-miR-4679 TCTGTGATAGAGATTCTTTGCT >hsa-miR-4784 TGAGGAGATGCTGGGACTGA >hsa-miR-210 CTGTGCGTGTGACAGCGGCTGA >hsa-miR-3679-3p CTTCCCCCCAGTAATCTTCATC >hsa-miR-326 CCTCTGGGCCCTTCCTCCAG >hsa-miR-200a-5p CATCTTACCGGACAGTGCTGGA >hsa-miR-3188 AGAGGCTTTGTGCGGATACGGGG >hsa-miR-663b GGTGGCCCGGCCGTGCCTGAGG >hsa-miR-4661-3p CAGGATCCACAGAGCTAGTCCA >hsa-miR-1285-5p GATCTCACTTTGTTGCCCAGG >hsa-miR-3591-5p TTTAGTGTGATAATGGCGTTTGA >hsa-miR-5586-5p TATCCAGCTTGTTACTATATGC >hsa-miR-3529-5p AGGTAGACTGGGATTTGTTGTT >hsa-miR-338-3p TCCAGCATCAGTGATTTTGTTG >hsa-miR-4524a-3p TGAGACAGGCTTATGCTGCTAT >hsa-miR-107 AGCAGCATTGTACAGGGCTATCA >hsa-miR-3201 GGGATATGAAGAAAAAT >hsa-miR-641 AAAGACATAGGATAGAGTCACCTC >hsa-miR-548ax AGAAGTAATTGCGGTTTTGCCA >hsa-miR-664-3p TATTCATTTATCCCCAGCCTACA >hsa-miR-5584-5p CAGGGAAATGGGAAGAACTAGA >hsa-miR-4791 TGGATATGATGACTGAAA >hsa-miR-4468 AGAGCAGAAGGATGAGAT >hsa-miR-4748 GAGGTTTGGGGAGGATTTGCT >hsa-miR-4790-3p TGAATGGTAAAGCGATGTCACA >hsa-miR-4467 TGGCGGCGGTAGTTATGGGCTT >hsa-miR-3134 TGATGGATAAAAGACTACATATT >hsa-miR-652-3p AATGGCGCCACTAGGGTTGTG >hsa-miR-92a-3p TATTGCACTTGTCCCGGCCTGT >hsa-miR-3619-5p TCAGCAGGCAGGCTGGTGCAGC >hsa-miR-577 TAGATAAAATATTGGTACCTG >hsa-miR-302a-5p ACTTAAACGTGGATGTACTTGCT >hsa-miR-4792 CGGTGAGCGCTCGCTGGC >hsa-miR-130a-5p TTCACATTGTGCTACTGTCTGC >hsa-miR-4743 TGGCCGGATGGGACAGGAGGCAT >hsa-miR-3132 TGGGTAGAGAAGGAGCTCAGAGGA >hsa-miR-1825 TCCAGTGCCCTCCTCTCC >hsa-miR-221-3p AGCTACATTGTCTGCTGGGTTTC >hsa-miR-4788 TTACGGACCAGCTAAGGGAGGC >hsa-miR-1238 CTTCCTCGTCTGTCTGCCCC >hsa-miR-4793-3p TCTGCACTGTGAGTTGGCTGGCT >hsa-miR-1305 TTTTCAACTCTAATGGGAGAGA >hsa-miR-5580-5p TGCTGGCTCATTTCATATGTGT >hsa-miR-3126-3p CATCTGGCATCCGTCACACAGA >hsa-miR-496 TGAGTATTACATGGCCAATCTC >hsa-miR-665 ACCAGGAGGCTGAGGCCCCT >hsa-miR-3162-5p TTAGGGAGTAGAAGGGTGGGGAG >hsa-miR-105-3p ACGGATGTTTGAGCATGTGCTA >hsa-miR-4728-5p TGGGAGGGGAGAGGCAGCAAGCA >hsa-miR-1268b CGGGCGTGGTGGTGGGGGTG >hsa-miR-766-5p AGGAGGAATTGGTGCTGGTCTT >hsa-miR-3125 TAGAGGAAGCTGTGGAGAGA >hsa-miR-5003-5p TCACAACAACCTTGCAGGGTAGA >hsa-miR-5589-3p TGCACATGGCAACCTAGCTCCCA >hsa-miR-383 AGATCAGAAGGTGATTGTGGCT >hsa-miR-1207-3p TCAGCTGGCCCTCATTTC >hsa-miR-4668-5p AGGGAAAAAAAAAAGGATTTGTC >hsa-miR-592 TTGTGTCAATATGCGATGATGT >hsa-miR-3123 CAGAGAATTGTTTAATC >hsa-miR-4781-3p AATGTTGGAATCCTCGCTAGAG >hsa-miR-190b TGATATGTTTGATATTGGGTT >hsa-miR-1273g-5p GGTGGTTGAGGCTGCAGTAAGT >hsa-miR-409-3p GAATGTTGCTCGGTGAACCCCT >hsa-miR-512-3p AAGTGCTGTCATAGCTGAGGTC >hsa-miR-622 ACAGTCTGCTGAGGTTGGAGC >hsa-miR-5705 TGTTTCGGGGCTCATGGCCTGTG >hsa-miR-335-3p TTTTTCATTATTGCTCCTGACC >hsa-miR-548x-3p TAAAAACTGCAATTACTTTC >hsa-miR-4651 CGGGGTGGGTGAGGTCGGGC >hsa-miR-30a-3p CTTTCAGTCGGATGTTTGCAGC >hsa-miR-3714 GAAGGCAGCAGTGCTCCCCTGT >hsa-miR-365b-5p AGGGACTTTCAGGGGCAGCTGT >hsa-miR-101-3p TACAGTACTGTGATAACTGAA >hsa-miR-550b-3p TCTTACTCCCTCAGGCACTG >hsa-miR-548b-3p CAAGAACCTCAGTTGCTTTTGT >hsa-miR-5087 GGGTTTGTAGCTTTGCTGGCATG >hsa-miR-136-5p ACTCCATTTGTTTTGATGATGGA >hsa-miR-205-3p GATTTCAGTGGAGTGAAGTTC >hsa-miR-100-5p AACCCGTAGATCCGAACTTGTG >hsa-miR-5002-3p TGACTGCCTCACTGACCACTT >hsa-miR-4449 CGTCCCGGGGCTGCGCGAGGCA >hsa-miR-548at-3p CAAAACCGCAGTAACTTTTGT >hsa-miR-4798-3p AACTCACGAAGTATACCGAAGT >hsa-miR-4770 TGAGATGACACTGTAGCT >hsa-miR-3115 ATATGGGTTTACTAGTTGGT >hsa-miR-524-5p CTACAAAGGGAAGCACTTTCTC >hsa-miR-323b-5p AGGTTGTCCGTGGTGAGTTCGCA >hsa-miR-4768-3p CCAGGAGATCCAGAGAGAAT >hsa-miR-520a-5p CTCCAGAGGGAAGTACTTTCT >hsa-miR-4311 GAAAGAGAGCTGAGTGTG >hsa-miR-543 AAACATTCGCGGTGCACTTCTT >hsa-miR-5195-3p ATCCAGTTCTCTGAGGGGGCT >hsa-miR-31-3p TGCTATGCCAACATATTGCCAT >hsa-miR-5692b AATAATATCACAGTAGGTGT >hsa-miR-609 AGGGTGTTTCTCTCATCTCT >hsa-miR-202-5p TTCCTATGCATATACTTCTTTG >hsa-miR-4797-3p TCTCAGTAAGTGGCACTCTGT >hsa-miR-629-5p TGGGTTTACGTTGGGAGAACT >hsa-miR-940 AAGGCAGGGCCCCCGCTCCCC >hsa-miR-4528 TCATTATATGTATGATCTGGAC >hsa-miR-589-5p TGAGAACCACGTCTGCTCTGAG >hsa-miR-508-3p TGATTGTAGCCTTTTGGAGTAGA >hsa-miR-519e-5p TTCTCCAAAAGGGAGCACTTTC >hsa-miR-10b-5p TACCCTGTAGAACCGAATTTGTG >hsa-miR-181a-5p AACATTCAACGCTGTCGGTGAGT >hsa-miR-129-2-3p AAGCCCTTACCCCAAAAAGCAT >hsa-miR-1251 ACTCTAGCTGCCAAAGGCGCT >hsa-miR-130a-3p CAGTGCAATGTTAAAAGGGCAT >hsa-miR-548at-5p AAAAGTTATTGCGGTTTTGGCT >hsa-miR-103b TCATAGCCCTGTACAATGCTGCT >hsa-miR-185-5p TGGAGAGAAAGGCAGTTCCTGA >hsa-miR-4637 TACTAACTGCAGATTCAAGTGA >hsa-miR-3676-5p AGGAGATCCTGGGTT >hsa-miR-4716-5p TCCATGTTTCCTTCCCCCTTCT >hsa-miR-760 CGGCTCTGGGTCTGTGGGGA >hsa-miR-4525 GGGGGGATGTGCATGCTGGTT >hsa-miR-4708-5p AGAGATGCCGCCTTGCTCCTT >hsa-miR-5586-3p CAGAGTGACAAGCTGGTTAAAG >hsa-miR-4712-5p TCCAGTACAGGTCTCTCATTTC >hsa-miR-648 AAGTGTGCAGGGCACTGGT >hsa-miR-1226-3p TCACCAGCCCTGTGTTCCCTAG >hsa-miR-5000-5p CAGTTCAGAAGTGTTCCTGAGT >hsa-miR-1914-5p CCCTGTGCCCGGCCCACTTCTG >hsa-miR-374b-3p CTTAGCAGGTTGTATTATCATT >hsa-miR-4536-3p TCGTGCATATATCTACCACAT >hsa-miR-23b-3p ATCACATTGCCAGGGATTACC >hsa-miR-30c-2-3p CTGGGAGAAGGCTGTTTACTCT >hsa-miR-219-2-3p AGAATTGTGGCTGGACATCTGT >hsa-miR-4459 CCAGGAGGCGGAGGAGGTGGAG >hsa-miR-29a-3p TAGCACCATCTGAAATCGGTTA >hsa-let-7f-2-3p CTATACAGTCTACTGTCTTTCC >hsa-miR-1179 AAGCATTCTTTCATTGGTTGG >hsa-miR-217 TACTGCATCAGGAACTGATTGGA >hsa-miR-3158-5p CCTGCAGAGAGGAAGCCCTTC >hsa-miR-433 ATCATGATGGGCTCCTCGGTGT >hsa-miR-3127-3p TCCCCTTCTGCAGGCCTGCTGG >hsa-miR-762 GGGGCTGGGGCCGGGGCCGAGC >hsa-miR-550b-2-5p ATGTGCCTGAGGGAGTAAGACA >hsa-miR-576-3p AAGATGTGGAAAAATTGGAATC >hsa-miR-4797-5p GACAGAGTGCCACTTACTGAA >hsa-miR-1306-3p ACGTTGGCTCTGGTGGTG >hsa-miR-497-3p CAAACCACACTGTGGTGTTAGA >hsa-let-7i-5p TGAGGTAGTAGTTTGTGCTGTT >hsa-miR-432-3p CTGGATGGCTCCTCCATGTCT >hsa-miR-4280 GAGTGTAGTTCTGAGCAGAGC >hsa-miR-574-3p CACGCTCATGCACACACCCACA >hsa-miR-302b-3p TAAGTGCTTCCATGTTTTAGTAG >hsa-miR-586 TATGCATTGTATTTTTAGGTCC >hsa-miR-5681b AGGTATTGCCACCCTTTCTAGT >hsa-miR-548v AGCTACAGTTACTTTTGCACCA >hsa-miR-3131 TCGAGGACTGGTGGAAGGGCCTT >hsa-miR-452-3p CTCATCTGCAAAGAAGTAAGTG >hsa-miR-4647 GAAGATGGTGCTGTGCTGAGGAA >hsa-miR-1264 CAAGTCTTATTTGAGCACCTGTT >hsa-miR-544a ATTCTGCATTTTTAGCAAGTTC >hsa-miR-144-5p GGATATCATCATATACTGTAAG >hsa-miR-618 AAACTCTACTTGTCCTTCTGAGT >hsa-miR-3187-3p TTGGCCATGGGGCTGCGCGG >hsa-miR-4304 CCGGCATGTCCAGGGCA >hsa-miR-4508 GCGGGGCTGGGCGCGCG >hsa-miR-4316 GGTGAGGCTAGCTGGTG >hsa-miR-1181 CCGTCGCCGCCACCCGAGCCG >hsa-miR-5093 AGGAAATGAGGCTGGCTAGGAGC >hsa-miR-148a-5p AAAGTTCTGAGACACTCCGACT >hsa-miR-4633-3p AGGAGCTAGCCAGGCATATGCA >hsa-miR-1202 GTGCCAGCTGCAGTGGGGGAG >hsa-miR-3919 GCAGAGAACAAAGGACTCAGT >hsa-miR-5001-5p AGGGCTGGACTCAGCGGCGGAGCT >hsa-miR-2355-5p ATCCCCAGATACAATGGACAA >hsa-miR-1226-5p GTGAGGGCATGCAGGCCTGGATGGGG >hsa-miR-591 AGACCATGGGTTCTCATTGT >hsa-miR-4323 CAGCCCCACAGCCTCAGA >hsa-miR-4683 TGGAGATCCAGTGCTCGCCCGAT >hsa-miR-1233 TGAGCCCTGTCCTCCCGCAG >hsa-miR-3150a-5p CAACCTCGACGATCTCCTCAGC >hsa-miR-371a-3p AAGTGCCGCCATCTTTTGAGTGT >hsa-miR-516b-3p___hsa-miR-516a-3p TGCTTCCTTTCAGAGGGT >hsa-miR-1295b-5p CACCCAGATCTGCGGCCTAAT >hsa-miR-3939 TACGCGCAGACCACAGGATGTC >hsa-miR-3065-5p TCAACAAAATCACTGATGCTGGA >hsa-miR-518d-3p CAAAGCGCTTCCCTTTGGAGC >hsa-miR-4299 GCTGGTGACATGAGAGGC >hsa-miR-514b-3p ATTGACACCTCTGTGAGTGGA >hsa-miR-4800-5p AGTGGACCGAGGAAGGAAGGA >hsa-miR-521 AACGCACTTCCCTTTAGAGTGT >hsa-miR-4701-5p TTGGCCACCACACCTACCCCTT >hsa-miR-520h ACAAAGTGCTTCCCTTTAGAGT >hsa-let-7g-5p TGAGGTAGTAGTTTGTACAGTT >hsa-miR-642a-3p AGACACATTTGGAGAGGGAACC >hsa-miR-628-3p TCTAGTAAGAGTGGCAGTCGA >hsa-miR-92b-3p TATTGCACTCGTCCCGGCCTCC >hsa-miR-4652-3p GTTCTGTTAACCCATCCCCTCA >hsa-miR-367-3p AATTGCACTTTAGCAATGGTGA >hsa-miR-4441 ACAGGGAGGAGATTGTA >hsa-miR-200c-3p TAATACTGCCGGGTAATGATGGA >hsa-miR-4638-3p CCTGGACACCGCTCAGCCGGCCG >hsa-miR-3713 GGTATCCGTTTGGGGATGGT >hsa-miR-151b TCGAGGAGCTCACAGTCT >hsa-miR-515-3p GAGTGCCTTCTTTTGGAGCGTT >hsa-let-7b-3p CTATACAACCTACTGCCTTCCC >hsa-miR-4472 GGTGGGGGGTGTTGTTTT >hsa-miR-642a-5p GTCCCTCTCCAAATGTGTCTTG >hsa-miR-617 AGACTTCCCATTTGAAGGTGGC >hsa-miR-1254 AGCCTGGAAGCTGGAGCCTGCAGT >hsa-miR-4643 GACACATGACCATAAATGCTAA >hsa-miR-5702 TGAGTCAGCAACATATCCCATG >hsa-miR-3156-5p AAAGATCTGGAAGTGGGAGACA >hsa-miR-4756-5p CAGGGAGGCGCTCACTCTCTGCT >hsa-miR-4740-3p GCCCGAGAGGATCCGTCCCTGC >hsa-miR-548c-3p CAAAAATCTCAATTACTTTTGC >hsa-miR-943 CTGACTGTTGCCGTCCTCCAG >hsa-miR-1913 TCTGCCCCCTCCGCTGCTGCCA >hsa-miR-503 TAGCAGCGGGAACAGTTCTGCAG >hsa-miR-4274 CAGCAGTCCCTCCCCCTG >hsa-miR-371b-5p ACTCAAAAGATGGCGGCACTTT >hsa-miR-4685-5p CCCAGGGCTTGGAGTGGGGCAAGGTT >hsa-miR-3191-5p CTCTCTGGCCGTCTACCTTCCA >hsa-miR-139-3p GGAGACGCGGCCCTGTTGGAGT >hsa-miR-4680-3p TCTGAATTGTAAGAGTTGTTA >hsa-miR-4723-3p CCCTCTCTGGCTCCTCCCCAAA >hsa-miR-4484 AAAAGGCGGGAGAAGCCCCA >hsa-miR-3915 TTGAGGAAAAGATGGTCTTATT >hsa-miR-3127-5p ATCAGGGCTTGTGGAATGGGAAG >hsa-miR-3148 TGGAAAAAACTGGTGTGTGCTT >hsa-miR-4480 AGCCAAGTGGAAGTTACTTTA >hsa-miR-4725-3p TGGGGAAGGCGTCAGTGTCGGG >hsa-miR-100-3p CAAGCTTGTATCTATAGGTATG >hsa-miR-5000-3p TCAGGACACTTCTGAACTTGGA >hsa-miR-5704 TTAGGCCATCATCCCATTATGC >hsa-miR-2467-5p TGAGGCTCTGTTAGCCTTGGCTC >hsa-miR-4276 CTCAGTGACTCATGTGC >hsa-miR-3680-3p TTTTGCATGACCCTGGGAGTAGG >hsa-miR-4722-5p GGCAGGAGGGCTGTGCCAGGTTG >hsa-miR-548d-3p CAAAAACCACAGTTTCTTTTGC >hsa-miR-4695-5p CAGGAGGCAGTGGGCGAGCAGG >hsa-miR-4802-3p TACATGGATGGAAACCTTCAAGC >hsa-miR-4287 TCTCCCTTGAGGGCACTTT >hsa-miR-4690-5p GAGCAGGCGAGGCTGGGCTGAA >hsa-miR-3129-5p GCAGTAGTGTAGAGATTGGTTT >hsa-miR-671-5p AGGAAGCCCTGGAGGGGCTGGAG >hsa-miR-548f AAAAACTGTAATTACTTTT >hsa-miR-520c-3p AAAGTGCTTCCTTTTAGAGGGT >hsa-miR-532-5p CATGCCTTGAGTGTAGGACCGT >hsa-miR-3116 TGCCTGGAACATAGTAGGGACT >hsa-miR-4524b-5p ATAGCAGCATAAGCCTGTCTC >hsa-miR-885-3p AGGCAGCGGGGTGTAGTGGATA >hsa-miR-337-5p GAACGGCTTCATACAGGAGTT >hsa-miR-548h-5p AAAAGTAATCGCGGTTTTTGTC >hsa-miR-376a-5p GTAGATTCTCCTTCTATGAGTA >hsa-miR-4443 TTGGAGGCGTGGGTTTT >hsa-miR-4659a-3p TTTCTTCTTAGACATGGCAACG >hsa-miR-197-5p CGGGTAGAGAGGGCAGTGGGAGG >hsa-miR-3936 TAAGGGGTGTATGGCAGATGCA >hsa-miR-4275 CCAATTACCACTTCTTT >hsa-miR-3922-5p TCAAGGCCAGAGGTCCCACAGCA >hsa-miR-4477b ATTAAGGACATTTGTGATTGAT >hsa-miR-4539 GCTGAACTGGGCTGAGCTGGGC >hsa-miR-124-3p TAAGGCACGCGGTGAATGCC >hsa-miR-3186-5p CAGGCGTCTGTCTACGTGGCTT >hsa-miR-4422 AAAAGCATCAGGAAGTACCCA >hsa-miR-455-5p TATGTGCCTTTGGACTACATCG >hsa-miR-4418 CACTGCAGGACTCAGCAG >hsa-miR-3064-5p TCTGGCTGTTGTGGTGTGCAA >hsa-miR-216a TAATCTCAGCTGGCAACTGTGA >hsa-miR-4799-5p ATCTAAATGCAGCATGCCAGTC >hsa-miR-30e-3p CTTTCAGTCGGATGTTTACAGC >hsa-miR-5590-5p TTGCCATACATAGACTTTATT >hsa-miR-4417 GGTGGGCTTCCCGGAGGG >hsa-miR-2110 TTGGGGAAACGGCCGCTGAGTG >hsa-miR-196a-3p CGGCAACAAGAAACTGCCTGAG >hsa-miR-584-3p TCAGTTCCAGGCCAACCAGGCT >hsa-miR-4652-5p AGGGGACTGGTTAATAGAACTA >hsa-miR-5579-5p TATGGTACTCCTTAAGCTAAC >hsa-miR-548ap-5p AAAAGTAATTGCGGTCTTT >hsa-miR-4801 TACACAAGAAAACCAAGGCTCA >hsa-miR-449c-5p TAGGCAGTGTATTGCTAGCGGCTGT >hsa-miR-137 TTATTGCTTAAGAATACGCGTAG >hsa-miR-548av-3p AAAACTGCAGTTACTTTTGC >hsa-miR-570-3p CGAAAACAGCAATTACCTTTGC >hsa-miR-924 AGAGTCTTGTGATGTCTTGC >hsa-miR-5194 TGAGGGGTTTGGAATGGGATGG >hsa-miR-3144-3p ATATACCTGTTCGGTCTCTTTA >hsa-miR-4531 ATGGAGAAGGCTTCTGA >hsa-miR-143-5p GGTGCAGTGCTGCATCTCTGGT >hsa-miR-4646-3p ATTGTCCCTCTCCCTTCCCAG >hsa-miR-4686 TATCTGCTGGGCTTTCTGGTGTT >hsa-miR-4476 CAGGAAGGATTTAGGGACAGGC >hsa-miR-5589-5p GGCTGGGTGCTCTTGTGCAGT >hsa-miR-1261 ATGGATAAGGCTTTGGCTT >hsa-miR-181b-5p AACATTCATTGCTGTCGGTGGGT >hsa-miR-5189 TCTGGGCACAGGCGGATGGACAGG >hsa-miR-4278 CTAGGGGGTTTGCCCTTG >hsa-miR-3182 GCTTCTGTAGTGTAGTC >hsa-miR-651 TTTAGGATAAGCTTGACTTTTG >hsa-miR-4673 TCCAGGCAGGAGCCGGACTGGA >hsa-miR-200b-3p TAATACTGCCTGGTAATGATGA >hsa-miR-548ar-5p AAAAGTAATTGCAGTTTTTGC >hsa-miR-548w AAAAGTAACTGCGGTTTTTGCCT >hsa-miR-4684-5p CTCTCTACTGACTTGCAACATA >hsa-miR-5691 TTGCTCTGAGCTCCGAGAAAGC >hsa-miR-933 TGTGCGCAGGGAGACCTCTCCC >hsa-miR-518a-3p GAAAGCGCTTCCCTTTGCTGGA >hsa-miR-4437 TGGGCTCAGGGTACAAAGGTT >hsa-miR-1255b-2-3p AACCACTTTCTTTGCTCATCCA >hsa-miR-2053 GTGTTAATTAAACCTCTATTTAC >hsa-miR-4714-3p CCAACCTAGGTGGTCAGAGTTG >hsa-miR-4425 TGTTGGGATTCAGCAGGACCAT >hsa-miR-720 TCTCGCTGGGGCCTCCA >hsa-miR-4257 CCAGAGGTGGGGACTGAG >hsa-miR-33a-5p GTGCATTGTAGTTGCATTGCA >hsa-miR-4795-3p ATATTATTAGCCACTTCTGGAT >hsa-miR-4446-5p ATTTCCCTGCCATTCCCTTGGC >hsa-miR-554 GCTAGTCCTGACTCAGCCAGT >hsa-miR-3666 CAGTGCAAGTGTAGATGCCGA >hsa-miR-4456 CCTGGTGGCTTCCTTTT >hsa-miR-4436b-5p GTCCACTTCTGCCTGCCCTGCC >hsa-miR-10a-3p CAAATTCGTATCTAGGGGAATA >hsa-miR-936 ACAGTAGAGGGAGGAATCGCAG >hsa-miR-1244 AAGTAGTTGGTTTGTATGAGATGGTT >hsa-miR-621 GGCTAGCAACAGCGCTTACCT >hsa-miR-153 TTGCATAGTCACAAAAGTGATC >hsa-miR-3683 TGCGACATTGGAAGTAGTATCA >hsa-miR-650 AGGAGGCAGCGCTCTCAGGAC >hsa-miR-320c AAAAGCTGGGTTGAGAGGGT >hsa-miR-4767 CGCGGGCGCTCCTGGCCGCCGCC >hsa-miR-331-3p GCCCCTGGGCCTATCCTAGAA >hsa-miR-3940-5p GTGGGTTGGGGCGGGCTCTG >hsa-miR-338-5p AACAATATCCTGGTGCTGAGTG >hsa-miR-106b-3p CCGCACTGTGGGTACTTGCTGC >hsa-miR-525-5p CTCCAGAGGGATGCACTTTCT >hsa-miR-3690 ACCTGGACCCAGCGTAGACAAAG >hsa-miR-3929 GAGGCTGATGTGAGTAGACCACT >hsa-miR-944 AAATTATTGTACATCGGATGAG >hsa-miR-5572 GTTGGGGTGCAGGGGTCTGCT >hsa-miR-150-5p TCTCCCAACCCTTGTACCAGTG >hsa-miR-3924 ATATGTATATGTGACTGCTACT >hsa-miR-208a ATAAGACGAGCAAAAAGCTTGT >hsa-miR-3665 AGCAGGTGCGGGGCGGCG >hsa-miR-5047 TTGCAGCTGCGGTTGTAAGGT >hsa-miR-4307 AATGTTTTTTCCTGTTTCC >hsa-miR-4303 TTCTGAGCTGAGGACAG >hsa-miR-5591-3p ATACCCATAGCTTAGCTCCCA >hsa-miR-1301 TTGCAGCTGCCTGGGAGTGACTTC >hsa-miR-4671-3p TTAGTGCATAGTCTTTGGTCT >hsa-miR-219-1-3p AGAGTTGAGTCTGGACGTCCCG >hsa-miR-3677-3p CTCGTGGGCTCTGGCCACGGCC >hsa-miR-1908 CGGCGGGGACGGCGATTGGTC >hsa-miR-875-3p CCTGGAAACACTGAGGTTGTG >hsa-miR-548am-3p CAAAAACTGCAGTTACTTTTGT >hsa-miR-532-3p CCTCCCACACCCAAGGCTTGCA >hsa-miR-26b-5p TTCAAGTAATTCAGGATAGGT >hsa-miR-3937 ACAGGCGGCTGTAGCAATGGGGG >hsa-miR-299-3p TATGTGGGATGGTAAACCGCTT >hsa-miR-548ah-3p CAAAAACTGCAGTTACTTTTGC >hsa-miR-598 TACGTCATCGTTGTCATCGTCA >hsa-miR-644a AGTGTGGCTTTCTTAGAGC >hsa-miR-29b-3p TAGCACCATTTGAAATCAGTGTT >hsa-miR-520f AAGTGCTTCCTTTTAGAGGGTT >hsa-miR-4288 TTGTCTGCTGAGTTTCC >hsa-miR-4772-3p CCTGCAACTTTGCCTGATCAGA >hsa-miR-1282 TCGTTTGCCTTTTTCTGCTT >hsa-miR-624-5p TAGTACCAGTACCTTGTGTTCA >hsa-miR-454-5p ACCCTATCAATATTGTCTCTGC >hsa-miR-15a-3p CAGGCCATATTGTGCTGCCTCA >hsa-miR-222-5p CTCAGTAGCCAGTGTAGATCCT >hsa-miR-501-3p AATGCACCCGGGCAAGGATTCT >hsa-miR-3678-5p TCCGTACAAACTCTGCTGTG >hsa-miR-509-5p TACTGCAGACAGTGGCAATCA >hsa-miR-4783-5p GGCGCGCCCAGCTCCCGGGCT >hsa-miR-4469 GCTCCCTCTAGGGTCGCTCGGA >hsa-miR-193a-3p AACTGGCCTACAAAGTCCCAGT >hsa-miR-3691-3p ACCAAGTCTGCGTCATCCTCTC >hsa-miR-1304-5p TTTGAGGCTACAGTGAGATGTG >hsa-miR-1299 TTCTGGAATTCTGTGTGAGGGA >hsa-miR-4713-3p TGGGATCCAGACAGTGGGAGAA >hsa-miR-3941 TTACACACAACTGAGGATCATA >hsa-miR-508-5p TACTCCAGAGGGCGTCACTCATG >hsa-miR-3149 TTTGTATGGATATGTGTGTGTAT >hsa-miR-4736 AGGCAGGTTATCTGGGCTG >hsa-miR-4461 GATTGAGACTAGTAGGGCTAGGC >hsa-miR-4665-5p CTGGGGGACGCGTGAGCGCGAGC >hsa-miR-490-3p CAACCTGGAGGACTCCATGCTG >hsa-miR-136-3p CATCATCGTCTCAAATGAGTCT >hsa-miR-214-5p TGCCTGTCTACACTTGCTGTGC >hsa-miR-196a-5p TAGGTAGTTTCATGTTGTTGGG >hsa-miR-4781-5p TAGCGGGGATTCCAATATTGG >hsa-miR-4706 AGCGGGGAGGAAGTGGGCGCTGCTT >hsa-miR-575 GAGCCAGTTGGACAGGAGC >hsa-miR-941 CACCCGGCTGTGTGCACATGTGC >hsa-miR-937 ATCCGCGCTCTGACTCTCTGCC >hsa-miR-889 TTAATATCGGACAACCATTGT >hsa-miR-127-5p CTGAAGCTCAGAGGGCTCTGAT >hsa-miR-657 GGCAGGTTCTCACCCTCTCTAGG >hsa-miR-4687-3p TGGCTGTTGGAGGGGGCAGGC >hsa-miR-3664-3p TCTCAGGAGTAAAGACAGAGTT >hsa-miR-1471 GCCCGCGTGTGGAGCCAGGTGT >hsa-miR-605 TAAATCCCATGGTGCCTTCTCCT >hsa-miR-200c-5p CGTCTTACCCAGCAGTGTTTGG >hsa-miR-4657 AATGTGGAAGTGGTCTGAGGCAT >hsa-miR-5001-3p TTCTGCCTCTGTCCAGGTCCTT >hsa-miR-424-3p CAAAACGTGAGGCGCTGCTAT >hsa-miR-380-3p TATGTAATATGGTCCACATCTT >hsa-miR-3649 AGGGACCTGAGTGTCTAAG >hsa-miR-548l AAAAGTATTTGCGGGTTTTGTC >hsa-miR-509-3-5p TACTGCAGACGTGGCAATCATG >hsa-miR-3928 GGAGGAACCTTGGAGCTTCGGC >hsa-miR-3622a-3p TCACCTGACCTCCCATGCCTGT >hsa-miR-4306 TGGAGAGAAAGGCAGTA >hsa-miR-1182 GAGGGTCTTGGGAGGGATGTGAC >hsa-miR-597 TGTGTCACTCGATGACCACTGT >hsa-miR-1284 TCTATACAGACCCTGGCTTTTC >hsa-miR-4272 CATTCAACTAGTGATTGT >hsa-miR-5698 TGGGGGAGTGCAGTGATTGTGG >hsa-miR-4688 TAGGGGCAGCAGAGGACCTGGG >hsa-miR-1229 CTCTCACCACTGCCCTCCCACAG >hsa-miR-1207-5p TGGCAGGGAGGCTGGGAGGGG >hsa-miR-3611 TTGTGAAGAAAGAAATTCTTA >hsa-miR-4804-3p TGCTTAACCTTGCCCTCGAAA >hsa-let-7b-5p TGAGGTAGTAGGTTGTGTGGTT >hsa-miR-1279 TCATATTGCTTCTTTCT >hsa-miR-1180 TTTCCGGCTCGCGTGGGTGTGT >hsa-miR-548ah-5p AAAAGTGATTGCAGTGTTTG >hsa-miR-4521 GCTAAGGAAGTCCTGTGCTCAG >hsa-miR-122-3p AACGCCATTATCACACTAAATA >hsa-miR-186-3p GCCCAAAGGTGAATTTTTTGGG >hsa-miR-371a-5p ACTCAAACTGTGGGGGCACT >hsa-miR-4786-3p TGAAGCCAGCTCTGGTCTGGGC >hsa-miR-1268a CGGGCGTGGTGGTGGGGG >hsa-miR-676-3p CTGTCCTAAGGTTGTTGAGTT >hsa-miR-3676-3p CCGTGTTTCCCCCACGCTTT >hsa-miR-4738-3p TGAAACTGGAGCGCCTGGAGGA >hsa-miR-4256 ATCTGACCTGATGAAGGT >hsa-miR-4655-3p ACCCTCGTCAGGTCCCCGGGG >hsa-miR-9-3p ATAAAGCTAGATAACCGAAAGT >hsa-miR-513b TTCACAAGGAGGTGTCATTTAT >hsa-miR-596 AAGCCTGCCCGGCTCCTCGGG >hsa-miR-3606 TTAGTGAAGGCTATTTTAATT >hsa-miR-194-3p CCAGTGGGGCTGCTGTTATCTG >hsa-miR-4419a TGAGGGAGGAGACTGCA >hsa-miR-548au-5p AAAAGTAATTGCGGTTTTTGC >hsa-miR-3926 TGGCCAAAAAGCAGGCAGAGA >hsa-miR-499a-3p AACATCACAGCAAGTCTGTGCT >hsa-miR-5579-3p TTAGCTTAAGGAGTACCAGATC >hsa-miR-449b-3p CAGCCACAACTACCCTGCCACT >hsa-miR-3199 AGGGACTGCCTTAGGAGAAAGTT >hsa-miR-4745-5p TGAGTGGGGCTCCCGGGACGGCG >hsa-miR-3935 TGTAGATACGAGCACCAGCCAC >hsa-miR-4769-5p GGTGGGATGGAGAGAAGGTATGAG >hsa-miR-4727-5p ATCTGCCAGCTTCCACAGTGG >hsa-miR-365a-5p AGGGACTTTTGGGGGCAGATGTG >hsa-miR-449a TGGCAGTGTATTGTTAGCTGGT >hsa-miR-1910 CCAGTCCTGTGCCTGCCGCCT >hsa-miR-3609 CAAAGTGATGAGTAATACTGGCTG >hsa-miR-744-5p TGCGGGGCTAGGGCTAACAGCA >hsa-miR-4445-5p AGATTGTTTCTTTTGCCGTGCA >hsa-miR-4454 GGATCCGAGTCACGGCACCA >hsa-miR-18a-3p ACTGCCCTAAGTGCTCCTTCTGG >hsa-miR-3591-3p AAACACCATTGTCACACTCCAC >hsa-miR-1911-3p CACCAGGCATTGTGGTCTCC >hsa-miR-4798-5p TTCGGTATACTTTGTGAATTGG >hsa-miR-374c-3p CACTTAGCAGGTTGTATTATAT >hsa-miR-654-5p TGGTGGGCCGCAGAACATGTGC >hsa-miR-342-3p TCTCACACAGAAATCGCACCCGT >hsa-miR-323a-3p CACATTACACGGTCGACCTCT >hsa-miR-548o-3p CCAAAACTGCAGTTACTTTTGC >hsa-miR-125b-1-3p ACGGGTTAGGCTCTTGGGAGCT >hsa-miR-4717-5p TAGGCCACAGCCACCCATGTGT >hsa-miR-1289 TGGAGTCCAGGAATCTGCATTTT >hsa-miR-184 TGGACGGAGAACTGATAAGGGT >hsa-miR-7-2-3p CAACAAATCCCAGTCTACCTAA >hsa-miR-545-5p TCAGTAAATGTTTATTAGATGA >hsa-miR-4757-3p CATGACGTCACAGAGGCTTCGC >hsa-miR-3910 AAAGGCATAAAACCAAGACA >hsa-miR-4639-3p TCACTCTCACCTTGCTTTGC >hsa-miR-4516 GGGAGAAGGGTCGGGGC >hsa-miR-4731-5p TGCTGGGGGCCACATGAGTGTG >hsa-miR-4294 GGGAGTCTACAGCAGGG >hsa-miR-514b-5p TTCTCAAGAGGGAGGCAATCAT >hsa-miR-423-3p AGCTCGGTCTGAGGCCCCTCAGT >hsa-miR-5010-5p AGGGGGATGGCAGAGCAAAATT >hsa-miR-1909-5p TGAGTGCCGGTGCCTGCCCTG >hsa-miR-4664-5p TGGGGTGCCCACTCCGCAAGTT >hsa-miR-5011-5p TATATATACAGCCATGCACTC >hsa-miR-934 TGTCTACTACTGGAGACACTGG >hsa-miR-138-2-3p GCTATTTCACGACACCAGGGTT >hsa-miR-3120-5p CCTGTCTGTGCCTGCTGTACA >hsa-miR-378f ACTGGACTTGGAGCCAGAAG >hsa-miR-369-5p AGATCGACCGTGTTATATTCGC >hsa-miR-505-5p GGGAGCCAGGAAGTATTGATGT >hsa-miR-873-5p GCAGGAACTTGTGAGTCTCCT >hsa-miR-938 TGCCCTTAAAGGTGAACCCAGT >hsa-miR-629-3p GTTCTCCCAACGTAAGCCCAGC >hsa-miR-196b-3p TCGACAGCACGACACTGCCTTC >hsa-miR-638 AGGGATCGCGGGCGGGTGGCGGCCT >hsa-miR-144-3p TACAGTATAGATGATGTACT >hsa-miR-4635 TCTTGAAGTCAGAACCCGCAA >hsa-miR-299-5p TGGTTTACCGTCCCACATACAT >hsa-miR-4790-5p ATCGCTTTACCATTCATGTT >hsa-miR-4687-5p CAGCCCTCCTCCCGCACCCAAA >hsa-miR-181a-2-3p ACCACTGACCGTTGACTGTACC >hsa-miR-4733-3p CCACCAGGTCTAGCATTGGGAT >hsa-miR-4426 GAAGATGGACGTACTTT >hsa-miR-3657 TGTGTCCCATTATTGGTGATT >hsa-miR-3921 TCTCTGAGTACCATATGCCTTGT >hsa-miR-193a-5p TGGGTCTTTGCGGGCGAGATGA >hsa-miR-125b-2-3p TCACAAGTCAGGCTCTTGGGAC >hsa-miR-548d-5p AAAAGTAATTGTGGTTTTTGCC >hsa-miR-375 TTTGTTCGTTCGGCTCGCGTGA >hsa-miR-4660 TGCAGCTCTGGTGGAAAATGGAG >hsa-miR-3145-5p AACTCCAAACACTCAAAACTCA >hsa-miR-675-3p CTGTATGCCCTCACCGCTCA >hsa-miR-5706 TTCTGGATAACATGCTGAAGCT >hsa-miR-297 ATGTATGTGTGCATGTGCATG >hsa-miR-4489 TGGGGCTAGTGATGCAGGACG >hsa-miR-4308 TCCCTGGAGTTTCTTCTT >hsa-miR-5197-3p AAGAAGAGACTGAGTCATCGAAT >hsa-miR-625-5p AGGGGGAAAGTTCTATAGTCC >hsa-miR-454-3p TAGTGCAATATTGCTTATAGGGT >hsa-miR-4681 AACGGGAATGCAGGCTGTATCT >hsa-miR-4760-5p TTTAGATTGAACATGAAGTTAG >hsa-miR-93-5p CAAAGTGCTGTTCGTGCAGGTAG >hsa-miR-4763-5p CGCCTGCCCAGCCCTCCTGCT >hsa-miR-4646-5p ACTGGGAAGAGGAGCTGAGGGA >hsa-miR-644b-3p TTCATTTGCCTCCCAGCCTACA >hsa-miR-5009-5p TTGGACTTTTTCAGATTTGGGGAT >hsa-miR-4322 CTGTGGGCTCAGCGCGTGGGG >hsa-miR-548aq-3p CAAAAACTGCAATTACTTTTGC >hsa-miR-548s ATGGCCAAAACTGCAGTTATTTT >hsa-miR-4721 TGAGGGCTCCAGGTGACGGTGG >hsa-miR-421 ATCAACAGACATTAATTGGGCGC >hsa-miR-876-5p TGGATTTCTTTGTGAATCACCA >hsa-let-7d-5p AGAGGTAGTAGGTTGCATAGTT >hsa-miR-582-5p TTACAGTTGTTCAACCAGTTACT >hsa-miR-4436a GCAGGACAGGCAGAAGTGGAT >hsa-miR-4778-3p TCTTCTTCCTTTGCAGAGTTGA >hsa-miR-181c-5p AACATTCAACCTGTCGGTGAGT >hsa-miR-559 TAAAGTAAATATGCACCAAAA >hsa-miR-4499 AAGACTGAGAGGAGGGA >hsa-miR-374a-5p TTATAATACAACCTGATAAGTG >hsa-miR-921 CTAGTGAGGGACAGAACCAGGATTC >hsa-miR-3692-5p CCTGCTGGTCAGGAGTGGATACTG >hsa-miR-5695 ACTCCAAGAAGAATCTAGACAG >hsa-miR-30d-5p TGTAAACATCCCCGACTGGAAG >hsa-miR-4716-3p AAGGGGGAAGGAAACATGGAGA >hsa-miR-500a-5p TAATCCTTGCTACCTGGGTGAGA >hsa-miR-4529-3p ATTGGACTGCTGATGGCCCGT >hsa-miR-5694 CAGATCATGGGACTGTCTCAG >hsa-miR-4427 TCTGAATAGAGTCTGAAGAGT >hsa-miR-29b-2-5p CTGGTTTCACATGGTGGCTTAG >hsa-miR-152 TCAGTGCATGACAGAACTTGG >hsa-miR-135b-5p TATGGCTTTTCATTCCTATGTGA >hsa-miR-2054 CTGTAATATAAATTTAATTTATT >hsa-miR-4420 GTCACTGATGTCTGTAGCTGAG >hsa-miR-5571-3p GTCCTAGGAGGCTCCTCTG >hsa-miR-3153 GGGGAAAGCGAGTAGGGACATTT >hsa-miR-218-2-3p CATGGTTCTGTCAAGCACCGCG >hsa-miR-4535 GTGGACCTGGCTGGGAC >hsa-miR-5707 ACGTTTGAATGCTGTACAAGGC >hsa-miR-409-5p AGGTTACCCGAGCAACTTTGCAT >hsa-miR-4464 AAGGTTTGGATAGATGCAATA >hsa-miR-99b-5p CACCCGTAGAACCGACCTTGCG >hsa-miR-154-5p TAGGTTATCCGTGTTGCCTTCG >hsa-miR-4511 GAAGAACTGTTGCATTTGCCCT >hsa-miR-5696 CTCATTTAAGTAGTCTGATGCC >hsa-miR-1250 ACGGTGCTGGATGTGGCCTTT >hsa-miR-182-3p TGGTTCTAGACTTGCCAACTA >hsa-miR-2681-3p TATCATGGAGTTGGTAAAGCAC >hsa-miR-1266 CCTCAGGGCTGTAGAACAGGGCT >hsa-miR-922 GCAGCAGAGAATAGGACTACGTC >hsa-miR-3662 GAAAATGATGAGTAGTGACTGATG >hsa-miR-513a-5p TTCACAGGGAGGTGTCAT >hsa-miR-4448 GGCTCCTTGGTCTAGGGGTA >hsa-miR-2682-5p CAGGCAGTGACTGTTCAGACGTC >hsa-miR-3173-5p TGCCCTGCCTGTTTTCTCCTTT >hsa-miR-3918 ACAGGGCCGCAGATGGAGACT >hsa-miR-1252 AGAAGGAAATTGAATTCATTTA >hsa-miR-4512 CAGGGCCTCACTGTATCGCCCA >hsa-miR-4326 TGTTCCTCTGTCTCCCAGAC >hsa-miR-16-5p TAGCAGCACGTAAATATTGGCG >hsa-miR-4302 CCAGTGTGGCTCAGCGAG >hsa-miR-550a-3-5p AGTGCCTGAGGGAGTAAGAG >hsa-miR-4265 CTGTGGGCTCAGCTCTGGG >hsa-miR-4451 TGGTAGAGCTGAGGACA >hsa-miR-516a-5p TTCTCGAGGAAAGAAGCACTTTC >hsa-miR-5681a AGAAAGGGTGGCAATACCTCTT >hsa-miR-1236 CCTCTTCCCCTTGTCTCTCCAG >hsa-miR-3942-3p TTTCAGATAACAGTATTACAT >hsa-miR-627 GTGAGTCTCTAAGAAAAGAGGA >hsa-miR-548av-5p AAAAGTACTTGCGGATTT >hsa-miR-612 GCTGGGCAGGGCTTCTGAGCTCCTT >hsa-miR-376a-3p ATCATAGAGGAAAATCCACGT >hsa-miR-4735-5p CCTAATTTGAACACCTTCGGTA >hsa-miR-520d-5p CTACAAAGGGAAGCCCTTTC >hsa-miR-548a-5p AAAAGTAATTGCGAGTTTTACC >hsa-miR-26a-1-3p CCTATTCTTGGTTACTTGCACG >hsa-miR-3681-3p ACACAGTGCTTCATCCACTACT >hsa-miR-371b-3p AAGTGCCCCCACAGTTTGAGTGC >hsa-miR-3180 TGGGGCGGAGCTTCCGGAG >hsa-miR-3138 TGTGGACAGTGAGGTAGAGGGAGT >hsa-miR-2113 ATTTGTGCTTGGCTCTGTCAC >hsa-miR-3161 CTGATAAGAACAGAGGCCCAGAT >hsa-miR-3144-5p AGGGGACCAAAGAGATATATAG >hsa-miR-548ag AAAGGTAATTGTGGTTTCTGC >hsa-miR-130b-5p ACTCTTTCCCTGTTGCACTAC >hsa-miR-146b-5p TGAGAACTGAATTCCATAGGCT >hsa-miR-4746-5p CCGGTCCCAGGAGAACCTGCAGA >hsa-miR-4491 AATGTGGACTGGTGTGACCAAA >hsa-miR-4663 AGCTGAGCTCCATGGACGTGCAGT >hsa-miR-1303 TTTAGAGACGGGGTCTTGCTCT >hsa-miR-5196-5p AGGGAAGGGGACGAGGGTTGGG >hsa-miR-504 AGACCCTGGTCTGCACTCTATC >hsa-miR-3119 TGGCTTTTAACTTTGATGGC >hsa-miR-3973 ACAAAGTACAGCATTAGCCTTAG >hsa-miR-3129-3p AAACTAATCTCTACACTGCTGC >hsa-miR-138-5p AGCTGGTGTTGTGAATCAGGCCG >hsa-miR-4757-5p AGGCCTCTGTGACGTCACGGTGT >hsa-miR-33a-3p CAATGTTTCCACAGTGCATCAC >hsa-miR-5689 AGCATACACCTGTAGTCCTAGA >hsa-miR-4691-5p GTCCTCCAGGCCATGAGCTGCGG >hsa-miR-4277 GCAGTTCTGAGCACAGTACAC >hsa-miR-4795-5p AGAAGTGGCTAATAATATTGA >hsa-miR-4760-3p AAATTCATGTTCAATCTAAACC >hsa-miR-2276 TCTGCAAGTGTCAGAGGCGAGG >hsa-miR-4325 TTGCACTTGTCTCAGTGA >hsa-miR-3135a TGCCTAGGCTGAGACTGCAGTG >hsa-miR-506-3p TAAGGCACCCTTCTGAGTAGA >hsa-miR-378i ACTGGACTAGGAGTCAGAAGG >hsa-miR-432-5p TCTTGGAGTAGGTCATTGGGTGG >hsa-miR-25-3p CATTGCACTTGTCTCGGTCTGA >hsa-miR-302c-5p TTTAACATGGGGGTACCTGCTG >hsa-miR-631 AGACCTGGCCCAGACCTCAGC >hsa-miR-548aq-5p GAAAGTAATTGCTGTTTTTGCC >hsa-miR-3614-3p TAGCCTTCAGATCTTGGTGTTTT >hsa-miR-744-3p CTGTTGCCACTAACCTCAACCT >hsa-miR-378g ACTGGGCTTGGAGTCAGAAG >hsa-miR-4738-5p ACCAGCGCGTTTTCAGTTTCAT >hsa-miR-4999-3p TCACTACCTGACAATACAGT >hsa-miR-4295 CAGTGCAATGTTTTCCTT >hsa-miR-3185 AGAAGAAGGCGGTCGGTCTGCGG >hsa-miR-518f-5p CTCTAGAGGGAAGCACTTTCTC >hsa-miR-558 TGAGCTGCTGTACCAAAAT >hsa-miR-4266 CTAGGAGGCCTTGGCC >hsa-miR-525-3p GAAGGCGCTTCCCTTTAGAGCG >hsa-miR-564 AGGCACGGTGTCAGCAGGC >hsa-miR-4718 AGCTGTACCTGAAACCAAGCA >hsa-miR-519d CAAAGTGCCTCCCTTTAGAGTG >hsa-miR-92a-1-5p AGGTTGGGATCGGTTGCAATGCT >hsa-miR-5700 TAATGCATTAAATTATTGAAGG >hsa-miR-4758-5p GTGAGTGGGAGCCGGTGGGGCTG >hsa-miR-580 TTGAGAATGATGAATCATTAGG >hsa-miR-1262 ATGGGTGAATTTGTAGAAGGAT >hsa-miR-2278 GAGAGCAGTGTGTGTTGCCTGG >hsa-miR-34b-5p TAGGCAGTGTCATTAGCTGATTG >hsa-miR-145-3p GGATTCCTGGAAATACTGTTCT >hsa-miR-128 TCACAGTGAACCGGTCTCTTT >hsa-miR-604 AGGCTGCGGAATTCAGGAC >hsa-miR-1287 TGCTGGATCAGTGGTTCGAGTC >hsa-miR-4530 CCCAGCAGGACGGGAGCG >hsa-miR-378b ACTGGACTTGGAGGCAGAA >hsa-miR-466 ATACACATACACGCAACACACAT >hsa-miR-4710 GGGTGAGGGCAGGTGGTT >hsa-miR-197-3p TTCACCACCTTCTCCACCCAGC >hsa-miR-328 CTGGCCCTCTCTGCCCTTCCGT >hsa-miR-3927 CAGGTAGATATTTGATAGGCAT >hsa-miR-5693 GCAGTGGCTCTGAAATGAACTC >hsa-miR-135a-3p TATAGGGATTGGAGCCGTGGCG >hsa-miR-5002-5p AATTTGGTTTCTGAGGCACTTAGT >hsa-miR-708-3p CAACTAGACTGTGAGCTTCTAG >hsa-miR-5571-5p CAATTCTCAAAGGAGCCTCCC >hsa-miR-30c-1-3p CTGGGAGAGGGTTGTTTACTCC >hsa-miR-634 AACCAGCACCCCAACTTTGGAC >hsa-miR-1283 TCTACAAAGGAAAGCGCTTTCT >hsa-miR-4692 TCAGGCAGTGTGGGTATCAGAT >hsa-miR-766-3p ACTCCAGCCCCACAGCCTCAGC >hsa-miR-769-3p CTGGGATCTCCGGGGTCTTGGTT >hsa-miR-4321 TTAGCGGTGGACCGCCCTGCG >hsa-miR-549 TGACAACTATGGATGAGCTCT >hsa-miR-1253 AGAGAAGAAGATCAGCCTGCA >hsa-miR-5100 TTCAGATCCCAGCGGTGCCTCT >hsa-miR-329 AACACACCTGGTTAACCTCTTT >hsa-miR-5192 AGGAGAGTGGATTCCAGGTGGT >hsa-miR-3674 ATTGTAGAACCTAAGATTGGCC >hsa-miR-561-5p ATCAAGGATCTTAAACTTTGCC >hsa-miR-1273g-3p ACCACTGCACTCCAGCCTGAG >hsa-miR-345-3p GCCCTGAACGAGGGGTCTGGAG >hsa-miR-494 TGAAACATACACGGGAAACCTC >hsa-miR-5094 AATCAGTGAATGCCTTGAACCT >hsa-let-7f-1-3p CTATACAATCTATTGCCTTCCC >hsa-miR-4709-3p TTGAAGAGGAGGTGCTCTGTAGC >hsa-miR-32-5p TATTGCACATTACTAAGTTGCA >hsa-miR-548n CAAAAGTAATTGTGGATTTTGT >hsa-miR-3074-5p GTTCCTGCTGAACTGAGCCAG >hsa-miR-192-5p CTGACCTATGAATTGACAGCC >hsa-miR-4691-3p CCAGCCACGGACTGAGAGTGCAT >hsa-miR-212-5p ACCTTGGCTCTAGACTGCTTACT >hsa-miR-2909 GTTAGGGCCAACATCTCTTGG >hsa-miR-208b ATAAGACGAACAAAAGGTTTGT >hsa-miR-3909 TGTCCTCTAGGGCCTGCAGTCT >hsa-miR-4741 CGGGCTGTCCGGAGGGGTCGGCT >hsa-miR-3137 TCTGTAGCCTGGGAGCAATGGGGT >hsa-miR-4457 TCACAAGGTATTGACTGGCGTA >hsa-miR-22-3p AAGCTGCCAGTTGAAGAACTGT >hsa-miR-3133 TAAAGAACTCTTAAAACCCAAT >hsa-miR-5095 TTACAGGCGTGAACCACCGCG >hsa-miR-3529-3p AACAACAAAATCACTAGTCTTCCA >hsa-miR-491-3p CTTATGCAAGATTCCCTTCTAC >hsa-miR-524-3p GAAGGCGCTTCCCTTTGGAGT >hsa-miR-134 TGTGACTGGTTGACCAGAGGGG >hsa-miR-1285-3p TCTGGGCAACAAAGTGAGACCT >hsa-miR-2392 TAGGATGGGGGTGAGAGGTG >hsa-miR-615-3p TCCGAGCCTGGGTCTCCCTCTT >hsa-miR-4506 AAATGGGTGGTCTGAGGCAA >hsa-miR-106a-3p CTGCAATGTAAGCACTTCTTAC >hsa-miR-373-5p ACTCAAAATGGGGGCGCTTTCC >hsa-miR-603 CACACACTGCAATTACTTTTGC >hsa-miR-4726-5p AGGGCCAGAGGAGCCTGGAGTGG >hsa-miR-145-5p GTCCAGTTTTCCCAGGAATCCCT >hsa-miR-512-5p CACTCAGCCTTGAGGGCACTTTC >hsa-miR-16-2-3p CCAATATTACTGTGCTGCTTTA >hsa-miR-3689f TGTGATATCGTGCTTCCTGGGA >hsa-miR-4285 GCGGCGAGTCCGACTCAT >hsa-miR-5585-3p CTGAATAGCTGGGACTACAGGT >hsa-miR-759 GCAGAGTGCAAACAATTTTGAC >hsa-miR-4455 AGGGTGTGTGTGTTTTT >hsa-miR-4720-3p TGCTTAAGTTGTACCAAGTAT >hsa-miR-3179 AGAAGGGGTGAAATTTAAACGT >hsa-miR-4481 GGAGTGGGCTGGTGGTT >hsa-miR-4298 CTGGGACAGGAGGAGGAGGCAG >hsa-miR-509-3p TGATTGGTACGTCTGTGGGTAG >hsa-miR-4505 AGGCTGGGCTGGGACGGA >hsa-miR-658 GGCGGAGGGAAGTAGGTCCGTTGGT >hsa-miR-1206 TGTTCATGTAGATGTTTAAGC >hsa-miR-570-5p___hsa-miR-548ai AAAGGTAATTGCAGTTTTTCCC >hsa-miR-548ap-3p AAAAACCACAATTACTTTT >hsa-miR-378a-5p CTCCTGACTCCAGGTCCTGTGT >hsa-miR-5190 CCAGTGACTGAGCTGGAGCCA >hsa-miR-1293 TGGGTGGTCTGGAGATTTGTGC >hsa-miR-1296 TTAGGGCCCTGGCTCCATCTCC >hsa-miR-4458 AGAGGTAGGTGTGGAAGAA >hsa-miR-662 TCCCACGTTGTGGCCCAGCAG >hsa-miR-506-5p TATTCAGGAAGGTGTTACTTAA >hsa-miR-4704-3p TCAGTCACATATCTAGTGTCTA >hsa-miR-3617 AAAGACATAGTTGCAAGATGGG >hsa-miR-3190-3p TGTGGAAGGTAGACGGCCAGAGA >hsa-miR-320b AAAAGCTGGGTTGAGAGGGCAA >hsa-miR-4510 TGAGGGAGTAGGATGTATGGTT >hsa-miR-483-5p AAGACGGGAGGAAAGAAGGGAG >hsa-miR-4328 CCAGTTTTCCCAGGATT >hsa-miR-890 TACTTGGAAAGGCATCAGTTG >hsa-miR-548i AAAAGTAATTGCGGATTTTGCC >hsa-miR-99a-5p AACCCGTAGATCCGATCTTGTG >hsa-miR-4764-3p TTAACTCCTTTCACACCCATGG >hsa-miR-4264 ACTCAGTCATGGTCATT >hsa-miR-4780 ACCCTTGAGCCTGATCCCTAGC >hsa-miR-3065-3p TCAGCACCAGGATATTGTTGGAG >hsa-miR-5091 ACGGAGACGACAAGACTGTGCTG >hsa-miR-3124-5p TTCGCGGGCGAAGGCAAAGTC >hsa-miR-1976 CCTCCTGCCCTCCTTGCTGT >hsa-miR-3194-5p GGCCAGCCACCAGGAGGGCTG >hsa-miR-4488 AGGGGGCGGGCTCCGGCG >hsa-miR-4636 AACTCGTGTTCAAAGCCTTTAG >hsa-miR-99b-3p CAAGCTCGTGTCTGTGGGTCCG >hsa-miR-4751 AGAGGACCCGTAGCTGCTAGAAGG >hsa-miR-4305 CCTAGACACCTCCAGTTC >hsa-miR-3613-5p TGTTGTACTTTTTTTTTTGTTC >hsa-miR-340-5p TTATAAAGCAATGAGACTGATT >hsa-miR-628-5p ATGCTGACATATTTACTAGAGG >hsa-miR-4475 CAAGGGACCAAGCATTCATTAT >hsa-miR-181c-3p AACCATCGACCGTTGAGTGGAC >hsa-miR-3658 TTTAAGAAAACACCATGGAGAT >hsa-let-7f-5p TGAGGTAGTAGATTGTATAGTT >hsa-miR-491-5p AGTGGGGAACCCTTCCATGAGG >hsa-miR-3688-5p AGTGGCAAAGTCTTTCCATAT >hsa-miR-32-3p CAATTTAGTGTGTGTGATATTT >hsa-miR-4785 AGAGTCGGCGACGCCGCCAGC >hsa-miR-620 ATGGAGATAGATATAGAAAT >hsa-miR-3944-3p TTCGGGCTGGCCTGCTGCTCCGG >hsa-miR-140-3p TACCACAGGGTAGAACCACGG >hsa-miR-4645-3p AGACAGTAGTTCTTGCCTGGTT >hsa-miR-4787-3p GATGCGCCGCCCACTGCCCCGCGC >hsa-miR-519c-3p AAAGTGCATCTTTTTAGAGGAT >hsa-miR-644b-5p TGGGCTAAGGGAGATGATTGGGTA >hsa-miR-219-5p TGATTGTCCAAACGCAATTCT >hsa-miR-4485 TAACGGCCGCGGTACCCTAA >hsa-miR-5009-3p TCCTAAATCTGAAAGTCCAAAA >hsa-miR-1915-3p CCCCAGGGCGACGCGGCGGG >hsa-miR-4439 GTGACTGATACCTTGGAGGCAT >hsa-miR-767-5p TGCACCATGGTTGTCTGAGCATG >hsa-miR-2115-5p AGCTTCCATGACTCCTGATGGA >hsa-miR-520e AAAGTGCTTCCTTTTTGAGGG >hsa-miR-4768-5p ATTCTCTCTGGATCCCATGGAT >hsa-miR-4737 ATGCGAGGATGCTGACAGTG >hsa-miR-637 ACTGGGGGCTTTCGGGCTCTGCGT >hsa-miR-656 AATATTATACAGTCAACCTCT >hsa-miR-4755-5p TTTCCCTTCAGAGCCTGGCTTT >hsa-miR-518f-3p GAAAGCGCTTCTCTTTAGAGG >hsa-miR-3607-3p ACTGTAAACGCTTTCTGATG >hsa-miR-146a-3p CCTCTGAAATTCAGTTCTTCAG >hsa-miR-5004-5p TGAGGACAGGGCAAATTCACGA >hsa-miR-3925-3p ACTCCAGTTTTAGTTCTCTTG >hsa-miR-548al AACGGCAATGACTTTTGTACCA >hsa-miR-4283 TGGGGCTCAGCGAGTTT >hsa-miR-3184-3p AAAGTCTCGCTCTCTGCCCCTCA >hsa-miR-149-3p AGGGAGGGACGGGGGCTGTGC >hsa-miR-155-5p TTAATGCTAATCGTGATAGGGGT >hsa-miR-3165 AGGTGGATGCAATGTGACCTCA >hsa-miR-548ae CAAAAACTGCAATTACTTTCA >hsa-miR-4317 ACATTGCCAGGGAGTTT >hsa-miR-4804-5p TTGGACGGTAAGGTTAAGCAA >hsa-miR-4466 GGGTGCGGGCCGGCGGGG >hsa-miR-4732-5p TGTAGAGCAGGGAGCAGGAAGCT >hsa-miR-1973 ACCGTGCAAAGGTAGCATA >hsa-let-7a-3p CTATACAATCTACTGTCTTTC >hsa-miR-718 CTTCCGCCCCGCCGGGCGTCG >hsa-miR-526b-5p CTCTTGAGGGAAGCACTTTCTGT >hsa-miR-10a-5p TACCCTGTAGATCCGAATTTGTG >hsa-miR-1278 TAGTACTGTGCATATCATCTAT >hsa-miR-3183 GCCTCTCTCGGAGTCGCTCGGA >hsa-miR-4771 AGCAGACTTGACCTACAATTA >hsa-miR-1245a AAGTGATCTAAAGGCCTACAT >hsa-miR-7-1-3p CAACAAATCACAGTCTGCCATA >hsa-miR-3655 GCTTGTCGCTGCGGTGTTGCT >hsa-miR-122-5p TGGAGTGTGACAATGGTGTTTG >hsa-miR-548ao-3p AAAGACCGTGACTACTTTTGCA >hsa-miR-548b-5p AAAAGTAATTGTGGTTTTGGCC >hsa-miR-4761-3p GAGGGCATGCGCACTTTGTCC >hsa-miR-548ab AAAAGTAATTGTGGATTTTGCT >hsa-miR-492 AGGACCTGCGGGACAAGATTCTT >hsa-miR-1185-5p AGAGGATACCCTTTGTATGTT >hsa-miR-514a-5p TACTCTGGAGAGTGACAATCATG >hsa-miR-670 GTCCCTGAGTGTATGTGGTG >hsa-miR-129-1-3p AAGCCCTTACCCCAAAAAGTAT >hsa-miR-5688 TAACAAACACCTGTAAAACAGC >hsa-miR-18a-5p TAAGGTGCATCTAGTGCAGATAG >hsa-miR-4783-3p CCCCGGTGTTGGGGCGCGTCTGC >hsa-miR-676-5p TCTTCAACCTCAGGACTTGCA >hsa-miR-548aj-3p TAAAAACTGCAATTACTTTTA >hsa-miR-4700-3p CACAGGACTGACTCCTCACCCCAGTG >hsa-miR-1294 TGTGAGGTTGGCATTGTTGTCT >hsa-miR-3938 AATTCCCTTGTAGATAACCCGG >hsa-miR-4667-3p TCCCTCCTTCTGTCCCCACAG >hsa-miR-4644 TGGAGAGAGAAAAGAGACAGAAG >hsa-miR-4759 TAGGACTAGATGTTGGAATTA >hsa-miR-4728-3p CATGCTGACCTCCCTCCTGCCCCAG >hsa-miR-588 TTGGCCACAATGGGTTAGAAC >hsa-miR-3907 AGGTGCTCCAGGCTGGCTCACA >hsa-miR-1302 TTGGGACATACTTATGCTAAA >hsa-miR-582-3p TAACTGGTTGAACAACTGAACC >hsa-miR-485-5p AGAGGCTGGCCGTGATGAATTC >hsa-miR-3139 TAGGAGCTCAACAGATGCCTGTT >hsa-miR-548ak AAAAGTAACTGCGGTTTTTGA >hsa-miR-1322 GATGATGCTGCTGATGCTG >hsa-miR-655 ATAATACATGGTTAACCTCTTT >hsa-miR-502-5p ATCCTTGCTATCTGGGTGCTA >hsa-miR-20b-5p CAAAGTGCTCATAGTGCAGGTAG >hsa-miR-3167 AGGATTTCAGAAATACTGGTGT >hsa-miR-301a-5p GCTCTGACTTTATTGCACTACT >hsa-miR-4699-5p AGAAGATTGCAGAGTAAGTTCC >hsa-miR-340-3p TCCGTCTCAGTTACTTTATAGC >hsa-miR-380-5p TGGTTGACCATAGAACATGCGC >hsa-miR-4667-5p ACTGGGGAGCAGAAGGAGAACC >hsa-miR-649 AAACCTGTGTTGTTCAAGAGTC >hsa-miR-21-3p CAACACCAGTCGATGGGCTGT >hsa-miR-3661 TGACCTGGGACTCGGACAGCTG >hsa-miR-378c ACTGGACTTGGAGTCAGAAGAGTGG >hsa-miR-3616-5p ATGAAGTGCACTCATGATATGT >hsa-miR-4518 GCTCAGGGATGATAACTGTGCTGAGA >hsa-miR-3200-5p AATCTGAGAAGGCGCACAAGGT >hsa-miR-450b-3p TTGGGATCATTTTGCATCCATA >hsa-miR-222-3p AGCTACATCTGGCTACTGGGT >hsa-miR-4503 TTTAAGCAGGAAATAGAATTTA >hsa-miR-1297 TTCAAGTAATTCAGGTG >hsa-miR-4286 ACCCCACTCCTGGTACC >hsa-miR-1208 TCACTGTTCAGACAGGCGGA >hsa-miR-4440 TGTCGTGGGGCTTGCTGGCTTG >hsa-miR-4292 CCCCTGGGCCGGCCTTGG >hsa-miR-4524a-5p ATAGCAGCATGAACCTGTCTCA >hsa-miR-4763-3p AGGCAGGGGCTGGTGCTGGGCGGG >hsa-miR-2052 TGTTTTGATAACAGTAATGT >hsa-miR-4766-5p TCTGAAAGAGCAGTTGGTGTT >hsa-miR-4724-5p AACTGAACCAGGAGTGAGCTTCG >hsa-miR-4773 CAGAACAGGAGCATAGAAAGGC >hsa-miR-4666a-3p CATACAATCTGACATGTATTT >hsa-miR-4520a-5p___hsa-miR-4520b-5p CCTGCGTGTTTTCTGTCCAA >hsa-miR-2117 TGTTCTCTTTGCCAAGGACAG >hsa-miR-4750 CTCGGGCGGAGGTGGTTGAGTG >hsa-miR-3605-5p TGAGGATGGATAGCAAGGAAGCC >hsa-miR-3117-5p AGACACTATACGAGTCATAT >hsa-miR-4527 TGGTCTGCAAAGAGATGACTGT >hsa-miR-448 TTGCATATGTAGGATGTCCCAT >hsa-let-7e-3p CTATACGGCCTCCTAGCTTTCC >hsa-miR-186-5p CAAAGAATTCTCCTTTTGGGCT >hsa-miR-520a-3p AAAGTGCTTCCCTTTGGACTGT >hsa-miR-135a-5p TATGGCTTTTTATTCCTATGTGA >hsa-miR-5582-5p TAGGCACACTTAAAGTTATAGC >hsa-miR-1273c GGCGACAAAACGAGACCCTGTC >hsa-miR-142-3p TGTAGTGTTTCCTACTTTATGGA >hsa-miR-3190-5p TCTGGCCAGCTACGTCCCCA >hsa-miR-888-3p GACTGACACCTCTTTGGGTGAA >hsa-miR-3150b-5p CAACCTCGAGGATCTCCCCAGC >hsa-miR-4471 TGGGAACTTAGTAGAGGTTTAA >hsa-miR-330-5p TCTCTGGGCCTGTGTCTTAGGC >hsa-miR-4776-3p CTTGCCATCCTGGTCCACTGCAT >hsa-miR-3650 AGGTGTGTCTGTAGAGTCC >hsa-miR-3152-5p ATTGCCTCTGTTCTAACACAAG >hsa-miR-5703 AGGAGAAGTCGGGAAGGT >hsa-miR-223-3p TGTCAGTTTGTCAAATACCCCA >hsa-miR-5011-3p GTGCATGGCTGTATATATAACA >hsa-miR-765 TGGAGGAGAAGGAAGGTGATG >hsa-miR-3196 CGGGGCGGCAGGGGCCTC >hsa-miR-1246 AATGGATTTTTGGAGCAGG >hsa-miR-548p TAGCAAAAACTGCAGTTACTTT >hsa-miR-892b CACTGGCTCCTTTCTGGGTAGA >hsa-miR-411-5p TAGTAGACCGTATAGCGTACG >hsa-miR-499b-5p ACAGACTTGCTGTGATGTTCA >hsa-miR-3689a-3p CTGGGAGGTGTGATATCGTGGT >hsa-miR-652-5p CAACCCTAGGAGAGGGTGCCATTCA >hsa-miR-5089 GTGGGATTTCTGAGTAGCATC >hsa-miR-561-3p CAAAGTTTAAGATCCTTGAAGT >hsa-miR-4742-5p TCAGGCAAAGGGATATTTACAGA >hsa-miR-4496 GAGGAAACTGAAGCTGAGAGGG >hsa-miR-4494 CCAGACTGTGGCTGACCAGAGG >hsa-miR-5187-5p TGGGATGAGGGATTGAAGTGGA >hsa-miR-4509 ACTAAAGGATATAGAAGGTTTT >hsa-miR-381 TATACAAGGGCAAGCTCTCTGT >hsa-miR-4330 CCTCAGATCAGAGCCTTGC >hsa-miR-4649-5p TGGGCGAGGGGTGGGCTCTCAGAG >hsa-miR-4693-5p ATACTGTGAATTTCACTGTCACA >hsa-miR-4799-3p ACTGGCATGCTGCATTTATATA >hsa-miR-4301 TCCCACTACTTCACTTGTGA >hsa-miR-663a AGGCGGGGCGCCGCGGGACCGC >hsa-miR-511 GTGTCTTTTGCTCTGCAGTCA >hsa-miR-30d-3p CTTTCAGTCAGATGTTTGCTGC marcelm-sqt-d3218a8c5437/tests/seq.fa000066400000000000000000000064721302004573300172310ustar00rootroot00000000000000>Chr1 CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCCCTAAATCTTTAAATCCTACATCCAT GAATCCCTAAATACCTAATTCCCTAAACCCGAAACCGGTTTCTCTGGTTGAAAATCATTGTGTATATAATGATAATTTT ATCGTTTTTATGTAATTGCTTATTGTTGTGTGTAGATTTTTTAAAAATATCATTTGAGGTCAATACAAATCCTATTTCT TGTGGTTTTCTTTCCTTCACTTAGCTATGGATGGTTTATCTTCATTTGTTATATTGGATACAAGCTTTGCTACGATCTA CATTTGGGAATGTGAGTCTCTTATTGTAACCTTAGGGTTGGTTTATCTCAAGAATCTTATTAATTGTTTGGACTGTTTA TTGTTTTGCTTCTTTGAAGTAGTTTCTCTTTGCAAAATTCCTCTTTTTTTAGAGTGATTTGGATGATTCAAGACTTCTC GGTACTGCAAAGTTCTTCCGCCTGATTAATTATCCATTTTACCTTTGTCGTAGATATTAGGTAATCTGTAAGTCAACTC ATATACAACTCATAATTTAAAATAAAATTATGATCGACACACGTTTACACATAAAATCTGTAAATCAACTCATATACCC GTTATTCCCACAATCATATGCTTTCTAAAAGCAAAAGTATATGTCAACAATTGGTTATAAATTATTAGAAGTTTTCCAC TTATGACTTAAGAACTTGTGAAGCAGAAAGTGGCAACACCCCCCACCTCCCCCCCCCCCCCCCACCCCCCAAATTGAGA AGTCAATTTTATATAATTTAATCAAATAAATAAGTTTATGGTTAAGAGTTTTTTACTCTCTTTATTTTTCTTTTTCTTT TTGAGACATACTGAAAAAAGTTGTAATTATTAATGATAGTTCTGTGATTCCTCCATGAATCACATCTGCTTGATTTTTC TTTCATAAATTTATAAGTAATACATTCTTATAAAATGGTCAGAGAAACACCAAAGATCCCGAGATTTCTTCTCACTTAC TTTTTTTCTATCTATCTAGATTATATAAATGAGATGTTGAATTAGAGGAACCTTTGATTCAATGATCATAGAAAAATTA GGTAAAGAGTCAGTGTCGTTATGTTATGGAAGATGTGAATGAAGTTTGACTTCTCATTGTATATGAGTAAAATCTTTTC TTACAAGGGAAGTCCCCAATTGGTCAACATGTGAAAGCACGTGTCATGTT >Chr2 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02 ctcgaccaggacgatgaatgggcgatgaaaatctatcgggttagaggaatggtcgaccgggtccgagaattcgtcgacc aggacgaggagtggtcgaggatttgtcgaccaggagttgaaatcgtcgaccgggtccgagaattcgtcgaccaggacgg cggaaccctcgaccaggacgatgaatgggcgatgaaaatctatcgggttcgaggaatggtcgaccagaagttggaatcg TTCCGAGTTTTCTCAGCAGTTCTCGGACAAAAACTGATGAATCGTCGAGGAGAATGAGCTTGCCTTGCGTGGGCTGCCA TTAGTTCTTCGAGGCGTTAGGGTGGCGGCGGTATAAAAGTGTCGGAGTTTTTTCAGCAGTTCTCGGACAAAAATTGCTG AGTGGCCGAGAAGAATGGGCGTGTCATGCGTGGGCTGACATGGATTCTTCGAGGCCTAGGGGTGGCGGTATATAACTTG TTCGCATGATATTACCGAGATGTCCCCACGGGCATCTTTTCACCTCGTCGCCGAAGAGAATGGGCGTGTCATGCATGGG CTGACATGGATTCTCCTAGGCCGTTTGGGTGGCGGTATAGTCGTCCTGCGCACGAAATACCGAGATGTCCCCATGGGCA TCGATTCCACCCGCCTAGGTTGGATGGGCGTGCTTCGTTGGAAAGCATGGATCCGCCTAGGCTGTCCCGAGTGTGAGCG AGGTGTGAGTGTCGCCCATGGGCATCGACACCTTGCGGCTAGGAACTGGCGAGGGATGGTATCCGAGGGATGGTATCGG >Chr3 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02 CNNNAATCCCTAAAACCATAATCCTAAATCCCTTAATTCCTAAATCCCTAATACTTAGACCCTAATCTTTAGTTCCTAG ACCCTAATCTTTAGTTCCTAGACCCTAAATCCATAATCCTTAATTCCTAAATTCCTAAATCCCTAATACTAAATCTCTA AATCCCTAGCAATTTTCAAGTTTTGCTTGATTGTTGTAGGATGGTCCTTTCTCTTGTTTCTTCTCTGTGTTGTTGAGAT TAGTTTGTTTAGGTTTGATAGCGTTGATTTTGGCCTGCGTTTGGTGACTCATATGGTTTGATTGGAGTTTGTTTCTGGG CTTGGTCTTGAGATTGGTGATTTTTTAGAGGTTAGACTTTTCTCCTTGTCTTTGGCCTTCCTGATCTATTTTGATAGGA TCGCGTTTTGAAGGTTTCTAGTTTGACGCAATTGGACGCAGCAGTCGTCAGTCGTCGTATTCGGAAGTTTGCTCGAGGT TTCCTTAACTCATTTGTCTTGCGTGCCGAGTTGTNNNGCCTTGGTTCTTGGTTTCTGAGAGATCAGCTCTCATCGTGAT AGGAGGAGGTAGATGTTATTTTTGGGTTGGCCACGGACATCCTTAGTAAATCTCAATCGCGGGTGCTTCAGAAGATTTG GTTTACTTGTGCTTTTGGTCTTTTGTTTCTTTTAAGATTTAAGTATGTTGCTTCTATCTCAGTTTTGGGTTTCGACCTC TTGTGTTGGTGGTTTGAAAGCTTTTGTTTTGTATTACTACTGTTTTTAGATTCCCTTCTATAGGGCTTAGCTTCCGGGA AAATTTCGTCTTCTGCCGACTTTGACTCCGGAGACTTTCTTGTCGTCCGCCGACTTTAGTTTCTCTTTTTGAGGATTTA TATTTCACTTCGGTTCCAATCCAAGATCAATAATATAATTTTAAATGATAAAAAAAAATCAAAAATAATTTCGGGTTTG TCTTGTAATCTCTCATGTTTAAACCGGGAAACCAAATATTGGAAATAATATCCGGTTTGTCTTGTAATCTCATGTTTAA ACTGGGTAACCGAATACTGGAAATAATCATCAGAATATATACCTTGCTTTCTTCTTTGTTTTATCTTTGATGATTCATG ATTACAAAGATGGAACAAGTACGAGGAAACCCAAATCTTGCAAGTTCCAACTAATT marcelm-sqt-d3218a8c5437/tests/seq.fa.fai000066400000000000000000000002061302004573300177540ustar00rootroot00000000000000Chr1 1235 6 79 80 Chr2 Chr2 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02 790 1333 79 80 Chr3 1162 2209 79 80 marcelm-sqt-d3218a8c5437/tests/seq.fastq000066400000000000000000000017111302004573300177500ustar00rootroot00000000000000@SEQ:1:1101:9010:3891#0/1 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGGGCCTAACTTCTTAGACTGCCTTAAGGACGTAAGCCAAGATGGGAAAGGTC + FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDFFDB4B?DB21;84?DDBC9DEBAB;=@<@@B@@@@B>CCBBDE98>>0@7 @SEQ:1:1101:9240:3898#0/1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG + GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D @SEQ:1:1101:9207:3899#0/1 here is a comment that should be ignored TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAACGCCTAACTTCTTAGACTGCCTTAAGGACGTATACATA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHFHHFHFFFFFBHHGHHHFFHHFHGGHHDEBFG maxdiff: assert banded_dist > maxdiff else: assert banded_dist == true_dist def test_edit_distance_banded(): for maxdiff in range(5): assert_banded('ABC', '', maxdiff) for s, t in STRING_PAIRS: assert_banded(s, '', maxdiff) assert_banded('', s, maxdiff) assert_banded(s, t, maxdiff) assert_banded(t, s, maxdiff) def nongap_characters(row): """ Return the non-gap characters (not '\0') of an alignment row. """ try: return row.replace(b'\0', b'') except TypeError: return row.replace('\0', '') def count_gaps(row): try: return row.count(b'\0') except TypeError: return row.count('\0') def count_mismatches(row1, row2): if type(row1) is str: gap = '\0' else: gap = 0 return sum(1 for (c1, c2) in zip(row1, row2) if c1 != c2 and c1 != gap and c2 != gap) def test_global_alignment(): for s, t in STRING_PAIRS: distance = ed(s, t) ga = GA(s, t) assert len(ga.row1) == len(ga.row2) assert ga.errors == distance assert nongap_characters(ga.row1) == s assert nongap_characters(ga.row2) == t assert ga.errors == count_gaps(ga.row1) + count_gaps(ga.row2) + count_mismatches(ga.row1, ga.row2) def test_consensus(): d = dict(a='AAA', b='ACA', c='AAG', d='TAA', e='AAA') assert consensus(d) == 'AAA' assert consensus(d.values()) == 'AAA' def test_hamming_distance(): assert hamming_distance('', '') == 0 assert hamming_distance('A', 'A') == 0 assert hamming_distance('HELLO', 'HELLO') == 0 assert hamming_distance('ABC', 'DEF') == 3 @raises(IndexError) def test_hamming_distance_incorrect_length(): hamming_distance('A', 'BC') marcelm-sqt-d3218a8c5437/tests/testcigar.py000066400000000000000000000060201302004573300204550ustar00rootroot00000000000000from nose.tools import raises from sqt.cigar import parse, Cigar, reference_to_query_length def test_parse(): # deprecated assert parse("4S17M8D4M9I3H") == [(4, 4), (0, 17), (2, 8), (0, 4), (1, 9), (5, 3)] def test_cigar_class(): assert Cigar('') == Cigar([]) assert Cigar('4M') == Cigar([(0, 4)]) c = Cigar('4S 17M 8D 4M 9I 3H') assert str(c) == '4S17M8D4M9I3H' assert '{}'.format(c) == str(c) assert '{: }'.format(c) == '4S 17M 8D 4M 9I 3H' assert repr(c) == "Cigar('4S17M8D4M9I3H')" assert Cigar('4M') + Cigar('1D') == Cigar('4M 1D') assert Cigar('2S 4M') + Cigar('3M') == Cigar('2S 7M') assert Cigar('3M2S')._as_string(join_by=' ') == '3M 2S' @raises(ValueError) def test_parse_error_1(): Cigar("4S5") @raises(ValueError) def test_parse_error_2(): Cigar("4S-5M") @raises(ValueError) def test_parse_error_3(): Cigar("M") def test_elements(): assert ''.join(Cigar("3S2I3M").elements()) == "SSSIIMMM" assert list(Cigar("3S2I3M").elements(numbers=True)) == [ 4, 4, 4, 1, 1, 0, 0, 0] def test_length(): c = Cigar('3H 2S 5M 1I 2M 3D 2M 4S 5H') assert c.query_length(count_clipped=None) == 10 assert c.query_length(count_clipped='soft') == 16 assert c.query_length(count_clipped='hard') == 24 assert c.reference_length() == 12 def test_concat(): assert Cigar('2S1M') + Cigar('3M4S') == Cigar('2S4M4S') assert Cigar('') + Cigar('3M4S') == Cigar('3M4S') assert Cigar('4S3M') + Cigar('') == Cigar('4S3M') def test_split_at_element(): assert Cigar('1M').split_at_element(0, 0) == (Cigar(), Cigar('1M')) assert Cigar('1M').split_at_element(0, 1) == (Cigar('1M'), Cigar()) assert Cigar('2S4M4S').split_at_element(1, 1) == (Cigar('2S1M'), Cigar('3M4S')) assert Cigar('3M 1D 6M 2I 4M').split_at_element(2, 5) == (Cigar('3M 1D 5M'), Cigar('1M 2I 4M')) assert Cigar('4M 3D 5M').split_at_element(1, 0) == (Cigar('4M'), Cigar('3D 5M')) assert Cigar('4M 3D 5M').split_at_element(1, 1) == (Cigar('4M 1D'), Cigar('2D 5M')) assert Cigar('4M 3D 5M').split_at_element(1, 3) == (Cigar('4M 3D'), Cigar('5M')) def test_reference_to_query_position(): cig = parse('2H 2S 2M 2D 2M 2I 2M 2S 2H') def r2q(length): return reference_to_query_length(cig, length) assert r2q(0) == 4 # 2H 2S assert r2q(1) == 5 # 2H 2S 1M assert r2q(2) == 6 # 2H 2S 2M assert r2q(3) == 6 # 2H 2S 2M 1D assert r2q(4) == 6 # 2H 2S 2M 2D assert r2q(5) == 7 # 2H 2S 2M 2D 1M assert r2q(6) == 8 # 2H 2S 2M 2D 2M assert r2q(7) == 11 # 2H 2S 2M 2D 2M 2I 1M assert r2q(8) == 12 # 2H 2S 2M 2D 2M 2I 2M assert r2q(9) is None def test_clipping(): cig = Cigar('3S 2M 2D 2M 2I 2M 7S') assert cig.hard_clipping_left == 0 assert cig.soft_clipping_left == 3 assert cig.clipping_left == 3 assert cig.hard_clipping_right == 0 assert cig.soft_clipping_right == 7 assert cig.clipping_right == 7 cig = Cigar('2H 3S 2M 2D 2M 2I 2M 7S 11H') assert cig.hard_clipping_left == 2 assert cig.soft_clipping_left == 0 # ??? or 4 assert cig.clipping_left == 5 assert cig.hard_clipping_right == 11 assert cig.soft_clipping_right == 0 assert cig.clipping_right == 18 marcelm-sqt-d3218a8c5437/tests/testcolorspace.py000066400000000000000000000024201302004573300215220ustar00rootroot00000000000000from sqt.colorspace import encode, decode # If there are any unknown characters in the test sequence, # round tripping will only work if all characters after the # first unknown character are also unknown: # encode("TNGN") == "T444", but # decode("T444") == "TNNN". sequences = [ "", "C", "ACGGTC", "TN", "TN.", "TNN.N", "CCGGCAGCATTCATTACGACAACGTGGCACCGTGTTTTCTCGGTGGTA", "TGCAGTTGATGATCGAAGAAAACGACATCATCAGCCAGCAAGTGC", "CAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGG" ] def test_encode(): assert encode("AA") == "A0" assert encode("AC") == "A1" assert encode("AG") == "A2" assert encode("AT") == "A3" assert encode("CA") == "C1" assert encode("CC") == "C0" assert encode("CG") == "C3" assert encode("CT") == "C2" assert encode("GA") == "G2" assert encode("GC") == "G3" assert encode("GG") == "G0" assert encode("GT") == "G1" assert encode("TA") == "T3" assert encode("TC") == "T2" assert encode("TG") == "T1" assert encode("TT") == "T0" assert encode("TN") == "T4" assert encode("NT") == "N4" assert encode("NN") == "N4" assert encode("ACGGTC") == "A13012" assert encode("TTT.N") == "T0044" assert encode("TTNT.N") == "T04444" def test_decode(): for s in sequences: expected = s.replace('.', 'N') encoded = encode(s) assert decode(encoded) == expected marcelm-sqt-d3218a8c5437/tests/testdna.py000066400000000000000000000055241302004573300201420ustar00rootroot00000000000000from sqt.dna import (reverse_complement, n_intervals, intervals_complement, amino_acid_regex, GENETIC_CODE, nt_to_aa) def test_complement_string(): rc = reverse_complement assert rc('') == '' assert rc('A') == 'T' assert rc('C') == 'G' assert rc('TG') == 'CA' assert rc('N') == 'N' assert rc('a') == 't' assert rc('ACGTUMRWSYKVHDBN') == 'NVHDBMRSWYKAACGT' assert rc('acgtumrwsykvhdbn') == 'nvhdbmrswykaacgt' assert rc('ACGTUMRWSYKVHDBNacgtumrwsykvhdbn') == 'nvhdbmrswykaacgtNVHDBMRSWYKAACGT' #ACGTUMRWSYKVHDBN #TGCAAKYWSRMBDHVN def test_complement_bytes(): rc = reverse_complement assert rc(b'') == b'' assert rc(b'A') == b'T' assert rc(b'C') == b'G' assert rc(b'TG') == b'CA' assert rc(b'N') == b'N' assert rc(b'a') == b't' assert rc(b'ACGTUMRWSYKVHDBN') == b'NVHDBMRSWYKAACGT' assert rc(b'acgtumrwsykvhdbn') == b'nvhdbmrswykaacgt' assert rc(b'ACGTUMRWSYKVHDBNacgtumrwsykvhdbn') == b'nvhdbmrswykaacgtNVHDBMRSWYKAACGT' def test_n_intervals(): assert list(n_intervals(b'', N=ord(b'N'))) == [] assert list(n_intervals(b'N', N=ord(b'N'))) == [(0, 1)] assert list(n_intervals(b'n', N=ord(b'N'))) == [(0, 1)] assert list(n_intervals(b'an', N=ord(b'N'))) == [(1, 2)] assert list(n_intervals(b'ACGTNNAC', N=ord(b'N'))) == [(4, 6)] assert list(n_intervals(b'NCGTNNACN', N=ord(b'N'))) == [(0, 1), (4, 6), (8, 9)] def test_n_intervals(): assert list(n_intervals('')) == [] assert list(n_intervals('N')) == [(0, 1)] assert list(n_intervals('n')) == [(0, 1)] assert list(n_intervals('an')) == [(1, 2)] assert list(n_intervals('ACGTNNAC')) == [(4, 6)] assert list(n_intervals('NCGTNNACN')) == [(0, 1), (4, 6), (8, 9)] def test_intervals_complement(): assert list(intervals_complement([], length=10)) == [(0, 10)] assert list(intervals_complement([(0, 10)], length=10)) == [] assert list(intervals_complement([(0, 2), (4, 6)], length=10)) == [(2, 4), (6,10)] assert list(intervals_complement([(1, 2), (4, 6)], length=10)) == [(0,1), (2, 4), (6,10)] assert list(intervals_complement([(0, 1), (3, 10)], length=5)) == [(1, 3)] assert list(intervals_complement([(2, 10)], length=5)) == [(0, 2)] assert list(intervals_complement([(0, 10)], length=5)) == [] def test_amino_acid_regex(): for codon1, aa1 in GENETIC_CODE.items(): r = amino_acid_regex(aa1, compile=True) for codon2, aa2 in GENETIC_CODE.items(): m = r.match(codon2) assert bool(m) == (aa1 == aa2) if m: assert m.group(0) == codon2 def test_nt_to_aa(): assert nt_to_aa('') == '' assert nt_to_aa('A') == '*' assert nt_to_aa('AC') == '*' assert nt_to_aa('AAA') == 'K' assert nt_to_aa('AAAG') == 'K*' assert nt_to_aa('AAAGG') == 'K*' assert nt_to_aa('AAAATG') == 'KM' assert nt_to_aa('TGA') == '*' assert nt_to_aa('TGAT') == '**' assert nt_to_aa('TGATT') == '**' assert nt_to_aa('TAGTTT') == '*F' assert nt_to_aa('TAATTTC') == '*F*' assert nt_to_aa('taatttatgc') == '*FM*' marcelm-sqt-d3218a8c5437/tests/testfasta.py000066400000000000000000000124441302004573300204750ustar00rootroot00000000000000""" Tests for the sqt.io.fasta module """ from io import StringIO from nose.tools import raises from sqt.io.fasta import (FastaReader, FastaWriter, Sequence, FastqWriter, SequenceReader, fastq_header) import os.path def dpath(path): return os.path.join(os.path.dirname(__file__), path) def test_fastqwriter(): tmp = dpath("tmp.fastq") with FastqWriter(tmp) as fq: fq.write("name", "CCATA", "!#!#!") fq.write("name2", "HELLO", "&&&!&&") assert fq._file.closed with open(tmp) as t: assert t.read() == '@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n' os.remove(tmp) def test_fastqwriter_twoheaders(): tmp = dpath("tmp.fastq") with FastqWriter(tmp, twoheaders=True) as fq: fq.write("name", "CCATA", "!#!#!") fq.write("name2", "HELLO", "&&&!&&") assert fq._file.closed with open(tmp) as t: assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&&\n' os.remove(tmp) def test_fastawriter(): tmp = dpath("tmp.fasta") with FastaWriter(tmp) as fw: fw.write("name", "CCATA") fw.write("name2", "HELLO") assert fw._file.closed with open(tmp) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' os.remove(tmp) def test_fastawriter_linelength(): tmp = dpath("tmp.fasta") with FastaWriter(tmp, line_length=3) as fw: fw.write("name", "CCAT") fw.write("name2", "TACCAG") assert fw._file.closed with open(tmp) as t: d = t.read() assert d == '>name\nCCA\nT\n>name2\nTAC\nCAG\n' os.remove(tmp) def test_fastawriter_sequence(): tmp = dpath("tmp.fasta") with FastaWriter(tmp) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert fw._file.closed with open(tmp) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' os.remove(tmp) @raises(ValueError) def test_fastawriter_contextmanager(): tmp = dpath("tmp.fasta") fr = FastaWriter(tmp) os.remove(tmp) with fr as frw: pass with fr as frw: pass def test_fastareader(): with FastaReader(dpath("seq.fa"), case='keep') as fr: seqs = list(fr) assert fr._file.closed assert len(seqs) == 3 assert seqs[0].qualities is None assert seqs[0].name == 'Chr1' assert seqs[1].name == 'Chr2 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02' assert len(seqs[0].sequence) == 1235 assert seqs[0].sequence.startswith('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert seqs[1].sequence.startswith('ctcgaccaggacgatgaatgggc') assert seqs[2].sequence.endswith('AATCTTGCAAGTTCCAACTAATT') def test_fastareader_upper(): with FastaReader(dpath("seq.fa")) as fr: seqs = list(fr) assert seqs[0].name == 'Chr1' assert len(seqs[0].sequence) == 1235 assert seqs[0].sequence.startswith('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert seqs[1].sequence.startswith('CTCGACCAGGACGATGAATGGGC') def test_fastareader_lower(): with FastaReader(dpath("seq.fa"), case='lower') as fr: seqs = list(fr) assert seqs[0].name == 'Chr1' assert len(seqs[0].sequence) == 1235 assert seqs[0].sequence.startswith('ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatc') assert seqs[1].sequence.startswith('ctcgaccaggacgatgaatgggc') def test_fastareader_binary(): for wholefile in False, True: print('wholefile:', wholefile) with FastaReader(dpath("seq.fa"), binary=True, wholefile=wholefile, case='keep') as fr: seqs1 = list(fr) with FastaReader(dpath("seq.fa"), mode='rb', wholefile=wholefile, case='keep') as fr: seqs2 = list(fr) for seqs in seqs1, seqs2: assert fr._file.closed assert len(seqs) == 3 assert seqs[0].qualities is None assert seqs[0].name == 'Chr1' assert seqs[2].name == 'Chr3 CHROMOSOME dumped from ADB: Jun/20/09 14:54; last updated: 2009-02-02' assert len(seqs[0].sequence) == 1235 assert seqs[0].sequence.startswith(b'CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert seqs[1].sequence.startswith(b'ctcgaccaggacgatgaatgggc') assert seqs[2].sequence.endswith(b'AATCTTGCAAGTTCCAACTAATT') def test_sequence_reader(): # should auto-detect FASTA vs FASTQ with SequenceReader(dpath("seq.fa")) as sr: assert sr.format == 'fasta' with SequenceReader(dpath("seq.fastq")) as sr: assert sr.format == 'fastq' @raises(ValueError) def test_fastareader_contextmanager(): fr = FastaReader(dpath("seq.fa")) with fr as frw: pass with fr as frw: pass def test_fastq_header(): h = fastq_header(StringIO('@HWI-ST344:204:D14G8ACXX:8:1101:1638:2116 1:N:0:CGATGT')) assert h.instrument == 'HWI-ST344' assert h.run == 204 assert h.flowcell == 'D14G8ACXX' assert h.lane == 8 assert h.barcode == 'CGATGT' h = fastq_header(StringIO('@MISEQ:56:000000000-A4YM7:1:1101:15071:2257 1:N:0:CTTGTA')) assert h.instrument == 'MISEQ' assert h.run == 56 assert h.flowcell == '000000000-A4YM7' assert h.lane == 1 assert h.barcode == 'CTTGTA' h = fastq_header(StringIO('@HWI-ST552_0:4:1101:1179:1939#0/1')) print(h) assert h.instrument == 'HWI-ST552_0' assert h.run is None assert h.flowcell is None assert h.lane == 4 assert h.barcode is None h = fastq_header(StringIO('@HWI_ST139:8:1:1202:1874#GATCAG/1')) assert h.instrument == 'HWI_ST139' assert h.run is None assert h.flowcell is None assert h.lane == 8 assert h.barcode == 'GATCAG' #h = fastq_header(StringIO('@FCD20MKACXX:8:1101:1215:2155#TCGTAAGC/1')) #assert h.instrument is None #assert h.run is None #assert h.flowcell == 'FCD20MKACXX' #assert h.lane == 8 #assert h.barcode == 'TCGTAAGC' marcelm-sqt-d3218a8c5437/tests/testindexedfasta.py000066400000000000000000000033711302004573300220350ustar00rootroot00000000000000from nose.tools import raises from sqt.io.fasta import IndexedFasta, NonIndexedFasta, FastaReader import os.path def dpath(path): return os.path.join(os.path.dirname(__file__), path) @raises(ValueError) def test_indexedfasta_contextmanager(): indfasta = IndexedFasta(dpath("seq.fa")) with indfasta as ifw: pass with indfasta as ifw: pass def test_indexedfasta(): for func in IndexedFasta, NonIndexedFasta: with IndexedFasta(dpath("seq.fa")) as ifa: assert len(ifa) == 3 chr1 = ifa.get("Chr1") chr2 = ifa.get("Chr2") assert chr1[:] == ifa["Chr1"][:] assert len(chr1) == 1235 assert chr1[0:300].startswith(b'CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert chr1[:300].startswith(b'CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert chr1[:].startswith(b'CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATC') assert chr2[227:320] == b'gttggaatcgTTCCGAGTTTTCTCAGCAGTTCTCGGACAAAAACTGATGAATCGTCGAGGAGAATGAGCTTGCCTTGCGTGGGCTGCCATTAG' assert chr1[:300].startswith(b'CCCTAAACCCTA') assert chr2[:].endswith(b'TATCCGAGGGATGGTATCGG') def test_all_regions(): # read the file via a FastaReader, then check that all substrings are equal path = dpath("indexed.fasta") sequences = dict() with FastaReader(path, mode='rb') as fr: for record in fr: sequences[record.name.split(' ', 1)[0]] = record.sequence with IndexedFasta(path): indexed = IndexedFasta(path) non_indexed = NonIndexedFasta(path) regions = [] for name in sorted(sequences): for i in range(len(sequences[name])): for j in range(i, len(sequences[name])): regions.append( (name, i, j) ) for name, start, stop in regions: expected = sequences[name][start:stop] assert indexed[name][start:stop] == expected assert non_indexed[name][start:stop] == expected marcelm-sqt-d3218a8c5437/tests/testintervaltree.py000066400000000000000000000006611302004573300221010ustar00rootroot00000000000000""" Tests for the sqt.intervaltree module """ from nose.tools import raises import sys from sqt.intervaltree import IntervalTree __author__ = "Johannes Köster" def test_intervaltree(): tree = IntervalTree() tree.insert(1,10) tree.insert(5,20) tree.insert(8,20) print(tree) print(list(tree.find(10,11))) assert len(list(tree.find(10,11))) == 3 assert len(list(tree.find(1,4))) == 1 assert len(list(tree.find(30,40))) == 0 marcelm-sqt-d3218a8c5437/tests/testmedian.py000066400000000000000000000015201302004573300206250ustar00rootroot00000000000000from sqt.math import frequency_median as median def test_median(): assert median( { 5: 2, 8: 4 } ) == 8 assert median( { 5: 2, 8: 3 } ) == 8 assert median( { 5: 1, 19: 2 } ) == 19 assert median( { 27: 20, 5: 1, 19: 2 } ) == 27 # one value assert median( { 5: 1 } ) == 5 assert median( { 5: 1000 } ) == 5 # five values assert median( { 5: 0, 8: 5 } ) == 8 assert median( { 5: 1, 8: 4 } ) == 8 assert median( { 5: 2, 8: 3 } ) == 8 assert median( { 5: 3, 8: 2 } ) == 5 assert median( { 5: 4, 8: 1 } ) == 5 assert median( { 5: 5, 8: 0 } ) == 5 # six values assert median( { 5: 0, 8: 6 } ) == 8 assert median( { 5: 1, 8: 5 } ) == 8 assert median( { 5: 2, 8: 4 } ) == 8 assert median( { 5: 3, 8: 3 } ) == 5 # see doc assert median( { 5: 4, 8: 2 } ) == 5 assert median( { 5: 5, 8: 1 } ) == 5 assert median( { 5: 6, 8: 0 } ) == 5 marcelm-sqt-d3218a8c5437/tests/testregion.py000066400000000000000000000021471302004573300206610ustar00rootroot00000000000000from sqt.region import Region def test_region(): regions = [ ("chr7:5-7", ("chr7", 4, 7, False)), ("chr7", ("chr7", 0, None, False)), ("rc:chr7", ("chr7", 0, None, True)), ("chr7:1-100", ("chr7", 0, 100, False)), ("rc:chr7:1-100", ("chr7", 0, 100, True)), ("chr7:20-1,000,000", ("chr7", 19, 1000000, False)), ("rc:chr7:1-999,999", ("chr7", 0, 999999, True)), ("chr7:5-", ("chr7", 4, None, False)), ("rc:chr7:5-", ("chr7", 4, None, True)), ("chr7:20", ("chr7", 19, 20, False)), ("rc:chr7:1,200..999,999", ("chr7", 1199, 999999, True)), ("chr7:12..", ("chr7", 11, None, False)), #("chr7:1-", ("chr7", 0, None, False)), # will not round-trip ] for spec, (reference, start, stop, is_reverse_complement) in regions: region = Region(spec) assert region.reference == reference assert region.start == start assert region.stop == stop assert region.is_reverse_complement == is_reverse_complement assert "{}".format(region) == spec.replace(',', '').replace('..', '-'), (region, spec) r1 = Region(reference, start, stop, is_reverse_complement) r2 = Region(spec) assert r1 == r2, (r1, r2) marcelm-sqt-d3218a8c5437/tox.ini000066400000000000000000000001561302004573300162730ustar00rootroot00000000000000[tox] envlist = py34,py35 [testenv] deps = pip>=8.0.0 wheel nose commands = nosetests -P tests/ marcelm-sqt-d3218a8c5437/unmaintained/000077500000000000000000000000001302004573300174325ustar00rootroot00000000000000marcelm-sqt-d3218a8c5437/unmaintained/neighbors.py000066400000000000000000000045141302004573300217700ustar00rootroot00000000000000#!/usr/bin/env python3 """ Enumerate neighboring sequences Read in a FASTA file and systematically mutate every single base, thus enumerating all "neighboring" sequences at Hamming distance 1. The resulting sequences are written to standard output. """ import sys from sqt import HelpfulArgumentParser from sqt.io.fasta import FastaReader, FastaWriter from sqt.dna import mutate from argparse import ArgumentTypeError __author__ = "Marcel Martin" def replacement_table(order, n): """ Return a dict that describes how to replace each base and in which order. {'A': 'CGT', 'G': 'TAC', 'C': 'GTA', 'T': 'ACG'} The first entry means: If there is an A in the original sequence, replace it by C, then G, then T. """ order += order # makes indexing easier replace = dict() for i, c in enumerate('ACGT'): r = order[i] for d in order[i+1:]: if d not in r and d != c: r += d replace[c] = r[:n] return replace def neighbors(record, table): """ Enumerate neighbors """ for i in range(len(record)): s = record.sequence c = s[i].upper() for new_c in table[c]: neighbor = record[:] neighbor.name += '_mut{}{}'.format(i+1, new_c) neighbor.sequence = s[:i] + new_c + s[i+1:] yield neighbor def four_bases(s): if not len(s) == 4: raise ArgumentTypeError("String of length 4 expected") if not set(s) <= set('ACGT'): raise ArgumentTypeError("String consisting of characters A, C, G, T expected") return s def main(): parser = HelpfulArgumentParser(description=__doc__) parser.add_argument("--acgt", type=four_bases, default='CGTA', help="Order of bases replacement. Default: %(default)s") parser.add_argument("-n", default=3, type=int, choices=range(1, 4), help="Change each individual base N times. Default: %(default)s") parser.add_argument("--exclude-self", dest='include_self', default=True, action='store_false', help="Exclude the sequence itself from the output") parser.add_argument("fasta", metavar='FASTA', help="Input FASTA file") args = parser.parse_args() table = replacement_table(args.acgt, args.n) fasta_output = FastaWriter(sys.stdout, line_length=0) for record in FastaReader(args.fasta): if args.include_self: fasta_output.write(record.name, record.sequence) for neighbor in neighbors(record, table): fasta_output.write(neighbor.name, neighbor.sequence) if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/unmaintained/sqt-bambai000077500000000000000000000011341302004573300213770ustar00rootroot00000000000000#!/bin/bash set -euo pipefail #set -x if [ $# -ne 1 -o x$1 == x-h -o x$1 == x--help ]; then echo -e \ "Usage: samtools sort -O bam -T prefix ... | bambai BAMPATH Read a sorted BAM file from standard input, write it to BAMPATH and index it at the same time (creating BAMPATH.bai)." exit 2 fi if [ -t 0 ]; then echo "Reading input from terminal - this is probably not what you want. Use Ctrl+C to cancel." fi BAM="$1" WORKDIR=$(mktemp -d) || exit 1 trap "rm -rf ${WORKDIR}" exit FIFO=${WORKDIR}/fifo.bam mkfifo ${FIFO} samtools index ${FIFO} && mv ${FIFO}.bai "${BAM}.bai" & tee ${FIFO} > "${BAM}" marcelm-sqt-d3218a8c5437/unmaintained/sqt-go-enrichment000077500000000000000000000256621302004573300227370ustar00rootroot00000000000000#!/usr/bin/env python """ Find enrichments of GO terms for a ranked list of genes. """ from __future__ import print_function, division import sys import re import csv import textwrap import argparse from itertools import product, islice from collections import defaultdict from functools import lru_cache import numpy as np import math from scipy.misc import comb as spcomb def argument_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("obo", help="The gene ontology in .obo format.") parser.add_argument("genelist", nargs="?", default=sys.stdin, help="A ranked list of genes. Each row should contain a comma separated list of GO terms the gene is contained in.") parser.add_argument("--draw-graph", "-g", help="Draw the graph of enriched GO terms.") parser.add_argument("--max-partition", "-m", type=int, metavar="M", help="Consider only the first M partitions when calculating an enrichment of a GO term against the ranked genelist.") parser.add_argument("--fixed-partition", "--forground", type=int, metavar="M", help="Consider the first M M genes as foreground and the rest as background when calculating an enrichment of a GO term against the ranked genelist.") parser.add_argument("--max-pvalue", "-p", type=float, metavar="P", default=0.001, help="Maximum uncorrected p-value to report an enrichment.") parser.add_argument("--usefdr", action="store_true", help="Use FDR for p-value cutoff (see above).") parser.add_argument("--fast", "-f", action="store_true", help="Calculate an upper bound rather than the exact p-value.") return parser def comb(n, k, spcomb=spcomb): return spcomb(n, k, exact=True) ############## Enrichment ################ # based on Eden et al. "GOrilla: a tool for discovery and visualization of enriched GO terms in ranked gene lists", BMC Bioinformatics 2009 def hypergeometric_tail(N, B, n, b, comb=comb): """ Computes HGT(b;N,B,n) N -- number of overall genes B -- size of the GO term n -- number of tested genes b -- number of tested genes associated with the GO term """ NB = comb(N,B) hgt = sum(comb(n, i) * comb(N - n, B - i) for i in range(b, min(n, B) + 1)) / NB return hgt def tails(N, B, interm, max_partition, hypergeometric_tail=hypergeometric_tail): b = 0 yield hypergeometric_tail(N, B, 1, interm[0]), 1, interm[0] # yield the first element for n in range(2, min(len(interm), max_partition) + 1): isin = interm[n-1] if isin: # only need to compute the tail if b increases, else hgt will become only bigger and we need the minimum b += 1 yield hypergeometric_tail(N, B, n, b), n, b def min_hypergeometric_tail(N, B, interm, max_partition, tails=tails): """ Computes mHGT(lambda) N -- number of overall genes B -- size of the GO term interm -- the vector lambda stating (with a boolean value True) for gene i in the list whether it is contained in the considered GO term """ return min(tails(N, B, interm, max_partition), key=lambda item: item[0]) def not_visiting_paths(N, B, R): """ Computes the number of paths not visiting R (PI_R(N,B)) by dynamic programming. """ # we shift b by 1 to allow for b=-1 at the index of 0, i.e. the real b equals b - 1 #P = np.zeros((N+1, B+2), dtype=np.uint) P = [[0 for b in range(B+2)] for n in range(N+1)] P[0][1] = 1 for n in range(1, N + 1): for b in range(max(1, B + 1 - N + n), min(B + 1, n + 1) + 1): if (n,b-1) in R: P[n][b] = 0 else: P[n][b] = P[n-1][b] + P[n-1][b-1] ret = P[N][B+1] # i.e. PI[N,B] in the paper return ret def R(N, B, mhgt, comb=comb): """ Points n,b in the Grid N,B where HGT(b;N,B,n) <= mhgt """ validpoints = set() NB = comb(N,B) for n in range(1, N+1): hgt = 0 for b in reversed(range(max(0, B - N + n), min(B, n) + 1)): hgt += comb(n, b) * comb(N - n, B - b) / NB # calc the next element of the hypergeometric tail sum if hgt <= mhgt: validpoints.add((n,b)) else: break return validpoints def pvalue(N, B, mhgt, not_visiting_paths=not_visiting_paths, comb=comb): """ Calculate the p-value for the given minimum Hypergeometric Tail score mhgt. Then the pvalue is the probability to see a score <= mhgt given N genes in total and B genes in the GO term. """ nvp = not_visiting_paths(N, B, R(N, B, mhgt)) NB = comb(N,B) return (NB - nvp) / NB ############### GO Parser ################### def parse_obo(obofile): with open(obofile) as obofile: for l in obofile: if l.startswith("[Term]"): goterm = GOTerm(obofile) #if goterm.id == 1709: yield goterm class GOTerm: _byid = dict() @classmethod def byid(cls, id): if isinstance(id, str): return cls._byid[goid(id)] return cls._byid[id] def __init__(self, obofile): values = self.parse_goterm(obofile) self.id = goid(values["id"][0]) self.name = values["name"][0] self.namespace = values["namespace"][0] self.definition = values["def"][0] self.subset = values["subset"][0] if values["subset"] else None self.is_a = list(map(goid, values["is_a"])) self._byid[self.id] = self @staticmethod def parse_goterm(obofile): values = defaultdict(list) regex = re.compile("(?P\w+): (?P[^!]+)") for l in obofile: if l == "\n": break match = re.match(regex, l) values[match.group("key")].append(match.group("value").strip()) return values def __repr__(self): return "GO:{:07} {}".format(self.id, self.name) def goid(idstring): return int(idstring[3:]) ################ Genelist parser ############## def parse_genelist(genelist): for l in csv.reader(genelist, delimiter="\t"): if len(l) > 1 and l[1]: yield Gene(l[0], l[1:]) class Gene: def __init__(self, id, goterms): self.id = id self.goterms = set(map(goid, goterms)) def interm(self, goterm): return goterm.id in self.goterms ################ process data ################# def calc_fdrs(pvalues, sortedindex, n): """ Calculate FDR with the algorithm of Benjamini-Hochberg as implemented in the R package multtest. From Benjamini-Hochberg, 1995: let k be the largest i for which P_i <= i / n * (q*) then reject all H_i for i = 1,...,k. Thereby, above procedure controls the false discovery rate at q*. In other words, the false discovery rate FDR_i for P_i is P_i * n / i <= FDR_i . """ fdr = np.empty_like(pvalues) if n: fdr[sortedindex[n-1]] = pvalues[sortedindex[n-1]] for i in reversed(range(n-1)): fdr[sortedindex[i]] = min(fdr[sortedindex[i+1]], pvalues[sortedindex[i]] * (n / (i+1)), 1) assert fdr[sortedindex[i]] >= pvalues[sortedindex[i]] return fdr def calc_interm(goterm, genelist): return [goterm.id in gene.goterms for gene in genelist] def is_hit(interm, max_partition = None): if max_partition is None: max_partition = len(interm) return sum(interm[:max_partition]) >= 1 def calc_enrichments(goterms, genelist, max_partition = None, fixed_partition = None, max_pvalue = 0.001, fast = False): N = len(genelist) max_partition = min(len(genelist), N) if max_partition is None else max_partition pvalues = np.ones(len(goterms)) interms = [] params = [] print("test", file=sys.stderr) for i, goterm in enumerate(goterms): interm = calc_interm(goterm, genelist) interms.append(interm) B = sum(interm) if is_hit(interm, max_partition=max_partition): if not fixed_partition is None: # TODO pvalue in this case has to be computed differently n = fixed_partition b = sum(interm[:fixed_partition]) mhgt = hypergeometric_tail(N, B, n, b) else: mhgt, n, b = min_hypergeometric_tail(N, B, interm, max_partition) if mhgt < max_pvalue: # use lower bound of p-value as in Eden et al. Plos Comp. Biol. 2007 to omit unnecessary computations if fast: pvalues[i] = B * mhgt else: pvalues[i] = pvalue(N, B, mhgt) assert mhgt - pvalues[i] <= 0.0001 assert pvalues[i] - B * mhgt <= 0.0001 else: n, b = 0, 0 # no hit in possible partitions params.append((N, B, n, b)) print(i, "of", len(goterms), "done", file=sys.stderr) return pvalues, interms, params def significant_indices(sortedindex, hits, pvalues, max_pvalue): return set(i for i in islice(sortedindex, hits) if pvalues[i] <= max_pvalue) ################## drawing ################### def collect_terms(goterms, significant): visited = set(significant) queue = list(visited) parents = dict() while queue: goterm = queue.pop(0) parents[goterm] = list(map(GOTerm.byid, goterm.is_a)) for parent in parents[goterm]: if parent not in visited: visited.add(parent) queue.append(parent) return visited, parents def draw_terms(outfile, goterms, pvalues, fdrs, params, significant, usefdr, maxpvalue): stat = pvalues if not usefdr else fdrs with open(outfile, "w") as dot: dot.write("digraph enrichment {\n") dot.write("node [shape=Mrecord,style=filled];") significant = set(goterms[i] for i in significant) visited, parents = collect_terms(goterms, significant) for i, goterm in enumerate(goterms): if goterm not in visited: continue if goterm in significant: pval = "\\npvalue: " if not usefdr else "\\nfdr: " pval += "{:.2e}".format(stat[i]) saturation = 1 - stat[i] / maxpvalue else: saturation = 0 pval = "" dot.write("{}[label=\"{}{}\",fillcolor=\"0.0 {} 1.0\"];\n".format(goterm.id, "\\n".join(textwrap.wrap(str(goterm), 30)), pval, saturation)) for parent in parents[goterm]: dot.write("{} -> {};\n".format(parent.id, goterm.id)) dot.write("}") def main(): parser = argument_parser() args = parser.parse_args() #import yappi #yappi.start() if args.genelist == sys.stdin: genelist = list(parse_genelist(args.genelist)) else: with open(args.genelist) as f: genelist = list(parse_genelist(f)) goterms = list(parse_obo(args.obo)) pvalues, interms, params = calc_enrichments(goterms, genelist, max_partition=args.max_partition, fixed_partition=args.fixed_partition, max_pvalue = args.max_pvalue, fast = args.fast) hits = sum(is_hit(interm) for interm in interms) sortedindex = sorted(range(len(pvalues)), key=pvalues.__getitem__) fdrs = calc_fdrs(pvalues, sortedindex, hits) if args.usefdr: significant = significant_indices(sortedindex, hits, fdrs, args.max_pvalue) else: significant = significant_indices(sortedindex, hits, pvalues, args.max_pvalue) if args.draw_graph: draw_terms(args.draw_graph, goterms, pvalues, fdrs, params, significant, args.usefdr, args.max_pvalue) print("goterm\tp-value\tfdr\ttotal num of genes\tgenes in GO term\tnum of genes in partition\tnum of genes in partition and GO term\tgenes") for i in islice(sortedindex, hits): N, B, n, b = params[i] if i in significant: genes = (genelist[j].id for j, isin in islice(enumerate(interms[i]), n) if isin) print("{goterm}\t{pvalue}\t{fdr}\t{params}\t{genes}".format(goterm=goterms[i], pvalue=pvalues[i], fdr=fdrs[i], params="\t".join(map(str, (N,B,n,b))), genes="\t".join(genes))) #with open("profile.txt", "w") as out: # yappi.print_stats(out=out, sort_type=2) def test(): N = 330 B = 30 mhgt = 0.0000001 while mhgt < 1: assert mhgt <= pvalue(N, B, mhgt) <= B*mhgt print(mhgt, pvalue(N, B, mhgt), B*mhgt) mhgt *= 10 if __name__ == "__main__": main() marcelm-sqt-d3218a8c5437/unmaintained/sqt-replace-ids000077500000000000000000000176451302004573300223720ustar00rootroot00000000000000#!/usr/bin/env python """Interpretes longest alphanumeric (plus - and _) strings from stdin as IDs (of given type available in ENSEMBL) and translates those to the given target ID type using ENSEMBL BioMart.""" from __future__ import print_function __author__ = "Johannes Koester" import sys, argparse, csv, re from collections import defaultdict from functools import partial import logging if sys.version_info < (3,0): import httplib as http def readbytes(data): for l in data.read().split("\n"): yield l else: import http.client as http def readbytes(data): for l in data: yield str(l, "iso8859-1") #_ID_REGEXP = "[-_\w]+" _ID_DELIMITER = "\t" def id_candidates(id): """ Map id to lower and uppercase versions to ensure proper mapping """ yield id if not id.isupper(): yield id.upper() if not id.islower(): yield id.lower() class TableLookup: def __init__(self, table, keep = False): self._table = table self._idmap = dict() self._queue = set() self._orig_id = dict() self._keep = keep def enqueue_lookup(self, id): """ Add a possible id for lookup """ if not id: return if id.lower() in self._table: self._idmap[id] = self._table[id.lower()] return for _id in id_candidates(id): if not _id in self._idmap: self._queue.add(_id) self._orig_id[_id] = id def map(self, id): """ Map id to the looked up target """ format = _ID_DELIMITER.join if self._keep else lambda t: t[1] for i in id_candidates(id): try: return format((i, self._idmap[i])) except KeyError: pass return id def lookup(self): pass def has_full_queue(self): return False def has_empty_queue(self): return not self._queue class BioMartLookup(TableLookup): _query = """ """ @classmethod def set_server(cls, server): cls._biomart = http.HTTPConnection(server) def __init__(self, dataset, sources, target, overwrite = dict(), all = False, keep = False): TableLookup.__init__(self, overwrite, keep = keep) self._dataset = dataset self._sources = sources self._target = target self._all = all def lookup(self): """ Perform the actual lookup in BioMart for the queued ids """ targetids = defaultdict(set) for source in self._sources: body = "query=" + self._query.format(dataset=self._dataset, source=source, target=self._target, ids=",".join(self._queue)) + "\n" logging.info("querying biomart") self._biomart.request("POST", "/biomart/martservice?", body=body) tsv = self._biomart.getresponse() tsv = csv.reader(readbytes(tsv), delimiter="\t") for i, l in enumerate(tsv): if l and l[0] in self._orig_id and l[1]: sourceid = self._orig_id[l[0]] targetids[sourceid].add(l[1]) if self._all: select = lambda sourceid, targetids: _ID_DELIMITER.join(targetids) else: select = self.select_smallest_targetid for sourceid, _targetids in targetids.items(): self._idmap[sourceid] = select(sourceid, _targetids) @staticmethod def select_smallest_targetid(sourceid, targetids): targetid = min((len(t), t) for t in targetids)[1] if len(targetids) > 1: print("Warning: Selecting {} from ambiguous target IDs for {}: {}".format(targetid, sourceid, ";".join(targetids)), file=sys.stderr) return targetid @classmethod def get_datasets(cls): """ Get the list of available datasets """ cls._biomart.request("GET", "/biomart/martservice?type=datasets&mart=ensembl") response = cls._biomart.getresponse() for l in csv.reader(readbytes(response), delimiter="\t"): if len(l) > 1: yield "{}\t({})".format(*l[1:3]) @classmethod def get_sources(cls, dataset): """ Get the list of available sources """ cls._biomart.request("GET", "/biomart/martservice?type=filters&dataset={}".format(dataset)) response = cls._biomart.getresponse() for l in csv.reader(readbytes(response), delimiter="\t"): if len(l) > 5 and l[5] == "id_list": yield "{}\t({})".format(*l[:2]) @classmethod def get_targets(cls, dataset): """ Get the list of available targets """ cls._biomart.request("GET", "/biomart/martservice?type=attributes&dataset={}".format(dataset)) response = cls._biomart.getresponse() for l in csv.reader(readbytes(response), delimiter="\t"): if len(l) > 3 and l[3] == "feature_page": yield "{}\t({})".format(*l[:2]) def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--server", default="www.ensembl.org", help="Local ensembl server (e.g. useast.ensembl.org)") parser.add_argument("--sources", "-s", nargs="+", help="Types of input IDs (e.g. UniProtKB).") parser.add_argument("--target", "-t", help="Type of output IDs (e.g. KEGG or NAME).") parser.add_argument("--dataset", "-d", default="hsapiens_gene_ensembl", help="An ENSEMBL dataset (default: hsapiens_gene_ensembl).") parser.add_argument("--listdatasets", "--ld", action="store_true", help="List the possible ENSEMBL datasets.") parser.add_argument("--listsources", "--ls", action="store_true", help="List the possible ID sources.") parser.add_argument("--listtargets", "--lt", action="store_true", help="List the available targets for a given source") parser.add_argument("--table", metavar="FILE", help="A file that contains a tab delimited map of IDs (will overwrite the decision of the lookup if used in combination with source and target).") parser.add_argument("--minlength", metavar="N", default=1, type=int, help="Minimum length for an input ID to be considered for translation.") parser.add_argument("--ignore-lowercase", "-i", action="store_true", help="Ignore lowercase words since they are likely no ids") parser.add_argument("--replace-by-all", "-a", action="store_true", help="Replace an ID by a tab separated list of all found alternative target IDs.") parser.add_argument("--keep-id", "-k", action="store_true", help="Keep the original id prepended and separated by a tab.") parser.add_argument("--id-regexp", "-r", default="[^\(\)\[\]\{\}\s,;]+", help="Python regular expression for ids to replace (default is non-whitespace: \S+).") args = parser.parse_args() logging.basicConfig(format="%(message)s", level=logging.INFO, stream=sys.stderr) BioMartLookup.set_server(args.server) if args.listdatasets: print("Available datasets:") for dataset in BioMartLookup.get_datasets(): print(dataset) elif args.listsources: print("Available sources:") for source in BioMartLookup.get_sources(args.dataset): print(source) elif args.listtargets: print("Available targets:".format(args.listtargets)) for target in BioMartLookup.get_targets(args.dataset): print(target) else: if args.table: table = dict( ((l[0].lower(), l[1]) if len(l) > 1 else (l[0].lower(), l[0])) for l in csv.reader(open(args.table), delimiter="\t") if not l[0].startswith("#")) else: table = dict() # Select the lookup method based on the presence of sources and target if args.sources and args.target: db = BioMartLookup(args.dataset, args.sources, args.target, overwrite = table, all = args.replace_by_all, keep = args.keep_id) else: db = TableLookup(table, keep = args.keep_id) id_regexp = re.compile(args.id_regexp) linebuffer = [] def replace(): for l in linebuffer: l = id_regexp.sub(lambda match: db.map(match.group(0)), l) print(l, end="") del linebuffer[:] for l in sys.stdin: ids = id_regexp.findall(l) for id in ids: if len(id) >= args.minlength and (not args.ignore_lowercase or not id.islower()): db.enqueue_lookup(id) linebuffer.append(l) if db.has_full_queue(): db.lookup() replace() if not db.has_empty_queue(): db.lookup() replace() if __name__ == '__main__': main() marcelm-sqt-d3218a8c5437/versioneer.py000066400000000000000000002003231302004573300175110ustar00rootroot00000000000000 # Version: 0.16 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/warner/python-versioneer * Brian Warner * License: Public Domain * Compatible With: python2.6, 2.7, 3.3, 3.4, 3.5, and pypy * [![Latest Version] (https://pypip.in/version/versioneer/badge.svg?style=flat) ](https://pypi.python.org/pypi/versioneer/) * [![Build Status] (https://travis-ci.org/warner/python-versioneer.png?branch=master) ](https://travis-ci.org/warner/python-versioneer) This is a tool for managing a recorded version number in distutils-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install * `pip install versioneer` to somewhere to your $PATH * add a `[versioneer]` section to your setup.cfg (see below) * run `versioneer install` in your source tree, commit the results ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes. The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation First, decide on values for the following configuration variables: * `VCS`: the version control system you use. Currently accepts "git". * `style`: the style of version string to be produced. See "Styles" below for details. Defaults to "pep440", which looks like `TAG[+DISTANCE.gSHORTHASH[.dirty]]`. * `versionfile_source`: A project-relative pathname into which the generated version strings should be written. This is usually a `_version.py` next to your project's main `__init__.py` file, so it can be imported at runtime. If your project uses `src/myproject/__init__.py`, this should be `src/myproject/_version.py`. This file should be checked in to your VCS as usual: the copy created below by `setup.py setup_versioneer` will include code that parses expanded VCS keywords in generated tarballs. The 'build' and 'sdist' commands will replace it with a copy that has just the calculated version string. This must be set even if your project does not have any modules (and will therefore never import `_version.py`), since "setup.py sdist" -based trees still need somewhere to record the pre-calculated version strings. Anywhere in the source tree should do. If there is a `__init__.py` next to your `_version.py`, the `setup.py setup_versioneer` command (described below) will append some `__version__`-setting assignments, if they aren't already present. * `versionfile_build`: Like `versionfile_source`, but relative to the build directory instead of the source directory. These will differ when your setup.py uses 'package_dir='. If you have `package_dir={'myproject': 'src/myproject'}`, then you will probably have `versionfile_build='myproject/_version.py'` and `versionfile_source='src/myproject/_version.py'`. If this is set to None, then `setup.py build` will not attempt to rewrite any `_version.py` in the built tree. If your project does not have any libraries (e.g. if it only builds a script), then you should use `versionfile_build = None`. To actually use the computed version string, your `setup.py` will need to override `distutils.command.build_scripts` with a subclass that explicitly inserts a copy of `versioneer.get_version()` into your script file. See `test/demoapp-script-only/setup.py` for an example. * `tag_prefix`: a string, like 'PROJECTNAME-', which appears at the start of all VCS tags. If your tags look like 'myproject-1.2.0', then you should use tag_prefix='myproject-'. If you use unprefixed tags like '1.2.0', this should be an empty string, using either `tag_prefix=` or `tag_prefix=''`. * `parentdir_prefix`: a optional string, frequently the same as tag_prefix, which appears at the start of all unpacked tarball filenames. If your tarball unpacks into 'myproject-1.2.0', this should be 'myproject-'. To disable this feature, just omit the field from your `setup.cfg`. This tool provides one script, named `versioneer`. That script has one mode, "install", which writes a copy of `versioneer.py` into the current directory and runs `versioneer.py setup` to finish the installation. To versioneer-enable your project: * 1: Modify your `setup.cfg`, adding a section named `[versioneer]` and populating it with the configuration values you decided earlier (note that the option names are not case-sensitive): ```` [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- ```` * 2: Run `versioneer install`. This will do the following: * copy `versioneer.py` into the top of your source tree * create `_version.py` in the right place (`versionfile_source`) * modify your `__init__.py` (if one exists next to `_version.py`) to define `__version__` (by calling a function from `_version.py`) * modify your `MANIFEST.in` to include both `versioneer.py` and the generated `_version.py` in sdist tarballs `versioneer install` will complain about any problems it finds with your `setup.py` or `setup.cfg`. Run it multiple times until you have fixed all the problems. * 3: add a `import versioneer` to your setup.py, and add the following arguments to the setup() call: version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), * 4: commit these changes to your VCS. To make sure you won't forget, `versioneer install` will mark everything it touched for addition using `git add`. Don't forget to add `setup.py` and `setup.cfg` too. ## Post-Installation Usage Once established, all uses of your tree from a VCS checkout should get the current version string. All generated tarballs should include an embedded version string (so users who unpack them will not need a VCS tool installed). If you distribute your project through PyPI, then the release process should boil down to two steps: * 1: git tag 1.0 * 2: python setup.py register sdist upload If you distribute it through github (i.e. users use github to generate tarballs with `git archive`), the process is: * 1: git tag 1.0 * 2: git push; git push --tags Versioneer will report "0+untagged.NUMCOMMITS.gHASH" until your tree has at least one tag in its history. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See details.md in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg`, if necessary, to include any new configuration settings indicated by the release notes * re-run `versioneer install` in your source tree, to replace `SRC/_version.py` * commit any changed files ### Upgrading to 0.16 Nothing special. ### Upgrading to 0.15 Starting with this version, Versioneer is configured with a `[versioneer]` section in your `setup.cfg` file. Earlier versions required the `setup.py` to set attributes on the `versioneer` module immediately after import. The new version will refuse to run (raising an exception during import) until you have provided the necessary `setup.cfg` section. In addition, the Versioneer package provides an executable named `versioneer`, and the installation process is driven by running `versioneer install`. In 0.14 and earlier, the executable was named `versioneer-installer` and was run without an argument. ### Upgrading to 0.14 0.14 changes the format of the version string. 0.13 and earlier used hyphen-separated strings like "0.11-2-g1076c97-dirty". 0.14 and beyond use a plus-separated "local version" section strings, with dot-separated components, like "0.11+2.g1076c97". PEP440-strict tools did not like the old format, but should be ok with the new one. ### Upgrading from 0.11 to 0.12 Nothing special. ### Upgrading from 0.10 to 0.11 You must add a `versioneer.VCS = "git"` to your `setup.py` before re-running `setup.py setup_versioneer`. This will enable the use of additional version-control systems (SVN, etc) in the future. ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the Creative Commons "Public Domain Dedication" license (CC0-1.0), as described in https://creativecommons.org/publicdomain/zero/1.0/ . """ from __future__ import print_function try: import configparser except ImportError: import ConfigParser as configparser import errno import json import os import re import subprocess import sys class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_root(): """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): err = ("Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) if os.path.splitext(me)[0] != os.path.splitext(versioneer_py)[0]: print("Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(me), versioneer_py)) except NameError: pass return root def get_config_from_root(root): """Read the project setup.cfg file to determine Versioneer config.""" # This might raise EnvironmentError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.SafeConfigParser() with open(setup_cfg, "r") as f: parser.readfp(f) VCS = parser.get("versioneer", "VCS") # mandatory def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" cfg.versionfile_source = get(parser, "versionfile_source") cfg.versionfile_build = get(parser, "versionfile_build") cfg.tag_prefix = get(parser, "tag_prefix") if cfg.tag_prefix in ("''", '""'): cfg.tag_prefix = "" cfg.parentdir_prefix = get(parser, "parentdir_prefix") cfg.verbose = get(parser, "verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) return None return stdout LONG_VERSION_PY['git'] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.16 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) return None return stdout def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. """ dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: print("guessing rootdir is '%%s', but '%%s' doesn't start with " "prefix '%%s'" %% (root, dirname, parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None} @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%%s', no digits" %% ",".join(refs-tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags"} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ if not os.path.exists(os.path.join(root, ".git")): if verbose: print("no .git in %%s" %% root) raise NotThisMethod("no .git directory") GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%%s*" %% tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%%d" %% pieces["distance"] else: # exception #1 rendered = "0.post.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"]} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree"} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs-tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags"} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ if not os.path.exists(os.path.join(root, ".git")): if verbose: print("no .git in %s" % root) raise NotThisMethod("no .git directory") GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces def do_vcs_install(manifest_in, versionfile_source, ipy): """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-time keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [manifest_in, versionfile_source] if ipy: files.append(ipy) try: me = __file__ if me.endswith(".pyc") or me.endswith(".pyo"): me = os.path.splitext(me)[0] + ".py" versioneer_file = os.path.relpath(me) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: f = open(".gitattributes", "r") for line in f.readlines(): if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True f.close() except EnvironmentError: pass if not present: f = open(".gitattributes", "a+") f.write("%s export-subst\n" % versionfile_source) f.close() files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. """ dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: print("guessing rootdir is '%s', but '%s' doesn't start with " "prefix '%s'" % (root, dirname, parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None} SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.16) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json import sys version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename): """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"]} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None} class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose=False): """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose assert cfg.versionfile_source is not None, \ "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} def get_version(): """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(): """Get the custom setuptools/distutils subclasses used by Versioneer.""" if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/warner/python-versioneer/issues/52 cmds = {} # we add "version" to both distutils and setuptools from distutils.core import Command class cmd_version(Command): description = "report generated version string" user_options = [] boolean_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # we override different "build_py" commands for both environments if "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: from distutils.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe class cmd_build_exe(_build_exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] # we override different "sdist" commands for both environments if "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: from distutils.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self): versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir, files): root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ INIT_PY_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ def do_setup(): """Main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (EnvironmentError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except EnvironmentError: old = "" if INIT_PY_SNIPPET not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(INIT_PY_SNIPPET) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) ipy = None # Make sure both the top-level "versioneer.py" and versionfile_source # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so # they'll be copied into source distributions. Pip won't be able to # install the package without this. manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: with open(manifest_in, "r") as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) except EnvironmentError: pass # That doesn't cover everything MANIFEST.in can do # (http://docs.python.org/2/distutils/sourcedist.html#commands), so # it might give some false negatives. Appending redundant 'include' # lines is safe, though. if "versioneer.py" not in simple_includes: print(" appending 'versioneer.py' to MANIFEST.in") with open(manifest_in, "a") as f: f.write("include versioneer.py\n") else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: print(" versionfile_source already in MANIFEST.in") # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-time keyword # substitution. do_vcs_install(manifest_in, cfg.versionfile_source, ipy) return 0 def scan_setup_py(): """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": errors = do_setup() errors += scan_setup_py() if errors: sys.exit(1)