biotools-1.2.12/0000755000076500000240000000000012241266112014360 5ustar atkassenstaff00000000000000biotools-1.2.12/PKG-INFO0000644000076500000240000000200312241266112015450 0ustar atkassenstaff00000000000000Metadata-Version: 1.1 Name: biotools Version: 1.2.12 Summary: A bunch of bioinformatics utilities. Home-page: https://github.com/sonwell/biotools Author: Andrew Kassen Author-email: atkassen@gmail.com License: UNKNOWN Description: Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*. Currently depends on `clustalw `_ and `BLAST `_. You can grab the most current code from `github `_ or via git git clone git://github.com/sonwell/biotools.git Keywords: gene prediction,prokaryotes,effectors Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Science/Research Classifier: Topic :: Scientific/Engineering :: Bio-Informatics Requires: numpy Requires: matplotlib biotools-1.2.12/prok-geneseek0000755000076500000240000000554612002111133017042 0ustar atkassenstaff00000000000000#!/usr/bin/env python # 1. predict # 2. cluster # 3. rename # 4. snps import biotools.analysis.run as pr import biotools.analysis.cluster as cw import biotools.analysis.renamer as mv import biotools.analysis.variance as cs import biotools.analysis.options as op import biotools.analysis.loaddata as ld import sys import os if __name__ == "__main__": op.parse(sys.argv[1:]) if len(op.args) < 2: op.help() exit(0) direc = op.DIRECTORY plotter = op.PLOTTER database = op.args[0] names = op.args[1:] sep = os.sep predict = op.predicting cluster = op.clustering rename = op.renaming plot = op.plotting report = op.reporting calculate = op.calculating rename = rename and (cluster or not predict) calculate = calculate and (cluster or not predict) plot = plot and (calculate or not (predict or cluster)) report = report and calculate if not (predict or cluster or rename or calculate or plot or report): print "I don't know what you want, I give up." exit(0) if predict: names = pr.run(database, names) print "Homologous sequences written to " + direc + 'sequences' + sep if cluster: names = cw.run(direc + 'clusters' + sep, names) print "Clustalw files written to " + direc + "clusters" + sep if rename: names = mv.rename(direc + 'clusters' + sep, database, names) if plot and plotter.lower() != 'none': try: cv = __import__(plotter, globals(), locals(), ['plot'], -1) except ImportError: try: if not plotter.endswith('.py'): plotter += '.py' open(plotter, 'r') except: plot = False else: p = plotter.rfind(sep) if p > -1: sys.path.append(plotter[:p]) plotter = plotter[p + len(sep):] try: cv = __import__(plotter, globals(), locals(), ['plot'], -1) except ImportError: plot = False if calculate: gen = cs.var(names) elif plot: gen = (ld.parse(s) for s in names) else: gen = [] for entry in gen: plotdata = entry['plotdata'] metadata = entry['metadata'] if plot: try: cv.plot(plotdata, direc + 'plots' + sep, filename=metadata['filename']) except Exception as e: print metadata['filename'], e if report: try: os.mkdir(direc + 'data' + sep) except: pass fh = open(direc + 'data' + sep + metadata['strain'] + '.py', 'w') fh.write('plotdata = ' + repr(plotdata) + '\n') fh.write('metadata = ' + repr(metadata) + '\n') print "Done" biotools-1.2.12/setup.py0000644000076500000240000000257312241266044016105 0ustar atkassenstaff00000000000000#!/usr/bin/env python from distutils.core import setup setup(name="biotools", version='1.2.12', description="A bunch of bioinformatics utilities.", long_description="""Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*. Currently depends on `clustalw `_ and `BLAST `_. You can grab the most current code from `github `_ or via git git clone git://github.com/sonwell/biotools.git""", author="Andrew Kassen", maintainer="Andrew Kassen", author_email="atkassen@gmail.com", maintainer_email="atkassen@gmail.com", url="https://github.com/sonwell/biotools", requires=['numpy','matplotlib'], packages=['biotools', 'biotools.analysis', 'biotools.IO'], package_dir={'biotools': 'src', 'biotools.analysis': 'src/analysis', 'biotools.IO': 'src/IO'}, scripts=['prok-geneseek'], keywords='gene prediction, prokaryotes, effectors', classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics' ]) biotools-1.2.12/src/0000755000076500000240000000000012241266112015147 5ustar atkassenstaff00000000000000biotools-1.2.12/src/__init__.py0000644000076500000240000000005511773402744017274 0ustar atkassenstaff00000000000000''' A bunch of bioinformatics utilities. ''' biotools-1.2.12/src/align.py0000644000076500000240000002017612241075232016622 0ustar atkassenstaff00000000000000#!/usr/bin/env python ''' This module is used to align sequences. Currently, there is only a single alignment algorithm implementented; it is a hybrid between Needleman-Wunsch and Smith-Waterman and is used to find the subsequence within a larger sequence that best aligns to a reference. ''' from biotools.translate import translate import biotools.analysis.options as options DIAG_MARK, VGAP_MARK, HGAP_MARK = 3, 2, 1 bl = { '*': {'*': 0, 'A': -9, 'C': -9, 'E': -9, 'D': -9, 'G': -9, 'F': -9, 'I': -9, 'H': -9, 'K': -9, 'M': -9, 'L': -9, 'N': -9, 'Q': -9, 'P': -9, 'S': -9, 'R': -9, 'T': -9, 'W': -9, 'V': -9, 'Y': -9, 'X': 0}, 'A': {'*': -9, 'A': 4, 'C': 0, 'E': -1, 'D': -2, 'G': 0, 'F': -2, 'I': -1, 'H': -2, 'K': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': 1, 'R': -1, 'T': -1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0}, 'C': {'*': -9, 'A': 0, 'C': 9, 'E': -4, 'D': -3, 'G': -3, 'F': -2, 'I': -1, 'H': -3, 'K': -3, 'M': -1, 'L': -1, 'N': -3, 'Q': -3, 'P': -3, 'S': -1, 'R': -3, 'T': -1, 'W': -2, 'V': -1, 'Y': -2, 'X': 0}, 'E': {'*': -9, 'A': -1, 'C': -4, 'E': 5, 'D': 2, 'G': -2, 'F': -3, 'I': -3, 'H': 0, 'K': 1, 'M': -2, 'L': -3, 'N': 0, 'Q': 2, 'P': -1, 'S': 0, 'R': 0, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0}, 'D': {'*': -9, 'A': -2, 'C': -3, 'E': 2, 'D': 6, 'G': -1, 'F': -3, 'I': -3, 'H': -1, 'K': -1, 'M': -3, 'L': -4, 'N': 1, 'Q': 0, 'P': -1, 'S': 0, 'R': -2, 'T': 1, 'W': -4, 'V': -3, 'Y': -3, 'X': 0}, 'G': {'*': -9, 'A': 0, 'C': -3, 'E': -2, 'D': -1, 'G': 6, 'F': -3, 'I': -4, 'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': -2, 'Q': -2, 'P': -2, 'S': 0, 'R': -2, 'T': 1, 'W': -2, 'V': 0, 'Y': -3, 'X': 0}, 'F': {'*': -9, 'A': -2, 'C': -2, 'E': -3, 'D': -3, 'G': -3, 'F': 6, 'I': 0, 'H': -1, 'K': -3, 'M': 0, 'L': 0, 'N': -3, 'Q': -3, 'P': -4, 'S': -2, 'R': -3, 'T': -2, 'W': 1, 'V': -1, 'Y': 3, 'X': 0}, 'I': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -3, 'G': -4, 'F': 0, 'I': 4, 'H': -3, 'K': -3, 'M': 1, 'L': 2, 'N': -3, 'Q': -3, 'P': -3, 'S': -2, 'R': -3, 'T': -2, 'W': -3, 'V': 1, 'Y': -1, 'X': 0}, 'H': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': -2, 'F': -1, 'I': -3, 'H': 8, 'K': -1, 'M': -2, 'L': -3, 'N': 1, 'Q': 0, 'P': -2, 'S': -1, 'R': 0, 'T': 0, 'W': -2, 'V': -2, 'Y': 2, 'X': 0}, 'K': {'*': -9, 'A': -1, 'C': -3, 'E': 1, 'D': -1, 'G': -2, 'F': -3, 'I': -3, 'H': -1, 'K': 5, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -1, 'S': 0, 'R': 2, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0}, 'M': {'*': -9, 'A': -1, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 1, 'H': -2, 'K': -1, 'M': 5, 'L': 2, 'N': -2, 'Q': 0, 'P': -2, 'S': -1, 'R': -1, 'T': -1, 'W': -1, 'V': -2, 'Y': -1, 'X': 0}, 'L': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -4, 'G': -4, 'F': 0, 'I': 2, 'H': -3, 'K': -2, 'M': 2, 'L': 4, 'N': -3, 'Q': -2, 'P': -3, 'S': -2, 'R': -2, 'T': -2, 'W': -2, 'V': 3, 'Y': -1, 'X': 0}, 'N': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': 0, 'F': -3, 'I': -3, 'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 6, 'Q': 0, 'P': -2, 'S': 1, 'R': 0, 'T': 0, 'W': -4, 'V': -3, 'Y': -2, 'X': 0}, 'Q': {'*': -9, 'A': -1, 'C': -3, 'E': 2, 'D': 0, 'G': -2, 'F': -3, 'I': -3, 'H': 0, 'K': 1, 'M': 0, 'L': -2, 'N': 0, 'Q': 5, 'P': -1, 'S': 0, 'R': 1, 'T': 0, 'W': -2, 'V': -2, 'Y': -1, 'X': 0}, 'P': {'*': -9, 'A': -1, 'C': -3, 'E': -1, 'D': -1, 'G': -2, 'F': -4, 'I': -3, 'H': -2, 'K': -1, 'M': -2, 'L': -3, 'N': -1, 'Q': -1, 'P': 7, 'S': -1, 'R': -2, 'T': 1, 'W': -4, 'V': -2, 'Y': -3, 'X': 0}, 'S': {'*': -9, 'A': 1, 'C': -1, 'E': 0, 'D': 0, 'G': 0, 'F': -2, 'I': -2, 'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 1, 'Q': 0, 'P': -1, 'S': 4, 'R': -1, 'T': 1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0}, 'R': {'*': -9, 'A': -1, 'C': -3, 'E': 0, 'D': -2, 'G': -2, 'F': -3, 'I': -3, 'H': 0, 'K': 2, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -2, 'S': -1, 'R': 5, 'T': -1, 'W': -3, 'V': -3, 'Y': -2, 'X': 0}, 'T': {'*': -9, 'A': -1, 'C': -1, 'E': 0, 'D': 1, 'G': 1, 'F': -2, 'I': -2, 'H': 0, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1, 'R': -1, 'T': 4, 'W': -3, 'V': -2, 'Y': -2, 'X': 0}, 'W': {'*': -9, 'A': -3, 'C': -2, 'E': -3, 'D': -4, 'G': -2, 'F': 1, 'I': -3, 'H': -2, 'K': -3, 'M': -1, 'L': -2, 'N': -4, 'Q': -2, 'P': -4, 'S': -3, 'R': -3, 'T': -3, 'W': 11, 'V': -3, 'Y': 2, 'X': 0}, 'V': {'*': -9, 'A': 0, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': -1, 'I': 3, 'H': -3, 'K': -2, 'M': 1, 'L': 1, 'N': -3, 'Q': -2, 'P': -2, 'S': -2, 'R': -3, 'T': -2, 'W': -3, 'V': 4, 'Y': -1, 'X': 0}, 'Y': {'*': -9, 'A': -2, 'C': -2, 'E': -2, 'D': -3, 'G': -3, 'F': 3, 'I': -1, 'H': 2, 'K': -2, 'M': -1, 'L': -1, 'N': -2, 'Q': -1, 'P': -3, 'S': -2, 'R': -2, 'T': -2, 'W': 2, 'V': -1, 'Y': 7, 'X': 0}, 'X': {'*': 0, 'A': 0, 'C': 0, 'E': 0, 'D': 0, 'G': 0, 'F': 0, 'I': 0, 'H': 0, 'K': 0, 'M': 0, 'L': 0, 'N': 0, 'Q': 0, 'P': 0, 'S': 0, 'R': 0, 'T': 0, 'W': 0, 'V': 0, 'Y': 0, 'X': 0} } def OptimalCTether(reference, translation, extend=1, create=10): ''' This function will take two sequences: a `reference` sequence and another protein sequence (`translation`; usually, this is an open reading frame that has been translated). Needleman-Wunsch alignment will be performed and the substring of translation with the highest identity that begins with a start codon [default: `['ATG']`] is reported. This function returns a dictionary of relevent information from the alignment; specifically, the alignments itself [keys: `query`, `subject`], the score [key: `score`], the length of the alignment [key: `length`], the length of the substring of translation used [key: `sublength`], the number of identities [key: `identities`], and the number of gaps [key: `gaps`]. ''' starts = set(translate(s) for s in options.START_CODONS) v, w = reference, translation try: v = v.seq except AttributeError: pass try: w = w.seq except AttributeError: pass if not starts & set(w): raise ValueError("Open reading frame does not contain a start codon.") v, w = v[::-1], w[::-1] lv, lw = len(v), len(w) rv, rw = range(lv + 1), range(lw + 1) gpc = [[create * int(not (i | j)) for i in rw] for j in rv] mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0]) for i in rw] for j in rv] pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK for i in rw] for j in rv] ids = [[0 for i in rw] for j in rv] optimal = [None, 0, 0] for i in range(lv): for j in range(lw): vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK], [mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK], [mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]] mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals) gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK) if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \ abs(lv - i) / float(lv) <= options.LENGTH_ERR and \ w[j] in starts: optimal = [mat[i + 1][j + 1], i + 1, j + 1] i, j = optimal[1], optimal[2] seq, ids = ['', ''], 0 gapcount, length, sublen = 0, 0, 0 methods = { VGAP_MARK: lambda s, i, j, l, g, n: (['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n), DIAG_MARK: lambda s, i, j, l, g, n: ([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1, l + 1, g, n + (w[j - 1] == v[i - 1])), HGAP_MARK: lambda s, i, j, l, g, n: ([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n) } while [i, j] != [0, 0]: length += 1 state = (seq, i, j, sublen, gapcount, ids) seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state) return { 'subject': seq[0][::-1], 'query': seq[1][::-1], 'score': optimal[0], 'gaps': gapcount, 'length': length, 'sublength': sublen, 'identities': ids } biotools-1.2.12/src/analysis/0000755000076500000240000000000012241266112016772 5ustar atkassenstaff00000000000000biotools-1.2.12/src/analysis/__init__.py0000644000076500000240000000000011774635423021110 0ustar atkassenstaff00000000000000biotools-1.2.12/src/analysis/cluster.py0000644000076500000240000000676512033603756021053 0ustar atkassenstaff00000000000000#!/usr/bin/env python import biotools.sequence as sequ import biotools.IO as io import biotools.translate as tran import biotools.clustal as clustal import biotools.analysis.options as options try: import Queue as queue except ImportError: import queue import hashlib import subprocess import threading from os import sep, mkdir def run(direc, inputs): ''' Takes a collection of files generated by gene prediction, creates clusters based off of the genes that have homology to those predicted genes, and creates new fasta files in the clusters sub directory under the given directory and separated according to whether they are nucleotide or amino acid sequnces. These new fasta files are then used to create clustalw alignments of the genes if more than 1 sequence exists in the fasta file. ''' clusters = {} all_ids = set() ids = {} q = queue.Queue() filenames = [] def run_clustal(): while not q.empty(): cid = q.get() dig = hashlib.md5() dig.update(' '.join(cid)) dig = dig.hexdigest() fpre = direc + 'nt' + sep + dig apre = direc + 'aa' + sep + dig fname = fpre + ".fasta" aname = apre + ".fasta" fh = io.open(fname, 'w') ah = io.open(aname, 'w') for ipt in clusters: counter = 0 name = '_'.join(ipt.split(sep)[-1].split('.')[0].split()) for cluster in clusters[ipt]: if cid & cluster[0]: nm = name + '_' + str(counter) seq = cluster[1] curr = sequ.Sequence(nm, seq, defline=', '.join(cid)) tr = tran.translate(curr) tr.name = curr.name fh.write(curr) ah.write(tr) counter += 1 fh.close() ah.close() try: clustal.run(fname, fpre + '.clustalw') clustal.run(aname, apre + '.clustalw') filenames.append(dig + '.fasta') except ValueError: pass q.task_done() if direc: for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]: try: mkdir(d) except OSError: pass for ipt in inputs: seqs = {} ids[ipt] = set() for seq in io.open(ipt, 'r'): ids[ipt].add(seq.name) all_ids.add(seq.name) if seq.seq not in seqs: seqs[seq.seq] = set() seqs[seq.seq].add(seq.name) clusters[ipt] = [(seqs[k], k) for k in seqs] del seqs sub_ids = [] while all_ids: cid = all_ids.pop() subcluster = (all_ids | set([cid])) & \ set(i for ipt in clusters for cluster in clusters[ipt] for i in cluster[0] if cid in cluster[0]) for ipt in clusters: for cluster in clusters[ipt]: if cid in cluster[0]: subcluster = (subcluster & cluster[0]) | \ (subcluster - ids[ipt]) sub_ids.append(subcluster) all_ids -= subcluster for cid in sub_ids: q.put(cid) threads = [] for i in xrange(options.NUM_PROCESSES - 1): curr = threading.Thread(target=run_clustal) threads.append(curr) curr.start() run_clustal() q.join() return filenames biotools-1.2.12/src/analysis/loaddata.py0000644000076500000240000001062111773402607021127 0ustar atkassenstaff00000000000000''' This is a pretty simple JSON-like parser. Specifically, it can load Python-like object, list, and other literals, i.e., the sort of stuff you'd get it you dumped the the string representation of some data into a file. The real difference is that you must specify a variable name, e.g.: ```python my_stuff = { ... } ``` These variable names don't need to be on a newline or anything like that, you should be able to omit any and all whitespace. The result of a successful parse is a dictionary: ```python {'my_stuff': { ... }} ``` This function really only works for `None`, `True`, `False`, numbers, strings, dictionaries, and lists. ''' def parse(ipt): par = {'char': '', 'pos': -1} leng = 0 val = "" ret = {} def advance(test=None): curr = par['char'] if test and test != curr: raise ValueError("Expected %s, saw %s." % (test, curr)) par['pos'] += 1 if par['pos'] < leng: par['char'] = val[par['pos']] else: par['char'] = '' def whitespace(): while par['char'] and ord(par['char']) <= ord(' '): advance() def variable(): whitespace() start = par['pos'] while \ ('0' <= par['char'] <= '9') or \ ('A' <= par['char'] <= 'Z') or \ ('a' <= par['char'] <= 'z'): advance() stop = par['pos'] whitespace() advance('=') return (val[start:stop], value()) def value(): whitespace() if par['char'] == '{': return dictionary() if par['char'] == '[': return array() if par['char'] in ("'", '"'): return string() if '0' <= par['char'] <= '9' or par['char'] == '-': return number() at = par['pos'] if par['char'] == 'T': advance('r') advance('u') advance('e') return True if par['char'] == 'F': advance('a') advance('l') advance('s') advance('e') return False if par['char'] == 'N': advance('o') advance('n') advance('e') return None prefix = val[at:par['pos']] raise ValueError("Unexpected value starting with %s." % prefix) def dictionary(): ret = {} advance('{') whitespace() while par['char'] != '}': s = value() advance(':') v = value() ret[s] = v if par['char'] == ',': advance(',') else: break advance('}') whitespace() return ret def array(): ret = [] advance('[') whitespace() while par['char'] != ']': ret.append(value()) if par['char'] == ',': advance(',') else: break advance(']') whitespace() return ret def string(): ret = "" q = par['char'] advance(q) while par['char'] != q: if par['char'] == '\\': advance('\\') if par['char'] == 'n': ret += '\n' elif par['char'] == 't': ret += '\t' else: ret += par['char'] advance() continue ret += par['char'] advance() advance(q) whitespace() return ret def number(): ret = "" if par['char'] == '-': ret += '-' advance('-') while '0' <= par['char'] <= '9': ret += par['char'] advance() if par['char'] == '.': ret += '.' advance('.') while '0' <= par['char'] <= '9': ret += par['char'] advance() if par['char'] in ('e', 'E'): ret += par['char'] advance() if par['char'] == '-': ret += '-' advance('-') while '0' <= par['char'] <= '9': ret += par['char'] advance() whitespace() return float(ret) for line in ipt: val += line leng += len(line) advance() while par['pos'] < leng: k, v = variable() ret[k] = v return ret biotools-1.2.12/src/analysis/options.py0000644000076500000240000001355511774635067021073 0ustar atkassenstaff00000000000000from optparse import OptionParser from threading import Lock from os import sep, makedirs from sys import stderr LENGTH_ERR = 0.2 MIN_IDENTITY = 0.45 MAX_EVALUE = 1e-30 MIN_ORFLEN = 300 NUM_THREADS = 16 NUM_PROCESSES = 2 DIRECTORY = '.' + sep PLOTTER = 'biotools.analysis.plot' START_CODONS = ["ATG"] STOP_CODONS = ["TAG", "TAA", "TGA"] args = tuple() predicting = True clustering = True renaming = True calculating = True reporting = True plotting = True verbose = False lock = Lock() parser = OptionParser(usage="Usage: %prog [options] " + " ") parser.add_option("-S", "--start", action="append", dest="start", default=START_CODONS, type="string", help="define a start codon [default: %s]" % ' '.join("-S " + s for s in START_CODONS)) parser.add_option("-E", "--stop", action="append", dest="stop", default=STOP_CODONS, type="string", help="define a stop codon [default: %s]" % ' '.join("-E " + s for s in STOP_CODONS)) parser.add_option("-j", "--threads", action="store", dest="threads", default=NUM_THREADS, type="int", help="number of threads [default: %default]") parser.add_option("-p", "--processes", action="store", dest="processes", default=NUM_PROCESSES, type="int", help="number of parallel processes to run " + "[default: %default]") parser.add_option("-e", "--evalue", action="store", dest="evalue", default=MAX_EVALUE, type="float", help="maximum e-value [default: %default]") parser.add_option("-I", "--identity", action="store", dest="identity", default=MIN_IDENTITY, type="float", help="minimum percent identity [default: %default]") parser.add_option("-L", "--length", action="store", dest="fraction", default=LENGTH_ERR, type="float", help="allowable relative error in hit length " + "[default: %default]") parser.add_option("-O", "--orflen", action="store", dest="orflen", metavar="bases", default=MIN_ORFLEN, type="int", help="minimum allowable length for ORFs [default: %default]") parser.add_option("-d", "--directory", action="store", dest="directory", default=DIRECTORY, type="string", help="set working directory [default: current]") parser.add_option("-P", "--plotter", action="store", dest="plotter", default=PLOTTER, type="string", help="plotting module [default: %default]") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=verbose, help="print debug messages [default: False]") parser.add_option("--no-plots", action="store_false", dest="plotting", default=plotting, help="suppress the drawing of plots [default: False]") parser.add_option("--no-predict", action="store_false", dest="predicting", default=predicting, help="don't predict genes, instead treat the input files " + "as predicted genes [default: False]") parser.add_option("--no-cluster", action="store_false", dest="clustering", default=True, help="don't cluster the sequences, " + "instead treat the input files as alignments " + "[default: False]") parser.add_option("--no-rename", action="store_false", dest="renaming", default=True, help="don't rename the fasta and clustal files " + "[default: False]") parser.add_option("--no-reports", action="store_false", dest="reporting", default=True, help="don't generate files for variance data " + "[default: False]") parser.add_option("--no-calculation", action="store_false", dest="calculating", default=True, help="don't calculate sequence variance [default: False]") def debug(msg): if verbose: lock.acquire(True) stderr.write(str(msg) + '\n') lock.release() def parse(pargs): ''' Parses `pargs` and sets global variables to be accessible to other modules. These variables are: * `LENGTH_ERR` * `MIN_IDENTITY` * `MAX_EVALUE` * `NUM_THREADS` * `NUM_PROCESSES` * `START_CODONS` * `START_CODONS` * `DIRECTORY` * `PLOTTER` * `args` ''' global \ LENGTH_ERR, MIN_IDENTITY, MAX_EVALUE, MIN_ORFLEN, \ NUM_THREADS, NUM_PROCESSES, START_CODONS, STOP_CODONS, \ DIRECTORY, PLOTTER, args, predicting, clustering, \ renaming, calculating, reporting, plotting, verbose opts, largs = parser.parse_args(pargs) if opts.directory[-1] != sep: opts.directory += sep try: makedirs(opts.directory) except OSError: pass if '*' in opts.start: DNA = 'ATCG' opts.start = set(i + j + k for i in DNA for j in DNA for k in DNA) - \ set(opts.stop) opts.processes = min(opts.processes, len(largs)) if not (0 <= opts.fraction <= 1): raise RuntimeError("Allowable length error must be between 0 and 1.") LENGTH_ERR = opts.fraction MIN_IDENTITY = opts.identity MAX_EVALUE = opts.evalue MIN_ORFLEN = opts.orflen NUM_THREADS = opts.threads NUM_PROCESSES = opts.processes STOP_CODONS = opts.stop START_CODONS = opts.start DIRECTORY = opts.directory PLOTTER = opts.plotter predicting = opts.predicting clustering = opts.clustering renaming = opts.renaming calculating = opts.calculating reporting = opts.reporting plotting = opts.plotting verbose = opts.verbose args = largs def help(): ''' Prints the usage. ''' parser.print_help() biotools-1.2.12/src/analysis/plot.py0000644000076500000240000000720212033604146020325 0ustar atkassenstaff00000000000000import numpy as np import matplotlib.pyplot as plt import math from os import sep, mkdir def smoothed(unsmoothed, factor): l = len(unsmoothed) r = [math.exp(-(i - l) ** 2 / float(factor)) for i in xrange(1, l)] w = np.array(r + [1] + r[::-1]) c = (w[l - i - 1:2 * l - i - 1] for i in xrange(l)) wm = np.array([x / sum(x) for x in c]) return np.dot(unsmoothed, wm.transpose()) def plot(plotdata, directory, bottom=True, side=True, legend=True, save=True, filename='untitled.pdf', upperbound=0.05, factor=21, fig=None, **kwargs): if fig is None: fig = plt.figure(None, facecolor='w', edgecolor='w') if not directory.endswith(sep): directory += sep try: mkdir(directory) except OSError: pass # plotting data ntvar = plotdata['nt']['var'] aavar = plotdata['aa']['var'] # gene models starts = plotdata['aa']['starts'] ends = plotdata['aa']['ends'] counts = plotdata['aa']['count'] # smooth the data snt = smoothed(ntvar, factor) lnt = len(ntvar) saa = smoothed(aavar, factor) laa = len(aavar) # bounding rectangle bound = [0, laa, -upperbound / 6.0, upperbound] # x-values to align nucleotide & amino acids xnt = np.arange(lnt) / 3.0 + 1 xaa = np.arange(laa) + 1 ax = axes(bottom, side, bound, fig, **kwargs) ntl = draw(xnt, snt, ax, '#0000ff', **kwargs) aal = draw(xaa, saa, ax, '#00ff00', **kwargs) models(starts, ends, counts, bound, ax, **kwargs) report(filename, ntvar, aavar, lnt, laa) if legend: fig.legend((ntl, aal), ('Nucleotide', 'Amino acid'), 'upper right') if save: fig.savefig(directory + filename) def axes(bottom, side, bound, fig, **kwargs): # create the proper sized frame, depending on # how we draw the plot x = 0.09 if side else 0.02 y = 0.09 if bottom else 0.04 xs = [bound[0], bound[1] * 1.06] # construct the axes ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=xs, ylim=bound[2:]) ax.minorticks_on() ax.tick_params(axis='x', which='minor', length=3) # hide the unwanted axis lines (typically the top & right) # label the wanted axes and draw them for loc, spine in ax.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') continue if loc == 'bottom': ax.xaxis.set_ticks_position('bottom') if bottom: ax.set_xlabel("Amino acids") continue if loc == 'left': if side: ax.set_ylabel("Sequence variance") ax.yaxis.set_ticks_position('left') else: spine.set_color('none') ax.tick_params('y', which='both', color='none', labelcolor='none') ax.hlines(ax.get_yticks(), xs[0], xs[1], color='0.75', linestyle='dashed') ax.hlines(0, xs[0], xs[1], color='k', linestyle='solid') return ax def draw(x, y, ax, color, **kwargs): return ax.plot(x, y, color=color, linestyle='solid') def models(starts, ends, counts, bound, ax, **kwargs): lb, l = bound[2], len(starts) scale = bound[1] / max(ends) ys, i = np.arange(1, l + 1) * lb / 3.0, 0 # draw the gene models ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid') for c in counts: ax.text(bound[1] + 10, lb / 3.0 * (i + 1.25), int(c)) i += 1 def report(filename, ntvar, aavar, lnt, laa): print(filename.center(80, '=')) print('Average variance: ') print('\t%f per base pair' % (sum(ntvar) / lnt)) print('\t%f per amino acid' % (sum(aavar) / laa)) biotools-1.2.12/src/analysis/predict.py0000644000076500000240000001663712241261376021022 0ustar atkassenstaff00000000000000#!/usr/bin/env python import biotools.IO as io import biotools.BLAST as BLAST import biotools.analysis.options as options from biotools.sequence import Sequence, annotation as ann from biotools.align import OptimalCTether as align from biotools.translate import translate from biotools.complement import complement try: import Queue as queue except ImportError: import queue import threading from os import sep, mkdir PIPING = True def ORFGenerator(sequ): ''' Scans both strands of the given sequence and yields the longest subsequence that starts with a start codon and contains no stop codon other than the final codon. ''' comp = complement(sequ[::-1]) seq = sequ.seq cseq = comp.seq slen = len(sequ) starts = [-1, 0, 1, -1, 0, 1] # locations of start codons in each frame stops = [0, 1, 2, 0, 1, 2] # locations of stop codons in each frame mlen = options.MIN_ORFLEN for i in xrange(slen - 2): fcodon, rcodon = seq[i:i + 3], cseq[i:i + 3] if fcodon in options.STOP_CODONS: if i - mlen >= starts[i % 3 + 3] >= stops[i % 3 + 3]: yield sequ[starts[i % 3 + 3]:i + 3] stops[i % 3 + 3] = i + 3 elif fcodon in options.START_CODONS: if starts[i % 3 + 3] < stops[i % 3 + 3]: starts[i % 3 + 3] = i if rcodon in options.STOP_CODONS: if i - mlen >= starts[i % 3] >= stops[i % 3]: yield comp[starts[i % 3]:i + 3] stops[i % 3] = i + 3 elif rcodon in options.START_CODONS: if starts[i % 3] < stops[i % 3]: starts[i % 3] = i raise StopIteration() class ThreadQueue(queue.Queue): def __init__(self, target): queue.Queue.__init__(self) self.threadcount = 0 self.target = target def put(self, item): options.lock.acquire() queue.Queue.put(self, item) if self.threadcount < options.NUM_THREADS - 1: thread = threading.Thread(target=self.target) thread.start() self.threadcount += 1 options.lock.release() def GeneFromBLAST(db, sequences, pref, names): ''' BLASTs database against sequences, and for those results that pass the length and percent identity requirements, attempt to locate the full gene that corresponds to that BLAST hit. Genes that are found are saved in the subdirectory sequences under the given directory, divided depending on whether the sequnece is amino acid or nucleotide. ''' PIPING = True wd = options.DIRECTORY + 'sequences' + sep for d in [options.DIRECTORY, wd]: try: mkdir(d) except OSError: pass subj = dict((s.name, s) for s in io.open(db, 'r')) options.debug("Database sequences loaded from file %s." % db) try: orfs = dict((s.name, [orf for orf in ORFGenerator(s)]) for s in io.open(sequences, 'r')) options.debug("ORFs loaded from file %s." % sequences) except IOError: options.debug("No file \"" + sequences + ",\" skipping.") return def target(): while 1: try: res = qin.get(PIPING, 1) except queue.Empty: if not PIPING: break else: continue qname, sname = res['query']['name'], res['subject']['name'] start, end = res['query']['start'], res['query']['end'] alignments = [] max_match = (options.MIN_IDENTITY, None) if subj[sname].type == 'nucl': subject = translate(subj[sname]) else: subject = subj[sname] while qname: try: o = orfs[qname] break except KeyError: qname = qname[:-1] if not qname: qin.task_done() continue for orf in o: if in_range(orf, start, end, res['frame']): orf = orf[:-3] query = translate(orf) options.debug("Aligning %33s v. %33s." % (qname, sname)) alignment = align(subject.seq, query.seq) alignments.append((orf, sname, alignment)) for orf, refname, aln in alignments: hitlen = aln['sublength'] region = orf[-3 * hitlen:] identity = float(aln['identities']) / aln['length'] if identity >= max_match[0]: max_match = (identity, (region, sname, aln)) if max_match[1]: seq, name, _ = max_match[1] odl = subject.defline.split('[')[0].strip() src = seq.original.name start, end, strand = seq.start, seq.end, seq.step defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \ (odl + (' ' if odl else ''), src, start, end, strand) new = Sequence(name.strip(), seq.seq, defline=defline, original=seq.original, type=seq.type, start=seq.start, end=seq.end, step=seq.step) qout.put(new) qin.task_done() def in_range(seq, start, end, frame): ss, se = sorted((seq.start, seq.end)) os, oe = sorted((start, end)) frame = int(frame) return (ss < oe and se > os and (se % 3 == oe % 3 or ss % 3 == oe % 3) and ((frame < 0 and seq.step < 0) or (frame > 0 and seq.step > 0))) qout = queue.Queue() qin = ThreadQueue(target) blastopts = { 'evalue': options.MAX_EVALUE, 'num_threads': options.NUM_THREADS } for res in BLAST.run(db, sequences, **blastopts): if float(res['expect']) > options.MAX_EVALUE: continue sbjl = len(subj[res['subject']['name']]) ident = float(res['identities'].split('(')[1][:-2]) / 100 lerr = float(res['subject']['length']) / sbjl if ident >= options.MIN_IDENTITY: if lerr >= (1.0 - options.LENGTH_ERR): qin.put(res) PIPING = False options.debug("BLAST done.") target() qin.join() options.debug("Done Aligning sequences.") options.debug("Now writing sequences (%d)." % qout.qsize()) seqs = {} nuc_file = io.open(wd + pref + '.fasta', 'w') count = 0 while 1: try: seq = qout.get(False) if seq.seq not in seqs: seqs[seq.seq] = set() seqs[seq.seq].add(seq) nuc_file.write(seq) count += 1 options.debug("Wrote %s (%d)." % (seq.name, count)); except queue.Empty: break nuc_file.close() options.debug("Done Aligning sequences.") gh = io.open(wd + pref + '.gff3', 'w') names.append(wd + pref + '.fasta') for id in seqs: gh.write(ann(seqs[id].copy().pop(), pref, 'gene', homologs=','.join(s.name for s in seqs[id]))) gh.close() def run(subject, query, prefix, names): GeneFromBLAST(subject, query, prefix, names) if __name__ == '__main__': options.START_CODONS = ['TTG'] import sys f = io.open(sys.argv[1], 'r') for seq in f: print(seq.name + ' ' + seq.defline) for orf in ORFGenerator(seq): print('%d ... %d' % (orf.start, orf.end)) biotools-1.2.12/src/analysis/renamer.py0000644000076500000240000000425512241027703021004 0ustar atkassenstaff00000000000000from __future__ import print_function from os import sep, rename as mv import biotools.IO as io import biotools.analysis.options as options try: get_input = raw_input except: get_input = input def rename(direc, db, files): ''' This isn't really for bioinformatics, this is more for the pipeline, to rename the files generated by cluster.py with a little human interaction. ''' names = [] seqdb = dict((s.name, s) for s in io.open(db, 'r')) nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep for f in files: seq = io.open(nt_dir + f, 'r').next() ids = seq.defline.split(', ') print("File\033[33;1m", f, \ "\033[0mis described by the following sequences:") try: for id in ids: seqdb[id] print("* " + seqdb[id].name + ': ' + seqdb[id].defline.split('[')[0]) except KeyError: print("* (none)") continue pre = get_input("\033[33;1mWhat should we call this file " + "(or hit enter to skip)? \033[0m") fpre = f[:f.find('.')] if pre != "": count = 0 while True: rpre = pre + ((" (%d)" % count) if count > 0 else "") try: fh = open(nt_dir + rpre + ".fasta", 'r') fh.close() count += 1 continue except IOError: nt_old, nt_new = nt_dir + fpre, nt_dir + rpre aa_old, aa_new = aa_dir + fpre, aa_dir + rpre print("Renaming " + fpre + ".* to " + rpre + ".*") try: mv(nt_old + ".fasta", nt_new + ".fasta") mv(aa_old + ".fasta", aa_new + ".fasta") mv(nt_old + ".clustalw", nt_new + ".clustalw") mv(aa_old + ".clustalw", aa_new + ".clustalw") names.append(nt_new + '.clustalw') names.append(aa_new + '.clustalw') except OSError: pass break return names biotools-1.2.12/src/analysis/report.py0000644000076500000240000000557511774635321020707 0ustar atkassenstaff00000000000000import matplotlib.pyplot as plt import biotools.analysis.plot as bap from os import sep, mkdir # report areas of high conservation or variation def report(plotdata, **kwargs): pass # wraps biotools.analysis.plot.plot() def plot(plotdata, directory, bottom=True, side=True, legend=True, save=True, filename='untitled.pdf', upperbound=0.05, factor=21, fig=plt.figure(None, facecolor='w', edgecolor='w'), **kwargs): ranges = report(plotdata, **kwargs) try: mkdir(directory) except OSError: pass lowerbound = -upperbound / 6 # smooth the data snt = smoothed(snpdata['nt']['var'], factor) lnt = len(snpdata['nt']['var']) saa = smoothed(snpdata['aa']['var'], factor) laa = len(snpdata['aa']['var']) # generate x-ranges so that amino acids # and nucleotides align xnt = np.arange(lnt) * (0.0 + laa) / lnt + 1 xaa = np.arange(laa) + 1 # create the proper sized frame, depending on # how we draw the plot x = 0.09 if side else 0.02 y = 0.09 if bottom else 0.04 ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=[0, laa * 1.06], ylim=[lowerbound, upperbound]) ax.minorticks_on() ax.tick_params(axis='x', which='minor', length=3) for loc, spine in ax.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') continue if loc == 'bottom': ax.xaxis.set_ticks_position('bottom') if bottom: ax.set_xlabel("Amino acids") continue if loc == 'left': if side: ax.set_ylabel("Sequence variance") ax.yaxis.set_ticks_position('left') else: spine.set_color('none') ax.tick_params('y', which='both', color='none', labelcolor='none') ax.hlines(ax.get_yticks(), 0, laa * 1.06, color='0.75', linestyle='dashed') ax.hlines(0, 0, laa * 1.06, color='k', linestyle='solid') nt_lines = ax.plot(xnt, snt, color='#0000ff', linestyle='solid') aa_lines = ax.plot(xaa, saa, color='#00ff00', linestyle='solid') starts = snpdata['aa']['starts'] ends = snpdata['aa']['ends'] counts = snpdata['aa']['count'] scale = laa / max(ends) ys = (np.arange(len(starts)) + 1) * lowerbound / 3 ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid') for i, c in zip(xrange(len(counts)), counts): ax.text(laa + 10, lowerbound / 3 * (i + 1.25), c) if legend: fig.legend((nt_lines, aa_lines), ('Nucleotide', 'Amino acid'), 'upper right') if save: fig.savefig(directory + filename) print '=============', filename, '=============' print 'Average variance: ' print '\t', sum(snpdata['nt']['var']) / lnt, 'per base pair' print '\t', sum(snpdata['aa']['var']) / laa, 'per amino acid' biotools-1.2.12/src/analysis/run.py0000644000076500000240000000234412241062127020153 0ustar atkassenstaff00000000000000#!/usr/bin/env python import biotools.analysis.predict as genepredict import biotools.analysis.options as options import threading from os import sep try: import Queue as queue except ImportError: import queue def run(infile, strains): ''' Run several instances of `genepredict.run` at once. ''' q = queue.Queue() filenames = [] def run_predict(): while 1: try: strainf = q.get(False) except queue.Empty: break strain = strainf.split(sep)[-1] pos = strain.rfind('.') if pos > 1 or (pos == 1 and strain[0] != '.'): strain = strain[:pos] options.debug("Predicting for %s." % strain) try: genepredict.run(infile, strainf, strain, filenames) except RuntimeError: pass q.task_done() for strain in strains: q.put(strain) for i in range(options.NUM_PROCESSES - 1): curr = threading.Thread(target=run_predict) curr.start() run_predict() q.join() return filenames if __name__ == "__main__": import sys try: run(sys.argv[1:]) except IndexError: pass biotools-1.2.12/src/analysis/variance.py0000644000076500000240000000622312002354765021146 0ustar atkassenstaff00000000000000#!/usr/bin/env python import biotools.IO as io import biotools.analysis.options as options from biotools.translate import translate from os import sep def SaySNPs(input): ''' Takes a clustalw alignment and will return a dictionary of data relevent to plotting the sequence variance for the sequences in the given clustalw alignment. These data are: * `var`: the measure of sequence variation, * `starts`: the starting positions for each gene model in amino acids, * `ends`: the ending positions for each gene model in amino acids, and * `count`: the number of sequences with a particular gene model. The values given in `starts`, `ends`, and `counts` are sorted to that the nth element in starts corresponds to the nth value in ends and the nth value in counts. ''' catalogue = [] lengths = {} for seq in io.open(input, 'r'): key = (seq.start, seq.end) lengths[key] = lengths.get(key, 0) + 1 for (i, c) in zip(xrange(key[1] - key[0] + 1), seq): if i >= len(catalogue): catalogue.append({}) if c != " ": catalogue[i][c] = catalogue[i].get(c, 0) + 1 calc = [] for s in catalogue: tot = float(sum(s.values())) cnt = float(len(s)) calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s)) llist = sorted(list(lengths.keys())) return { 'var': calc, 'starts': [s for s, e in llist], 'ends': [e for s, e in llist], 'count': [lengths[k] for k in llist] } def var(files): ''' Returns plot data and metadata for plotting later on in the pipeline. ''' sort = {} for f in files: seqs = [s for s in io.open(f)] type = set(s.type for s in seqs) if len(type) > 1: type = set(['prot']) fid = (type.pop(), f) seqs = [''.join(s.seq.split('-')).strip() for s in seqs] seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs] sset = frozenset(seqs) srtr = (len(seqs), sset) sort[srtr] = sort.get(srtr, set()) | set([fid]) couples = [] for partners in sort.values(): trim = lambda x: '.'.join(x.split('.')[:-1]) \ if f.endswith('.clustalw') or \ f.endswith('.clustal') or \ f.endswith('.aln') else x names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners)) pair = {} for type, f in partners: if len(pair) == 2: break if type in pair: continue pair[type] = f if 0 < len(pair) < 2: raise TypeError("Unmatched clustal alignment(s): " + ", ".join(f for type, f in partners)) if len(pair) == 0: continue couples.append((pair['nucl'], pair['prot'], names)) for nt, aa, strain in couples: plotdata = { 'nt': SaySNPs(nt), 'aa': SaySNPs(aa) } metadata = {'strain': strain, 'filename': strain + '.pdf'} yield {'plotdata': plotdata, 'metadata': metadata} raise StopIteration biotools-1.2.12/src/annotation.py0000644000076500000240000001034512013524553017701 0ustar atkassenstaff00000000000000''' This module is used to create annotation files (currently, only GFF files). The annotations can be used to create a heirarchy among the annotations (e.g., genes contain exons, introns, ... etc.). ''' class Annotation(object): ''' An object to help with reading and writing GFF files. ''' unknowns = 0 def __init__(self, ref, src, type, start, end, score, strand, phase, attr, name_token='ID', gff_token='='): ''' Constructs an `Annotation` object with the necessary values. The parameters are passed in the same order as the columns from a GFF (version 3) file and the name_token and gff_token parameters are the defaults for a gff version 3 file from phytozome. Just write (e.g.) ```python Annotation(*line.split('\\t')) #(splitting on tabs) ``` and the rest of the work will be done for you. Other sources may require changes to `name_tokens` and `gff_token`. Instantiating an `Annotation` will generate for it an id of the form *SEQNAME*_*TYPE*[START:END], where *SEQNAME* is the name of the sequence (column 1) from the GFF file, and type is like 'gene' or 'CDS'. If no *SEQNAME* is provided, then `X` be used in its place, and if no identifier can be found in the attributes, the `Annotation` will generate an identifier for itself in the form of `unknown #`. ''' def parse_attrs(attr, keyvalsep='=', attrsep=';'): ''' Creates a dictionary from the atrributes (9th column) of a gff file. By default, key-value separator (`keyvalsep`) is `=`, which is the separator used in gff version 3. In other words, `attr` `"a=b;c=d;"` and `keyvalsep` `=` will yield the dictionary `{'a':'b','c':'d'}`. The other separator (`attrsep`) separates individual attributes and defaults to ';', which is also the norm in GFF files. ''' attributes = {} if keyvalsep not in attr: keyvalsep = ' ' l = len(keyvalsep) attrs = [a.strip() for a in attr.strip().split(attrsep)] for attribute in attrs: pos = attribute.find(keyvalsep) if pos > -1: var, val = attribute[:pos], attribute[pos + l:] attributes[var] = attributes.get(var, []) + [val] for key in attributes: attributes[key] = ','.join(attributes[key]) return attributes start, end = int(start), int(end) self.strand = strand self.type = type self.source = src self.seq = ref self.start = min(start, end) self.end = max(end, start) self.attr = parse_attrs(attr, gff_token) self.phase = phase self.score = score self.ntoken = name_token self.id = ((self.seq or 'X') + '_' + self.type + "[%d:%d]" % (self.start, self.end)) try: self.name = self.attr[name_token] except KeyError: Annotation.unknowns += 1 self.name = "unknown %d" % Annotation.unknowns self.parent = None self.children = [] ''' Some things that you can do to `Annotation` objects: * `len(annotation)` => length of the annotation (`end-start+1`) * `dictionary[annotation]` => store annotations as keys of a dictionary or as elements in a set * `annA == annB` => compare two Annotations, they are the same if they have the same id. * `print annotation` => prints the annotation as a line of a GFF version 3 file. ''' def __len__(self): return max(self.start, self.end) - min(self.end, self.start) + 1 def __hash__(self): return self.id.__hash__() def __eq__(self, other): try: return self.id == other.id except AttributeError: return False def __str__(self): return '\t'.join((self.seq, self.source, self.type, str(self.start), str(self.end), self.score, self.strand, str(self.phase), ';'.join(k + '=' + self.attr[k] for k in self.attr))) biotools-1.2.12/src/BLAST.py0000644000076500000240000002365412241022662016400 0ustar atkassenstaff00000000000000#!/usr/bin/env python ''' A module to manage BLAST databases and interface with the BLAST+ standalone program available from NCBI. ''' import biotools.IO as io import subprocess from os import sep, getenv, listdir import shutil def run(db, sfile, mega_blast=False, **kwargs): ''' Takes a database and a query and runs the appropriate type of BLAST on them. The database can be an existing BLAST database or a fasta/fastq file. If it is a sequence file, this function will look in the places where BLAST would look for an existing database created from that file and use that instead. If there is no such database, this function will make one for you and then use the newly created database with BLAST. Optional named arguments can currently only be `evalue`, `num_threads`, `gapopen`, or `gapextend`. The correspond to the BLAST options of the same name. ''' cmds = { 'prot': { 'prot': 'blastp', 'nucl': 'tblastn' }, 'nucl': { 'nucl': 'blastn', 'prot': 'blastx' } } seq = io.open(sfile, 'r').next() qtype = seq.type rcloc = '' for loc in (".:~:" + (getenv("NCBI") or "")).split(':'): if loc and loc[-1] == sep: loc += sep try: for line in (l.strip() for l in open(loc + '.ncbirc', 'r')): pos = line.find('=') if pos >= 0 and line[:pos].strip() == "BLASTDB": rcloc = line[pos + 1:].strip() except IOError: pass dbtype = None bdbenv = getenv("BLASTDB") dblocations = (":." + ((':' + bdbenv) if bdbenv else '') + ((':' + rcloc) if rcloc else '')).split(':') for loc in dblocations: if loc and loc[-1] != sep: loc += sep try: open(loc + db + '.pin', 'r') dbtype = 'prot' break except IOError: try: open(loc + db + '.nin', 'r') dbtype = 'nucl' break except IOError: pass if not dbtype: odb = db pos = db.rfind(".") for seq in io.open(db, 'r'): dbtype = seq.type break if not dbtype: raise IOError("Database not found: " + odb) ndb = None sp = db.rfind(sep) if sp > -1: dbdir, db = db[:sp], db[sp + 1:pos] else: dbdir, db = '.', db[:pos] for file in listdir(dbdir): dpos = file.rfind('.') if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in': fh = open(dbdir + sep + file, 'r') c = ord(fh.read(12)[-1]) fname = fh.read(c) if fname[0] in ("'", '"'): fname = fname[1:-1] if fname.endswith(odb): ndb = dbdir + sep + file[:dpos] break if not ndb: ndb = '_'.join(db.split()) try: ignore = open('/dev/null', 'w') except IOError: ignore = open('nul', 'w') try: # possible race condition open(ndb, 'r').close() except IOError: subprocess.call(["makeblastdb", "-in", '"%s"' % odb, "-out", ndb, "-dbtype", dbtype], stdout=ignore) try: for suff in ['in', 'hr', 'sq']: name = ndb + '.' + dbtype[0] + suff shutil.move(name, dbdir + sep + name) except shutil.Error: pass db = dbdir + sep + ndb else: db = ndb else: raise IOError("Database not found: " + db) allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \ set(kwargs.keys()) cmd = cmds[qtype][dbtype] pn = ["-db", "-query"] if mega_blast: cmd = "megablast" pn = ["-d", "-i"] allowed = ["e", "a"] args = [cmd, pn[0], db, pn[1], sfile] + [arg for pair in [["-" + k, str(kwargs[k])] for k in allowed] for arg in pair] proc = subprocess.Popen(args, bufsize=1, stdout=subprocess.PIPE) return Result(iter(proc.stdout.readline, '')) class Result(object): ''' A class which take the raw output from BLAST and generates dictionaries from the data from BLAST. This data includes the alignment, percent identity, gaps, e-value, score, length of subject, length of query, and start and stop positions for both sequences. This class should be used in a for loop like so: ```python for res in Result(file_or_data): pass ``` The class instance has a single other property, `headers`, which are the lines in BLAST results before the BLAST hits (e.g., citation info, etc.). ''' def __init__(self, file): self.file = file self.headers = [] def __iter__(self): try: ipt = open(self.file, 'r') except (IOError, TypeError): try: ipt = self.file.split('\n') except: ipt = self.file mode = 0 headers = [] curr = None length = 0 def sh(sn, qn, l): qdl = '' space = qn.find(' ') if space > -1: qn, qdl = qn[:space], qn[space + 1:].lstrip() return { 'subject': { 'name': sn.lstrip(), 'defline': '', 'start': None, 'end': None, 'sequence': '' }, 'query': { 'name': qn, 'defline': qdl, 'start': None, 'end': None, 'sequence': '' }, 'length': l } def ra(sh): for res in ('subject', 'query'): sh[res]['start'] = int(sh[res]['start']) sh[res]['end'] = int(sh[res]['end']) sh[res]['length'] = abs(sh[res]['end'] - sh[res]['start'] + 1) return sh def sh_fmt(l): for pairs in (a.strip() for a in l.split(',')): l, r = tuple(a.strip() for a in (pairs.split('=')[:2] if '=' in pairs else pairs.split(':')[:2])) subheaders[l.lower().split('(')[0]] = r for line in ipt: line = line.rstrip('\n').lstrip() if not line: if mode == 4: mode = 5 continue if mode == 0: if line[:6] == 'Query=': mode = 1 qname = line[6:].lstrip() self.headers = headers else: headers.append(line) elif mode == 1: if line[0] == '>': mode = 3 subheaders = sh(line[1:], qname, length) elif line[:7] == 'Length=': length = int(''.join(line[7:].strip().split(','))) mode = 2 elif line[0] == '(' and line.endswith('letters)'): length = int(''.join(line[1:-8].strip().split(','))) mode = 2 elif line[:6] == 'Query=': qname = line[6:].lstrip() else: qname += line elif mode == 2: if line[0] == '>': mode = 3 subheaders = sh(line[1:], qname, length) elif line[:6] == 'Query=': qname = line[6:].lstrip() mode = 1 elif mode == 3: if line[:5] == 'Score': snm = subheaders['subject']['name'] defline = '' space = snm.find(' ') if space > -1: snm, defline = snm[:space], snm[space + 1:] subheaders['subject']['name'] = snm subheaders['subject']['defline'] = defline sh_fmt(line) mode = 4 elif line[:7] == 'Length=': pass elif line[0] == '(' and line.endswith('letters)'): pass else: subheaders['subject']['name'] += line elif mode == 4: sh_fmt(line) elif mode == 5: if line[:6] == 'Query=': mode = 1 qname = line[6:].lstrip() yield ra(subheaders) continue elif line[0] == '>': yield ra(subheaders) subheaders = sh(line[1:], qname, length) mode = 3 continue elif line[:5] == 'Score': yield ra(subheaders) subheaders = sh(subheaders['subject']['name'], qname, length) sh_fmt(line) mode = 4 continue elif line[:5] == 'Sbjct': curr = 'subject' elif line[:5] == 'Query': curr = 'query' else: continue _, start, seq, end = line.split() subheaders[curr]['start'] = subheaders[curr]['start'] or start subheaders[curr]['end'] = end subheaders[curr]['sequence'] += seq try: yield ra(subheaders) except UnboundLocalError: pass raise StopIteration() if __name__ == '__main__': import sys if len(sys.argv) > 1: output = open(sys.argv[1]).read() for result in Result(output): print(result) biotools-1.2.12/src/clustal.py0000644000076500000240000000323411774634454017214 0ustar atkassenstaff00000000000000#!/usr/bin/env python import biotools.IO as io import subprocess from os import remove def run(infile, outfile, **kwargs): n = 0 for seq in io.open(infile, 'r'): n += 1 if n > 1: seqtype = seq.type break if n > 1: cmd = "clustalw" try: ignore = open('/dev/null', 'w') except IOError: ignore = open('nul', 'w') if seqtype == 'nucl': defaults = { 'OUTORDER': 'ALIGNED', 'GAPOPEN': '10', 'GAPEXT': '0.1', 'DNAMATRIX': 'IUB' } others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg])) for arg in set(name.upper() for name in kwargs) & set(defaults.keys())] subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA", "-OUTFILE=" + outfile] + others, stdout=ignore) else: defaults = { 'OUTORDER': 'ALIGNED', 'GAPOPEN': '10', 'GAPEXT': '0.1', 'MATRIX': 'BLOSUM' } others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg])) for arg in set(name.upper() for name in kwargs) & set(defaults.keys())] subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others, stdout=ignore) pos = infile.rfind('.') if pos > -1: prefix = infile[:pos] else: prefix = infile remove(prefix + '.dnd') biotools-1.2.12/src/complement.py0000644000076500000240000000264412033603424017672 0ustar atkassenstaff00000000000000#!/usr/bin/env python from biotools.sequence import isprot _ref = { 'DNA': { 'A': 'T', 'T': 'A', 'a': 't', 't': 'a', 'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c', 'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r', ' ': ' ', '-': '-' }, 'RNA': { 'A': 'U', 'U': 'A', 'a': 'u', 'u': 'a', 'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c', 'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r', ' ': ' ', '-': '-' } } def complement(s): ''' Creates the complement of a sequence, which can then be reversed by using `seq[::-1]`, if it needs to be reversed. This function accepts either `Sequence`s or strings. ''' if isprot(s): return s has_u = ('U' in s or 'u' in s) has_t = ('T' in s or 't' in s) if has_u and not has_t: repl = _ref['RNA'] elif has_t and not has_u: repl = _ref['DNA'] else: repl = _ref['DNA'] value = ''.join(repl.get(c, 'N') for c in s) try: return s.__class__("complement(%s)" % s.name, value, original=s.original, start=s.start, end=s.end, step=s.step, qual=s.qual) except (AttributeError, TypeError): return s.__class__(value) if __name__ == '__main__': assert complement('ATCGTAGCTGATCGAT') == 'TAGCATCGACTAGCTA' assert complement('AUCGUAGCUGAUCGAU') == 'UAGCAUCGACUAGCUA' print(complement('AUCgu--cuGAUCGAU')) biotools-1.2.12/src/IO/0000755000076500000240000000000012241266112015456 5ustar atkassenstaff00000000000000biotools-1.2.12/src/IO/__init__.py0000644000076500000240000001122712017447050017575 0ustar atkassenstaff00000000000000''' A module for reading and writing to sequence and annotation files. Currently supported file types are: FASTA, FASTQ, CLUSTAL alignments, and GFF3 files. ''' from biotools.IO import fasta, fastq, gff, clustal from biotools.IO.manager import IOManager try: import __builtin__ except ImportError: import builtins as __builtin__ def get_methods(): methods = {} nil = lambda *x: iter([]) for module in [fasta, fastq, gff, clustal]: modname = module.__name__.split('.')[-1] methods[modname] = {} for method in ['read', 'write', 'rhook', 'whook', 'probe']: methods[modname][method] = module.__dict__.get(method, nil) return methods class IOBase(object): ''' Generic IO class for sequence files. ''' methods = IOManager(get_methods()) def __init__(self, name, mode): ''' Opens file name with mode mode. This function will attempt to guess at the filetype by 1. looking at the file extension and failing that, will 2. read the first few lines to determine the file type. Recoginized file extensions include fa, fsa, fas, fasta, fastq, clustalw, clustal, aln. ''' self.file = name self.handle = __builtin__.open(name, mode) self.method = self.methods.default self.type = None self.suffixes = { 'fsa': 'fasta', 'fa': 'fasta', 'fs': 'fasta', 'fas': 'fasta', 'fna': 'fasta', 'fasta': 'fasta', 'clustalw': 'clustal', 'clustal': 'clustal', 'aln': 'clustal', 'fastq': 'fastq', 'gff': 'gff', 'gff3': 'gff' } p = name.rfind('.') if p > -1: ext = name[p + 1:] if ext in self.suffixes: try: self.format(self.suffixes[ext]) return except ValueError: pass try: for method in IOBase.methods: try: self.format(method) return except ValueError: pass except IOError: raise def format(self, fmt): ''' Forces a file to be parsed as a particular format. By default, the values for fmt can be any recognized format. ''' if fmt in self.methods: method = self.methods[fmt] ret = method['probe'](__builtin__.open(self.file, 'r')) if ret: self.method = method for key in ret: object.__setattr__(self, key, ret[key]) return self else: raise ValueError("File cannot be parsed as type %s." % fmt) self.method = self.methods.default return self def close(self): ''' Close the file handle. ''' self.handle.close() class Reader(IOBase): ''' A class that wraps IOBase and restricts the ability to write. ''' def __init__(self, filename, mode='r'): IOBase.__init__(self, filename, mode) self.method['rhook'](self.handle) self.iter = self.method['read'](self.handle) def read(self, n=None): ''' If `n` is provided, the next (up to) `n` entries are parsed and returned. Otherwise, all remaining entries are parsed and returned. ''' if n is None: return [s for s in self] return [self.iter.next() for i in xrange(int(n))] def __iter__(self): return self.iter def next(self): ''' Reads a single entry in the file and returns it. ''' try: return self.read(1)[0] except (StopIteration, ValueError, IndexError): raise StopIteration() class Writer(IOBase): ''' A class that wraps IOBase and restricts the ability to read. ''' def __init__(self, filename, mode='w'): IOBase.__init__(self, filename, mode) self.haswritten = False def write(self, sequence): ''' Writes sequence as the correct format to the file. ''' if not self.haswritten: self.method['whook'](self.handle) self.haswritten = True self.method['write'](self.handle, sequence) def open(filename, mode='r'): ''' Open a file for parsing or creation. Returns either a Reader or Writer object, depending on the open mode. ''' if mode == 'r': return Reader(filename) if mode == 'w': return Writer(filename, mode='w') if mode == 'a': return Writer(filename, mode='a') biotools-1.2.12/src/IO/clustal.py0000644000076500000240000000227212002100576017476 0ustar atkassenstaff00000000000000''' Methods for manipulating clustalw alignment files. ''' from biotools.sequence import Sequence def read(fh): def clean_alignment(x): i = 0 for c in x: if c != "-": break i += 1 j = 0 for c in reversed(x): if c != "-": break j += 1 return (' ' * i + x[(i or None):(-j or None)] + ' ' * j, i, len(x) - j) seqs = {} for line in fh: if line.startswith(' '): continue st = line.strip() if st: bits = st.split() if len(bits) != 2: continue if bits[0] not in seqs: seqs[bits[0]] = '' seqs[bits[0]] += bits[1] for k in seqs: seq, start, end = clean_alignment(seqs[k]) yield Sequence(k, seq, start=start, end=end) raise StopIteration() def probe(fh): for line in fh: st = line.strip() if st: if st.startswith('CLUSTAL'): return {'type': 'clustalw'} return False return {'type': 'clustalw'} def rhook(fh): try: fh.next() except StopIteration: pass biotools-1.2.12/src/IO/fasta.py0000644000076500000240000000332712017447011017133 0ustar atkassenstaff00000000000000''' Functions for manipulating FASTA files. ''' from biotools.sequence import Sequence, chop def read(fh): ''' Read sequences in FASTA format; identifiers (names and definition lines) are on lines that begin with carets ('>') and sequence is on lines that intervene between the carets. This function is a generator that yields `Sequence` objects. ''' name, defline, seq = '', '', '' for line in fh: line = line.strip() if not line: continue if line[0] == '>': if name or seq: yield Sequence(name, seq, defline=defline) seq = '' name = line[1:].split()[0] defline = line[1 + len(name):].strip() continue seq += line if name or seq: yield Sequence(name, seq, defline=defline) raise StopIteration() def write(fh, s): ''' Write sequences in FASTA format, i.e., ``` >name defline sequence ... ``` Sequences are wrapped to 70 characters by default. ''' fh.write('>%s %s\n' % (s.name, s.defline) + '\n'.join(chop(s.seq, 70)) + '\n') def probe(fh): ''' Probe a file to determine whether or not it is a FASTA file. That is, the first non-empty line should begin with a caret ('>'). If no caret is found on the first line, then we conclude that it is not a FASTA file and return False, otherwise, we return a dictionary with information relevant to the FASTA file type. ''' for line in fh: st = line.strip() if st: fh.close() if st[0] == '>': return {'type': 'fasta'} return False fh.close() return {'type': 'fasta'} biotools-1.2.12/src/IO/fastq.py0000644000076500000240000000464611774366315017200 0ustar atkassenstaff00000000000000''' Functions for manipulating FASTQ files. ''' from biotools.sequence import Sequence def read(fh): ''' Read sequences in FASTQ format; identifiers are on lines that begin with at symbols ('@'), sequence follows on the next line, then a line that begins and sequence is with a plus sign ('+') and finally the quality scores on the subsequent line. Quality scores are encoded in Phred format, the type of which (either 32 or 64) is determined when the file is probed for opening. The scores are decoded into a list of integers. This function is a generator that yields `Sequence` objects. ''' while 1: try: line = fh.next().strip() if line[0] == '@': name = line[1:].split()[0] defline = line[1 + len(name):].strip() seq = f.next().strip() fh.next() qual = [ord(c) - self.phred for c in fh.next().strip()] yield Sequence(name, seq, qual=qual, defline=defline) except StopIteration: raise finally: fh.close() def write(fh, s): ''' Write sequences in FASTA format, i.e., ``` @name sequence ... + quality scores ``` ''' fh.write('@%s %s\n%s\n+\n%s\n' % (s.name, s.defline, s.seq, ''.join(q + chr('A') - 1 for q in s.qual)) + '\n') def probe(fh): ''' Probe a file to determine whether or not it is a FASTQ file. That is, the first non-empty line should begin with a caret ('@') and the 3rd line following that first non-empty line should contain no character with ordinal value less than 32. If none of the characters have ordinal value less than 64, then the file is guessed to be encoded in Phred64, otherwise it is encoded in Phred32. This function will return False if the file is not in FASTQ format and will return a dictionary with the phred score and type ('fastq') if the file is FASTQ. ''' for line in fh: st = line.strip() if st: fh.close() if st[0] == '@': fh.next() fh.next() qual = [ord(c) for c in fh.next().strip()] phred = 32 if min(qual) < ord('A') else 64 qual = [q - phred for q in qual] return {'type': 'fastq', 'phred': phred} return False return {'type': 'fastq', 'phread': 64} biotools-1.2.12/src/IO/gff.py0000644000076500000240000000112412013525676016602 0ustar atkassenstaff00000000000000from biotools.annotation import Annotation def read(fh): for line in fh: if line[0] != '#': yield Annotation(*line.split('\t')) raise StopIteration() def write(fh, a): # TODO: improve this... fh.write(str(a) + '\n') def probe(fh): for line in fh: line = line.strip() if line: bits = line.split() if bits[0] == '##gff-version': return {'type': 'gff', 'version': float(bits[1])} return False return {'type': 'gff', 'version': 3} def whook(fh): fh.write('##gff-version 3\n') biotools-1.2.12/src/IO/manager.py0000644000076500000240000000505111774617711017462 0ustar atkassenstaff00000000000000''' This module is home to the IOManager class, which manages the various input and output formats (specifically, FASTA, FASTQ, CLUSTAL alignments, and GFF files, currently). ''' class IOManager(object): ''' A class used by the `IOBase` class to manage the various input and output methods for the different file types. Additional file types can be added to the manager by using ```python manager[format] = methods ``` From the above example, `methods` is a dictionary with keys `rhook`, `read`, `whook`, `write`, and `probe`. Each of the values must be callable object: * `rhook` => takes a file handle opened for reading; called before reading of the file has begun, * `whook` => takes a file handle opened for writing; called before writing to the file has begun, * `read` => takes a file handle opened for reading; should be a generator that yields entries, * `write` => takes a file handle opened for writing and a single entry; writes the entry to the file, * `probe` => takes a file handle opened for reading; returns a dictionary of attributes to be applied to the `IOBase` instance. This class behaves similarly to a dictionary, except that the get method will default to the default method (which does nothing) if no truthy second parameter is passed. ''' def __init__(self, methods=None): ''' Instantiates an `IOManager` with methods, where the keys of methods are the formats and the values are dictionaries with `rhook`, `whook`, `read`, `write`, and `probe` callables. ''' self.methods = methods or {} self.protected = set(self.methods.keys()) nil = lambda *x: iter([]) self.default = { 'rhook': nil, 'read': nil, 'whook': nil, 'write': nil, 'probe': nil } def __contains__(self, key): return key in self.methods def __getitem__(self, key): return self.methods.get(key, self.default) def get(self, key, default=None): ''' Try to get a set of methods via format (e.g., 'fasta') or fall-back to the default methods (which do nothing). ''' try: return self.methods[key] except KeyError: return default or self.default def __setitem__(self, key, value): if key not in self.protected: self.methods[key] = value return value def __iter__(self): return iter(self.methods) biotools-1.2.12/src/IO/phylip.py0000644000076500000240000000277012010562315017341 0ustar atkassenstaff00000000000000''' Functions for manipulating PHYLIP files. ''' from biotools.sequence import Sequence, chop def read(fh): ''' Read sequences in PHYLIP format; This function is a generator that yields `Sequence` objects. ''' grid, seqs, names = None, [], [] try: line = fh.next().strip() while not line: line = fh.next.strip() except StopIteration: fh.close() raise StopIteration() grid = [int(x) for x in line.strip()] while True: lines = [] try: for i in xrange(grid[0]): line = fh.next().strip() while not line: line = fh.next().strip() lines.append(line) if not names: names = [l.split()[0] for l in lines] try: seqs = [''.join(l.split()[1:]) for l in lines] except IndexError: seqs = [''] * grid[0] else: temp = [''.join(l.split()) for l in lines] seqs = [a + b for a, b in zip(seqs, temp)] except StopIteration: break for name, seq in zip(names, seqs): yield Sequence(name, seq) fh.close() raise StopIteration() def probe(fh): ''' ''' for line in fh: bits = line.split() if len(bits) == 2 and int(bits[0]) > 0 and int(bits[1]) > 0: return {'type': 'phylip'} else: return False return {'type': 'phylip'} biotools-1.2.12/src/sequence.py0000644000076500000240000001513512017457115017344 0ustar atkassenstaff00000000000000from biotools.annotation import Annotation def chop(seq, length=70): ''' Yields a chunk of a sequence of no more than `length` characters, it is meant to be used to print fasta files. ''' while seq: try: piece, seq = seq[:length], seq[length:] except IndexError: piece, seq = seq, '' yield piece raise StopIteration() def isprot(seq, nucleotides='ATUCGNYRatucgnyr- '): ''' Check whether the current sequence is a protein or nucleotide sequence. ''' for c in seq: if c not in nucleotides: return True else: return False class Sequence(object): ''' A wrapper class for sequences. ''' def __init__(self, name, seq, **kwargs): ''' Instantiates a Sequence object with a given sequence. Some other useful parameters that the `Sequence` constructor can handle are: * `qual` => the quality scores (an array of integers) of the sequence, * `type` => the type of the sequence, either prot or nucl, * `start` => the starting position of the sequence within a supersequence, * `end` => the ending position of the sequnece within a supersequence, * `step` => the 'step' of the sequence, usually +1 for top-strand sequences, and -1 for bottom-strand sequences, but can handle other values as well, * `original` => the original `Sequence` object from which this one derives, * `defline` => the definition line for this sequnce from a fasta file. If one of these are not given, they will default to the most logical value that can be determined from the other values and sequence (e.g., if `end < start`, then `step` is probably -1). ''' self.name = name self.seq = seq self.qual = kwargs.get('qual', None) if 'type' in kwargs: self.type = kwargs['type'] self.start = kwargs.get('start', 1) self.end = kwargs.get('end', self.start - 1 + len(seq)) self.step = kwargs.get('step', -1 if self.start > self.end else 1) self.original = kwargs.get('original', self) self.defline = kwargs.get('defline', '') def __getattr__(self, attr): if attr == 'type': self.type = 'prot' if isprot(self.seq) else 'nucl' return self.type raise AttributeError('%r object has no attribute %r' % (self.__class__.__name__, attr)) def __getitem__(self, key): ''' sequence[i] or sequence[start:end] or sequence[start:end:step] constructs a new Sequence that is a subsequence as described by the way this function is called. This function will automatically fill in the name, sequence, start, stop, step, original, and type of the subsequence. It also tries to fill in the annotations, but annotations are handled pretty poorly right now, so it's probably best not to worry about those, but it will work if you really want to. ''' try: start, stop, step = key.indices(len(self.seq)) except AttributeError: start, stop, step = key, key + 1, 1 order = abs(self.step) / self.step r = stop - (stop - start) % step - step seq = ''.join(self.seq[x] for x in xrange(start, stop, step)) qual = self.qual and [self.qual[x] for x in xrange(start, stop, step)] info = (self.name, start, stop, step) self.type return Sequence("subsequence(%s, %d, %d, %d)" % info, seq, qual=qual, original=self.original, type=self.type, start=self.start + start * order, end=self.start + r * order, step=step * self.step) ''' Some other things you can do with a Sequence object: * len(sequence) => gives the length of the sequence. * for character in sequence: => allows you to loop over each character in the sequence. * dictionary[sequence] => allows sequences to be used as keys for dictionaries and allows you to have sequences in sets. This relies on the test seqA == seqB, described next. * seqA == seqB => compare two sequences. The sequences are the same if they have the same sequence AND name. Therefore, two sequences with different names are treated as separate items in a set and separate keys in a dictionary. If you need to match only the sequence, use seqA.seq == seqB.seq. * print sequence => print a fasta / fastq (depending on whether there are any quality scores) representation of the sequence. Sequence objects in any other data structure (e.g., list, dictionary) are printed as (e.g., ). If you want to change that, you can do: def __repr__(self): return self.__str__() ''' def upper(self): return Sequence(self.name, self.seq.upper(), type=self.type, qual=self.qual, original=self.original, defline=self.defline, start=self.start, step=self.step, end=self.end) def __iter__(self): for c in self.seq: yield c raise StopIteration() def __len__(self): return len(self.seq) def __hash__(self): return hash(self.seq) def __eq__(self, other): try: return self.seq == other.seq and self.name == other.name except AttributeError: return (self.seq == other) def __str__(self): if self.qual: return '@%s\n%s\n+\n%s' % (self.name, self.seq, ''.join(chr(ord('A') - 1 + q) for q in self.qual)) else: return '>%s %s\n%s' % (self.name, self.defline, '\n'.join(chop(self.seq, 70))) def annotation(seq, source, type, **kwargs): ''' Creates an `Annotation` object for the given sequence from a source (e.g., "phytozome7.0") of a particular type (e.g., "gene"). ''' try: sname = source.name except AttributeError: sname = source start, end = min(seq.start, seq.end), max(seq.start, seq.end) strand = '+' if seq.step == 1 else '-' attrs = ';'.join('%s=%s' % (key, str(kwargs[key])) for key in kwargs) return Annotation(seq.original.name, sname, type, start, end, '.', strand, start % 3, attrs) biotools-1.2.12/src/translate.py0000644000076500000240000000500112033603443017513 0ustar atkassenstaff00000000000000_gencode = { 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACU': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 'AAC': 'N', 'AAU': 'N', 'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', 'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 'AGG': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P', 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CAC': 'H', 'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGU': 'R', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 'GUU': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGU': 'G', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'UCA': 'S', 'UCC': 'S', 'UCG': 'S', 'UCU': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 'UUG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*', 'UAC': 'Y', 'UAU': 'Y', 'UAA': '*', 'UAG': '*', 'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W', 'UGC': 'C', 'UGU': 'C', 'UGA': '*', 'UGG': 'W'} def translate(sequence): ''' Translate a nucleotide using the standard genetic code. The sequence parameter can be either a string or a `Sequence` object. Stop codons are denoted with an asterisk (*). ''' try: value = ''.join(_gencode.get(sequence.seq[i:i + 3].upper(), 'X') for i in xrange(0, int(len(sequence) / 3) * 3, 3)) return sequence.__class__("translate(%s)" % sequence.name, value, original=sequence.original, type='prot', defline=sequence.defline) except AttributeError: value = ''.join(_gencode.get(sequence[i:i + 3].upper(), 'X') for i in xrange(0, int(len(sequence) / 3) * 3, 3)) return sequence.__class__(value) if __name__ == '__main__': import sys print(translate(sys.argv[1]))