biotools-1.2.12/ 0000755 0000765 0000024 00000000000 12241266112 014360 5 ustar atkassen staff 0000000 0000000 biotools-1.2.12/PKG-INFO 0000644 0000765 0000024 00000002003 12241266112 015450 0 ustar atkassen staff 0000000 0000000 Metadata-Version: 1.1
Name: biotools
Version: 1.2.12
Summary: A bunch of bioinformatics utilities.
Home-page: https://github.com/sonwell/biotools
Author: Andrew Kassen
Author-email: atkassen@gmail.com
License: UNKNOWN
Description: Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
Currently depends on `clustalw `_ and `BLAST `_.
You can grab the most current code from `github `_ or via git
git clone git://github.com/sonwell/biotools.git
Keywords: gene prediction,prokaryotes,effectors
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Intended Audience :: Science/Research
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
Requires: numpy
Requires: matplotlib
biotools-1.2.12/prok-geneseek 0000755 0000765 0000024 00000005546 12002111133 017042 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
# 1. predict
# 2. cluster
# 3. rename
# 4. snps
import biotools.analysis.run as pr
import biotools.analysis.cluster as cw
import biotools.analysis.renamer as mv
import biotools.analysis.variance as cs
import biotools.analysis.options as op
import biotools.analysis.loaddata as ld
import sys
import os
if __name__ == "__main__":
op.parse(sys.argv[1:])
if len(op.args) < 2:
op.help()
exit(0)
direc = op.DIRECTORY
plotter = op.PLOTTER
database = op.args[0]
names = op.args[1:]
sep = os.sep
predict = op.predicting
cluster = op.clustering
rename = op.renaming
plot = op.plotting
report = op.reporting
calculate = op.calculating
rename = rename and (cluster or not predict)
calculate = calculate and (cluster or not predict)
plot = plot and (calculate or not (predict or cluster))
report = report and calculate
if not (predict or cluster or rename or calculate or plot or report):
print "I don't know what you want, I give up."
exit(0)
if predict:
names = pr.run(database, names)
print "Homologous sequences written to " + direc + 'sequences' + sep
if cluster:
names = cw.run(direc + 'clusters' + sep, names)
print "Clustalw files written to " + direc + "clusters" + sep
if rename:
names = mv.rename(direc + 'clusters' + sep, database, names)
if plot and plotter.lower() != 'none':
try:
cv = __import__(plotter, globals(), locals(), ['plot'], -1)
except ImportError:
try:
if not plotter.endswith('.py'):
plotter += '.py'
open(plotter, 'r')
except:
plot = False
else:
p = plotter.rfind(sep)
if p > -1:
sys.path.append(plotter[:p])
plotter = plotter[p + len(sep):]
try:
cv = __import__(plotter, globals(), locals(), ['plot'], -1)
except ImportError:
plot = False
if calculate:
gen = cs.var(names)
elif plot:
gen = (ld.parse(s) for s in names)
else:
gen = []
for entry in gen:
plotdata = entry['plotdata']
metadata = entry['metadata']
if plot:
try:
cv.plot(plotdata, direc + 'plots' + sep,
filename=metadata['filename'])
except Exception as e:
print metadata['filename'], e
if report:
try:
os.mkdir(direc + 'data' + sep)
except:
pass
fh = open(direc + 'data' + sep + metadata['strain'] + '.py', 'w')
fh.write('plotdata = ' + repr(plotdata) + '\n')
fh.write('metadata = ' + repr(metadata) + '\n')
print "Done"
biotools-1.2.12/setup.py 0000644 0000765 0000024 00000002573 12241266044 016105 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
from distutils.core import setup
setup(name="biotools",
version='1.2.12',
description="A bunch of bioinformatics utilities.",
long_description="""Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
Currently depends on `clustalw `_ and `BLAST `_.
You can grab the most current code from `github `_ or via git
git clone git://github.com/sonwell/biotools.git""",
author="Andrew Kassen",
maintainer="Andrew Kassen",
author_email="atkassen@gmail.com",
maintainer_email="atkassen@gmail.com",
url="https://github.com/sonwell/biotools",
requires=['numpy','matplotlib'],
packages=['biotools', 'biotools.analysis', 'biotools.IO'],
package_dir={'biotools': 'src',
'biotools.analysis': 'src/analysis',
'biotools.IO': 'src/IO'},
scripts=['prok-geneseek'],
keywords='gene prediction, prokaryotes, effectors',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics'
])
biotools-1.2.12/src/ 0000755 0000765 0000024 00000000000 12241266112 015147 5 ustar atkassen staff 0000000 0000000 biotools-1.2.12/src/__init__.py 0000644 0000765 0000024 00000000055 11773402744 017274 0 ustar atkassen staff 0000000 0000000 '''
A bunch of bioinformatics utilities.
'''
biotools-1.2.12/src/align.py 0000644 0000765 0000024 00000020176 12241075232 016622 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
'''
This module is used to align sequences. Currently, there is only a single
alignment algorithm implementented; it is a hybrid between Needleman-Wunsch
and Smith-Waterman and is used to find the subsequence within a larger sequence
that best aligns to a reference.
'''
from biotools.translate import translate
import biotools.analysis.options as options
DIAG_MARK, VGAP_MARK, HGAP_MARK = 3, 2, 1
bl = {
'*': {'*': 0, 'A': -9, 'C': -9, 'E': -9, 'D': -9, 'G': -9, 'F': -9, 'I': -9,
'H': -9, 'K': -9, 'M': -9, 'L': -9, 'N': -9, 'Q': -9, 'P': -9, 'S': -9,
'R': -9, 'T': -9, 'W': -9, 'V': -9, 'Y': -9, 'X': 0},
'A': {'*': -9, 'A': 4, 'C': 0, 'E': -1, 'D': -2, 'G': 0, 'F': -2, 'I': -1,
'H': -2, 'K': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': 1,
'R': -1, 'T': -1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
'C': {'*': -9, 'A': 0, 'C': 9, 'E': -4, 'D': -3, 'G': -3, 'F': -2, 'I': -1,
'H': -3, 'K': -3, 'M': -1, 'L': -1, 'N': -3, 'Q': -3, 'P': -3, 'S': -1,
'R': -3, 'T': -1, 'W': -2, 'V': -1, 'Y': -2, 'X': 0},
'E': {'*': -9, 'A': -1, 'C': -4, 'E': 5, 'D': 2, 'G': -2, 'F': -3, 'I': -3,
'H': 0, 'K': 1, 'M': -2, 'L': -3, 'N': 0, 'Q': 2, 'P': -1, 'S': 0,
'R': 0, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
'D': {'*': -9, 'A': -2, 'C': -3, 'E': 2, 'D': 6, 'G': -1, 'F': -3, 'I': -3,
'H': -1, 'K': -1, 'M': -3, 'L': -4, 'N': 1, 'Q': 0, 'P': -1, 'S': 0,
'R': -2, 'T': 1, 'W': -4, 'V': -3, 'Y': -3, 'X': 0},
'G': {'*': -9, 'A': 0, 'C': -3, 'E': -2, 'D': -1, 'G': 6, 'F': -3, 'I': -4,
'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': -2, 'Q': -2, 'P': -2, 'S': 0,
'R': -2, 'T': 1, 'W': -2, 'V': 0, 'Y': -3, 'X': 0},
'F': {'*': -9, 'A': -2, 'C': -2, 'E': -3, 'D': -3, 'G': -3, 'F': 6, 'I': 0,
'H': -1, 'K': -3, 'M': 0, 'L': 0, 'N': -3, 'Q': -3, 'P': -4, 'S': -2,
'R': -3, 'T': -2, 'W': 1, 'V': -1, 'Y': 3, 'X': 0},
'I': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -3, 'G': -4, 'F': 0, 'I': 4,
'H': -3, 'K': -3, 'M': 1, 'L': 2, 'N': -3, 'Q': -3, 'P': -3, 'S': -2,
'R': -3, 'T': -2, 'W': -3, 'V': 1, 'Y': -1, 'X': 0},
'H': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': -2, 'F': -1, 'I': -3,
'H': 8, 'K': -1, 'M': -2, 'L': -3, 'N': 1, 'Q': 0, 'P': -2, 'S': -1,
'R': 0, 'T': 0, 'W': -2, 'V': -2, 'Y': 2, 'X': 0},
'K': {'*': -9, 'A': -1, 'C': -3, 'E': 1, 'D': -1, 'G': -2, 'F': -3, 'I': -3,
'H': -1, 'K': 5, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -1, 'S': 0,
'R': 2, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
'M': {'*': -9, 'A': -1, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 1,
'H': -2, 'K': -1, 'M': 5, 'L': 2, 'N': -2, 'Q': 0, 'P': -2, 'S': -1,
'R': -1, 'T': -1, 'W': -1, 'V': -2, 'Y': -1, 'X': 0},
'L': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -4, 'G': -4, 'F': 0, 'I': 2,
'H': -3, 'K': -2, 'M': 2, 'L': 4, 'N': -3, 'Q': -2, 'P': -3, 'S': -2,
'R': -2, 'T': -2, 'W': -2, 'V': 3, 'Y': -1, 'X': 0},
'N': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': 0, 'F': -3, 'I': -3,
'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 6, 'Q': 0, 'P': -2, 'S': 1,
'R': 0, 'T': 0, 'W': -4, 'V': -3, 'Y': -2, 'X': 0},
'Q': {'*': -9, 'A': -1, 'C': -3, 'E': 2, 'D': 0, 'G': -2, 'F': -3, 'I': -3,
'H': 0, 'K': 1, 'M': 0, 'L': -2, 'N': 0, 'Q': 5, 'P': -1, 'S': 0,
'R': 1, 'T': 0, 'W': -2, 'V': -2, 'Y': -1, 'X': 0},
'P': {'*': -9, 'A': -1, 'C': -3, 'E': -1, 'D': -1, 'G': -2, 'F': -4, 'I': -3,
'H': -2, 'K': -1, 'M': -2, 'L': -3, 'N': -1, 'Q': -1, 'P': 7, 'S': -1,
'R': -2, 'T': 1, 'W': -4, 'V': -2, 'Y': -3, 'X': 0},
'S': {'*': -9, 'A': 1, 'C': -1, 'E': 0, 'D': 0, 'G': 0, 'F': -2, 'I': -2,
'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 1, 'Q': 0, 'P': -1, 'S': 4,
'R': -1, 'T': 1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
'R': {'*': -9, 'A': -1, 'C': -3, 'E': 0, 'D': -2, 'G': -2, 'F': -3, 'I': -3,
'H': 0, 'K': 2, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -2, 'S': -1,
'R': 5, 'T': -1, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
'T': {'*': -9, 'A': -1, 'C': -1, 'E': 0, 'D': 1, 'G': 1, 'F': -2, 'I': -2,
'H': 0, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1,
'R': -1, 'T': 4, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
'W': {'*': -9, 'A': -3, 'C': -2, 'E': -3, 'D': -4, 'G': -2, 'F': 1, 'I': -3,
'H': -2, 'K': -3, 'M': -1, 'L': -2, 'N': -4, 'Q': -2, 'P': -4, 'S': -3,
'R': -3, 'T': -3, 'W': 11, 'V': -3, 'Y': 2, 'X': 0},
'V': {'*': -9, 'A': 0, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': -1, 'I': 3,
'H': -3, 'K': -2, 'M': 1, 'L': 1, 'N': -3, 'Q': -2, 'P': -2, 'S': -2,
'R': -3, 'T': -2, 'W': -3, 'V': 4, 'Y': -1, 'X': 0},
'Y': {'*': -9, 'A': -2, 'C': -2, 'E': -2, 'D': -3, 'G': -3, 'F': 3, 'I': -1,
'H': 2, 'K': -2, 'M': -1, 'L': -1, 'N': -2, 'Q': -1, 'P': -3, 'S': -2,
'R': -2, 'T': -2, 'W': 2, 'V': -1, 'Y': 7, 'X': 0},
'X': {'*': 0, 'A': 0, 'C': 0, 'E': 0, 'D': 0, 'G': 0, 'F': 0, 'I': 0,
'H': 0, 'K': 0, 'M': 0, 'L': 0, 'N': 0, 'Q': 0, 'P': 0, 'S': 0,
'R': 0, 'T': 0, 'W': 0, 'V': 0, 'Y': 0, 'X': 0}
}
def OptimalCTether(reference, translation, extend=1, create=10):
'''
This function will take two sequences: a `reference` sequence and another
protein sequence (`translation`; usually, this is an open reading frame
that has been translated). Needleman-Wunsch alignment will be performed
and the substring of translation with the highest identity that begins
with a start codon [default: `['ATG']`] is reported.
This function returns a dictionary of relevent information from the
alignment; specifically, the alignments itself [keys: `query`, `subject`],
the score [key: `score`], the length of the alignment [key: `length`], the
length of the substring of translation used [key: `sublength`], the number
of identities [key: `identities`], and the number of gaps [key: `gaps`].
'''
starts = set(translate(s) for s in options.START_CODONS)
v, w = reference, translation
try:
v = v.seq
except AttributeError:
pass
try:
w = w.seq
except AttributeError:
pass
if not starts & set(w):
raise ValueError("Open reading frame does not contain a start codon.")
v, w = v[::-1], w[::-1]
lv, lw = len(v), len(w)
rv, rw = range(lv + 1), range(lw + 1)
gpc = [[create * int(not (i | j)) for i in rw] for j in rv]
mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0])
for i in rw] for j in rv]
pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK
for i in rw] for j in rv]
ids = [[0 for i in rw] for j in rv]
optimal = [None, 0, 0]
for i in range(lv):
for j in range(lw):
vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK],
[mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK],
[mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]]
mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals)
gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK)
if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \
abs(lv - i) / float(lv) <= options.LENGTH_ERR and \
w[j] in starts:
optimal = [mat[i + 1][j + 1], i + 1, j + 1]
i, j = optimal[1], optimal[2]
seq, ids = ['', ''], 0
gapcount, length, sublen = 0, 0, 0
methods = {
VGAP_MARK:
lambda s, i, j, l, g, n:
(['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n),
DIAG_MARK:
lambda s, i, j, l, g, n:
([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1,
l + 1, g, n + (w[j - 1] == v[i - 1])),
HGAP_MARK:
lambda s, i, j, l, g, n:
([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n)
}
while [i, j] != [0, 0]:
length += 1
state = (seq, i, j, sublen, gapcount, ids)
seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state)
return {
'subject': seq[0][::-1],
'query': seq[1][::-1],
'score': optimal[0],
'gaps': gapcount,
'length': length,
'sublength': sublen,
'identities': ids
}
biotools-1.2.12/src/analysis/ 0000755 0000765 0000024 00000000000 12241266112 016772 5 ustar atkassen staff 0000000 0000000 biotools-1.2.12/src/analysis/__init__.py 0000644 0000765 0000024 00000000000 11774635423 021110 0 ustar atkassen staff 0000000 0000000 biotools-1.2.12/src/analysis/cluster.py 0000644 0000765 0000024 00000006765 12033603756 021053 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
import biotools.sequence as sequ
import biotools.IO as io
import biotools.translate as tran
import biotools.clustal as clustal
import biotools.analysis.options as options
try:
import Queue as queue
except ImportError:
import queue
import hashlib
import subprocess
import threading
from os import sep, mkdir
def run(direc, inputs):
'''
Takes a collection of files generated by gene prediction, creates clusters
based off of the genes that have homology to those predicted genes, and
creates new fasta files in the clusters sub directory under the given
directory and separated according to whether they are nucleotide or amino
acid sequnces. These new fasta files are then used to create clustalw
alignments of the genes if more than 1 sequence exists in the fasta file.
'''
clusters = {}
all_ids = set()
ids = {}
q = queue.Queue()
filenames = []
def run_clustal():
while not q.empty():
cid = q.get()
dig = hashlib.md5()
dig.update(' '.join(cid))
dig = dig.hexdigest()
fpre = direc + 'nt' + sep + dig
apre = direc + 'aa' + sep + dig
fname = fpre + ".fasta"
aname = apre + ".fasta"
fh = io.open(fname, 'w')
ah = io.open(aname, 'w')
for ipt in clusters:
counter = 0
name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
for cluster in clusters[ipt]:
if cid & cluster[0]:
nm = name + '_' + str(counter)
seq = cluster[1]
curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
tr = tran.translate(curr)
tr.name = curr.name
fh.write(curr)
ah.write(tr)
counter += 1
fh.close()
ah.close()
try:
clustal.run(fname, fpre + '.clustalw')
clustal.run(aname, apre + '.clustalw')
filenames.append(dig + '.fasta')
except ValueError:
pass
q.task_done()
if direc:
for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]:
try:
mkdir(d)
except OSError:
pass
for ipt in inputs:
seqs = {}
ids[ipt] = set()
for seq in io.open(ipt, 'r'):
ids[ipt].add(seq.name)
all_ids.add(seq.name)
if seq.seq not in seqs:
seqs[seq.seq] = set()
seqs[seq.seq].add(seq.name)
clusters[ipt] = [(seqs[k], k) for k in seqs]
del seqs
sub_ids = []
while all_ids:
cid = all_ids.pop()
subcluster = (all_ids | set([cid])) & \
set(i for ipt in clusters for cluster in clusters[ipt]
for i in cluster[0] if cid in cluster[0])
for ipt in clusters:
for cluster in clusters[ipt]:
if cid in cluster[0]:
subcluster = (subcluster & cluster[0]) | \
(subcluster - ids[ipt])
sub_ids.append(subcluster)
all_ids -= subcluster
for cid in sub_ids:
q.put(cid)
threads = []
for i in xrange(options.NUM_PROCESSES - 1):
curr = threading.Thread(target=run_clustal)
threads.append(curr)
curr.start()
run_clustal()
q.join()
return filenames
biotools-1.2.12/src/analysis/loaddata.py 0000644 0000765 0000024 00000010621 11773402607 021127 0 ustar atkassen staff 0000000 0000000 '''
This is a pretty simple JSON-like parser. Specifically, it can load Python-like
object, list, and other literals, i.e., the sort of stuff you'd get it you
dumped the the string representation of some data into a file.
The real difference is that you must specify a variable name, e.g.:
```python
my_stuff = { ... }
```
These variable names don't need to be on a newline or anything like that, you
should be able to omit any and all whitespace. The result of a successful
parse is a dictionary:
```python
{'my_stuff': { ... }}
```
This function really only works for `None`, `True`, `False`, numbers, strings,
dictionaries, and lists.
'''
def parse(ipt):
par = {'char': '', 'pos': -1}
leng = 0
val = ""
ret = {}
def advance(test=None):
curr = par['char']
if test and test != curr:
raise ValueError("Expected %s, saw %s." % (test, curr))
par['pos'] += 1
if par['pos'] < leng:
par['char'] = val[par['pos']]
else:
par['char'] = ''
def whitespace():
while par['char'] and ord(par['char']) <= ord(' '):
advance()
def variable():
whitespace()
start = par['pos']
while \
('0' <= par['char'] <= '9') or \
('A' <= par['char'] <= 'Z') or \
('a' <= par['char'] <= 'z'):
advance()
stop = par['pos']
whitespace()
advance('=')
return (val[start:stop], value())
def value():
whitespace()
if par['char'] == '{':
return dictionary()
if par['char'] == '[':
return array()
if par['char'] in ("'", '"'):
return string()
if '0' <= par['char'] <= '9' or par['char'] == '-':
return number()
at = par['pos']
if par['char'] == 'T':
advance('r')
advance('u')
advance('e')
return True
if par['char'] == 'F':
advance('a')
advance('l')
advance('s')
advance('e')
return False
if par['char'] == 'N':
advance('o')
advance('n')
advance('e')
return None
prefix = val[at:par['pos']]
raise ValueError("Unexpected value starting with %s." % prefix)
def dictionary():
ret = {}
advance('{')
whitespace()
while par['char'] != '}':
s = value()
advance(':')
v = value()
ret[s] = v
if par['char'] == ',':
advance(',')
else:
break
advance('}')
whitespace()
return ret
def array():
ret = []
advance('[')
whitespace()
while par['char'] != ']':
ret.append(value())
if par['char'] == ',':
advance(',')
else:
break
advance(']')
whitespace()
return ret
def string():
ret = ""
q = par['char']
advance(q)
while par['char'] != q:
if par['char'] == '\\':
advance('\\')
if par['char'] == 'n':
ret += '\n'
elif par['char'] == 't':
ret += '\t'
else:
ret += par['char']
advance()
continue
ret += par['char']
advance()
advance(q)
whitespace()
return ret
def number():
ret = ""
if par['char'] == '-':
ret += '-'
advance('-')
while '0' <= par['char'] <= '9':
ret += par['char']
advance()
if par['char'] == '.':
ret += '.'
advance('.')
while '0' <= par['char'] <= '9':
ret += par['char']
advance()
if par['char'] in ('e', 'E'):
ret += par['char']
advance()
if par['char'] == '-':
ret += '-'
advance('-')
while '0' <= par['char'] <= '9':
ret += par['char']
advance()
whitespace()
return float(ret)
for line in ipt:
val += line
leng += len(line)
advance()
while par['pos'] < leng:
k, v = variable()
ret[k] = v
return ret
biotools-1.2.12/src/analysis/options.py 0000644 0000765 0000024 00000013555 11774635067 021073 0 ustar atkassen staff 0000000 0000000 from optparse import OptionParser
from threading import Lock
from os import sep, makedirs
from sys import stderr
LENGTH_ERR = 0.2
MIN_IDENTITY = 0.45
MAX_EVALUE = 1e-30
MIN_ORFLEN = 300
NUM_THREADS = 16
NUM_PROCESSES = 2
DIRECTORY = '.' + sep
PLOTTER = 'biotools.analysis.plot'
START_CODONS = ["ATG"]
STOP_CODONS = ["TAG", "TAA", "TGA"]
args = tuple()
predicting = True
clustering = True
renaming = True
calculating = True
reporting = True
plotting = True
verbose = False
lock = Lock()
parser = OptionParser(usage="Usage: %prog [options] " +
" ")
parser.add_option("-S", "--start", action="append", dest="start",
default=START_CODONS, type="string",
help="define a start codon [default: %s]" %
' '.join("-S " + s for s in START_CODONS))
parser.add_option("-E", "--stop", action="append", dest="stop",
default=STOP_CODONS, type="string",
help="define a stop codon [default: %s]" %
' '.join("-E " + s for s in STOP_CODONS))
parser.add_option("-j", "--threads", action="store", dest="threads",
default=NUM_THREADS, type="int",
help="number of threads [default: %default]")
parser.add_option("-p", "--processes", action="store", dest="processes",
default=NUM_PROCESSES, type="int",
help="number of parallel processes to run " +
"[default: %default]")
parser.add_option("-e", "--evalue", action="store", dest="evalue",
default=MAX_EVALUE, type="float",
help="maximum e-value [default: %default]")
parser.add_option("-I", "--identity", action="store", dest="identity",
default=MIN_IDENTITY, type="float",
help="minimum percent identity [default: %default]")
parser.add_option("-L", "--length", action="store", dest="fraction",
default=LENGTH_ERR, type="float",
help="allowable relative error in hit length " +
"[default: %default]")
parser.add_option("-O", "--orflen", action="store", dest="orflen",
metavar="bases", default=MIN_ORFLEN, type="int",
help="minimum allowable length for ORFs [default: %default]")
parser.add_option("-d", "--directory", action="store", dest="directory",
default=DIRECTORY, type="string",
help="set working directory [default: current]")
parser.add_option("-P", "--plotter", action="store", dest="plotter",
default=PLOTTER, type="string",
help="plotting module [default: %default]")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
default=verbose,
help="print debug messages [default: False]")
parser.add_option("--no-plots", action="store_false", dest="plotting",
default=plotting,
help="suppress the drawing of plots [default: False]")
parser.add_option("--no-predict", action="store_false", dest="predicting",
default=predicting,
help="don't predict genes, instead treat the input files " +
"as predicted genes [default: False]")
parser.add_option("--no-cluster", action="store_false", dest="clustering",
default=True, help="don't cluster the sequences, " +
"instead treat the input files as alignments " +
"[default: False]")
parser.add_option("--no-rename", action="store_false", dest="renaming",
default=True,
help="don't rename the fasta and clustal files " +
"[default: False]")
parser.add_option("--no-reports", action="store_false", dest="reporting",
default=True,
help="don't generate files for variance data " +
"[default: False]")
parser.add_option("--no-calculation", action="store_false",
dest="calculating", default=True,
help="don't calculate sequence variance [default: False]")
def debug(msg):
if verbose:
lock.acquire(True)
stderr.write(str(msg) + '\n')
lock.release()
def parse(pargs):
'''
Parses `pargs` and sets global variables to be accessible to other
modules.
These variables are:
* `LENGTH_ERR`
* `MIN_IDENTITY`
* `MAX_EVALUE`
* `NUM_THREADS`
* `NUM_PROCESSES`
* `START_CODONS`
* `START_CODONS`
* `DIRECTORY`
* `PLOTTER`
* `args`
'''
global \
LENGTH_ERR, MIN_IDENTITY, MAX_EVALUE, MIN_ORFLEN, \
NUM_THREADS, NUM_PROCESSES, START_CODONS, STOP_CODONS, \
DIRECTORY, PLOTTER, args, predicting, clustering, \
renaming, calculating, reporting, plotting, verbose
opts, largs = parser.parse_args(pargs)
if opts.directory[-1] != sep:
opts.directory += sep
try:
makedirs(opts.directory)
except OSError:
pass
if '*' in opts.start:
DNA = 'ATCG'
opts.start = set(i + j + k for i in DNA for j in DNA for k in DNA) - \
set(opts.stop)
opts.processes = min(opts.processes, len(largs))
if not (0 <= opts.fraction <= 1):
raise RuntimeError("Allowable length error must be between 0 and 1.")
LENGTH_ERR = opts.fraction
MIN_IDENTITY = opts.identity
MAX_EVALUE = opts.evalue
MIN_ORFLEN = opts.orflen
NUM_THREADS = opts.threads
NUM_PROCESSES = opts.processes
STOP_CODONS = opts.stop
START_CODONS = opts.start
DIRECTORY = opts.directory
PLOTTER = opts.plotter
predicting = opts.predicting
clustering = opts.clustering
renaming = opts.renaming
calculating = opts.calculating
reporting = opts.reporting
plotting = opts.plotting
verbose = opts.verbose
args = largs
def help():
'''
Prints the usage.
'''
parser.print_help()
biotools-1.2.12/src/analysis/plot.py 0000644 0000765 0000024 00000007202 12033604146 020325 0 ustar atkassen staff 0000000 0000000 import numpy as np
import matplotlib.pyplot as plt
import math
from os import sep, mkdir
def smoothed(unsmoothed, factor):
l = len(unsmoothed)
r = [math.exp(-(i - l) ** 2 / float(factor)) for i in xrange(1, l)]
w = np.array(r + [1] + r[::-1])
c = (w[l - i - 1:2 * l - i - 1] for i in xrange(l))
wm = np.array([x / sum(x) for x in c])
return np.dot(unsmoothed, wm.transpose())
def plot(plotdata, directory, bottom=True, side=True, legend=True,
save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
fig=None, **kwargs):
if fig is None:
fig = plt.figure(None, facecolor='w', edgecolor='w')
if not directory.endswith(sep):
directory += sep
try:
mkdir(directory)
except OSError:
pass
# plotting data
ntvar = plotdata['nt']['var']
aavar = plotdata['aa']['var']
# gene models
starts = plotdata['aa']['starts']
ends = plotdata['aa']['ends']
counts = plotdata['aa']['count']
# smooth the data
snt = smoothed(ntvar, factor)
lnt = len(ntvar)
saa = smoothed(aavar, factor)
laa = len(aavar)
# bounding rectangle
bound = [0, laa, -upperbound / 6.0, upperbound]
# x-values to align nucleotide & amino acids
xnt = np.arange(lnt) / 3.0 + 1
xaa = np.arange(laa) + 1
ax = axes(bottom, side, bound, fig, **kwargs)
ntl = draw(xnt, snt, ax, '#0000ff', **kwargs)
aal = draw(xaa, saa, ax, '#00ff00', **kwargs)
models(starts, ends, counts, bound, ax, **kwargs)
report(filename, ntvar, aavar, lnt, laa)
if legend:
fig.legend((ntl, aal), ('Nucleotide', 'Amino acid'), 'upper right')
if save:
fig.savefig(directory + filename)
def axes(bottom, side, bound, fig, **kwargs):
# create the proper sized frame, depending on
# how we draw the plot
x = 0.09 if side else 0.02
y = 0.09 if bottom else 0.04
xs = [bound[0], bound[1] * 1.06]
# construct the axes
ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=xs, ylim=bound[2:])
ax.minorticks_on()
ax.tick_params(axis='x', which='minor', length=3)
# hide the unwanted axis lines (typically the top & right)
# label the wanted axes and draw them
for loc, spine in ax.spines.iteritems():
if loc in ['right', 'top']:
spine.set_color('none')
continue
if loc == 'bottom':
ax.xaxis.set_ticks_position('bottom')
if bottom:
ax.set_xlabel("Amino acids")
continue
if loc == 'left':
if side:
ax.set_ylabel("Sequence variance")
ax.yaxis.set_ticks_position('left')
else:
spine.set_color('none')
ax.tick_params('y', which='both', color='none',
labelcolor='none')
ax.hlines(ax.get_yticks(), xs[0], xs[1], color='0.75',
linestyle='dashed')
ax.hlines(0, xs[0], xs[1], color='k', linestyle='solid')
return ax
def draw(x, y, ax, color, **kwargs):
return ax.plot(x, y, color=color, linestyle='solid')
def models(starts, ends, counts, bound, ax, **kwargs):
lb, l = bound[2], len(starts)
scale = bound[1] / max(ends)
ys, i = np.arange(1, l + 1) * lb / 3.0, 0
# draw the gene models
ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
for c in counts:
ax.text(bound[1] + 10, lb / 3.0 * (i + 1.25), int(c))
i += 1
def report(filename, ntvar, aavar, lnt, laa):
print(filename.center(80, '='))
print('Average variance: ')
print('\t%f per base pair' % (sum(ntvar) / lnt))
print('\t%f per amino acid' % (sum(aavar) / laa))
biotools-1.2.12/src/analysis/predict.py 0000644 0000765 0000024 00000016637 12241261376 021022 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
import biotools.IO as io
import biotools.BLAST as BLAST
import biotools.analysis.options as options
from biotools.sequence import Sequence, annotation as ann
from biotools.align import OptimalCTether as align
from biotools.translate import translate
from biotools.complement import complement
try:
import Queue as queue
except ImportError:
import queue
import threading
from os import sep, mkdir
PIPING = True
def ORFGenerator(sequ):
'''
Scans both strands of the given sequence and yields the longest subsequence
that starts with a start codon and contains no stop codon other than the
final codon.
'''
comp = complement(sequ[::-1])
seq = sequ.seq
cseq = comp.seq
slen = len(sequ)
starts = [-1, 0, 1, -1, 0, 1] # locations of start codons in each frame
stops = [0, 1, 2, 0, 1, 2] # locations of stop codons in each frame
mlen = options.MIN_ORFLEN
for i in xrange(slen - 2):
fcodon, rcodon = seq[i:i + 3], cseq[i:i + 3]
if fcodon in options.STOP_CODONS:
if i - mlen >= starts[i % 3 + 3] >= stops[i % 3 + 3]:
yield sequ[starts[i % 3 + 3]:i + 3]
stops[i % 3 + 3] = i + 3
elif fcodon in options.START_CODONS:
if starts[i % 3 + 3] < stops[i % 3 + 3]:
starts[i % 3 + 3] = i
if rcodon in options.STOP_CODONS:
if i - mlen >= starts[i % 3] >= stops[i % 3]:
yield comp[starts[i % 3]:i + 3]
stops[i % 3] = i + 3
elif rcodon in options.START_CODONS:
if starts[i % 3] < stops[i % 3]:
starts[i % 3] = i
raise StopIteration()
class ThreadQueue(queue.Queue):
def __init__(self, target):
queue.Queue.__init__(self)
self.threadcount = 0
self.target = target
def put(self, item):
options.lock.acquire()
queue.Queue.put(self, item)
if self.threadcount < options.NUM_THREADS - 1:
thread = threading.Thread(target=self.target)
thread.start()
self.threadcount += 1
options.lock.release()
def GeneFromBLAST(db, sequences, pref, names):
'''
BLASTs database against sequences, and for those results that pass the
length and percent identity requirements, attempt to locate the full gene
that corresponds to that BLAST hit. Genes that are found are saved in the
subdirectory sequences under the given directory, divided depending on
whether the sequnece is amino acid or nucleotide.
'''
PIPING = True
wd = options.DIRECTORY + 'sequences' + sep
for d in [options.DIRECTORY, wd]:
try:
mkdir(d)
except OSError:
pass
subj = dict((s.name, s) for s in io.open(db, 'r'))
options.debug("Database sequences loaded from file %s." % db)
try:
orfs = dict((s.name, [orf for orf in ORFGenerator(s)])
for s in io.open(sequences, 'r'))
options.debug("ORFs loaded from file %s." % sequences)
except IOError:
options.debug("No file \"" + sequences + ",\" skipping.")
return
def target():
while 1:
try:
res = qin.get(PIPING, 1)
except queue.Empty:
if not PIPING:
break
else:
continue
qname, sname = res['query']['name'], res['subject']['name']
start, end = res['query']['start'], res['query']['end']
alignments = []
max_match = (options.MIN_IDENTITY, None)
if subj[sname].type == 'nucl':
subject = translate(subj[sname])
else:
subject = subj[sname]
while qname:
try:
o = orfs[qname]
break
except KeyError:
qname = qname[:-1]
if not qname:
qin.task_done()
continue
for orf in o:
if in_range(orf, start, end, res['frame']):
orf = orf[:-3]
query = translate(orf)
options.debug("Aligning %33s v. %33s." % (qname, sname))
alignment = align(subject.seq, query.seq)
alignments.append((orf, sname, alignment))
for orf, refname, aln in alignments:
hitlen = aln['sublength']
region = orf[-3 * hitlen:]
identity = float(aln['identities']) / aln['length']
if identity >= max_match[0]:
max_match = (identity, (region, sname, aln))
if max_match[1]:
seq, name, _ = max_match[1]
odl = subject.defline.split('[')[0].strip()
src = seq.original.name
start, end, strand = seq.start, seq.end, seq.step
defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
(odl + (' ' if odl else ''), src, start, end, strand)
new = Sequence(name.strip(), seq.seq, defline=defline,
original=seq.original, type=seq.type,
start=seq.start, end=seq.end, step=seq.step)
qout.put(new)
qin.task_done()
def in_range(seq, start, end, frame):
ss, se = sorted((seq.start, seq.end))
os, oe = sorted((start, end))
frame = int(frame)
return (ss < oe and se > os and
(se % 3 == oe % 3 or ss % 3 == oe % 3) and
((frame < 0 and seq.step < 0) or
(frame > 0 and seq.step > 0)))
qout = queue.Queue()
qin = ThreadQueue(target)
blastopts = {
'evalue': options.MAX_EVALUE,
'num_threads': options.NUM_THREADS
}
for res in BLAST.run(db, sequences, **blastopts):
if float(res['expect']) > options.MAX_EVALUE:
continue
sbjl = len(subj[res['subject']['name']])
ident = float(res['identities'].split('(')[1][:-2]) / 100
lerr = float(res['subject']['length']) / sbjl
if ident >= options.MIN_IDENTITY:
if lerr >= (1.0 - options.LENGTH_ERR):
qin.put(res)
PIPING = False
options.debug("BLAST done.")
target()
qin.join()
options.debug("Done Aligning sequences.")
options.debug("Now writing sequences (%d)." % qout.qsize())
seqs = {}
nuc_file = io.open(wd + pref + '.fasta', 'w')
count = 0
while 1:
try:
seq = qout.get(False)
if seq.seq not in seqs:
seqs[seq.seq] = set()
seqs[seq.seq].add(seq)
nuc_file.write(seq)
count += 1
options.debug("Wrote %s (%d)." % (seq.name, count));
except queue.Empty:
break
nuc_file.close()
options.debug("Done Aligning sequences.")
gh = io.open(wd + pref + '.gff3', 'w')
names.append(wd + pref + '.fasta')
for id in seqs:
gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
homologs=','.join(s.name for s in seqs[id])))
gh.close()
def run(subject, query, prefix, names):
GeneFromBLAST(subject, query, prefix, names)
if __name__ == '__main__':
options.START_CODONS = ['TTG']
import sys
f = io.open(sys.argv[1], 'r')
for seq in f:
print(seq.name + ' ' + seq.defline)
for orf in ORFGenerator(seq):
print('%d ... %d' % (orf.start, orf.end))
biotools-1.2.12/src/analysis/renamer.py 0000644 0000765 0000024 00000004255 12241027703 021004 0 ustar atkassen staff 0000000 0000000 from __future__ import print_function
from os import sep, rename as mv
import biotools.IO as io
import biotools.analysis.options as options
try:
get_input = raw_input
except:
get_input = input
def rename(direc, db, files):
'''
This isn't really for bioinformatics, this is more for the pipeline, to
rename the files generated by cluster.py with a little human interaction.
'''
names = []
seqdb = dict((s.name, s) for s in io.open(db, 'r'))
nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep
for f in files:
seq = io.open(nt_dir + f, 'r').next()
ids = seq.defline.split(', ')
print("File\033[33;1m", f, \
"\033[0mis described by the following sequences:")
try:
for id in ids:
seqdb[id]
print("* " + seqdb[id].name + ': ' +
seqdb[id].defline.split('[')[0])
except KeyError:
print("* (none)")
continue
pre = get_input("\033[33;1mWhat should we call this file " +
"(or hit enter to skip)? \033[0m")
fpre = f[:f.find('.')]
if pre != "":
count = 0
while True:
rpre = pre + ((" (%d)" % count) if count > 0 else "")
try:
fh = open(nt_dir + rpre + ".fasta", 'r')
fh.close()
count += 1
continue
except IOError:
nt_old, nt_new = nt_dir + fpre, nt_dir + rpre
aa_old, aa_new = aa_dir + fpre, aa_dir + rpre
print("Renaming " + fpre + ".* to " + rpre + ".*")
try:
mv(nt_old + ".fasta", nt_new + ".fasta")
mv(aa_old + ".fasta", aa_new + ".fasta")
mv(nt_old + ".clustalw", nt_new + ".clustalw")
mv(aa_old + ".clustalw", aa_new + ".clustalw")
names.append(nt_new + '.clustalw')
names.append(aa_new + '.clustalw')
except OSError:
pass
break
return names
biotools-1.2.12/src/analysis/report.py 0000644 0000765 0000024 00000005575 11774635321 020707 0 ustar atkassen staff 0000000 0000000 import matplotlib.pyplot as plt
import biotools.analysis.plot as bap
from os import sep, mkdir
# report areas of high conservation or variation
def report(plotdata, **kwargs):
pass
# wraps biotools.analysis.plot.plot()
def plot(plotdata, directory, bottom=True, side=True, legend=True,
save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
fig=plt.figure(None, facecolor='w', edgecolor='w'), **kwargs):
ranges = report(plotdata, **kwargs)
try:
mkdir(directory)
except OSError:
pass
lowerbound = -upperbound / 6
# smooth the data
snt = smoothed(snpdata['nt']['var'], factor)
lnt = len(snpdata['nt']['var'])
saa = smoothed(snpdata['aa']['var'], factor)
laa = len(snpdata['aa']['var'])
# generate x-ranges so that amino acids
# and nucleotides align
xnt = np.arange(lnt) * (0.0 + laa) / lnt + 1
xaa = np.arange(laa) + 1
# create the proper sized frame, depending on
# how we draw the plot
x = 0.09 if side else 0.02
y = 0.09 if bottom else 0.04
ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=[0, laa * 1.06],
ylim=[lowerbound, upperbound])
ax.minorticks_on()
ax.tick_params(axis='x', which='minor', length=3)
for loc, spine in ax.spines.iteritems():
if loc in ['right', 'top']:
spine.set_color('none')
continue
if loc == 'bottom':
ax.xaxis.set_ticks_position('bottom')
if bottom:
ax.set_xlabel("Amino acids")
continue
if loc == 'left':
if side:
ax.set_ylabel("Sequence variance")
ax.yaxis.set_ticks_position('left')
else:
spine.set_color('none')
ax.tick_params('y', which='both', color='none',
labelcolor='none')
ax.hlines(ax.get_yticks(), 0, laa * 1.06, color='0.75', linestyle='dashed')
ax.hlines(0, 0, laa * 1.06, color='k', linestyle='solid')
nt_lines = ax.plot(xnt, snt, color='#0000ff', linestyle='solid')
aa_lines = ax.plot(xaa, saa, color='#00ff00', linestyle='solid')
starts = snpdata['aa']['starts']
ends = snpdata['aa']['ends']
counts = snpdata['aa']['count']
scale = laa / max(ends)
ys = (np.arange(len(starts)) + 1) * lowerbound / 3
ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
for i, c in zip(xrange(len(counts)), counts):
ax.text(laa + 10, lowerbound / 3 * (i + 1.25), c)
if legend:
fig.legend((nt_lines, aa_lines), ('Nucleotide', 'Amino acid'),
'upper right')
if save:
fig.savefig(directory + filename)
print '=============', filename, '============='
print 'Average variance: '
print '\t', sum(snpdata['nt']['var']) / lnt, 'per base pair'
print '\t', sum(snpdata['aa']['var']) / laa, 'per amino acid'
biotools-1.2.12/src/analysis/run.py 0000644 0000765 0000024 00000002344 12241062127 020153 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
import biotools.analysis.predict as genepredict
import biotools.analysis.options as options
import threading
from os import sep
try:
import Queue as queue
except ImportError:
import queue
def run(infile, strains):
'''
Run several instances of `genepredict.run` at once.
'''
q = queue.Queue()
filenames = []
def run_predict():
while 1:
try:
strainf = q.get(False)
except queue.Empty:
break
strain = strainf.split(sep)[-1]
pos = strain.rfind('.')
if pos > 1 or (pos == 1 and strain[0] != '.'):
strain = strain[:pos]
options.debug("Predicting for %s." % strain)
try:
genepredict.run(infile, strainf, strain, filenames)
except RuntimeError:
pass
q.task_done()
for strain in strains:
q.put(strain)
for i in range(options.NUM_PROCESSES - 1):
curr = threading.Thread(target=run_predict)
curr.start()
run_predict()
q.join()
return filenames
if __name__ == "__main__":
import sys
try:
run(sys.argv[1:])
except IndexError:
pass
biotools-1.2.12/src/analysis/variance.py 0000644 0000765 0000024 00000006223 12002354765 021146 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
import biotools.IO as io
import biotools.analysis.options as options
from biotools.translate import translate
from os import sep
def SaySNPs(input):
'''
Takes a clustalw alignment and will return a dictionary of data
relevent to plotting the sequence variance for the sequences in the
given clustalw alignment. These data are:
* `var`: the measure of sequence variation,
* `starts`: the starting positions for each gene model in amino acids,
* `ends`: the ending positions for each gene model in amino acids, and
* `count`: the number of sequences with a particular gene model.
The values given in `starts`, `ends`, and `counts` are sorted to that the
nth element in starts corresponds to the nth value in ends and the nth
value in counts.
'''
catalogue = []
lengths = {}
for seq in io.open(input, 'r'):
key = (seq.start, seq.end)
lengths[key] = lengths.get(key, 0) + 1
for (i, c) in zip(xrange(key[1] - key[0] + 1), seq):
if i >= len(catalogue):
catalogue.append({})
if c != " ":
catalogue[i][c] = catalogue[i].get(c, 0) + 1
calc = []
for s in catalogue:
tot = float(sum(s.values()))
cnt = float(len(s))
calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s))
llist = sorted(list(lengths.keys()))
return {
'var': calc,
'starts': [s for s, e in llist],
'ends': [e for s, e in llist],
'count': [lengths[k] for k in llist]
}
def var(files):
'''
Returns plot data and metadata for plotting later on in the pipeline.
'''
sort = {}
for f in files:
seqs = [s for s in io.open(f)]
type = set(s.type for s in seqs)
if len(type) > 1:
type = set(['prot'])
fid = (type.pop(), f)
seqs = [''.join(s.seq.split('-')).strip() for s in seqs]
seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs]
sset = frozenset(seqs)
srtr = (len(seqs), sset)
sort[srtr] = sort.get(srtr, set()) | set([fid])
couples = []
for partners in sort.values():
trim = lambda x: '.'.join(x.split('.')[:-1]) \
if f.endswith('.clustalw') or \
f.endswith('.clustal') or \
f.endswith('.aln') else x
names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners))
pair = {}
for type, f in partners:
if len(pair) == 2:
break
if type in pair:
continue
pair[type] = f
if 0 < len(pair) < 2:
raise TypeError("Unmatched clustal alignment(s): " +
", ".join(f for type, f in partners))
if len(pair) == 0:
continue
couples.append((pair['nucl'], pair['prot'], names))
for nt, aa, strain in couples:
plotdata = {
'nt': SaySNPs(nt),
'aa': SaySNPs(aa)
}
metadata = {'strain': strain, 'filename': strain + '.pdf'}
yield {'plotdata': plotdata, 'metadata': metadata}
raise StopIteration
biotools-1.2.12/src/annotation.py 0000644 0000765 0000024 00000010345 12013524553 017701 0 ustar atkassen staff 0000000 0000000 '''
This module is used to create annotation files (currently, only GFF files).
The annotations can be used to create a heirarchy among the annotations (e.g.,
genes contain exons, introns, ... etc.).
'''
class Annotation(object):
'''
An object to help with reading and writing GFF files.
'''
unknowns = 0
def __init__(self, ref, src, type, start, end, score, strand, phase,
attr, name_token='ID', gff_token='='):
'''
Constructs an `Annotation` object with the necessary values. The
parameters are passed in the same order as the columns from a GFF
(version 3) file and the name_token and gff_token parameters are the
defaults for a gff version 3 file from phytozome. Just write (e.g.)
```python
Annotation(*line.split('\\t')) #(splitting on tabs)
```
and the rest of the work will be done for you. Other sources may
require changes to `name_tokens` and `gff_token`.
Instantiating an `Annotation` will generate for it an id of the form
*SEQNAME*_*TYPE*[START:END], where *SEQNAME* is the name of the
sequence (column 1) from the GFF file, and type is like 'gene' or
'CDS'. If no *SEQNAME* is provided, then `X` be used in its place, and
if no identifier can be found in the attributes, the `Annotation` will
generate an identifier for itself in the form of `unknown #`.
'''
def parse_attrs(attr, keyvalsep='=', attrsep=';'):
'''
Creates a dictionary from the atrributes (9th column) of a gff
file. By default, key-value separator (`keyvalsep`) is `=`, which
is the separator used in gff version 3.
In other words, `attr` `"a=b;c=d;"` and `keyvalsep` `=` will
yield the dictionary `{'a':'b','c':'d'}`. The other separator
(`attrsep`) separates individual attributes and defaults to ';',
which is also the norm in GFF files.
'''
attributes = {}
if keyvalsep not in attr:
keyvalsep = ' '
l = len(keyvalsep)
attrs = [a.strip() for a in attr.strip().split(attrsep)]
for attribute in attrs:
pos = attribute.find(keyvalsep)
if pos > -1:
var, val = attribute[:pos], attribute[pos + l:]
attributes[var] = attributes.get(var, []) + [val]
for key in attributes:
attributes[key] = ','.join(attributes[key])
return attributes
start, end = int(start), int(end)
self.strand = strand
self.type = type
self.source = src
self.seq = ref
self.start = min(start, end)
self.end = max(end, start)
self.attr = parse_attrs(attr, gff_token)
self.phase = phase
self.score = score
self.ntoken = name_token
self.id = ((self.seq or 'X') + '_' + self.type +
"[%d:%d]" % (self.start, self.end))
try:
self.name = self.attr[name_token]
except KeyError:
Annotation.unknowns += 1
self.name = "unknown %d" % Annotation.unknowns
self.parent = None
self.children = []
'''
Some things that you can do to `Annotation` objects:
* `len(annotation)` => length of the annotation (`end-start+1`)
* `dictionary[annotation]` => store annotations as keys of a dictionary or
as elements in a set
* `annA == annB` => compare two Annotations, they are the same if they have
the same id.
* `print annotation` => prints the annotation as a line of a GFF version 3
file.
'''
def __len__(self):
return max(self.start, self.end) - min(self.end, self.start) + 1
def __hash__(self):
return self.id.__hash__()
def __eq__(self, other):
try:
return self.id == other.id
except AttributeError:
return False
def __str__(self):
return '\t'.join((self.seq, self.source,
self.type, str(self.start), str(self.end), self.score,
self.strand, str(self.phase),
';'.join(k + '=' + self.attr[k] for k in self.attr)))
biotools-1.2.12/src/BLAST.py 0000644 0000765 0000024 00000023654 12241022662 016400 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
'''
A module to manage BLAST databases and interface with the BLAST+ standalone
program available from NCBI.
'''
import biotools.IO as io
import subprocess
from os import sep, getenv, listdir
import shutil
def run(db, sfile, mega_blast=False, **kwargs):
'''
Takes a database and a query and runs the appropriate type of BLAST on
them. The database can be an existing BLAST database or a fasta/fastq
file. If it is a sequence file, this function will look in the places
where BLAST would look for an existing database created from that file and
use that instead. If there is no such database, this function will make
one for you and then use the newly created database with BLAST.
Optional named arguments can currently only be `evalue`, `num_threads`,
`gapopen`, or `gapextend`. The correspond to the BLAST options of the same
name.
'''
cmds = {
'prot': {
'prot': 'blastp',
'nucl': 'tblastn'
},
'nucl': {
'nucl': 'blastn',
'prot': 'blastx'
}
}
seq = io.open(sfile, 'r').next()
qtype = seq.type
rcloc = ''
for loc in (".:~:" + (getenv("NCBI") or "")).split(':'):
if loc and loc[-1] == sep:
loc += sep
try:
for line in (l.strip() for l in open(loc + '.ncbirc', 'r')):
pos = line.find('=')
if pos >= 0 and line[:pos].strip() == "BLASTDB":
rcloc = line[pos + 1:].strip()
except IOError:
pass
dbtype = None
bdbenv = getenv("BLASTDB")
dblocations = (":." + ((':' + bdbenv) if bdbenv else '') +
((':' + rcloc) if rcloc else '')).split(':')
for loc in dblocations:
if loc and loc[-1] != sep:
loc += sep
try:
open(loc + db + '.pin', 'r')
dbtype = 'prot'
break
except IOError:
try:
open(loc + db + '.nin', 'r')
dbtype = 'nucl'
break
except IOError:
pass
if not dbtype:
odb = db
pos = db.rfind(".")
for seq in io.open(db, 'r'):
dbtype = seq.type
break
if not dbtype:
raise IOError("Database not found: " + odb)
ndb = None
sp = db.rfind(sep)
if sp > -1:
dbdir, db = db[:sp], db[sp + 1:pos]
else:
dbdir, db = '.', db[:pos]
for file in listdir(dbdir):
dpos = file.rfind('.')
if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in':
fh = open(dbdir + sep + file, 'r')
c = ord(fh.read(12)[-1])
fname = fh.read(c)
if fname[0] in ("'", '"'):
fname = fname[1:-1]
if fname.endswith(odb):
ndb = dbdir + sep + file[:dpos]
break
if not ndb:
ndb = '_'.join(db.split())
try:
ignore = open('/dev/null', 'w')
except IOError:
ignore = open('nul', 'w')
try: # possible race condition
open(ndb, 'r').close()
except IOError:
subprocess.call(["makeblastdb", "-in", '"%s"' % odb,
"-out", ndb, "-dbtype", dbtype],
stdout=ignore)
try:
for suff in ['in', 'hr', 'sq']:
name = ndb + '.' + dbtype[0] + suff
shutil.move(name, dbdir + sep + name)
except shutil.Error:
pass
db = dbdir + sep + ndb
else:
db = ndb
else:
raise IOError("Database not found: " + db)
allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \
set(kwargs.keys())
cmd = cmds[qtype][dbtype]
pn = ["-db", "-query"]
if mega_blast:
cmd = "megablast"
pn = ["-d", "-i"]
allowed = ["e", "a"]
args = [cmd, pn[0], db, pn[1], sfile] + [arg for pair in
[["-" + k, str(kwargs[k])] for k in allowed] for arg in pair]
proc = subprocess.Popen(args, bufsize=1, stdout=subprocess.PIPE)
return Result(iter(proc.stdout.readline, ''))
class Result(object):
'''
A class which take the raw output from BLAST and generates dictionaries
from the data from BLAST. This data includes the alignment, percent
identity, gaps, e-value, score, length of subject, length of query, and
start and stop positions for both sequences. This class should be used in
a for loop like so:
```python
for res in Result(file_or_data):
pass
```
The class instance has a single other property, `headers`, which are the
lines in BLAST results before the BLAST hits (e.g., citation info, etc.).
'''
def __init__(self, file):
self.file = file
self.headers = []
def __iter__(self):
try:
ipt = open(self.file, 'r')
except (IOError, TypeError):
try:
ipt = self.file.split('\n')
except:
ipt = self.file
mode = 0
headers = []
curr = None
length = 0
def sh(sn, qn, l):
qdl = ''
space = qn.find(' ')
if space > -1:
qn, qdl = qn[:space], qn[space + 1:].lstrip()
return {
'subject': {
'name': sn.lstrip(),
'defline': '',
'start': None,
'end': None,
'sequence': ''
},
'query': {
'name': qn,
'defline': qdl,
'start': None,
'end': None,
'sequence': ''
},
'length': l
}
def ra(sh):
for res in ('subject', 'query'):
sh[res]['start'] = int(sh[res]['start'])
sh[res]['end'] = int(sh[res]['end'])
sh[res]['length'] = abs(sh[res]['end'] - sh[res]['start'] + 1)
return sh
def sh_fmt(l):
for pairs in (a.strip() for a in l.split(',')):
l, r = tuple(a.strip() for a in (pairs.split('=')[:2]
if '=' in pairs else pairs.split(':')[:2]))
subheaders[l.lower().split('(')[0]] = r
for line in ipt:
line = line.rstrip('\n').lstrip()
if not line:
if mode == 4:
mode = 5
continue
if mode == 0:
if line[:6] == 'Query=':
mode = 1
qname = line[6:].lstrip()
self.headers = headers
else:
headers.append(line)
elif mode == 1:
if line[0] == '>':
mode = 3
subheaders = sh(line[1:], qname, length)
elif line[:7] == 'Length=':
length = int(''.join(line[7:].strip().split(',')))
mode = 2
elif line[0] == '(' and line.endswith('letters)'):
length = int(''.join(line[1:-8].strip().split(',')))
mode = 2
elif line[:6] == 'Query=':
qname = line[6:].lstrip()
else:
qname += line
elif mode == 2:
if line[0] == '>':
mode = 3
subheaders = sh(line[1:], qname, length)
elif line[:6] == 'Query=':
qname = line[6:].lstrip()
mode = 1
elif mode == 3:
if line[:5] == 'Score':
snm = subheaders['subject']['name']
defline = ''
space = snm.find(' ')
if space > -1:
snm, defline = snm[:space], snm[space + 1:]
subheaders['subject']['name'] = snm
subheaders['subject']['defline'] = defline
sh_fmt(line)
mode = 4
elif line[:7] == 'Length=':
pass
elif line[0] == '(' and line.endswith('letters)'):
pass
else:
subheaders['subject']['name'] += line
elif mode == 4:
sh_fmt(line)
elif mode == 5:
if line[:6] == 'Query=':
mode = 1
qname = line[6:].lstrip()
yield ra(subheaders)
continue
elif line[0] == '>':
yield ra(subheaders)
subheaders = sh(line[1:], qname, length)
mode = 3
continue
elif line[:5] == 'Score':
yield ra(subheaders)
subheaders = sh(subheaders['subject']['name'], qname,
length)
sh_fmt(line)
mode = 4
continue
elif line[:5] == 'Sbjct':
curr = 'subject'
elif line[:5] == 'Query':
curr = 'query'
else:
continue
_, start, seq, end = line.split()
subheaders[curr]['start'] = subheaders[curr]['start'] or start
subheaders[curr]['end'] = end
subheaders[curr]['sequence'] += seq
try:
yield ra(subheaders)
except UnboundLocalError:
pass
raise StopIteration()
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
output = open(sys.argv[1]).read()
for result in Result(output):
print(result)
biotools-1.2.12/src/clustal.py 0000644 0000765 0000024 00000003234 11774634454 017214 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
import biotools.IO as io
import subprocess
from os import remove
def run(infile, outfile, **kwargs):
n = 0
for seq in io.open(infile, 'r'):
n += 1
if n > 1:
seqtype = seq.type
break
if n > 1:
cmd = "clustalw"
try:
ignore = open('/dev/null', 'w')
except IOError:
ignore = open('nul', 'w')
if seqtype == 'nucl':
defaults = {
'OUTORDER': 'ALIGNED',
'GAPOPEN': '10',
'GAPEXT': '0.1',
'DNAMATRIX': 'IUB'
}
others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
for arg in set(name.upper() for name in kwargs) &
set(defaults.keys())]
subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA",
"-OUTFILE=" + outfile] + others, stdout=ignore)
else:
defaults = {
'OUTORDER': 'ALIGNED',
'GAPOPEN': '10',
'GAPEXT': '0.1',
'MATRIX': 'BLOSUM'
}
others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
for arg in set(name.upper() for name in kwargs) &
set(defaults.keys())]
subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN",
"-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others,
stdout=ignore)
pos = infile.rfind('.')
if pos > -1:
prefix = infile[:pos]
else:
prefix = infile
remove(prefix + '.dnd')
biotools-1.2.12/src/complement.py 0000644 0000765 0000024 00000002644 12033603424 017672 0 ustar atkassen staff 0000000 0000000 #!/usr/bin/env python
from biotools.sequence import isprot
_ref = {
'DNA': {
'A': 'T', 'T': 'A', 'a': 't', 't': 'a',
'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
' ': ' ', '-': '-'
},
'RNA': {
'A': 'U', 'U': 'A', 'a': 'u', 'u': 'a',
'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
' ': ' ', '-': '-'
}
}
def complement(s):
'''
Creates the complement of a sequence, which can then be reversed by using
`seq[::-1]`, if it needs to be reversed. This function accepts either
`Sequence`s or strings.
'''
if isprot(s):
return s
has_u = ('U' in s or 'u' in s)
has_t = ('T' in s or 't' in s)
if has_u and not has_t:
repl = _ref['RNA']
elif has_t and not has_u:
repl = _ref['DNA']
else:
repl = _ref['DNA']
value = ''.join(repl.get(c, 'N') for c in s)
try:
return s.__class__("complement(%s)" % s.name, value,
original=s.original, start=s.start,
end=s.end, step=s.step, qual=s.qual)
except (AttributeError, TypeError):
return s.__class__(value)
if __name__ == '__main__':
assert complement('ATCGTAGCTGATCGAT') == 'TAGCATCGACTAGCTA'
assert complement('AUCGUAGCUGAUCGAU') == 'UAGCAUCGACUAGCUA'
print(complement('AUCgu--cuGAUCGAU'))
biotools-1.2.12/src/IO/ 0000755 0000765 0000024 00000000000 12241266112 015456 5 ustar atkassen staff 0000000 0000000 biotools-1.2.12/src/IO/__init__.py 0000644 0000765 0000024 00000011227 12017447050 017575 0 ustar atkassen staff 0000000 0000000 '''
A module for reading and writing to sequence and annotation files. Currently
supported file types are: FASTA, FASTQ, CLUSTAL alignments, and GFF3 files.
'''
from biotools.IO import fasta, fastq, gff, clustal
from biotools.IO.manager import IOManager
try:
import __builtin__
except ImportError:
import builtins as __builtin__
def get_methods():
methods = {}
nil = lambda *x: iter([])
for module in [fasta, fastq, gff, clustal]:
modname = module.__name__.split('.')[-1]
methods[modname] = {}
for method in ['read', 'write', 'rhook', 'whook', 'probe']:
methods[modname][method] = module.__dict__.get(method, nil)
return methods
class IOBase(object):
'''
Generic IO class for sequence files.
'''
methods = IOManager(get_methods())
def __init__(self, name, mode):
'''
Opens file name with mode mode. This function will attempt to guess at
the filetype by 1. looking at the file extension and failing that,
will 2. read the first few lines to determine the file type.
Recoginized file extensions include fa, fsa, fas, fasta, fastq,
clustalw, clustal, aln.
'''
self.file = name
self.handle = __builtin__.open(name, mode)
self.method = self.methods.default
self.type = None
self.suffixes = {
'fsa': 'fasta',
'fa': 'fasta',
'fs': 'fasta',
'fas': 'fasta',
'fna': 'fasta',
'fasta': 'fasta',
'clustalw': 'clustal',
'clustal': 'clustal',
'aln': 'clustal',
'fastq': 'fastq',
'gff': 'gff',
'gff3': 'gff'
}
p = name.rfind('.')
if p > -1:
ext = name[p + 1:]
if ext in self.suffixes:
try:
self.format(self.suffixes[ext])
return
except ValueError:
pass
try:
for method in IOBase.methods:
try:
self.format(method)
return
except ValueError:
pass
except IOError:
raise
def format(self, fmt):
'''
Forces a file to be parsed as a particular format. By default, the
values for fmt can be any recognized format.
'''
if fmt in self.methods:
method = self.methods[fmt]
ret = method['probe'](__builtin__.open(self.file, 'r'))
if ret:
self.method = method
for key in ret:
object.__setattr__(self, key, ret[key])
return self
else:
raise ValueError("File cannot be parsed as type %s." % fmt)
self.method = self.methods.default
return self
def close(self):
'''
Close the file handle.
'''
self.handle.close()
class Reader(IOBase):
'''
A class that wraps IOBase and restricts the ability to write.
'''
def __init__(self, filename, mode='r'):
IOBase.__init__(self, filename, mode)
self.method['rhook'](self.handle)
self.iter = self.method['read'](self.handle)
def read(self, n=None):
'''
If `n` is provided, the next (up to) `n` entries are parsed and
returned. Otherwise, all remaining entries are parsed and returned.
'''
if n is None:
return [s for s in self]
return [self.iter.next() for i in xrange(int(n))]
def __iter__(self):
return self.iter
def next(self):
'''
Reads a single entry in the file and returns it.
'''
try:
return self.read(1)[0]
except (StopIteration, ValueError, IndexError):
raise StopIteration()
class Writer(IOBase):
'''
A class that wraps IOBase and restricts the ability to read.
'''
def __init__(self, filename, mode='w'):
IOBase.__init__(self, filename, mode)
self.haswritten = False
def write(self, sequence):
'''
Writes sequence as the correct format to the file.
'''
if not self.haswritten:
self.method['whook'](self.handle)
self.haswritten = True
self.method['write'](self.handle, sequence)
def open(filename, mode='r'):
'''
Open a file for parsing or creation. Returns either a Reader or Writer
object, depending on the open mode.
'''
if mode == 'r':
return Reader(filename)
if mode == 'w':
return Writer(filename, mode='w')
if mode == 'a':
return Writer(filename, mode='a')
biotools-1.2.12/src/IO/clustal.py 0000644 0000765 0000024 00000002272 12002100576 017476 0 ustar atkassen staff 0000000 0000000 '''
Methods for manipulating clustalw alignment files.
'''
from biotools.sequence import Sequence
def read(fh):
def clean_alignment(x):
i = 0
for c in x:
if c != "-":
break
i += 1
j = 0
for c in reversed(x):
if c != "-":
break
j += 1
return (' ' * i + x[(i or None):(-j or None)] + ' ' * j, i, len(x) - j)
seqs = {}
for line in fh:
if line.startswith(' '):
continue
st = line.strip()
if st:
bits = st.split()
if len(bits) != 2:
continue
if bits[0] not in seqs:
seqs[bits[0]] = ''
seqs[bits[0]] += bits[1]
for k in seqs:
seq, start, end = clean_alignment(seqs[k])
yield Sequence(k, seq, start=start, end=end)
raise StopIteration()
def probe(fh):
for line in fh:
st = line.strip()
if st:
if st.startswith('CLUSTAL'):
return {'type': 'clustalw'}
return False
return {'type': 'clustalw'}
def rhook(fh):
try:
fh.next()
except StopIteration:
pass
biotools-1.2.12/src/IO/fasta.py 0000644 0000765 0000024 00000003327 12017447011 017133 0 ustar atkassen staff 0000000 0000000 '''
Functions for manipulating FASTA files.
'''
from biotools.sequence import Sequence, chop
def read(fh):
'''
Read sequences in FASTA format; identifiers (names and definition lines)
are on lines that begin with carets ('>') and sequence is on lines that
intervene between the carets. This function is a generator that yields
`Sequence` objects.
'''
name, defline, seq = '', '', ''
for line in fh:
line = line.strip()
if not line:
continue
if line[0] == '>':
if name or seq:
yield Sequence(name, seq, defline=defline)
seq = ''
name = line[1:].split()[0]
defline = line[1 + len(name):].strip()
continue
seq += line
if name or seq:
yield Sequence(name, seq, defline=defline)
raise StopIteration()
def write(fh, s):
'''
Write sequences in FASTA format, i.e.,
```
>name defline
sequence ...
```
Sequences are wrapped to 70 characters by default.
'''
fh.write('>%s %s\n' % (s.name, s.defline) +
'\n'.join(chop(s.seq, 70)) + '\n')
def probe(fh):
'''
Probe a file to determine whether or not it is a FASTA file. That is,
the first non-empty line should begin with a caret ('>'). If no caret is
found on the first line, then we conclude that it is not a FASTA file
and return False, otherwise, we return a dictionary with information
relevant to the FASTA file type.
'''
for line in fh:
st = line.strip()
if st:
fh.close()
if st[0] == '>':
return {'type': 'fasta'}
return False
fh.close()
return {'type': 'fasta'}
biotools-1.2.12/src/IO/fastq.py 0000644 0000765 0000024 00000004646 11774366315 017200 0 ustar atkassen staff 0000000 0000000 '''
Functions for manipulating FASTQ files.
'''
from biotools.sequence import Sequence
def read(fh):
'''
Read sequences in FASTQ format; identifiers are on lines that begin with
at symbols ('@'), sequence follows on the next line, then a line that
begins and sequence is with a plus sign ('+') and finally the quality
scores on the subsequent line. Quality scores are encoded in Phred format,
the type of which (either 32 or 64) is determined when the file is probed
for opening. The scores are decoded into a list of integers. This function
is a generator that yields `Sequence` objects.
'''
while 1:
try:
line = fh.next().strip()
if line[0] == '@':
name = line[1:].split()[0]
defline = line[1 + len(name):].strip()
seq = f.next().strip()
fh.next()
qual = [ord(c) - self.phred for c in fh.next().strip()]
yield Sequence(name, seq, qual=qual, defline=defline)
except StopIteration:
raise
finally:
fh.close()
def write(fh, s):
'''
Write sequences in FASTA format, i.e.,
```
@name
sequence ...
+
quality scores
```
'''
fh.write('@%s %s\n%s\n+\n%s\n' % (s.name, s.defline, s.seq,
''.join(q + chr('A') - 1 for q in s.qual)) + '\n')
def probe(fh):
'''
Probe a file to determine whether or not it is a FASTQ file. That is,
the first non-empty line should begin with a caret ('@') and the 3rd line
following that first non-empty line should contain no character with
ordinal value less than 32. If none of the characters have ordinal value
less than 64, then the file is guessed to be encoded in Phred64, otherwise
it is encoded in Phred32. This function will return False if the file is
not in FASTQ format and will return a dictionary with the phred score and
type ('fastq') if the file is FASTQ.
'''
for line in fh:
st = line.strip()
if st:
fh.close()
if st[0] == '@':
fh.next()
fh.next()
qual = [ord(c) for c in fh.next().strip()]
phred = 32 if min(qual) < ord('A') else 64
qual = [q - phred for q in qual]
return {'type': 'fastq', 'phred': phred}
return False
return {'type': 'fastq', 'phread': 64}
biotools-1.2.12/src/IO/gff.py 0000644 0000765 0000024 00000001124 12013525676 016602 0 ustar atkassen staff 0000000 0000000 from biotools.annotation import Annotation
def read(fh):
for line in fh:
if line[0] != '#':
yield Annotation(*line.split('\t'))
raise StopIteration()
def write(fh, a):
# TODO: improve this...
fh.write(str(a) + '\n')
def probe(fh):
for line in fh:
line = line.strip()
if line:
bits = line.split()
if bits[0] == '##gff-version':
return {'type': 'gff', 'version': float(bits[1])}
return False
return {'type': 'gff', 'version': 3}
def whook(fh):
fh.write('##gff-version 3\n')
biotools-1.2.12/src/IO/manager.py 0000644 0000765 0000024 00000005051 11774617711 017462 0 ustar atkassen staff 0000000 0000000 '''
This module is home to the IOManager class, which manages the various input
and output formats (specifically, FASTA, FASTQ, CLUSTAL alignments, and GFF
files, currently).
'''
class IOManager(object):
'''
A class used by the `IOBase` class to manage the various input and output
methods for the different file types. Additional file types can be added
to the manager by using
```python
manager[format] = methods
```
From the above example, `methods` is a dictionary with keys `rhook`,
`read`, `whook`, `write`, and `probe`. Each of the values must be callable
object:
* `rhook` => takes a file handle opened for reading; called before reading
of the file has begun,
* `whook` => takes a file handle opened for writing; called before writing
to the file has begun,
* `read` => takes a file handle opened for reading; should be a generator
that yields entries,
* `write` => takes a file handle opened for writing and a single entry;
writes the entry to the file,
* `probe` => takes a file handle opened for reading; returns a dictionary
of attributes to be applied to the `IOBase` instance.
This class behaves similarly to a dictionary, except that the get method
will default to the default method (which does nothing) if no truthy
second parameter is passed.
'''
def __init__(self, methods=None):
'''
Instantiates an `IOManager` with methods, where the keys of methods are
the formats and the values are dictionaries with `rhook`, `whook`,
`read`, `write`, and `probe` callables.
'''
self.methods = methods or {}
self.protected = set(self.methods.keys())
nil = lambda *x: iter([])
self.default = {
'rhook': nil,
'read': nil,
'whook': nil,
'write': nil,
'probe': nil
}
def __contains__(self, key):
return key in self.methods
def __getitem__(self, key):
return self.methods.get(key, self.default)
def get(self, key, default=None):
'''
Try to get a set of methods via format (e.g., 'fasta') or fall-back
to the default methods (which do nothing).
'''
try:
return self.methods[key]
except KeyError:
return default or self.default
def __setitem__(self, key, value):
if key not in self.protected:
self.methods[key] = value
return value
def __iter__(self):
return iter(self.methods)
biotools-1.2.12/src/IO/phylip.py 0000644 0000765 0000024 00000002770 12010562315 017341 0 ustar atkassen staff 0000000 0000000 '''
Functions for manipulating PHYLIP files.
'''
from biotools.sequence import Sequence, chop
def read(fh):
'''
Read sequences in PHYLIP format; This function is a generator that yields
`Sequence` objects.
'''
grid, seqs, names = None, [], []
try:
line = fh.next().strip()
while not line:
line = fh.next.strip()
except StopIteration:
fh.close()
raise StopIteration()
grid = [int(x) for x in line.strip()]
while True:
lines = []
try:
for i in xrange(grid[0]):
line = fh.next().strip()
while not line:
line = fh.next().strip()
lines.append(line)
if not names:
names = [l.split()[0] for l in lines]
try:
seqs = [''.join(l.split()[1:]) for l in lines]
except IndexError:
seqs = [''] * grid[0]
else:
temp = [''.join(l.split()) for l in lines]
seqs = [a + b for a, b in zip(seqs, temp)]
except StopIteration:
break
for name, seq in zip(names, seqs):
yield Sequence(name, seq)
fh.close()
raise StopIteration()
def probe(fh):
'''
'''
for line in fh:
bits = line.split()
if len(bits) == 2 and int(bits[0]) > 0 and int(bits[1]) > 0:
return {'type': 'phylip'}
else:
return False
return {'type': 'phylip'}
biotools-1.2.12/src/sequence.py 0000644 0000765 0000024 00000015135 12017457115 017344 0 ustar atkassen staff 0000000 0000000 from biotools.annotation import Annotation
def chop(seq, length=70):
'''
Yields a chunk of a sequence of no more than `length` characters,
it is meant to be used to print fasta files.
'''
while seq:
try:
piece, seq = seq[:length], seq[length:]
except IndexError:
piece, seq = seq, ''
yield piece
raise StopIteration()
def isprot(seq, nucleotides='ATUCGNYRatucgnyr- '):
'''
Check whether the current sequence is a protein or nucleotide sequence.
'''
for c in seq:
if c not in nucleotides:
return True
else:
return False
class Sequence(object):
'''
A wrapper class for sequences.
'''
def __init__(self, name, seq, **kwargs):
'''
Instantiates a Sequence object with a given sequence. Some other
useful parameters that the `Sequence` constructor can handle are:
* `qual` => the quality scores (an array of integers) of the
sequence,
* `type` => the type of the sequence, either prot or nucl,
* `start` => the starting position of the sequence within a
supersequence,
* `end` => the ending position of the sequnece within a
supersequence,
* `step` => the 'step' of the sequence, usually +1 for top-strand
sequences, and -1 for bottom-strand sequences, but can handle
other values as well,
* `original` => the original `Sequence` object from which this one
derives,
* `defline` => the definition line for this sequnce from a fasta
file.
If one of these are not given, they will default to the most logical
value that can be determined from the other values and sequence (e.g.,
if `end < start`, then `step` is probably -1).
'''
self.name = name
self.seq = seq
self.qual = kwargs.get('qual', None)
if 'type' in kwargs:
self.type = kwargs['type']
self.start = kwargs.get('start', 1)
self.end = kwargs.get('end', self.start - 1 + len(seq))
self.step = kwargs.get('step', -1 if self.start > self.end else 1)
self.original = kwargs.get('original', self)
self.defline = kwargs.get('defline', '')
def __getattr__(self, attr):
if attr == 'type':
self.type = 'prot' if isprot(self.seq) else 'nucl'
return self.type
raise AttributeError('%r object has no attribute %r' %
(self.__class__.__name__, attr))
def __getitem__(self, key):
'''
sequence[i] or sequence[start:end] or sequence[start:end:step]
constructs a new Sequence that is a subsequence as described by the
way this function is called. This function will automatically fill in
the name, sequence, start, stop, step, original, and type of the
subsequence. It also tries to fill in the annotations, but annotations
are handled pretty poorly right now, so it's probably best not to
worry about those, but it will work if you really want to.
'''
try:
start, stop, step = key.indices(len(self.seq))
except AttributeError:
start, stop, step = key, key + 1, 1
order = abs(self.step) / self.step
r = stop - (stop - start) % step - step
seq = ''.join(self.seq[x] for x in xrange(start, stop, step))
qual = self.qual and [self.qual[x] for x in xrange(start, stop, step)]
info = (self.name, start, stop, step)
self.type
return Sequence("subsequence(%s, %d, %d, %d)" % info, seq,
qual=qual, original=self.original, type=self.type,
start=self.start + start * order,
end=self.start + r * order,
step=step * self.step)
'''
Some other things you can do with a Sequence object:
* len(sequence) => gives the length of the sequence.
* for character in sequence: => allows you to loop over each character in
the sequence.
* dictionary[sequence] => allows sequences to be used as keys for
dictionaries and allows you to have sequences in sets. This relies on
the test seqA == seqB, described next.
* seqA == seqB => compare two sequences. The sequences are the same if
they have the same sequence AND name. Therefore, two sequences with
different names are treated as separate items in a set and separate
keys in a dictionary. If you need to match only the sequence, use
seqA.seq == seqB.seq.
* print sequence => print a fasta / fastq (depending on whether there are
any quality scores) representation of the sequence. Sequence objects
in any other data structure (e.g., list, dictionary) are printed as
(e.g., ). If you want to change that, you can do:
def __repr__(self):
return self.__str__()
'''
def upper(self):
return Sequence(self.name, self.seq.upper(), type=self.type,
qual=self.qual, original=self.original,
defline=self.defline, start=self.start, step=self.step,
end=self.end)
def __iter__(self):
for c in self.seq:
yield c
raise StopIteration()
def __len__(self):
return len(self.seq)
def __hash__(self):
return hash(self.seq)
def __eq__(self, other):
try:
return self.seq == other.seq and self.name == other.name
except AttributeError:
return (self.seq == other)
def __str__(self):
if self.qual:
return '@%s\n%s\n+\n%s' % (self.name, self.seq,
''.join(chr(ord('A') - 1 + q)
for q in self.qual))
else:
return '>%s %s\n%s' % (self.name, self.defline,
'\n'.join(chop(self.seq, 70)))
def annotation(seq, source, type, **kwargs):
'''
Creates an `Annotation` object for the given sequence from a source
(e.g., "phytozome7.0") of a particular type (e.g., "gene").
'''
try:
sname = source.name
except AttributeError:
sname = source
start, end = min(seq.start, seq.end), max(seq.start, seq.end)
strand = '+' if seq.step == 1 else '-'
attrs = ';'.join('%s=%s' % (key, str(kwargs[key])) for key in kwargs)
return Annotation(seq.original.name, sname, type, start, end, '.',
strand, start % 3, attrs)
biotools-1.2.12/src/translate.py 0000644 0000765 0000024 00000005001 12033603443 017513 0 ustar atkassen staff 0000000 0000000 _gencode = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACU': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AAC': 'N', 'AAU': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CAC': 'H', 'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGU': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 'GUU': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGU': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'UCA': 'S', 'UCC': 'S', 'UCG': 'S', 'UCU': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 'UUG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
'UAC': 'Y', 'UAU': 'Y', 'UAA': '*', 'UAG': '*',
'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
'UGC': 'C', 'UGU': 'C', 'UGA': '*', 'UGG': 'W'}
def translate(sequence):
'''
Translate a nucleotide using the standard genetic code. The sequence
parameter can be either a string or a `Sequence` object. Stop codons are
denoted with an asterisk (*).
'''
try:
value = ''.join(_gencode.get(sequence.seq[i:i + 3].upper(), 'X')
for i in xrange(0, int(len(sequence) / 3) * 3, 3))
return sequence.__class__("translate(%s)" % sequence.name, value,
original=sequence.original, type='prot',
defline=sequence.defline)
except AttributeError:
value = ''.join(_gencode.get(sequence[i:i + 3].upper(), 'X')
for i in xrange(0, int(len(sequence) / 3) * 3, 3))
return sequence.__class__(value)
if __name__ == '__main__':
import sys
print(translate(sys.argv[1]))