debian/0000755000000000000000000000000012151467646007202 5ustar debian/install0000644000000000000000000000015312035107726010561 0ustar distributions usr/share/conservation-code/ matrix usr/share/conservation-code/ score_conservation usr/bin/ debian/rules0000755000000000000000000000363112063625704010256 0ustar #!/usr/bin/make -f #export DH_VERBOSE=1 PACKAGE:=$(shell dpkg-parsechangelog --format rfc822|sed --posix -n -e 's/^Source: \(.*\)/\1/p;') VERSION:=$(shell dpkg-parsechangelog --format rfc822|sed --posix -n -e 's/^Version: \([0-9.]*\).*/\1/p;') SCRIPTS=score_conservation MANS=debian/score_conservation.1 prefix?=/usr datarootdir:=${prefix}/share docdir:=${datarootdir}/doc/${PACKAGE} pkgdatadir:=${datarootdir}/${PACKAGE} # Policy §4.9 says that the get-orig-source target 'may be invoked in any directory'. So we do not use variables set from dpkg-parsechangelog. .PHONY: get-orig-source get-orig-source: set -e; \ if ! ( which xz >/dev/null ); then \ echo "Could not find 'xz' tool for compression. Please install the package 'xz-utils'." >&2; \ exit 1; \ fi ; \ t=$$(mktemp -d) || exit 1; \ trap "rm -rf -- '$$t'" EXIT; \ ( cd "$$t"; \ wget -O conservation-code_20110309.0.orig.tar.gz http://compbio.cs.princeton.edu/conservation/conservation_code.tar.gz; \ gunzip *.tar.gz; \ tar --owner=root --group=root --mode=a+rX --delete -f *.tar --wildcards '*/._*'; \ xz --best *.tar; \ ); \ mv $$t/*.tar.?z ./ .PHONY: override_dh_auto_build override_dh_auto_build: man scripts %: dh $@ --parallel --with python2 .PHONY: man man: $(MANS) .PHONY: scripts scripts: $(SCRIPTS) $(SCRIPTS) : % : %.py cp -f $< $@ ChangeLog: score_conservation.py # Courtesy of Jakub Wilk: sed -n -e '/# [0-9/]\{8\}/ { s/# // p }' < $< > $@ .PHONY: override_dh_installchangelogs override_dh_installchangelogs: ChangeLog dh_installchangelogs %.1: %.1.pod sed -e 's|__docdir__|$(docdir)|g;s|__pkgdatadir__|$(pkgdatadir)|g;s|__VERSION__|$(VERSION)|g;' "$<" | \ pod2man -c 'User Commands' -r "$(VERSION)" -name $(shell echo "$(basename $@)" | tr '[:lower:]' '[:upper:]') > "$@" .PHONY: override_dh_install override_dh_install: dh_install -X._ .PHONY: override_dh_auto_clean override_dh_auto_clean: rm -f $(MANS) $(SCRIPTS) ChangeLog debian/manpages0000644000000000000000000000003412035275571010710 0ustar debian/score_conservation.1 debian/changelog0000644000000000000000000000101312151467463011044 0ustar conservation-code (20110309.0-3) unstable; urgency=low * Allow parsing of Stockholm format as well. * Python3 print(). * usage() prints on stderr in case of error. * Errors are printed on stderr. * Implemented Eugene V. Lyubimkin's suggestions to d/rules (minor changes). -- Laszlo Kajan Mon, 17 Dec 2012 15:26:04 +0100 conservation-code (20110309.0-1) unstable; urgency=low * Initial release. (Closes: #690058) -- Laszlo Kajan Tue, 09 Oct 2012 17:34:05 +0200 debian/upstream0000644000000000000000000000075712035107726010765 0ustar Name: conservation-code Contact: Tony Capra Homepage: http://compbio.cs.princeton.edu/conservation/ Reference: - Author: John A. Capra and Mona Singh Title: Predicting functionally important residues from sequence conservation Journal: Bioinformatics Volume: 23 Number: 15 Pages: 1875-82 Year: 2007 URL: http://bioinformatics.oxfordjournals.org/content/23/15/1875.full DOI: 10.1093/bioinformatics/btm270 PMID: 17519246 debian/examples0000644000000000000000000000003012035107726010723 0ustar 2plc__hssp-filtered.aln debian/control0000644000000000000000000000326212035527110010570 0ustar Source: conservation-code Section: science Priority: extra Maintainer: Debian Med Packaging Team Uploaders: Laszlo Kajan Build-Depends: debhelper (>= 8.0.0), perl, python (>= 2.6.6-3~) Standards-Version: 3.9.4 Homepage: http://compbio.cs.princeton.edu/conservation/ Vcs-Svn: svn://svn.debian.org/debian-med/trunk/packages/conservation-code/trunk/ Vcs-Browser: http://svn.debian.org/wsvn/debian-med/trunk/packages/conservation-code/trunk/ Package: conservation-code Architecture: all Depends: ${misc:Depends}, ${python:Depends}, python-numpy Enhances: concavity Description: protein sequence conservation scoring tool This package provides score_conservation(1), a tool to score protein sequence conservation. . The following conservation scoring methods are implemented: * sum of pairs * weighted sum of pairs * Shannon entropy * Shannon entropy with property groupings (Mirny and Shakhnovich 1995, Valdar and Thornton 2001) * relative entropy with property groupings (Williamson 1995) * von Neumann entropy (Caffrey et al 2004) * relative entropy (Samudrala and Wang 2006) * Jensen-Shannon divergence (Capra and Singh 2007) . A window-based extension that incorporates the estimated conservation of sequentially adjacent residues into the score for each column is also given. This window approach can be applied to any of the conservation scoring methods. . The program accepts alignments in the CLUSTAL and FASTA formats. . The sequence-specific output can be used as the conservation input for concavity. . Conservation is highly predictive in identifying catalytic sites and residues near bound ligands. debian/compat0000644000000000000000000000000212035107726010367 0ustar 8 debian/README.source0000644000000000000000000000154712035554724011363 0ustar conservation-code for Debian ============================ Repackaged Upstream Source -------------------------- The 'get-orig-source' target is provided to build the upstream tarball. The problem with the upstream tarball is that there is a '._conservation_code' file in its root. This confuses 'dpkg-source -b': it tries to apply the patches at the wrong level. Curiously, nothing else breaks because of this. All '._*' files ('macos-resource-fork-file' as lintian calls them) are removed in 'get-orig-source'. Missing d/watch --------------- Upstream (as of 20121009) does not version the downloadable tarball. There is a change log in the main executable, with dates. The latest date is used to form the version for the repacked upstream. A watch file will be created when upstream introduces version numbers to the name of the downloadable archive. # vim:et:ts=4: debian/docs0000644000000000000000000000000712035107726010041 0ustar README debian/copyright0000644000000000000000000000233612035107726011130 0ustar Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: conservation_code Upstream-Contact: John A. Capra Mona Singh Source: http://compbio.cs.princeton.edu/conservation/ Files: debian/* Copyright: 2012 Laszlo Kajan License: GPL-2.0+ Files: * Copyright: Tony Capra 2007 License: GPL-2.0+ License: GPL-2.0+ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. . On Debian systems, the complete text of the GNU General Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". debian/patches/0000755000000000000000000000000012151467643010626 5ustar debian/patches/usage0000644000000000000000000000275612151467463011667 0ustar Author: Laszlo Kajan Description: usage() takes print stream parameter --- a/score_conservation.py +++ b/score_conservation.py @@ -99,7 +99,7 @@ aa_to_index[aa] = i -def usage(): +def usage( __file = sys.stdout ): print( """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t -a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t -b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n @@ -113,7 +113,7 @@ -p\tuse gap penalty. Lower the score of columns that contain gaps. Default=True [True|False]\n\t -s\tconservation estimation method. \n\t\tOptions: shannon_entropy, property_entropy, property_relative_entropy, vn_entropy, relative_entropy, js_divergence, sum_of_pairs. Default=js_divergence\n\t -w\twindow size. Number of residues on either side included in the window. Default=3 [int]\n\t - """ ) + """, file=__file) @@ -740,17 +740,17 @@ # parse options and args -- see usage() if len(sys.argv) < 2: - usage() + usage(__file=sys.stderr) sys.exit(2) try: opts, args = getopt.getopt(sys.argv[1:], "hl:d:m:o:s:w:g:p:b:a:n") except getopt.GetoptError: - usage() + usage(__file=sys.stderr) sys.exit(1) if len(args) < 1: - usage() + usage(__file=sys.stderr) sys.exit(1) for opt, arg in opts: debian/patches/series0000644000000000000000000000015412037047401012030 0ustar examples script_name numpy.numarray default_matrix_path optimize_loop stockholm_format Python3-prints usage debian/patches/optimize_loop0000644000000000000000000000102712035527110013425 0ustar Author: Laszlo Kajan Description: move code outside of loop Forwarded: http://lists.alioth.debian.org/pipermail/debian-med-packaging/2012-October/017448.html --- a/score_conservation.py +++ b/score_conservation.py @@ -136,8 +136,9 @@ aa_num += 1 + freqsum = (sum(seq_weights) + len(amino_acids) * pc_amount) for j in range(len(freq_counts)): - freq_counts[j] = freq_counts[j] / (sum(seq_weights) + len(amino_acids) * pc_amount) + freq_counts[j] = freq_counts[j] / freqsum return freq_counts debian/patches/numpy.numarray0000644000000000000000000000156412035527110013547 0ustar Author: Laszlo Kajan Description: fix import of numarray from numpy.numarray Forwarded: http://lists.alioth.debian.org/pipermail/debian-med-packaging/2012-October/017448.html --- a/score_conservation.py +++ b/score_conservation.py @@ -790,7 +790,7 @@ if arg == 'shannon_entropy': scoring_function = shannon_entropy elif arg == 'property_entropy': scoring_function = property_entropy elif arg == 'property_relative_entropy': scoring_function = property_relative_entropy - elif arg == 'vn_entropy': scoring_function = vn_entropy; from numarray import *; import numarray.linear_algebra as la + elif arg == 'vn_entropy': scoring_function = vn_entropy; from numpy.numarray import *; import numpy.numarray.linear_algebra as la elif arg == 'relative_entropy': scoring_function = relative_entropy elif arg == 'js_divergence': scoring_function = js_divergence debian/patches/Python3-prints0000644000000000000000000001554612151467463013445 0ustar Author: Laszlo Kajan Description: use Python3 style print() --- a/score_conservation.py +++ b/score_conservation.py @@ -83,6 +83,7 @@ # ################################################################################ +from __future__ import print_function import math, sys, getopt import re # numarray imported below @@ -99,7 +100,7 @@ def usage(): - print """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t + print( """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t -a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t -b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n -d\tbackground distribution file, e.g., swissprot.distribution. Default=BLOSUM62 background [filename]\n\t @@ -112,7 +113,7 @@ -p\tuse gap penalty. Lower the score of columns that contain gaps. Default=True [True|False]\n\t -s\tconservation estimation method. \n\t\tOptions: shannon_entropy, property_entropy, property_relative_entropy, vn_entropy, relative_entropy, js_divergence, sum_of_pairs. Default=js_divergence\n\t -w\twindow size. Number of residues on either side included in the window. Default=3 [int]\n\t - """ + """ ) @@ -542,7 +543,7 @@ list_sm.append(row) except IOError, e: - print "Could not load similarity matrix: %s. Using identity matrix..." % sm_file + print( "Could not load similarity matrix: %s. Using identity matrix..." % sm_file, file=sys.stderr ) return identity(20) # if matrix is stored in lower tri form, copy to upper @@ -630,13 +631,13 @@ except IOError, e: - print e, "Using default (BLOSUM62) background." + print( e, "Using default (BLOSUM62) background.", file=sys.stderr ) return [] # use a range to be flexible about round off if .997 > sum(distribution) or sum(distribution) > 1.003: - print "Distribution does not sum to 1. Using default (BLOSUM62) background." - print sum(distribution) + print( "Distribution does not sum to 1. Using default (BLOSUM62) background.", file=sys.stderr ) + print( sum(distribution), file=sys.stderr ) return [] return distribution @@ -775,21 +776,21 @@ try: window_size = int(arg) except ValueError: - print "ERROR: Window size must be an integer. Using window_size 3..." + print( "ERROR: Window size must be an integer. Using window_size 3...", file=sys.stderr ) window_size = 3 elif opt == "-b": try: win_lam = float(arg) if not (0. <= win_lam <= 1.): raise ValueError except ValueError: - print "ERROR: Window lambda must be a real in [0,1]. Using lambda = .5..." + print( "ERROR: Window lambda must be a real in [0,1]. Using lambda = .5...", file=sys.stderr ) win_lam = .5 elif opt == "-g": try: gap_cutoff = float(arg) if not (0. <= gap_cutoff < 1.): raise ValueError except ValueError: - print "ERROR: Gap cutoff must be a real in [0,1). Using a gap cutoff of .3..." + print( "ERROR: Gap cutoff must be a real in [0,1). Using a gap cutoff of .3...", file=sys.stderr ) gap_cutoff = .3 elif opt == '-a': seq_specific_output = arg @@ -804,7 +805,7 @@ elif arg == 'relative_entropy': scoring_function = relative_entropy elif arg == 'js_divergence': scoring_function = js_divergence elif arg == 'sum_of_pairs': scoring_function = sum_of_pairs - else: print "%s is not a valid scoring method. Using %s.\n" % (arg, scoring_function.__name__) + else: print( "%s is not a valid scoring method. Using %s.\n" % (arg, scoring_function.__name__), file=sys.stderr ) align_file = args[0] @@ -821,18 +822,18 @@ if names == []: names, alignment = read_fasta_alignment(align_file) except IOError, e: - print e, "Could not find %s. Exiting..." % align_file + print( e, "Could not find %s. Exiting..." % align_file, file=sys.stderr ) sys.exit(1) if len(alignment) != len(names) or alignment == []: - print "Unable to parse alignment.\n" + print( "Unable to parse alignment.\n", file=sys.stderr ) sys.exit(1) seq_len = len(alignment[0]) for i, seq in enumerate(alignment): if len(seq) != seq_len: - print "ERROR: Sequences of different lengths: %s (%d) != %s (%d).\n" % (names[0], seq_len, names[i], len(seq)) + print( "ERROR: Sequences of different lengths: %s (%d) != %s (%d).\n" % (names[0], seq_len, names[i], len(seq)), file=sys.stderr ) sys.exit(1) @@ -846,7 +847,7 @@ # handle print of output relative to specific sequence ref_seq_num = None if seq_specific_output and seq_specific_output not in names: - print "Sequence %s not found in alignment. Using default output format...\n" % seq_specific_output + print( "Sequence %s not found in alignment. Using default output format...\n" % seq_specific_output, file=sys.stderr ) seq_specific_output = 0 elif seq_specific_output in names: ref_seq_num = names.index(seq_specific_output) @@ -880,15 +881,15 @@ else: outfile.write("# align_column_number\tscore\tcolumn\n") else: - print "# %s -- %s - window_size: %d - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s" % (align_file, scoring_function.__name__, window_size, background_name, use_seq_weights, use_gap_penalty, normalize_scores) + print( "# %s -- %s - window_size: %d - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s" % (align_file, scoring_function.__name__, window_size, background_name, use_seq_weights, use_gap_penalty, normalize_scores) ) if seq_specific_output: - print "# reference sequence: %s" % seq_specific_output - print "# align_column_number\tamino acid\tscore\n" + print( "# reference sequence: %s" % seq_specific_output ) + print( "# align_column_number\tamino acid\tscore\n" ) else: - print "# align_column_number\tscore\tcolumn\n" + print( "# align_column_number\tscore\tcolumn\n" ) except IOError, e: - print "Could not open %s for output. Printing results to standard out..." % outfile_name + print( "Could not open %s for output. Printing results to standard out..." % outfile_name, file=sys.stderr ) outfile_name = "" for i, score in enumerate(scores): @@ -896,12 +897,12 @@ cur_aa = get_column(i, alignment)[ref_seq_num] if cur_aa == '-': continue if outfile_name == "": - print "%d\t%s\t%.5f" % (i, cur_aa, score) + print( "%d\t%s\t%.5f" % (i, cur_aa, score) ) else: outfile.write("%d\t%s\t%5f\n" % (i, cur_aa, score)) else: if outfile_name == "": - print "%d\t%.5f\t%s" % (i, score, "".join(get_column(i, alignment))) + print( "%d\t%.5f\t%s" % (i, score, "".join(get_column(i, alignment))) ) else: outfile.write("%d\t%5f\t%s\n" % (i, score, "".join(get_column(i, alignment)))) debian/patches/script_name0000644000000000000000000000232712035527110013044 0ustar Author: Laszlo Kajan Description: change interpreter and remove extension from script Policy §1.4.2 Interpreter Location: 'The preferred specification for the Python interpreter is /usr/bin/python'. Policy §10.4 Scripts: 'the script name should not include an extension'. Forwarded: no --- a/score_conservation.py +++ b/score_conservation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python ################################################################################ # score_conservation.py - Copyright Tony Capra 2007 - Last Update: 03/09/11 @@ -98,7 +98,7 @@ def usage(): - print """\nUSAGE:\npython score_conservation.py [options] alignfile\n\t -alignfile must be in fasta or clustal format.\n\nOPTIONS:\n\t + print """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta or clustal format.\n\nOPTIONS:\n\t -a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t -b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n -d\tbackground distribution file, e.g., swissprot.distribution. Default=BLOSUM62 background [filename]\n\t debian/patches/default_matrix_path0000644000000000000000000000111612035527110014557 0ustar Author: Laszlo Kajan Description: correct path to packaged default matrix file Forwarded: http://lists.alioth.debian.org/pipermail/debian-med-packaging/2012-October/017448.html --- a/score_conservation.py +++ b/score_conservation.py @@ -718,7 +718,7 @@ window_size = 3 # 0 = no window win_lam = .5 # for window method linear combination outfile_name = "" -s_matrix_file = "matrix/blosum62.bla" +s_matrix_file = "/usr/share/conservation-code/matrix/blosum62.bla" bg_distribution = blosum_background_distr[:] scoring_function = js_divergence use_seq_weights = True debian/patches/examples0000644000000000000000000000523112035110433012347 0ustar Author: Laszlo Kajan Description: example file Forwarded: no --- /dev/null +++ b/2plc__hssp-filtered.aln @@ -0,0 +1,38 @@ +CLUSTAL X (1.81) multiple sequence alignment + + +2plc_ VTTKQWMSALPDTTNLAALSIPGTHDTMSYNGDITWTLTKPLAQTQTMSL +Q9P866 VDYKTWLKDIDNNTRISKLSIPGTHNSAACHTAL------PSVQCQGASV +Q59488 ITTKQWMSALPDNTKLTSLTIPGTHDTMSYKGNISWTLTKSLAQTQKMSL +O07690 INTQRWMTSLPDSVSLSALSIPGTHDTMSYNGYITWQFTRPLAQTQTMTL +PLC_LISMO VTTKQWMSALPDTTNLAALSIPGTHDTMSYNGDITWTLTKPLAQTQTMSL + +2plc_ YQQLEAGIRYIDIRAKDNLNIYHGPIFLNASLSGVLETITQFLKKNPKET +Q9P866 TEQLEHGVRFLDIRVGKDLQVIHGKFPVKLKLKDTLEEVYKFLAHNSSET +Q59488 FQQLEARIRYIDIRAKEDLQIYHGPIYLDASLKGVLETTVNFLKEHPKET +O07690 NEQLNAGIRFFDIRAKEDLNIYHGPIYLNASLEEVLHTFISFLKENPKEV +PLC_LISMO YQQLEAGIRYIDIRAKDNLNIYHGPIFLNASLSGVLETITQFLKKNPKET + +2plc_ IIMRLKDEQNSNDSFDYRIQPLINIYKDYFYTTPRTDTSNKIPTLKDVRG +Q9P866 VIVSIKQEGNSQDEFGKLIWDYVNPNKDRWYLNT------DIPKLGDARG +Q59488 IIMRLKDENNHNDRFDYRIQPLINQYKAFFYTTPKSDSSDKFPTLKELRG +O07690 VIMRLKDENKSENSFDYRIQPLIHKLKSFFYTESAKNTSSKTPTLRKLRG +PLC_LISMO IIMRLKDEQNSNDSFDYRIQPLINIYKDYFYTTPRTDTSNKIPTLKDVRG + +2plc_ KILLLSENHTKKPLVINSRKFGMQFGAPNQVIQDDYNGPSVKTKFKEIVQ +Q9P866 KAILFRRFGVQDEQLKK--QFGFSASSWTYNTTNDDRGQFVVQDFCEVNT +Q59488 KILLLLENGTNKPLTINYSKFGMKFAAENQVIQDNFNGPTIKTKYNEIVQ +O07690 KILLLSDNNTKKSLVINSSRYGMQYDSSEQVIQDDYNGPDVNTKYQEIVQ +PLC_LISMO KILLLSENHTKKPLVINSRKFGMQFGAPNQVIQDDYNGPSVKTKFKEIVQ + +2plc_ TAYQASKADNKLFLNHISATSLTFTPRQYAAALNNKVEQFVLNLTSEKVR +Q9P866 ADDYTSKGDDKVFLNFTSASNFF---DQSCWPQPIAEAMIKGNIQETFHK +Q59488 TAHQASSGENKLYLNHVSATSLTCTPYQYASTLNAKVDQYVTKLTAVGVR +O07690 TAYQASSSENKLFLNYVSATSLTFTPSQYADKLNSKVENFVDNLTANKLN +PLC_LISMO TAYQASKADNKLFLNHISATSLTFTPRQYAAALNNKVEQFVLNLTSEKVR + +2plc_ GLGILIMDFPEKQTIKNIIKNNKF +Q9P866 GVGIIVLDYAETDNWK-------- +Q59488 GLGVFIMDFPPKQTIKSVIKNNKF +O07690 GVGMLIMDFPEKQTIHSIIKNNKF +PLC_LISMO GLGILIMDFPEKQTIKNIIKNNKF debian/patches/stockholm_format0000644000000000000000000000436512037047401014121 0ustar Author: Laszlo Kajan Description: allow parsing of Stockholm format as well Forwarded: no --- a/score_conservation.py +++ b/score_conservation.py @@ -84,6 +84,7 @@ ################################################################################ import math, sys, getopt +import re # numarray imported below PSEUDOCOUNT = .0000001 @@ -98,7 +99,7 @@ def usage(): - print """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta or clustal format.\n\nOPTIONS:\n\t + print """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t -a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t -b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n -d\tbackground distribution file, e.g., swissprot.distribution. Default=BLOSUM62 background [filename]\n\t @@ -679,12 +680,14 @@ return names, alignment def read_clustal_alignment(filename): - """ Read in the alignment stored in the CLUSTAL file, filename. Return + """ Read in the alignment stored in the CLUSTAL or Stockholm file, filename. Return two lists: the names and sequences. """ names = [] alignment = [] + re_stock_markup = re.compile('^#=') + f = open(filename) for line in f: @@ -692,16 +695,21 @@ if len(line) == 0: continue if '*' in line: continue - if 'CLUSTAL' in line: continue + if line[0:7] == 'CLUSTAL': continue + if line[0:11] == '# STOCKHOLM': continue + if line[0:2] == '//': continue + + if re_stock_markup.match(line): continue t = line.split() if len(t) == 2 and t[1][0] in iupac_alphabet: + ali = t[1].upper().replace('B', 'D').replace('Z', 'Q').replace('X', '-').replace('\r', '').replace('.', '-') if t[0] not in names: names.append(t[0]) - alignment.append(t[1].upper().replace('B', 'D').replace('Z', 'Q').replace('X', '-').replace('\r', '')) + alignment.append(ali) else: - alignment[names.index(t[0])] += t[1].upper().replace('B', 'D').replace('Z', 'Q').replace('X','-').replace('\r', '') + alignment[names.index(t[0])] += ali return names, alignment debian/source/0000755000000000000000000000000012151467642010476 5ustar debian/source/format0000644000000000000000000000001412035107726011677 0ustar 3.0 (quilt) debian/score_conservation.1.pod0000644000000000000000000000752012035630446013745 0ustar =pod =head1 NAME score_conservation - score protein sequence conservation =head1 SYNOPSIS score_conservation [options] ALIGNFILE =head1 DESCRIPTION Score protein sequence conservation in B. B must be in FASTA, CLUSTAL or Stockholm format. The following conservation scoring methods are implemented: * sum of pairs * weighted sum of pairs * Shannon entropy * Shannon entropy with property groupings (Mirny and Shakhnovich 1995, Valdar and Thornton 2001) * relative entropy with property groupings (Williamson 1995) * von Neumann entropy (Caffrey et al 2004) * relative entropy (Samudrala and Wang 2006) * Jensen-Shannon divergence (Capra and Singh 2007) A window-based extension that incorporates the estimated conservation of sequentially adjacent residues into the score for each column is also given. This window approach can be applied to any of the conservation scoring methods. With default parameters score_conservation(1) computes the conservation scores for the alignment using the Jensen-Shannon divergence and a window B<-w> of I<3>. The sequence-specific output can be used as the conservation input for concavity(1). Conservation is highly predictive in identifying catalytic sites and residues near bound ligands. =head1 REFERENCES =over =item Capra JA and Singh M. Predicting functionally important residues from sequence conservation. Bioinformatics, 23(15):1875-82, 2007. =back =head1 OPTIONS =over =item -a [NAME] Reference sequence. Print scores in reference to the named sequence (ignoring gaps). Default prints the entire column. =item -b [0-1] Lambda for window heuristic linear combination. Default=I<.5>. Equation: C =item -d [FILE] Background distribution file, e.g. F. Default=built-in BLOSUM62. =item -g [0-1)] Gap cutoff. Do not score columns that contain more than gap cutoff fraction gaps. Default=I<.3>. =item -h Print help. =item -l [true|false] Use sequence weighting. Default=I. =item -m [FILE] Similarity matrix file, e.g. F or .qij. Default=F. Some methods, e.g. I, do not use this. =item -n [true|false] Normalize scores. Print the z-score (over the alignment) of each column raw score. Default=I. =item -o FILE Output file. Default: standard output stream. =item -p [true|false] Use gap penalty. Lower the score of columns that contain gaps, proportionally to the sum weight of the gapped sequences. Default=I. =item -s [METHOD] Conservation estimation method, one of I. Default=I. =item -w [0-INT] Window size. Number of residues on either side included in the window. Default=I<3>. =back =head1 EXAMPLES Note: you may have to copy and uncompress the example data files before running the following examples. =over =item Compute conservation scores for the alignment using the Jensen-Shannon divergence with default settings and print out the scores: score_conservation __docdir__/examples/2plc__hssp-filtered.aln =item Score an alignment using Jensen-Shannon divergence, a window of size 3 (on either side of the residue), and the swissprot background distribution: score_conservation -s js_divergence -w 3 -d \ __pkgdatadir__/distributions/swissprot.distribution \ __docdir__/examples/2plc__hssp-filtered.aln =back =head1 FILES =over =item Distributions F<__pkgdatadir__/distributions> =item Matrices F<__pkgdatadir__/matrix> =back =head1 SEE ALSO =over =item Homepage L =item Publication L =item concavity(1) =back =cut