pax_global_header00006660000000000000000000000064123142175230014512gustar00rootroot0000000000000052 comment=2745b81f15c0a561b6502dd7db418e0743e187d7 pbbarcode-master/000077500000000000000000000000001231421752300142705ustar00rootroot00000000000000pbbarcode-master/Makefile000066400000000000000000000017251231421752300157350ustar00rootroot00000000000000.PHONY: doc doc-clean SHELL = /bin/bash -e all: build install build: python setup.py build --executable="/usr/bin/env python" bdist: python setup.py build --executable="/usr/bin/env python" python setup.py bdist --formats=egg install: python setup.py install develop: python setup.py develop test: find tests -name "*.py" | xargs nosetests find tests/cram -name "*.t" | grep -v consensus.t | xargs cram --verbose clean: doc-clean rm -rf build/;\ find . -name "*.egg-info" | xargs rm -rf;\ find . -name "*.pyc" | xargs rm -rf;\ rm -rf dist/ make -C src/C clean doc-clean: make -C doc clean doc: make -C doc html pip-install: @which pip > /dev/null @pip freeze|grep 'pbtools.barcode=='>/dev/null \ && pip uninstall -y pbtools.barcode \ || true @pip freeze|grep 'pbbarcode=='>/dev/null \ && pip uninstall -y pbbarcode \ || true @pip install --no-index \ --install-option="--install-scripts=$(PREFIX)/bin" \ ./ pbbarcode-master/README.rst000066400000000000000000000021071231421752300157570ustar00rootroot00000000000000Overview of the pbbarcode package ================================= The *pbbarcode* package provides tools for annotating PacBio sequencing reads with barcode information. Typically, *pbbarcode* is called in context of a SMRTPipe workflow as opposed to directly on the command line, however, users are encouraged to utilize the command-line utility directly, as more options are available. The *pbbarcode* package provides a multi-command line tool *pbbarcode* which currently has the following sub-commands: * labelZmws * labelAlignments * emitFastqs * consensus The first three sub-commands depend on only *pbcore* and its dependencies, the fourth, *consensus*, depends on the *pbdagcon* package and is considered experimental. For more details on the package, please see docs/index.rst for more information. Installation ============ Typically, the *pbbarcode* package is installed within an installation of SMRTPipe, however, it can be installed by itself using:: make install To test that everything is installed correctly, one should additionally issue a:: make test pbbarcode-master/doc/000077500000000000000000000000001231421752300150355ustar00rootroot00000000000000pbbarcode-master/doc/Makefile000066400000000000000000000127101231421752300164760ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbbarcode.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbbarcode.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/pbbarcode" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbbarcode" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." pbbarcode-master/doc/PbbarcodeFunctionalSpecification.rst000066400000000000000000000462031231421752300242010ustar00rootroot00000000000000.. pbbarcode Functional Specification .. ======================================= .. Version Introduction ```````````` This document describes the interface and input/output formats of the ``pbbarcode`` package command line tools. The package provides utilities for annotating individual ZMWs directly from a bas.h5 file, emitting fast[a|q] files for each barcode, labeling alignments stored in a cmp.h5 file, and calling consensus on small amplicons (requires ``pbdagcon``) At the moment, Barcodes can be scored in two different ways: ``symmetric`` and ``paired``. Symmetric mode supports barcode designs with two identical barcodes on both sides of a SMRTbell, e.g., for barcodes (A, B), molecules are labeled as A--A or B--B. The ``paired`` mode supports designs with two distinct barcodes on each side of the molecule, but neither barcode appears without its mate. The minimum example is given with the following barcodes: (ALeft, ARight, BLeft, BRight), where the following barcode sets are checked: ALeft--ARight, BLeft--BRight. It is important to highlight that a barcode FASTA file specifies a list of available barcodes to evaluate. Depending on the scoring mode, the barcodes are grouped together in different ways. For instance, in the ``symmetric`` case, the number of possible barcode outcomes are simply the number of barcodes that are supplied to the routine in the FASTA file (see below for usage) plus an additional ``NULL`` barcode indicating that no barcode could be evaluated (denoted by: '--'). Labels like this (A--A) are used in the final outputs. In the ``paired`` mode, the number of possible barcode outcomes are half the number of the sequences in the FASTA file plus the ``NULL`` barcode. The ``NULL`` barcode indicates that no attempt was made to score the molecule or it was filtered out by the user's criteria. The majority of cases when a molecule is not scored are related to not observing any adapters. If a user has executed a "hot-start" run, the user can try the '--scoreFirst' parameter to attempt to label the first adapter's barcode. This increases the yield of the labeleing procedure at the expense of some probably false positives. The software is implemented as a standard python package. Barcodes are labeled according to the following high-level logic. For each molecule, all adapters are found. For each adapter, we align (using standard Smith-Watterman alignment) each barcode and its reverse complement to flanking sequence of the adapter. If two complete flanking sequences are available, we divide by 2, else 1 if only one flanking sequence was available (average score at adapter). This allows the scores across adapters to be on the same scale (chimera detection). Depending on the ``mode``, we then determine which barcode(s) are maximally scoring. We store the two maximally scoring barcodes, the sum of their alignment scores across the adapters. The average barcode score then can be given approximately by: total-score/number-of-adapters. At the moment, the alignment parameters are fixed at: .. table:: SW Match Parameters +----------+----------+ |type |score | | | | +----------+----------+ |insertion |-1 | | | | +----------+----------+ |deletion |-1 | | | | +----------+----------+ |missmatch |-2 | | | | +----------+----------+ |match |2 | | | | +----------+----------+ Input and output ```````````````` labelZmws --------- usage: pbbarcode labelZmws [-h] [--outDir OUTDIR] [--outFofn OUTFOFN] [--adapterSidePad ADAPTERSIDEPAD] [--insertSidePad INSERTSIDEPAD] [--scoreMode {symmetric,paired}] [--maxAdapters MAXADAPTERS] [--scoreFirst] [--startTimeCutoff STARTTIMECUTOFF] [--nZmws NZMWS] [--nProcs NPROCS] [--saveExtendedInfo] barcode.fasta input.fofn Creates a barcode.h5 file from base h5 files. positional arguments: barcode.fasta Input barcode fasta file input.fofn Input base fofn optional arguments: -h, --help show this help message and exit --outDir OUTDIR Where to write the newly created barcode.h5 files. (default: /home/UNIXHOME/jbullard/projects/software/bi oinformatics/tools/pbbarcode/doc) --outFofn OUTFOFN Write to outFofn (default: barcode.fofn) --adapterSidePad ADAPTERSIDEPAD Pad with adapterSidePad bases (default: 4) --insertSidePad INSERTSIDEPAD Pad with insertSidePad bases (default: 4) --scoreMode {symmetric,paired} The mode in which the barcodes should be scored. (default: symmetric) --maxAdapters MAXADAPTERS Only score the first maxAdapters (default: 20) --scoreFirst Whether to try to score the leftmost barcode in a trace. (default: False) --startTimeCutoff STARTTIMECUTOFF Reads must start before this value in order to be included when scoreFirst is set. (default: 10.0) --nZmws NZMWS Use the first n ZMWs for testing (default: -1) --nProcs NPROCS How many processes to use (default: 8) --saveExtendedInfo Whether to save extended information tothe barcode.h5 files; this information is useful for debugging and chimera detection (default: False) The ``labelZmws`` command takes an input.fofn representing a set of bas.h5 files to operate on. Additionally, it takes a barcode.fasta file. Depending on ``scoreMode``, the FASTA file will be processed in different ways. Specifically, in ``paired`` mode, each two consecutive barcodes in the file are considered a set. The parameters, ``adapterSidePad`` and ``insertSidePad`` represents how many bases should be considered on each side of the putative barcode. These parameters are constrained such that: ``|adapterSidePad| + |insertSidePad| + |barcode| < 65``. Users have the option to specify a different output location for the various outputs. Specifically, for each bas.h5 file in input.fofn, a bc.h5 (barcode hdf5) file is generated. These files are listed in the file ``outFofn`` which is typically just called ``barcode.fofn``. See below for a description of the barcode hdf5 file. labelAlignments --------------- usage: pbbarcode labelAlignments [-h] [--minAvgBarcodeScore MINAVGBARCODESCORE] [--minNumBarcodes MINNUMBARCODES] [--minScoreRatio MINSCORERATIO] barcode.fofn aligned_reads.cmp.h5 Adds information about barcode alignments to a cmp.h5 file from a previous call to "labelZmws". positional arguments: barcode.fofn input barcode fofn file aligned_reads.cmp.h5 cmp.h5 file to add barcode labels optional arguments: -h, --help show this help message and exit --minAvgBarcodeScore MINAVGBARCODESCORE ZMW Filter: exclude ZMW if average barcode score is less than this value (default: 0.0) --minNumBarcodes MINNUMBARCODES ZMW Filter: exclude ZMW if number of barcodes observed is less than this value (default: 1) --minScoreRatio MINSCORERATIO ZMW Filter: exclude ZMWs whose best score divided by the 2nd best score is less than this ratio (default: 1.0) The ``labelAlignments`` command takes as input a barcode.fofn computed from a call to ``labelZMWs`` and a cmp.h5 file where the barcode information is written to. See below for a description of the cmp.h5 file additions. emitFastqs ---------- usage: pbbarcode emitFastqs [-h] [--outDir output.dir] [--subreads] [--unlabeledZmws] [--trim TRIM] [--fasta] [--minMaxInsertLength MINMAXINSERTLENGTH] [--hqStartTime HQSTARTTIME] [--minReadScore MINREADSCORE] [--minAvgBarcodeScore MINAVGBARCODESCORE] [--minNumBarcodes MINNUMBARCODES] [--minScoreRatio MINSCORERATIO] input.fofn barcode.fofn Takes a bas.h5 fofn and a barcode.h5 fofn and produces a fast[a|q] file for each barcode. positional arguments: input.fofn input base or CCS fofn file barcode.fofn input barcode.h5 fofn file optional arguments: -h, --help show this help message and exit --outDir output.dir output directory to write fastq files (default: /home/ UNIXHOME/jbullard/projects/software/bioinformatics/too ls/pbbarcode/doc) --subreads whether to produce fastq files for the subreads;the default is to use the CCS reads. This option onlyapplies when input.fofn has both consensus and raw reads,otherwise the read type from input.fofn will be returned. (default: False) --unlabeledZmws whether to emit a fastq file for the unlabeled ZMWs. These are the ZMWs where no adapters are found typically (default: False) --trim TRIM trim off barcodes and any excess constant sequence (default: 20) --fasta whether the files produced should be FASTA files asopposed to FASTQ (default: False) --minMaxInsertLength MINMAXINSERTLENGTH ZMW Filter: exclude ZMW if the longest subreadis less than this amount (default: 0) --hqStartTime HQSTARTTIME ZMW Filter: exclude ZMW if start time of HQ regiongreater than this value (seconds) (default: inf) --minReadScore MINREADSCORE ZMW Filter: exclude ZMW if readScore is less thanthis value (default: 0) --minAvgBarcodeScore MINAVGBARCODESCORE ZMW Filter: exclude ZMW if average barcode score is less than this value (default: 0.0) --minNumBarcodes MINNUMBARCODES ZMW Filter: exclude ZMW if number of barcodes observed is less than this value (default: 1) --minScoreRatio MINSCORERATIO ZMW Filter: exclude ZMWs whose best score divided by the 2nd best score is less than this ratio (default: 1.0) The ``emitFastqs`` command takes as input both an input.fofn for the bas.h5 files as well as a barcode.fofn from a call to labelZmws. The optional parameter ``outDir`` dictates where the files will be written. For each detected barcode, a fast[a|q] file will be emitted with all of the reads for that barcode. The ``trim`` parameter dictates how much of the read should be trimmed off. The default parameter for ``trim`` is the length of the barcode (which is stored in the barcode hdf5 files). At the moment, all barcodes in the barcode FASTA file must be the same length, therefore only a constant trim value is supported. In practice, one can aggressively trim in order to ensure that extra bases aren't left on the ends of reads. Finally, the ``subreads`` parameter dictates whether subreads or CCS reads should be returned with the default being the appropriate reads according to the input file type, either CCS or subreads. This parameter is only inspected if the input.fofn contains both CCS and subread data, if the input.fofn contains only subread or CCS data then that is returned irrespective of the state of the the ``subreads`` parameter and a warning is issued. consensus --------- usage: pbbarcode consensus [-h] [--subsample SUBSAMPLE] [--nZmws NZMWS] [--outDir OUTDIR] [--keepTmpDir] [--ccsFofn CCSFOFN] [--nProcs NPROCS] [--noQuiver] [--minMaxInsertLength MINMAXINSERTLENGTH] [--hqStartTime HQSTARTTIME] [--minReadScore MINREADSCORE] [--minAvgBarcodeScore MINAVGBARCODESCORE] [--minNumBarcodes MINNUMBARCODES] [--minScoreRatio MINSCORERATIO] [--barcode BARCODE [BARCODE ...]] input.fofn barcode.fofn Compute consensus sequences for each barcode. positional arguments: input.fofn input bas.h5 fofn file barcode.fofn input bc.h5 fofn file optional arguments: -h, --help show this help message and exit --subsample SUBSAMPLE Subsample ZMWs (default: 1) --nZmws NZMWS Take n ZMWs (default: -1) --outDir OUTDIR Use this directory to output results (default: .) --keepTmpDir --ccsFofn CCSFOFN Obtain CCS data from ccsFofn instead of input.fofn (default: ) --nProcs NPROCS Use nProcs to execute. (default: 16) --noQuiver --minMaxInsertLength MINMAXINSERTLENGTH ZMW Filter: exclude ZMW if the longest subreadis less than this amount (default: 0) --hqStartTime HQSTARTTIME ZMW Filter: exclude ZMW if start time of HQ regiongreater than this value (seconds) (default: inf) --minReadScore MINREADSCORE ZMW Filter: exclude ZMW if readScore is less thanthis value (default: 0) --minAvgBarcodeScore MINAVGBARCODESCORE ZMW Filter: exclude ZMW if average barcode score is less than this value (default: 0.0) --minNumBarcodes MINNUMBARCODES ZMW Filter: exclude ZMW if number of barcodes observed is less than this value (default: 1) --minScoreRatio MINSCORERATIO ZMW Filter: exclude ZMWs whose best score divided by the 2nd best score is less than this ratio (default: 1.0) --barcode BARCODE [BARCODE ...] Use this to extract consensus for just one barcode. (default: None) The ``emitFastqs`` command takes as input both an input.fofn for the bas.h5 files as well as a barcode.fofn from a call to labelZmws. The results are a FASTA file with an entry for each barcode containing the consensus amplicon sequence. This mode utilizes ``Quiver`` and ``pbdagcon`` to compute consensus. In cases where the amplicon is fewer than 2.5k bases, using CCS data is quite helpful. The ``--ccsFofn`` allows one to pass directly the ccs files. In many cases, both the CCS and raw basecalls are in the same file so you can check by passing the same parameter to input.fofn as to ccsFofn. Dependencies ```````````` The pbbarcode package depends on a standard pbcore installation (https://github.com/PacificBiosciences/pbcore). If one wishes to use the ``consensus`` tool, ``pbdagcon`` needs to be installed (https://github.com/PacificBiosciences/pbdagcon). Barcode HDF5 File ````````````````` The barcode hdf5 file, ``bc.h5``, represents a simple data store for barcode calls and their scores for each ZMW. Generally, a user need not interact with barcode hdf5 files, but can use the results stored in either the resulting cmp.h5 file or fast[a|q] files. The barcode hdf5 file contains the following structure: /BarcodeCalls/best - (nZMWs, 6)[32-bit integer] dataset with the following columns: ``holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2,barcodeScore2`` Additionally, the ``best`` dataset has the following attributes: +-----------+------------------------------------------------------------------------+ |movieName |m120408_042614_richard_c100309392550000001523011508061222_s1_p0 | | | | +-----------+------------------------------------------------------------------------+ |columnNames|holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2, | | |barcodeScore2 | +-----------+------------------------------------------------------------------------+ |scoreMode |[symmetric|paired] | | | | +-----------+------------------------------------------------------------------------+ |barcodes |'bc_1', 'bc_2', ...., 'bc_N' | | | | +-----------+------------------------------------------------------------------------+ The two barcodeIdx1 and barcodeIdx2 columns are indices into ``barcodes`` attribute. The ``scoreMode`` is scoring mode used to align the barcodes. The ``barcodes`` attribute correspond to the barcode.fasta sequence names. Additionally, in some circumstances, it is useful to retain the entire history of the scoring, i.e., each barcode scored to each adapter across all ZMWs. In oder to retain this information, one must call: ``pbbarcode labelZmws --saveExtendedInfo ...`` In this mode, the resultant HDF5 file will have an additional dataset under the BarcodeCalls group, named: ``all``. This dataset has the following format: /BarcodeCalls/all - (nbarcodes * nadapters[zmw_i], 4) \forall i in 1 ... nZMWs ```holeNumber, adapterIdx, barcodeIdx, score``` The ``adapterIdx`` is the index of the adapter along the molecule, i.e., adapterIdx 1 is the first adapter scored. Additions to the compare HDF5 (cmp.h5) File ``````````````````````````````````````````` In addition to the barcode hdf5 file, a call to ``labelAlignments`` will annotate a cmp.h5 file. This annotation is stored in ways consistent with the cmp.h5 file format. Specifically, a new group: | /BarcodeInfo/ | ID (nBarcodeLabels + 1, 1)[32-bit integer] | Name (nBarcodeLabels + 1, 1)[variable length string] In addition to the /BarcodeInfo/ group, the key dataset which assigns alignments to barcodes is located at: /AlnInfo/Barcode (nAlignments, 3)[32-bit integer] with the following colums: ``index,count,bestIndex,bestScore,secondBestIndex,secondBestScore`` Here index refers to the index into the ``Name`` vector, score corresponds to the sum of the scores for the barcodes, and finally, count refers to the number of adapters found in the molecule. pbbarcode-master/doc/conf.py000077500000000000000000000172231231421752300163440ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # pbbarcode documentation build configuration file, created by # sphinx-quickstart on Mon Apr 30 18:28:57 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'pbbarcode' copyright = u'2012, PacBio' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '.1' # The full version, including alpha/beta/rc tags. release = '.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'pbbarcodedoc' # -- Options for LaTeX output -------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pbbarcode.tex', u'pbbarcode Documentation', u'PacBio', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'pbbarcode', u'pbbarcode Documentation', [u'PacBio'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'pbbarcode', u'pbbarcode Documentation', u'PacBio', 'pbbarcode', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' pbbarcode-master/doc/index.rst000066400000000000000000000005001231421752300166710ustar00rootroot00000000000000.. pbbarcode documentation master file, created by sphinx-quickstart on Mon Apr 30 18:28:57 2012. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. pbbarcode ========= Contents: .. toctree:: :maxdepth: 2 PbbarcodeFunctionalSpecification pbbarcode-master/etc/000077500000000000000000000000001231421752300150435ustar00rootroot00000000000000pbbarcode-master/etc/barcode.fasta000066400000000000000000000001311231421752300174550ustar00rootroot00000000000000>bc3 tatctatcgtatacgc >bc4 atcacactgcatctga >bc5 acgtacgctcgtcata >bc10 tcatgcacgtctcgct pbbarcode-master/etc/barcode_complete.fasta000066400000000000000000000043671231421752300213640ustar00rootroot00000000000000>bc_1 GCGCTCTGTGTGCAGC >bc_2 TCATGAGTCGACACTA >bc_3 TATCTATCGTATACGC >bc_4 ATCACACTGCATCTGA >bc_5 ACGTACGCTCGTCATA >bc_6 TGTGAGTCAGTACGCG >bc_7 AGAGACACGATACTCA >bc_8 CTGCTAGAGTCTACAG >bc_9 AGCACTCGCGTCAGTG >bc_10 TCATGCACGTCTCGCT >bc_11 AGAGCATCTCTGTACT >bc_12 CGCATCGACTACGCTA >bc_13 CGTAGCGTGCTATCAC >bc_14 ATGCTGATGACTGCGA >bc_15 TGCGTGAGCTGTACAT >bc_16 CGATCATCTATAGACA >bc_17 CGACGTATCTGACAGT >bc_18 CACGTCACTAGAGCGA >bc_19 TGTCGCAGCTACTAGT >bc_20 CATACGCTGTGTAGCA >bc_21 AGTCGCATGACTGTGT >bc_22 CAGTACTGCACGATCG >bc_23 GTGCTGAGCATCAGAC >bc_24 CACTGATCGATATGCA >bc_25 TACAGTGTCTGCTGCG >bc_26 TACAGATAGTGTAGCG >bc_27 TCGTAGAGCTCGAGAC >bc_28 GAGCTGCGCACTCGAT >bc_29 GCGATGTCGCTATGTG >bc_30 CGAGAGTCAGCGCATA >bc_31 TCACGATGAGCACGTA >bc_32 GACTGAGATCATGATC >bc_33 ACGACATGATACTGCT >bc_34 ATACAGCACAGATGTG >bc_35 ACAGTCGATATCTCTC >bc_36 GCTCGATCACATGACG >bc_37 GTCGTACACGTGCGAC >bc_38 ACTCATATCTAGAGTG >bc_39 ACTGATCTGTCGCGCT >bc_40 CACTAGCTCTGACTAC >bc_41 GCTGTCATGTACTAGC >bc_42 TATACATACACGCACT >bc_43 TGTGACGACGCGTCTC >bc_44 GACGTGAGCATGCACT >bc_45 CTCGATACGTGTAGCT >bc_46 GTGTCTAGACAGCTGT >bc_47 GATGCATGCGTACGCA >bc_48 TATCAGAGCAGCGATG >bc_49 TCATATGTAGTACTCT >bc_50 GCGATCTATGCACACG >bc_51 TGCAGTCGAGATACAT >bc_52 GACTCTGCGTCGAGTC >bc_53 TACAGCGACGTCATCG >bc_54 GCGCAGACTACGTGTG >bc_55 GTCTCTGCGATACAGC >bc_56 AGTATGAGATAGCTCG >bc_57 GCGACGAGTACTCATG >bc_58 AGTATCACAGTCGCTG >bc_59 ATCATATGATGCGACA >bc_60 AGACGTAGATCACAGC >bc_61 CGTGTCATGCTACTCA >bc_62 TGTGAGACTGCATGTC >bc_63 GCTCAGTGCGCTACTG >bc_64 ACTATCGCGCACGCAG >bc_65 TGACACTCTGCACGCG >bc_66 CAGACGTGACTGATAT >bc_67 GCACTGTAGTGATCGT >bc_68 CAGTGCGAGACAGTAG >bc_69 AGTAGTGCTACTCGAC >bc_70 ATGCGAGATCTGCTCA >bc_71 TGAGACATACTGAGTG >bc_72 ATGTGCACTAGTGTAC >bc_73 TCAGCTGACGATGTGA >bc_74 ACTGATGCGCACATGT >bc_75 CTACTCTCAGCAGTGA >bc_76 ATCTACATCACGACTC >bc_77 ATATAGTACAGCGTCT >bc_78 GACACGACTAGATCGC >bc_79 TACGAGTCTGTCATAC >bc_80 ACTCAGCTACATAGTG >bc_81 ACGTATCATAGTGAGA >bc_82 GAGTCGTATCGCTCAT >bc_83 GCGATCACGAGTAGAC >bc_84 CTAGACGTACATGTCG >bc_85 TAGCAGTCACTGTGCG >bc_86 GCTCATGCGATAGCTA >bc_87 GCGCAGTCGTCTGTAT >bc_88 ATGAGCTACGTACAGA >bc_89 GTCGCGAGTCTATCAG >bc_90 ACATCGATCTGCACTA >bc_91 AGTATAGCATAGACGC >bc_92 GTGAGAGCGTGACTCT >bc_93 TGTCAGTAGATGACTC >bc_94 TCGTACGAGATCGACA >bc_95 CTACATGTGACTCGAG >bc_96 GCGCTATAGTGCTCGT pbbarcode-master/etc/pacbio_barcodes_paired.fasta000077500000000000000000000051561231421752300225200ustar00rootroot00000000000000>F_1 GGTAGGCGCTCTGTGTGCAGC >R_1 AGAGTACTACATATGAGATGG >F_2 GGTAGTCATGAGTCGACACTA >R_2 CGTGTGCATAGATCGCGATGG >F_3 GGTAGTATCTATCGTATACGC >R_3 ATGTATCTCGACTGCAGATGG >F_4 GGTAGATCACACTGCATCTGA >R_4 GACTCGACGCAGAGTCGATGG >F_5 GGTAGACGTACGCTCGTCATA >R_5 CGATGACGTCGCTGTAGATGG >F_6 GGTAGTGTGAGTCAGTACGCG >R_6 CACACGTAGTCTGCGCGATGG >F_7 GGTAGAGAGACACGATACTCA >R_7 GCTGTATCGCAGAGACGATGG >F_8 GGTAGCTGCTAGAGTCTACAG >R_8 CGAGCTATCTCATACTGATGG >F_9 GGTAGAGCACTCGCGTCAGTG >R_9 CATGAGTACTCGTCGCGATGG >F_10 GGTAGTCATGCACGTCTCGCT >R_10 CAGCGACTGTGATACTGATGG >F_11 GGTAGAGAGCATCTCTGTACT >R_11 TGTCGCATCATATGATGATGG >F_12 GGTAGCGCATCGACTACGCTA >R_12 GCTGTGATCTACGTCTGATGG >F_13 GGTAGCGTAGCGTGCTATCAC >R_13 TGAGTAGCATGACACGGATGG >F_14 GGTAGATGCTGATGACTGCGA >R_14 GACATGCAGTCTCACAGATGG >F_15 GGTAGTGCGTGAGCTGTACAT >R_15 CAGTAGCGCACTGAGCGATGG >F_16 GGTAGCGATCATCTATAGACA >R_16 CTGCGTGCGCGATAGTGATGG >F_17 GGTAGCGACGTATCTGACAGT >R_17 CGCGTGCAGAGTGTCAGATGG >F_18 GGTAGCACGTCACTAGAGCGA >R_18 ATATCAGTCACGTCTGGATGG >F_19 GGTAGTGTCGCAGCTACTAGT >R_19 ACGATCACTACAGTGCGATGG >F_20 GGTAGCATACGCTGTGTAGCA >R_20 CTACTGTCTCGCACTGGATGG >F_21 GGTAGAGTCGCATGACTGTGT >R_21 GTCGAGTAGCACTACTGATGG >F_22 GGTAGCAGTACTGCACGATCG >R_22 TGAGCAGATCTCGCATGATGG >F_23 GGTAGGTGCTGAGCATCAGAC >R_23 CACTCAGTATGTCTCAGATGG >F_24 GGTAGCACTGATCGATATGCA >R_24 GTACACTAGTGCACATGATGG >F_25 GGTAGTACAGTGTCTGCTGCG >R_25 TCACATCGTCAGCTGAGATGG >F_26 GGTAGTACAGATAGTGTAGCG >R_26 ACATGTGCGCATCAGTGATGG >F_27 GGTAGTCGTAGAGCTCGAGAC >R_27 TCACTGCTGAGAGTAGGATGG >F_28 GGTAGGAGCTGCGCACTCGAT >R_28 GAGTCGTGATGTAGATGATGG >F_29 GGTAGGCGATGTCGCTATGTG >R_29 AGACGCTGTACTATATGATGG >F_30 GGTAGCGAGAGTCAGCGCATA >R_30 GCGATCTAGTCGTGTCGATGG >F_31 GGTAGTCACGATGAGCACGTA >R_31 GTATGACAGACTCGTAGATGG >F_32 GGTAGGACTGAGATCATGATC >R_32 CACTATGTAGCTGAGTGATGG >F_33 GGTAGACGACATGATACTGCT >R_33 TCTCACTATGATACGTGATGG >F_34 GGTAGATACAGCACAGATGTG >R_34 ATGAGCGATACGACTCGATGG >F_35 GGTAGACAGTCGATATCTCTC >R_35 GTCTACTCGTGATCGCGATGG >F_36 GGTAGGCTCGATCACATGACG >R_36 CGACATGTACGTCTAGGATGG >F_37 GGTAGGTCGTACACGTGCGAC >R_37 CGCACAGTGACTGCTAGATGG >F_38 GGTAGACTCATATCTAGAGTG >R_38 TAGCTATCGCATGAGCGATGG >F_39 GGTAGACTGATCTGTCGCGCT >R_39 ATACAGACGACTGCGCGATGG >F_40 GGTAGCACTAGCTCTGACTAC >R_40 TCTGTACGTAGCTCATGATGG >F_41 GGTAGGCTGTCATGTACTAGC >R_41 CTGATAGACTCGCGACGATGG >F_42 GGTAGTATACATACACGCACT >R_42 TAGTGCAGATCGATGTGATGG >F_43 GGTAGTGTGACGACGCGTCTC >R_43 GCGTCTATGCTATACTGATGG >F_44 GGTAGGACGTGAGCATGCACT >R_44 AGAGTCACGCTCTCACGATGG >F_45 GGTAGCTCGATACGTGTAGCT >R_45 GAGTCATCTACTGACAGATGG >F_46 GGTAGGTGTCTAGACAGCTGT >R_46 TGTCGATCTCGTACGAGATGG >F_47 GGTAGGATGCATGCGTACGCA >R_47 CTCGAGTCACATGTAGGATGG >F_48 GGTAGTATCAGAGCAGCGATG >R_48 ACGAGCACTATAGCGCGATGG pbbarcode-master/setup.py000077500000000000000000000020421231421752300160030ustar00rootroot00000000000000from setuptools import setup, Extension, find_packages import os import sys vFile = 'src/python/pbbarcode/_version.py' if os.path.exists(vFile): lines = open(vFile, 'r').read().splitlines() for line in lines: elts = line.split('=') elts = [e.strip() for e in elts] if len(elts) == 2 and elts[0] == '__version__': _ReadVersion = elts[1].replace('\'', '').replace('\"', '') break else: _ReadVersion = '0.0.0' setup( name = 'pbbarcode', version=_ReadVersion, author='pbiDevNet', author_email='pbiDevNet@pacificbiosciences.com', license='LICENSE.txt', packages = find_packages('src/python'), package_dir = {'':'src/python'}, ext_modules=[Extension('pbbarcode/sw', ['src/C/sw.c'], extra_compile_args=["-O3","-shared"])], zip_safe = False, entry_points={ 'console_scripts': [ 'pbbarcode = pbbarcode.main:main'] }, install_requires=[ 'pbcore >= 0.6.3', 'numpy >= 1.6.0', 'h5py >= 1.3.0' ] ) pbbarcode-master/src/000077500000000000000000000000001231421752300150575ustar00rootroot00000000000000pbbarcode-master/src/C/000077500000000000000000000000001231421752300152415ustar00rootroot00000000000000pbbarcode-master/src/C/Makefile000066400000000000000000000002531231421752300167010ustar00rootroot00000000000000.PHONY: clean all SHELL = /bin/bash -e all: build/sw.so build/sw.so: sw.c mkdir -p ./build;\ gcc -O4 -DGETPROB -shared -fPIC sw.c -o build/sw.so clean: rm -rf build pbbarcode-master/src/C/sw.c000066400000000000000000000026121231421752300160370ustar00rootroot00000000000000#include #include #include #define M 64 #define N 64 #define MAX(x,y) (((x) > (y)) ? (x) : (y)) int* allocate_dp_mat() { return (int*) calloc(N*M, sizeof(int)); } int compute_align_score(int* dp_mat, char* tSeq, char* qSeq) { int ipenalty = -1; int dpenalty = -1; int match = 2; int mpenalty = -2; int best_score = 0; int iscore = 0; int dscore = 0; int mscore = 0; int i,j; memset(dp_mat, 0, M*N*sizeof(int)); for (i = 1; i < strlen(tSeq) + 1; i++) { for (j = 1; j < strlen(qSeq) + 1; j++) { iscore = dp_mat[i*M + j-1] + ipenalty; dscore = dp_mat[(i-1)*M + j] + dpenalty; mscore = dp_mat[(i-1)*M + j-1] + ((tSeq[i-1] == qSeq[j-1]) ? match : mpenalty); dp_mat[i*M + j] = MAX(MAX(0, iscore), MAX(dscore, mscore)); if (dp_mat[i*M + j] >= best_score) best_score = dp_mat[i*M + j]; } } return best_score; } void compute_align_scores(int* scores, int n, int* dp_mat, char* tSeq, char** qSeqs) { int i = 0; for (i; i < n; i++) { scores[i] = compute_align_score(dp_mat, tSeq, qSeqs[i]); } } void print_dp_mat(int* dp_mat, char* tSeq, char* qSeq) { int i,j; for (j = 0; j < strlen(qSeq) + 1; j++) { for (i = 0; i < strlen(tSeq) + 1; i++) { printf("%d ", dp_mat[i*M + j]); } printf("\n"); } } pbbarcode-master/src/python/000077500000000000000000000000001231421752300164005ustar00rootroot00000000000000pbbarcode-master/src/python/pbbarcode/000077500000000000000000000000001231421752300203215ustar00rootroot00000000000000pbbarcode-master/src/python/pbbarcode/BarcodeLabeler.py000077500000000000000000000233741231421752300235350ustar00rootroot00000000000000#################################################################################$$ # Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the name of Pacific Biosciences nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #################################################################################$$ import logging from pbcore.io import BasH5Reader, BaxH5Reader from pbcore.io.FastaIO import * import pbbarcode.SWaligner as Aligner import numpy as n from pbcore.io.BarcodeH5Reader import LabeledZmw, \ BARCODE_DELIMITER __RC_MAP__ = dict(zip('ACGTacgt-N','TGCAtgca-N')) class BarcodeScorer(object): def __init__(self, basH5, barcodeFasta, adapterSidePad = 0, insertSidePad = 4, scoreMode = 'symmetric', maxHits = 10, scoreFirst = False, startTimeCutoff = 1): """A BarcodeScorer object scores ZMWs and produces summaries of the scores. Various parameters control the behavior of the object, specifically the padding allows the user to add a little extra on each side of the adapter find for safety. The most relevant parameter is the scoreMode which dictates how the barcodes are scored, either paired or symmetric.""" self.basH5 = basH5 self.barcodeFasta = list(barcodeFasta) self.aligner = Aligner.SWaligner() self.barcodeLength = n.unique(map(lambda x : len(x.sequence), self.barcodeFasta)) if len(self.barcodeLength) > 1: raise Exception("Currently, all barcodes must be the same length.") else: self.barcodeLength = int(self.barcodeLength) self.barcodeSeqs = [(barcode.sequence.upper(), self._rc(barcode.sequence.upper())) for barcode in self.barcodeFasta] self.adapterSidePad = adapterSidePad self.insertSidePad = insertSidePad self.maxHits = maxHits if scoreMode not in ['symmetric', 'paired']: raise Exception("scoreMode must either be symmetric or paired") self._scoreMode = scoreMode self.scoreFirst = scoreFirst self.startTimeCutoff = startTimeCutoff self.forwardScorer = self.aligner.makeScorer([x[0] for x in self.barcodeSeqs]) self.reverseScorer = self.aligner.makeScorer([x[1] for x in self.barcodeSeqs]) logging.debug(("Constructed BarcodeScorer with scoreMode: %s," + \ "adapterSidePad: %d, insertSidePad: %d, and scoreFirst: %r") \ % (scoreMode, adapterSidePad, insertSidePad, scoreFirst)) @property def movieName(self): return self.basH5.movieName def makeBCLabel(self, s1, s2): return BARCODE_DELIMITER.join((s1, s2)) @property def barcodeLabels(self): """The barcode labels are function of the barcodeNames and the scoreMode, they represent the user-visible names.""" if self.scoreMode == 'paired': return n.array([self.makeBCLabel(self.barcodeFasta[i].name, self.barcodeFasta[i+1].name) for i in xrange(0, len(self.barcodeSeqs), 2)]) else: return n.array([self.makeBCLabel(x.name, x.name) for x in self.barcodeFasta]) @property def barcodeNames(self): """The barcode names are the FASTA names""" return n.array([x.name for x in self.barcodeFasta]) @property def scoreMode(self): return self._scoreMode def _rc(self, s): return "".join([__RC_MAP__[c] for c in s[::-1]]) def _flankingSeqs(self, zmw): def fromRange(rStart, rEnd): try: qSeqLeft = zmw.read(rStart - (self.barcodeLength + self.insertSidePad), rStart + self.adapterSidePad).basecalls() except IndexError: qSeqLeft = None try: qSeqRight = zmw.read(rEnd - self.adapterSidePad, rEnd + self.barcodeLength + self.insertSidePad).basecalls() except IndexError: qSeqRight = None return (qSeqLeft, qSeqRight) adapterRegions = zmw.adapterRegions if len(adapterRegions) > self.maxHits: adapterRegions = adapterRegions[0:self.maxHits] seqs = [fromRange(start, end) for (start, end) in adapterRegions] # We only score the first barcode if we don't find any adapters # *and* the start time is less than the threshold. scoredFirst = False if self.scoreFirst and not len(seqs): s = zmw.zmwMetric('HQRegionStartTime') e = zmw.zmwMetric('HQRegionEndTime') # s has HQ. if s < e and s <= self.startTimeCutoff: l = self.barcodeLength + self.insertSidePad l = l if zmw.hqRegion[1] > l else zmw.hqRegion[1] try: bc = zmw.read(0, l).basecalls() if len(bc) >= self.barcodeLength: seqs.insert(0, (bc, None)) scoredFirst = True except IndexError: pass return (seqs, scoredFirst) def labelZmws(self, holeNumbers): """Return a list of LabeledZmws for input holeNumbers""" def scoreZmw(zmw): adapters, scoredFirst = self._flankingSeqs(zmw) adapterScores = [[]]*len(adapters) barcodeScores = n.zeros(len(self.barcodeSeqs)) for i,adapter in enumerate(adapters): fscores = self.forwardScorer(adapter[0]) rscores = self.reverseScorer(adapter[0]) ffscores = self.forwardScorer(adapter[1]) rrscores = self.reverseScorer(adapter[1]) scored = 2.0 if adapter[0] and adapter[1] else \ 1.0 if adapter[0] or adapter[1] else 0 # An adapter score is the average barcode score for # each barcode -- that way, you can compare across # adapters even if the different adapters have # different numbers of flanking sequence. if scored == 0: adapterScores[i] = barcodeScores else: adapterScores[i] = n.maximum((fscores + rrscores)/scored, (rscores + ffscores)/scored) barcodeScores = reduce(lambda x, y: x + y, adapterScores) if adapterScores \ else n.zeros(len(self.barcodeSeqs)) return (zmw.holeNumber, len(adapters), barcodeScores, adapterScores, scoredFirst) # o here is the record immediately above. def chooseSymmetric(o): p = n.argsort(-o[2]) return LabeledZmw(o[0], o[1], p[0], o[2][p[0]], p[1], o[2][p[1]], o[3]) def choosePaired(o): if o[1] == 1: s = n.array([max(o[2][i], o[2][i + 1]) for i in \ xrange(0, len(self.barcodeSeqs), 2)]) p = n.argsort(-s) s = s[p] else: # score the pairs by scoring the two alternate # ways they could have been put on the molecule. A # missed adapter will confuse this computation. scores = o[3] results = n.zeros(len(self.barcodeSeqs)/2) for i in xrange(0, len(self.barcodeSeqs), 2): pths = [0,0] for j in xrange(0, len(scores)): pths[j % 2] += scores[j][i] pths[1 - j % 2] += scores[j][i + 1] results[i/2] = max(pths) p = n.argsort(-results) s = results[p] return LabeledZmw(o[0], o[1], p[0], s[0], p[1], s[1], o[3]) if self.scoreMode == 'symmetric': choose = chooseSymmetric elif self.scoreMode == 'paired': choose = choosePaired else: raise Exception("Unsupported scoring mode in BarcodeLabeler.py") scored = [scoreZmw(self.basH5[zmw]) for zmw in holeNumbers] return [choose(scoreTup) for scoreTup in scored if scoreTup[1]] pbbarcode-master/src/python/pbbarcode/SWaligner.py000077500000000000000000000060321231421752300225720ustar00rootroot00000000000000#################################################################################$$ # Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the name of Pacific Biosciences nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #################################################################################$$ from ctypes import * import os import numpy import pkg_resources class SWaligner(object): def __init__(self): # setup.py should put sw.so in the following path. self.SW_DLL_PATH = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "sw.so" self._dll = CDLL(self.SW_DLL_PATH) self.dpMat = self._dll.allocate_dp_mat() def score(self, tSeq, qSeq): return self._dll.compute_align_score(self.dpMat, tSeq, qSeq) def makeScorer(self, targets): ScoreType = c_int * len(targets) scores = ScoreType() for i in range(0, len(scores)): scores[i] = 0 TargetType = c_char_p * len(targets) targetSeqs = TargetType() for i in range(0, len(targetSeqs)): targetSeqs[i] = targets[i] targetLen = len(targets) def scorer(query): if not query: return numpy.zeros(len(targets)) self._dll.compute_align_scores(scores, targetLen, self.dpMat, query, targetSeqs) return numpy.array([scores[i] for i in xrange(0, len(scores))]) return scorer pbbarcode-master/src/python/pbbarcode/__init__.py000077500000000000000000000000001231421752300224230ustar00rootroot00000000000000pbbarcode-master/src/python/pbbarcode/_version.py000077500000000000000000000000241231421752300225160ustar00rootroot00000000000000__version__='0.8.0' pbbarcode-master/src/python/pbbarcode/main.py000077500000000000000000001000021231421752300216130ustar00rootroot00000000000000#!/usr/bin/env python #################################################################################$$ # Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the name of Pacific Biosciences nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #################################################################################$$ import os import sys import argparse import logging import tempfile import shutil import pkg_resources import re import subprocess import random import shutil from multiprocessing import Pool import h5py as h5 import numpy as n from pbcore.util.ToolRunner import PBMultiToolRunner from pbcore.io import BaxH5Reader, BasH5Reader from pbcore.io import CmpH5Reader, CmpH5Alignment from pbcore.io.BarcodeH5Reader import * from pbcore.io import FastaReader, FastqWriter, FastqRecord, \ FastaWriter, FastaRecord from pbbarcode.BarcodeLabeler import * from pbbarcode._version import __version__ from pbh5tools.CmpH5Utils import copyAttributes # Paths to the Barcode Datasets in the cmp.h5 file. BC_ALN_INFO_DS = "AlnInfo/Barcode" BC_INFO_NAME = "BarcodeInfo/Name" BC_INFO_ID = "BarcodeInfo/ID" SCORE_MODES = ['symmetric', 'paired'] BAS_PLS_REGEX = r'\.ba[x|s]\.h5$|\.pl[x|s]\.h5$|\.cc[x|s]\.h5$' BARCODE_EXT = '.bc.h5' BC_REGEX = r'\.bc\.h5' def movieNameFromFile(fn): return re.sub('|'.join((BC_REGEX, BAS_PLS_REGEX)) , '', os.path.basename(fn)) def makeBarcodeH5FromBasH5(basH5): """The workhorse function for creating a barcode H5 file from a base H5 file.""" labeler = BarcodeScorer(basH5, FastaReader(runner.args.barcodeFile), runner.args.adapterSidePad, runner.args.insertSidePad, scoreMode = runner.args.scoreMode, maxHits = runner.args.maxAdapters, scoreFirst = runner.args.scoreFirst, startTimeCutoff = runner.args.startTimeCutoff) if runner.args.nZmws < 0: zmws = basH5.sequencingZmws else: zmws = basH5.sequencingZmws[0:runner.args.nZmws] logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename)) labeledZmws = labeler.labelZmws(zmws) logging.debug("Labeled %d ZMWs" % len(labeledZmws)) outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT, os.path.basename(basH5.filename)) outFile = '/'.join((runner.args.outDir, outBase)) logging.debug("Writing to: %s" % outFile) writeBarcodeH5(labeledZmws, labeler, outFile, runner.args.saveExtendedInfo) return outFile def mpWrapper(f): return makeBarcodeH5FromBasH5(BasH5Reader(f)) def makeBarcodeFofnFromBasFofn(): inputFofn = runner.args.inputFile inFiles = open(inputFofn).read().splitlines() if not all(map(os.path.exists, inFiles)): raise IOError("All files in input.fofn must exist.") logging.debug("Using %d processes." % runner.args.nProcs) if runner.args.nProcs <= 1: newFiles = map(mpWrapper, inFiles) else: pool = Pool(runner.args.nProcs) newFiles = pool.map(mpWrapper, inFiles) oFile = open(runner.args.outFofn, 'w') for nF in newFiles: oFile.write(nF + "\n") oFile.close() def labelAlignments(): logging.info("Labeling alignments using: %s" % runner.args.inputFofn) bcFofn = BarcodeH5Fofn(runner.args.inputFofn) with CmpH5Reader(runner.args.cmpH5) as cmpH5: bcDS = n.zeros((len(cmpH5), 5), dtype = "int32") for (i, aln) in enumerate(cmpH5): bcReader = bcFofn.readerForMovie(aln.movieInfo.Name) try: lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber) if lZmw.nScored < runner.args.minNumBarcodes or \ lZmw.averageScore < runner.args.minAvgBarcodeScore or \ lZmw.scoreRatio < runner.args.minScoreRatio: lZmw = None except KeyError: lZmw = None if lZmw: bcDS[i,:] = n.array([lZmw.nScored, lZmw.bestIdx, lZmw.bestScore, lZmw.secondBestIdx, lZmw.secondBestScore]) else: # either no barcode was found for this guy or they got # filtered, hence the NULL_BARCODE bcDS[i,:] = n.array([0, len(bcReader.barcodeLabels), 0, len(bcReader.barcodeLabels), 0]) # write to the cmp.h5 file. H5 = h5.File(runner.args.cmpH5, 'r+') if BC_INFO_ID in H5: del H5[BC_INFO_ID] if BC_INFO_NAME in H5: del H5[BC_INFO_NAME] # we use the first one to get the labels, if somehow they # don't have all of the same stuff that will be an issue. bcLabels = n.concatenate((bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER]))) H5.create_dataset(BC_INFO_ID, data = n.array(range(0, len(bcLabels))), dtype = 'int32') H5.create_dataset(BC_INFO_NAME, data = bcLabels, dtype = h5.new_vlen(str)) if BC_ALN_INFO_DS in H5: del H5[BC_ALN_INFO_DS] bcDS = H5.create_dataset(BC_ALN_INFO_DS, data = bcDS, dtype = 'int32') bcDS.attrs['ColumnNames'] = n.array(['count', 'index1', 'score1', 'index2', 'score2']) #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine bcDS.attrs['BarcodeMode'] = n.array( bcFofn.scoreMode ) H5.close() def zipFofns(*inFofns): """Take inputFofns and return n tuples of length len(inFofns) where n is the number of entries in each FOFN.""" def readAndSort(inFile): lines = n.array(open(inFile).read().splitlines()) lines = lines[n.array(n.argsort([movieNameFromFile(fofnLine) for fofnLine in lines]))] return lines sortedFofns = [readAndSort(inFofn) for inFofn in inFofns] l = map(len, sortedFofns) if len(n.unique(l)) != 1: raise Exception("Fofns don't match, unequal number of inputs.") else: for i in xrange(0, n.unique(l)): if len(n.unique([movieNameFromFile(sortedFofn[i]) for sortedFofn in sortedFofns])) != 1: raise Exception("Fofn elements don't match, movies differ.") # need to un-arrayify these guys return zip(*map(list, sortedFofns)) def filterZmws(zmwsForBCs): """Apply various filterings passed by the user. There are somewhat different semantics for CCS filtering and subread filtering in terms of the raw primary metrics available, e.g., HQRegionStartTime is unavailable for the CCS data and somewhat irrelevant.""" def getHQStart(zmw): try: return zmw.zmwMetric('HQRegionStartTime') except: return 0 def getReadScore(zmw): return zmw.zmwMetric("ReadScore") def molLenGuess(zmw): if zmw.baxH5.hasRawBasecalls: return max(map(len, zmw.subreads)) if zmw.subreads else 0 else: return len(zmw.ccsRead) if zmw.ccsRead else 0 def zmwFilterFx(tup): zmw, lZmw = tup mlGuess = molLenGuess(zmw) if not mlGuess: return False avgScore = lZmw.averageScore numScored = lZmw.nScored scoreRatio = lZmw.scoreRatio hqStart = getHQStart(zmw) readScore = getReadScore(zmw) ## XXX : still need to detect the chimeras if mlGuess < runner.args.minMaxInsertLength or \ hqStart > runner.args.hqStartTime or \ readScore < runner.args.minReadScore or \ avgScore < runner.args.minAvgBarcodeScore or \ numScored < runner.args.minNumBarcodes or \ scoreRatio < runner.args.minScoreRatio: return False else: return True return { k:filter(zmwFilterFx, v) for k,v in zmwsForBCs.items() } def _warnOnce(): var = [] def warnOnce(msg): if not var: logging.warn(msg) var.append(1) return warnOnce warnOnce = _warnOnce() def getFastqRecords(zmw, lZmw = None): if zmw.baxH5.hasRawBasecalls and zmw.baxH5.hasConsensusBasecalls: # Only examine this parameter when passed both. if runner.args.subreads: reads = zmw.subreads else: reads = [zmw.ccsRead] elif zmw.baxH5.hasRawBasecalls: if runner.args.subreads: warnOnce("`subreads` argument is ignored when using >= 2.1" + "bas.h5 data as input.") reads = zmw.subreads else: if runner.args.subreads: warnOnce("`subreads` argument is ignored when using >= 2.1" + "ccs.h5 data as input.") reads = [zmw.ccsRead] extra = (" %g %g" % (round(zmw.zmwMetric("ReadScore"), 2), round(lZmw.averageScore, 2))) if lZmw else "" return [FastqRecord(read.readName + extra, read.basecalls(), read.QualityValue()) for read in reads if read] def getFastqs(): zmwsByBarcode = getZmwsForBarcodes() logging.debug("Pre-filter: Average number of ZMWs per barcode: %d" % n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()])) zmwsByBarcode = filterZmws(zmwsByBarcode) logging.debug("Post-filter: Average number of ZMWs per barcode: %d" % n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()])) def getReadData(zmws): recs = [getFastqRecords(zmw,lZmw) for zmw,lZmw in zmws] recs = filter(lambda x : x, recs) return [elt for sublst in recs for elt in sublst] return {k:getReadData(zmws) for k, zmws in zmwsByBarcode.iteritems()} def emitFastqs(): outFiles = getFastqs() outDir = runner.args.outDir fasta = runner.args.fasta if runner.args.unlabeledZmws: outFiles['UNLABELED'] = getUnlabeledZmws() if not os.path.exists(runner.args.outDir): os.makedirs(runner.args.outDir) if fasta: writer = FastaWriter def record(n, s, qv): return FastaRecord(n, s) else: writer = FastqWriter record = FastqRecord l = 'a' if runner.args.fasta else 'q' for k in outFiles.keys(): if outFiles[k]: with writer("%s/%s.fast%s" % (runner.args.outDir, k, l)) as w: for e in outFiles[k]: tlen = len(e.sequence)-runner.args.trim r = record(e.name, e.sequence[runner.args.trim:tlen], e.quality[runner.args.trim:tlen]) if r: w.writeRecord(r) def getUnlabeledZmws(): """Return FASTQ records for ZMWs which do not have a barcode label""" unlabeledZmws = [] for basFile, barcodeFile in zipFofns(runner.args.inputFofn, runner.args.barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) sdiff = basH5.sequencingZmws[~n.in1d(basH5.sequencingZmws, bcH5.labeledZmws.keys())] for hn in sdiff: unlabeledZmws.append(basH5[hn]) return reduce(lambda x,y : x+y, [getFastqRecords(unlabeledZmw) for unlabeledZmw in unlabeledZmws]) def getZmwsForBarcodes(labels = None): """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode label""" zmwsForBCs = {} for basFile, barcodeFile in zipFofns(runner.args.inputFofn, runner.args.barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) allLabs = bcH5.barcodeLabels if labels: allLabs = [x for x in allLabs if x in labels] logging.info("Processing only: %s" % ",".join(allLabs)) for label in allLabs: lZmws = bcH5.labeledZmwsFromBarcodeLabel(label) for lZmw in lZmws: zmw = basH5[lZmw.holeNumber] if not label in zmwsForBCs.keys(): zmwsForBCs[label] = [] zmwsForBCs[label].append((zmw, lZmw)) return zmwsForBCs def gconFunc(tp): # called bcause multiprocess rootDir, barcode = tp bcdir = "/".join((rootDir, barcode)) ## call gcon logging.info("In gconFunc for: %s" % barcode) cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \ (bcdir, bcdir, bcdir) subprocess.call(cmd, shell = True) ## check to see if the file is empty r = FastaReader("%s/g_consensus.fa" % bcdir) if not list(r)[0].sequence: return None ## check to see if we are going to run quiver if not runner.args.noQuiver: # setup the blasr / sam / quiver stuff. logging.info("Setup regions file, now running blasr through quiver.") cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \ '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell = True) cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \ (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell = True) cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \ 'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \ 'SubstitutionQV') % (runner.args.inputFofn, bcdir) logging.debug(cmd) subprocess.call(cmd, shell = True) cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir logging.debug(cmd) subprocess.call(cmd, shell = True) cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \ '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \ '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell = True) cFilename = 'q_consensus.fasta' else: cFilename = 'g_consensus.fa' ## append results to output file. bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename) if os.path.exists(bcCons): return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence) else: return None def subsampleReads(e): logging.debug("starting with %d zmws" % len(e)) if runner.args.nZmws > 0: k = runner.args.nZmws if runner.args.nZmws < len(e) else len(e) elif runner.args.subsample < 1: k = int(len(e)*runner.args.subsample) else: k = len(e) i = n.array(random.sample(range(0, len(e)), k), dtype = int) logging.debug("subsampled down to: %d" % len(i)) return [e[j] for j in i] def callConsensus(): def makeReadAndReads(zmwsForBC): ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw]) srData = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in zmwsForBC if zmw], []) if not srData and not ccsData: return (None,None) def getSeedRead(reads, lq = 80, uq = 90, sLambda = lambda x : -x.zmw.readScore): lens = map(len, reads) candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq)) pfReads = [read for read,l in zip(reads, lens) if l >= candidateRange[0] and l <= candidateRange[1]] pfReads.sort(key = sLambda) return pfReads[0] if len(pfReads) else None if ccsData: ## all CCS reads should be the *same* length for an ## amplicon. Let's take the middle ones seedRead = getSeedRead(ccsData, lq = 30, uq = 70, sLambda = lambda x: -x.zmw.numPasses) if not seedRead: seedRead = getSeedRead(srData) logging.info("Unable to use a CCS read for the seed read.") else: logging.info("Using a CCS read for the seed read.") else: logging.info("Using a raw read for the seed read") seedRead = getSeedRead(srData) return (seedRead, srData) # check to make sure that you have the necessary dependencies, # i.e., hgap script, blasr, etc. try: import pbtools.pbdagcon except ImportError: raise ImportError("Unable to find dependency `pbdagcon` - please install.") # retrieve ZMWs by barcode if runner.args.barcode: zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) else: zmwsForBCs = getZmwsForBarcodes() # subsample zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()} logging.info("unfiltered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # filter ZMWs zmwsForBCs = filterZmws(zmwsForBCs) logging.info("filtered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # now choose the best subread to seed the assembly if runner.args.ccsFofn: # XXX: This part depends on the filenames of the ccs and input # fofns, this is essentially a workaround to the fact the the # part isn't part of the API ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in open(runner.args.ccsFofn).read().splitlines()} # fill in the CCS spot. for k,v in zmwsForBCs.items(): l = [] for zmw,lZmw in v: r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] l.append((zmw,lZmw,r[zmw.holeNumber])) zmwsForBCs[k] = l else: # add none to the CCS spot. zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v] for k,v in zmwsForBCs.iteritems()} readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() } # remove barcodes that don't have a seed read and a set of useable reads. readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] } # generate FASTA files outDir = runner.args.outDir for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) if not os.path.exists(bcdir): os.makedirs(bcdir) # emit the seeds to separte files with FastaWriter("%s/seed_read.fasta" % bcdir) as w: w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) subreads = reads[1] # emit the subreads to a single file with FastaWriter("%s/subreads.fasta" % bcdir) as w: for r in subreads: w.writeRecord(FastaRecord(r.readName, r.basecalls())) # construct the region file by subsetting the ZMWs that you # are interested in. nfofn = [] for inFof, in zipFofns(runner.args.inputFofn): bh5 = BaxH5Reader(inFof) reg = bh5.file['/PulseData/Regions'] inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName, subreads) holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie])) if any(holes): nreg = reg[holes,:] else: nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32') fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) nfile = h5.File(fname, 'w') ndset = nfile.create_dataset('/PulseData/Regions', data = nreg, maxshape = (None, None)) copyAttributes(reg, ndset) nfile.close() nfofn.append(fname) ofile = open('%s/region.fofn' % bcdir, 'w') ofile.writelines("\n".join(nfofn)) ofile.close() ## call gcon outDirs = [ (outDir, k) for k in readAndReads.keys() ] if runner.args.nProcs == 1: outFasta = filter(lambda z: z, map(gconFunc, outDirs)) else: pool = Pool(runner.args.nProcs) outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs)) ## write the results with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: for r in outFasta: w.writeRecord(r) ## optionally cleanup if not runner.args.keepTmpDir: for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) shutil.rmtree(bcdir) class Pbbarcode(PBMultiToolRunner): def __init__(self): desc = ['Utilities for labeling and annoting reads with barcode information.'] super(Pbbarcode, self).__init__('\n'.join(desc)) subparsers = self.subParsers desc = ['Creates a barcode.h5 file from base h5 files.'] parser_m = subparsers.add_parser('labelZmws', description = "\n".join(desc), help = 'Label zmws with barcode annotation', formatter_class = \ argparse.ArgumentDefaultsHelpFormatter) parser_m.add_argument('--outDir', help = 'Where to write the newly created barcode.h5 files.', default = os.getcwd()) parser_m.add_argument('--outFofn', help = 'Write to outFofn', default = 'barcode.fofn') parser_m.add_argument('--adapterSidePad', help = 'Pad with adapterSidePad bases', default = 4, type = int) parser_m.add_argument('--insertSidePad', help = 'Pad with insertSidePad bases', default = 4, type = int) parser_m.add_argument('--scoreMode', help = 'The mode in which the barcodes should be scored.', choices = SCORE_MODES, default = 'symmetric', type = str) parser_m.add_argument('--maxAdapters', type = int, default = 20, help = 'Only score the first maxAdapters') parser_m.add_argument('--scoreFirst', action = 'store_true', default = False, help = 'Whether to try to score the leftmost barcode in a trace.') parser_m.add_argument('--startTimeCutoff', help = 'Reads must start before this value in order to be ' + \ 'included when scoreFirst is set.', type = float, default = 10.0) parser_m.add_argument('--nZmws', type = int, default = -1, help = 'Use the first n ZMWs for testing') parser_m.add_argument('--nProcs', type = int, default = 8, help = 'How many processes to use') parser_m.add_argument('--saveExtendedInfo', action = 'store_true', default = False,\ help = 'Whether to save extended information to' + \ 'the barcode.h5 files; this information is useful for ' + \ 'debugging and chimera detection') parser_m.add_argument('barcodeFile', metavar = 'barcode.fasta', help = 'Input barcode fasta file') parser_m.add_argument('inputFile', metavar = 'input.fofn', help = 'Input base fofn') def addFilteringOpts(parser, justBarcode = False): ## These are independent of the barcode scoring if not justBarcode: parser.add_argument('--minMaxInsertLength', default = 0, type = int, help = "ZMW Filter: exclude ZMW if the longest subread" + \ "is less than this amount") parser.add_argument('--hqStartTime', default = float("inf"), type = float, help = "ZMW Filter: exclude ZMW if start time of HQ region" + \ "greater than this value (seconds)") parser.add_argument('--minReadScore', default = 0, type = float, help = "ZMW Filter: exclude ZMW if readScore is less than" + \ "this value") ## These obviously need the barcode score parser.add_argument('--minAvgBarcodeScore', default = 0.0, type = float, help = "ZMW Filter: exclude ZMW if average barcode score " + \ "is less than this value") parser.add_argument('--minNumBarcodes', default = 1, type = int, help = "ZMW Filter: exclude ZMW if number of barcodes observed " + \ "is less than this value") parser.add_argument('--minScoreRatio', default = 1.0, type = float, help = "ZMW Filter: exclude ZMWs whose best score divided by " + \ "the 2nd best score is less than this ratio") # Not yet implemented # parser.add_argument('--filterChimeras', default = False, action = 'store_true', # help = "ZMW Filter: exclude ZMWs that appear to be chimeric") desc = ['Adds information about barcode alignments to a cmp.h5 file', 'from a previous call to "labelZmws".'] parser_s = subparsers.add_parser('labelAlignments', description = "\n".join(desc), help = "Label reads from a barcode or region h5 file", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addFilteringOpts(parser_s, justBarcode = True) parser_s.add_argument('inputFofn', metavar = 'barcode.fofn', help = 'input barcode fofn file') parser_s.add_argument('cmpH5', metavar = 'aligned_reads.cmp.h5', help = 'cmp.h5 file to add barcode labels') desc = ['Takes a bas.h5 fofn and a barcode.h5 fofn and produces', 'a fast[a|q] file for each barcode.'] parser_s = subparsers.add_parser('emitFastqs', description = "\n".join(desc), help = "Write fastq files", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_s.add_argument('--outDir', metavar = 'output.dir', help = 'output directory to write fastq files', default = os.getcwd()) parser_s.add_argument('--subreads', help = 'whether to produce fastq files for the subreads;' + \ 'the default is to use the CCS reads. This option only' + \ 'applies when input.fofn has both consensus and raw reads,' + \ 'otherwise the read type from input.fofn will be returned.', action = 'store_true', default = False) parser_s.add_argument('--unlabeledZmws', help = 'whether to emit a fastq file for the unlabeled ZMWs.' + \ ' These are the ZMWs where no adapters are found typically', action = 'store_true', default = False) parser_s.add_argument('--trim', help = 'trim off barcodes and any excess constant sequence', default = 20, type = int) parser_s.add_argument('--fasta', help = ('whether the files produced should be FASTA files as' + 'opposed to FASTQ'), action = 'store_true', default = False) addFilteringOpts(parser_s) parser_s.add_argument('inputFofn', metavar = 'input.fofn', help = 'input base or CCS fofn file') parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn', help = 'input barcode.h5 fofn file') desc = ['Compute consensus sequences for each barcode.'] parser_s = subparsers.add_parser('consensus', description = "\n".join(desc), help = "Compute a consensus sequence for each barcode." + \ "This command relies on the presence of pbdagcon", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_s.add_argument('--subsample', default = 1, type = float, help = "Subsample ZMWs") parser_s.add_argument('--nZmws', default = -1, type = int, help = "Take n ZMWs") parser_s.add_argument('--outDir', default = '.', type = str, help = "Use this directory to output results") parser_s.add_argument('--keepTmpDir', action = 'store_true', default = False) parser_s.add_argument('--ccsFofn', default = '', type = str, help = 'Obtain CCS data from ccsFofn instead of input.fofn') parser_s.add_argument('--nProcs', default = 16, type = int, help = 'Use nProcs to execute.') parser_s.add_argument('--noQuiver', action = 'store_true', default = False) addFilteringOpts(parser_s) parser_s.add_argument('inputFofn', metavar = 'input.fofn', help = 'input bas.h5 fofn file') parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn', help = 'input bc.h5 fofn file') parser_s.add_argument('--barcode', default = None, type = str, nargs = "+", help = "Use this to extract consensus for just one barcode.") def getVersion(self): return __version__ def run(self): logging.debug("Arguments" + str(self.args)) if self.args.subCommand == 'labelZmws': makeBarcodeFofnFromBasFofn() elif self.args.subCommand == 'labelAlignments': labelAlignments() elif self.args.subCommand == 'emitFastqs': emitFastqs() elif self.args.subCommand == 'consensus': callConsensus() else: sys.exit(1) runner = Pbbarcode() def main(): """The entry point for pbbarcode""" sys.exit(runner.start()) #if __name__ == '__main__': # runner = Pbbarcode() # sys.exit(runner.start()) pbbarcode-master/tests/000077500000000000000000000000001231421752300154325ustar00rootroot00000000000000pbbarcode-master/tests/cram/000077500000000000000000000000001231421752300163545ustar00rootroot00000000000000pbbarcode-master/tests/cram/consensus.t.disabled000066400000000000000000000077221231421752300223370ustar00rootroot00000000000000 $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"` $ export INBH51=`python -c "from pbcore import data ; print data.geBasH5s[0]"` $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s[1]"` $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta $ echo $INBH51 > bas.fofn $ echo $INBH52 >> bas.fofn $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn $ pbbarcode consensus bas.fofn barcode.fofn [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46[INFO] [blasr] started. 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] [INFO] 2013-08-02T00:28:462013-08-02T00:28:46 [blasr] started. [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. [INFO] 2013-08-02T00:28:46 [blasr] started. [INFO] 2013-08-02T00:28:46 [blasr] ended. pbbarcode-master/tests/cram/sanity.t000066400000000000000000000172561231421752300200630ustar00rootroot00000000000000 $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"` $ export INBH51=`python -c "from pbcore import data ; print data.getBasH5s()[0]"` $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s()[1]"` $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta $ echo $INBH51 > bas.fofn $ echo $INBH52 >> bas.fofn $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn $ pbbarcode labelZmws --scoreMode paired $BARCODE_FASTA bas.fofn $ pbbarcode labelZmws --scoreMode paired --scoreFirst $BARCODE_FASTA bas.fofn $ pbbarcode labelZmws --scoreMode paired --scoreFirst --adapterSidePad 0 --insertSidePad 0 $BARCODE_FASTA bas.fofn $ pbbarcode emitFastqs --fasta bas.fofn barcode.fofn $ pbbarcode emitFastqs --trim 20 bas.fofn barcode.fofn $ pbbarcode emitFastqs --subreads --trim 20 bas.fofn barcode.fofn $ cp $INH5 ./aligned_reads.cmp.h5 $ chmod 766 ./aligned_reads.cmp.h5 $ pbbarcode labelAlignments barcode.fofn aligned_reads.cmp.h5 Check that same holes get the same barcode (consistent scoring) $ cmph5tools.py stats --what "(Movie,HoleNumber,Barcode,AverageBarcodeScore)" aligned_reads.cmp.h5 | uniq Movie Barcode AverageBarcodeScore HoleNumber m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.00 3008 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.50 2001 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4009 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 14.33 3006 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 1000 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 4004 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 1006 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4006 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 13.33 1000 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.33 1007 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.00 1004 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 2002 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 4007 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.00 3008 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 2009 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 2007 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 16.00 1002 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.33 1008 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9 m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 8 m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 14.33 2003 pbbarcode-master/tests/test_basic.py000077500000000000000000000005121231421752300201250ustar00rootroot00000000000000import logging import unittest # this is purely for the coverage to not fail when it's generated import pbbarcode log = logging.getLogger(__name__) class TestBasic(unittest.TestCase): def test_01(self): """Place holder so jenkins will generate a coverage report""" self.assertTrue(True)