pax_global_header 0000666 0000000 0000000 00000000064 14156664651 0014530 g ustar 00root root 0000000 0000000 52 comment=88428965dc4a38c915920337ce653289be839db8
bx-python-0.8.13/ 0000775 0000000 0000000 00000000000 14156664651 0013551 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/.github/ 0000775 0000000 0000000 00000000000 14156664651 0015111 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/.github/workflows/ 0000775 0000000 0000000 00000000000 14156664651 0017146 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/.github/workflows/deploy.yaml 0000664 0000000 0000000 00000005450 14156664651 0021332 0 ustar 00root root 0000000 0000000 name: Deploy
on: [push, pull_request]
concurrency:
group: deploy-${{ github.ref }}
cancel-in-progress: true
jobs:
build_wheels:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
arch: [auto]
include:
- os: ubuntu-latest
arch: aarch64
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- name: Set up QEMU to build non-native architectures
if: ${{ matrix.arch == 'aarch64' }}
uses: docker/setup-qemu-action@v1
- name: Install required Python packages
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install 'cibuildwheel>=2.2.0' twine
- name: Build wheels
run: python -m cibuildwheel --output-dir dist
env:
CIBW_ARCHS: ${{matrix.arch}}
# Skip building musllinux wheels for aarch64, each one currently takes
# more than 2 hours to build.
# Skip also building the PyPy 3.7 wheel for macOS, because numpy
# doesn't have a wheel on PyPI and it fails to install.
CIBW_SKIP: '*-musllinux_aarch64 pp37-macosx_x86_64'
- name: Check packages
run: twine check dist/*
- uses: actions/upload-artifact@v2
with:
name: packages
path: dist/
build_sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- name: Install required Python packages
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install build twine
- name: Build sdist
run: |
python -m build --sdist
python -m venv test_venv
. test_venv/bin/activate
python -m pip install dist/*.tar.gz
# Test with the same command specified for cibuildwheel in pyproject.toml
python -c 'import bx, bx.align, bx.align.sitemask, bx.align.tools, bx.arrays, bx.bbi, bx.cookbook, bx.intervals, bx.intervals.operations, bx.intseq, bx.misc, bx.motif, bx.motif.io, bx.motif.logo, bx.phylo, bx.pwm, bx.seq, bx.tabular, bx_extras'
- name: Check packages
run: twine check dist/*
- uses: actions/upload-artifact@v2
with:
name: packages
path: dist/
upload_pypi:
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && github.repository_owner == 'bxlab'
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v2
with:
name: packages
path: dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
bx-python-0.8.13/.github/workflows/test.yaml 0000664 0000000 0000000 00000001735 14156664651 0021017 0 ustar 00root root 0000000 0000000 name: Lint and test
on: [push, pull_request]
concurrency:
group: test-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.10']
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install flake8
run: pip install flake8 flake8-import-order
- name: Lint
run: flake8 .
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install liblzo2-dev
run: sudo apt-get update && sudo apt-get -y install liblzo2-dev
- name: Install tox
run: pip install tox
- name: Test
run: tox
bx-python-0.8.13/.gitignore 0000664 0000000 0000000 00000000501 14156664651 0015535 0 ustar 00root root 0000000 0000000 # Build directory
build
# Python bytecode
*.pyc
# Object files
*.so
*.pyd
# Source files generated by Cython
*.c
*.h
# egg-info for inplace builds
bx_python.egg-info
# IDE project files
*.kpf
# windows shortcuts
*.lnk
# nose egg
nose*.egg
# .eggs directory
.eggs
# Virtualenv
.venv
# Built sdist directory
dist
bx-python-0.8.13/LICENSE 0000664 0000000 0000000 00000002157 14156664651 0014563 0 ustar 00root root 0000000 0000000 Copyright (c) 2005-2015 The Pennsylvania State University
Copyright (c) 2013-2020 The Johns Hopkins University
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
bx-python-0.8.13/MANIFEST.in 0000664 0000000 0000000 00000000224 14156664651 0015305 0 ustar 00root root 0000000 0000000 include LICENSE
recursive-include src *.h
recursive-include src *.c
recursive-include lib *.h
recursive-include lib *.c
recursive-include lib *.pyx
bx-python-0.8.13/README.md 0000664 0000000 0000000 00000003334 14156664651 0015033 0 ustar 00root root 0000000 0000000 [](https://travis-ci.org/bxlab/bx-python)
[](https://bx-python.readthedocs.io/)
# bx-python
The bx-python project is a Python library and associated set of scripts for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
* Classes for reading and working with genome-scale multiple local alignments (in MAF, AXT, and LAV formats)
* Generic data structure for indexing on disk files that contain blocks of data associated with intervals on various sequences (used, for example, to provide random access to individual alignments in huge files; optimized for use over network filesystems)
* Data structures for working with intervals on sequences
* "Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
* "Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation
## Requirements
Build currently requires liblzo, e.g. sudo apt-get install liblzo2-dev on debian/ubuntu).
## Installing
The package can be installed with pip:
```pip install bx-python```
It is available in [bioconda](https://anaconda.org/bioconda/bx-python) (recommended):
```conda install -c conda-forge -c bioconda bx-python```
It is available in [Debian](https://tracker.debian.org/pkg/python-bx) and [Ubuntu](https://packages.ubuntu.com/python3-bx):
```sudo apt install python3-bx```
Or can be built from a checkout of the repository:
```python setup.py install```
bx-python-0.8.13/doc/ 0000775 0000000 0000000 00000000000 14156664651 0014316 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/doc/Makefile 0000664 0000000 0000000 00000005163 14156664651 0015763 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help clean html web pickle htmlhelp latex changes linkcheck
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " apidoc to run epydoc"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " changes to make an overview over all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
clean:
-rm -rf docbuild/*
html:
mkdir -p build/html build/doctrees
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) docbuild/html
@echo
@echo "Build finished. The HTML pages are in docbuild/html."
apidoc:
mkdir -p build/html/apidoc
epydoc-2.6 --docformat restructuredtext ../lib/bx -o docbuild/html/apidoc
@echo
@echo "Epydoc finished. The pages are in docbuild/html/apidoc."
pickle:
mkdir -p build/pickle build/doctrees
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) docbuild/pickle
@echo
@echo "Build finished; now you can process the pickle files."
web: pickle
json:
mkdir -p build/json build/doctrees
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) docbuild/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
mkdir -p build/htmlhelp build/doctrees
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) docbuild/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in build/htmlhelp."
latex:
mkdir -p build/latex build/doctrees
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) docbuild/latex
@echo
@echo "Build finished; the LaTeX files are in build/latex."
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
changes:
mkdir -p build/changes build/doctrees
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) docbuild/changes
@echo
@echo "The overview file is in build/changes."
linkcheck:
mkdir -p build/linkcheck build/doctrees
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) docbuild/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in build/linkcheck/output.txt."
bx-python-0.8.13/doc/requirements.txt 0000664 0000000 0000000 00000000120 14156664651 0017573 0 ustar 00root root 0000000 0000000 https://bitbucket.org/james_taylor/python-lzo-static/get/63987d89fd1b.zip
numpy
bx-python-0.8.13/doc/source/ 0000775 0000000 0000000 00000000000 14156664651 0015616 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/doc/source/conf.py 0000664 0000000 0000000 00000014124 14156664651 0017117 0 ustar 00root root 0000000 0000000 #
# BxPython documentation build configuration file, created by
# sphinx-quickstart on Fri May 08 10:18:22 2009.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# The contents of this file are pickled, so don't put values in the namespace
# that aren't pickleable (module imports are okay, they're removed automatically).
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If your extensions are in another directory, add it here. If the directory
# is relative to the documentation root, use os.path.abspath to make it
# absolute, like shown here.
import bx
# General configuration
# ---------------------
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'bx-python'
copyright = '2017, James Taylor'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = bx.__version__
# The full version, including alpha/beta/rc tags.
release = version
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
#unused_docs = []
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = []
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# Options for HTML output
# -----------------------
# The style sheet to use for HTML and HTML Help pages. A file of that name
# must exist either in Sphinx' static/ path, or in one of the custom paths
# given in html_static_path.
html_style = 'base.css'
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
html_index = 'index.html'
html_sidebars = {'index': 'indexsidebar.html'}
# Additional templates that should be rendered to pages, maps page names to
# template names.
##html_additional_pages = {
## 'index': 'index.html',
##}
# If false, no module index is generated.
#html_use_modindex = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, the reST sources are included in the HTML build as _sources/.
#html_copy_source = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = 'bx-doc'
# Options for LaTeX output
# ------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, document class [howto/manual]).
latex_documents = [(
'index', 'bx-python.tex', 'bx-python Documentation',
'James Taylor', 'manual'), ]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_use_modindex = True
# Example configuration for intersphinx: refer to the Python standard library.
#intersphinx_mapping = {'http://docs.python.org/dev': None}
bx-python-0.8.13/doc/source/contents.rst 0000664 0000000 0000000 00000000375 14156664651 0020212 0 ustar 00root root 0000000 0000000
bx-python documentation contents
================================
Browse the Python API `class documentation `_
Contents:
.. toctree::
:maxdepth: 2
modules/index.rst
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
bx-python-0.8.13/doc/source/index.rst 0000664 0000000 0000000 00000002455 14156664651 0017465 0 ustar 00root root 0000000 0000000 About bx-python
===============
The bx-python project is a python library and associated set of scripts to allow for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
* Classes for reading and working with genome-scale multiple local alignments (in MAF, AXT, and LAV formats)
* Generic data structure for indexing on disk files that contain blocks of data associated with intervals on various sequences (used, for example, to provide random access to individual alignments in huge files; optimized for use over network filesystems)
* Data structures for working with intervals on sequences
* "Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
* "Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation
These tools have been used in a variety of published research, and are a fundamental part of the ongoing Galaxy and ESPERR projects.
Contents
========
.. toctree::
:maxdepth: 5
Application Documentation
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
bx-python-0.8.13/doc/source/lib/ 0000775 0000000 0000000 00000000000 14156664651 0016364 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/doc/source/lib/bx.align.axt.rst 0000664 0000000 0000000 00000000177 14156664651 0021420 0 ustar 00root root 0000000 0000000 bx.align.axt module
===================
.. automodule:: bx.align.axt
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.core.rst 0000664 0000000 0000000 00000000202 14156664651 0021541 0 ustar 00root root 0000000 0000000 bx.align.core module
====================
.. automodule:: bx.align.core
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.epo.rst 0000664 0000000 0000000 00000000177 14156664651 0021407 0 ustar 00root root 0000000 0000000 bx.align.epo module
===================
.. automodule:: bx.align.epo
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.epo_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022617 0 ustar 00root root 0000000 0000000 bx.align.epo_tests module
=========================
.. automodule:: bx.align.epo_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.lav.rst 0000664 0000000 0000000 00000000177 14156664651 0021406 0 ustar 00root root 0000000 0000000 bx.align.lav module
===================
.. automodule:: bx.align.lav
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.lav_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022616 0 ustar 00root root 0000000 0000000 bx.align.lav_tests module
=========================
.. automodule:: bx.align.lav_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.maf.rst 0000664 0000000 0000000 00000000177 14156664651 0021367 0 ustar 00root root 0000000 0000000 bx.align.maf module
===================
.. automodule:: bx.align.maf
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.maf_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022577 0 ustar 00root root 0000000 0000000 bx.align.maf_tests module
=========================
.. automodule:: bx.align.maf_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.rst 0000664 0000000 0000000 00000000712 14156664651 0020620 0 ustar 00root root 0000000 0000000 bx.align package
================
Subpackages
-----------
.. toctree::
bx.align.sitemask
bx.align.tools
Submodules
----------
.. toctree::
bx.align.axt
bx.align.core
bx.align.epo
bx.align.epo_tests
bx.align.lav
bx.align.lav_tests
bx.align.maf
bx.align.maf_tests
bx.align.score
bx.align.score_tests
Module contents
---------------
.. automodule:: bx.align
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.score.rst 0000664 0000000 0000000 00000000205 14156664651 0021727 0 ustar 00root root 0000000 0000000 bx.align.score module
=====================
.. automodule:: bx.align.score
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.score_tests.rst 0000664 0000000 0000000 00000000227 14156664651 0023155 0 ustar 00root root 0000000 0000000 bx.align.score_tests module
===========================
.. automodule:: bx.align.score_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.sitemask.core.rst 0000664 0000000 0000000 00000000235 14156664651 0023366 0 ustar 00root root 0000000 0000000 bx.align.sitemask.core module
=============================
.. automodule:: bx.align.sitemask.core
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.sitemask.cpg.rst 0000664 0000000 0000000 00000000232 14156664651 0023204 0 ustar 00root root 0000000 0000000 bx.align.sitemask.cpg module
============================
.. automodule:: bx.align.sitemask.cpg
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.sitemask.quality.rst 0000664 0000000 0000000 00000000246 14156664651 0024130 0 ustar 00root root 0000000 0000000 bx.align.sitemask.quality module
================================
.. automodule:: bx.align.sitemask.quality
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.sitemask.rst 0000664 0000000 0000000 00000000513 14156664651 0022436 0 ustar 00root root 0000000 0000000 bx.align.sitemask package
=========================
Submodules
----------
.. toctree::
bx.align.sitemask.core
bx.align.sitemask.cpg
bx.align.sitemask.quality
bx.align.sitemask.sitemask_tests
Module contents
---------------
.. automodule:: bx.align.sitemask
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.sitemask.sitemask_tests.rst 0000664 0000000 0000000 00000000273 14156664651 0025502 0 ustar 00root root 0000000 0000000 bx.align.sitemask.sitemask_tests module
=======================================
.. automodule:: bx.align.sitemask.sitemask_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.tools.chop.rst 0000664 0000000 0000000 00000000224 14156664651 0022705 0 ustar 00root root 0000000 0000000 bx.align.tools.chop module
==========================
.. automodule:: bx.align.tools.chop
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.tools.fuse.rst 0000664 0000000 0000000 00000000224 14156664651 0022716 0 ustar 00root root 0000000 0000000 bx.align.tools.fuse module
==========================
.. automodule:: bx.align.tools.fuse
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.tools.rst 0000664 0000000 0000000 00000000454 14156664651 0021762 0 ustar 00root root 0000000 0000000 bx.align.tools package
======================
Submodules
----------
.. toctree::
bx.align.tools.chop
bx.align.tools.fuse
bx.align.tools.thread
bx.align.tools.tile
Module contents
---------------
.. automodule:: bx.align.tools
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.tools.thread.rst 0000664 0000000 0000000 00000000232 14156664651 0023222 0 ustar 00root root 0000000 0000000 bx.align.tools.thread module
============================
.. automodule:: bx.align.tools.thread
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.align.tools.tile.rst 0000664 0000000 0000000 00000000224 14156664651 0022711 0 ustar 00root root 0000000 0000000 bx.align.tools.tile module
==========================
.. automodule:: bx.align.tools.tile
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.arrays.array_tree.rst 0000664 0000000 0000000 00000000227 14156664651 0023164 0 ustar 00root root 0000000 0000000 bx.arrays.array_tree module
===========================
.. automodule:: bx.arrays.array_tree
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.arrays.array_tree_tests.rst 0000664 0000000 0000000 00000000251 14156664651 0024403 0 ustar 00root root 0000000 0000000 bx.arrays.array_tree_tests module
=================================
.. automodule:: bx.arrays.array_tree_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.arrays.bed.rst 0000664 0000000 0000000 00000000202 14156664651 0021552 0 ustar 00root root 0000000 0000000 bx.arrays.bed module
====================
.. automodule:: bx.arrays.bed
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.arrays.rst 0000664 0000000 0000000 00000000432 14156664651 0021026 0 ustar 00root root 0000000 0000000 bx.arrays package
=================
Submodules
----------
.. toctree::
bx.arrays.array_tree
bx.arrays.array_tree_tests
bx.arrays.bed
bx.arrays.wiggle
Module contents
---------------
.. automodule:: bx.arrays
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.arrays.wiggle.rst 0000664 0000000 0000000 00000000213 14156664651 0022300 0 ustar 00root root 0000000 0000000 bx.arrays.wiggle module
=======================
.. automodule:: bx.arrays.wiggle
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.bbi_file.rst 0000664 0000000 0000000 00000000210 14156664651 0022005 0 ustar 00root root 0000000 0000000 bx.bbi.bbi_file module
======================
.. automodule:: bx.bbi.bbi_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.bigbed_file.rst 0000664 0000000 0000000 00000000221 14156664651 0022467 0 ustar 00root root 0000000 0000000 bx.bbi.bigbed_file module
=========================
.. automodule:: bx.bbi.bigbed_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.bigwig_file.rst 0000664 0000000 0000000 00000000221 14156664651 0022523 0 ustar 00root root 0000000 0000000 bx.bbi.bigwig_file module
=========================
.. automodule:: bx.bbi.bigwig_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.bigwig_tests.rst 0000664 0000000 0000000 00000000224 14156664651 0022751 0 ustar 00root root 0000000 0000000 bx.bbi.bigwig_tests module
==========================
.. automodule:: bx.bbi.bigwig_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.bpt_file.rst 0000664 0000000 0000000 00000000210 14156664651 0022036 0 ustar 00root root 0000000 0000000 bx.bbi.bpt_file module
======================
.. automodule:: bx.bbi.bpt_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.cirtree_file.rst 0000664 0000000 0000000 00000000224 14156664651 0022713 0 ustar 00root root 0000000 0000000 bx.bbi.cirtree_file module
==========================
.. automodule:: bx.bbi.cirtree_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bbi.rst 0000664 0000000 0000000 00000000466 14156664651 0020270 0 ustar 00root root 0000000 0000000 bx.bbi package
==============
Submodules
----------
.. toctree::
bx.bbi.bbi_file
bx.bbi.bigbed_file
bx.bbi.bigwig_file
bx.bbi.bigwig_tests
bx.bbi.bpt_file
bx.bbi.cirtree_file
Module contents
---------------
.. automodule:: bx.bbi
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.binned_array.rst 0000664 0000000 0000000 00000000210 14156664651 0022154 0 ustar 00root root 0000000 0000000 bx.binned_array module
======================
.. automodule:: bx.binned_array
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.binned_array_tests.rst 0000664 0000000 0000000 00000000232 14156664651 0023402 0 ustar 00root root 0000000 0000000 bx.binned_array_tests module
============================
.. automodule:: bx.binned_array_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bitset.rst 0000664 0000000 0000000 00000000166 14156664651 0021023 0 ustar 00root root 0000000 0000000 bx.bitset module
================
.. automodule:: bx.bitset
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bitset_builders.rst 0000664 0000000 0000000 00000000221 14156664651 0022704 0 ustar 00root root 0000000 0000000 bx.bitset_builders module
=========================
.. automodule:: bx.bitset_builders
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bitset_tests.rst 0000664 0000000 0000000 00000000210 14156664651 0022233 0 ustar 00root root 0000000 0000000 bx.bitset_tests module
======================
.. automodule:: bx.bitset_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.bitset_utils.rst 0000664 0000000 0000000 00000000210 14156664651 0022231 0 ustar 00root root 0000000 0000000 bx.bitset_utils module
======================
.. automodule:: bx.bitset_utils
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.cookbook.argparse.rst 0000664 0000000 0000000 00000000227 14156664651 0023140 0 ustar 00root root 0000000 0000000 bx.cookbook.argparse module
===========================
.. automodule:: bx.cookbook.argparse
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.cookbook.attribute.rst 0000664 0000000 0000000 00000000232 14156664651 0023333 0 ustar 00root root 0000000 0000000 bx.cookbook.attribute module
============================
.. automodule:: bx.cookbook.attribute
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.cookbook.doc_optparse.rst 0000664 0000000 0000000 00000000243 14156664651 0024014 0 ustar 00root root 0000000 0000000 bx.cookbook.doc_optparse module
===============================
.. automodule:: bx.cookbook.doc_optparse
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.cookbook.progress_bar.rst 0000664 0000000 0000000 00000000243 14156664651 0024022 0 ustar 00root root 0000000 0000000 bx.cookbook.progress_bar module
===============================
.. automodule:: bx.cookbook.progress_bar
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.cookbook.rst 0000664 0000000 0000000 00000000456 14156664651 0021341 0 ustar 00root root 0000000 0000000 bx.cookbook package
===================
Submodules
----------
.. toctree::
bx.cookbook.argparse
bx.cookbook.attribute
bx.cookbook.doc_optparse
bx.cookbook.progress_bar
Module contents
---------------
.. automodule:: bx.cookbook
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.filter.rst 0000664 0000000 0000000 00000000166 14156664651 0021016 0 ustar 00root root 0000000 0000000 bx.filter module
================
.. automodule:: bx.filter
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.gene_reader.rst 0000664 0000000 0000000 00000000205 14156664651 0021763 0 ustar 00root root 0000000 0000000 bx.gene_reader module
=====================
.. automodule:: bx.gene_reader
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.interval_index_file.rst 0000664 0000000 0000000 00000000235 14156664651 0023540 0 ustar 00root root 0000000 0000000 bx.interval_index_file module
=============================
.. automodule:: bx.interval_index_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.interval_index_file_tests.rst 0000664 0000000 0000000 00000000257 14156664651 0024766 0 ustar 00root root 0000000 0000000 bx.interval_index_file_tests module
===================================
.. automodule:: bx.interval_index_file_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.cluster.rst 0000664 0000000 0000000 00000000227 14156664651 0023216 0 ustar 00root root 0000000 0000000 bx.intervals.cluster module
===========================
.. automodule:: bx.intervals.cluster
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.cluster_tests.rst 0000664 0000000 0000000 00000000251 14156664651 0024435 0 ustar 00root root 0000000 0000000 bx.intervals.cluster_tests module
=================================
.. automodule:: bx.intervals.cluster_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.intersection.rst 0000664 0000000 0000000 00000000246 14156664651 0024244 0 ustar 00root root 0000000 0000000 bx.intervals.intersection module
================================
.. automodule:: bx.intervals.intersection
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.intersection_tests.rst 0000664 0000000 0000000 00000000270 14156664651 0025463 0 ustar 00root root 0000000 0000000 bx.intervals.intersection_tests module
======================================
.. automodule:: bx.intervals.intersection_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.io.rst 0000664 0000000 0000000 00000000210 14156664651 0022134 0 ustar 00root root 0000000 0000000 bx.intervals.io module
======================
.. automodule:: bx.intervals.io
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.base_coverage.rst 0000664 0000000 0000000 00000000312 14156664651 0026477 0 ustar 00root root 0000000 0000000 bx.intervals.operations.base_coverage module
============================================
.. automodule:: bx.intervals.operations.base_coverage
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.complement.rst 0000664 0000000 0000000 00000000301 14156664651 0026053 0 ustar 00root root 0000000 0000000 bx.intervals.operations.complement module
=========================================
.. automodule:: bx.intervals.operations.complement
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.concat.rst 0000664 0000000 0000000 00000000265 14156664651 0025170 0 ustar 00root root 0000000 0000000 bx.intervals.operations.concat module
=====================================
.. automodule:: bx.intervals.operations.concat
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.coverage.rst 0000664 0000000 0000000 00000000273 14156664651 0025513 0 ustar 00root root 0000000 0000000 bx.intervals.operations.coverage module
=======================================
.. automodule:: bx.intervals.operations.coverage
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.find_clusters.rst 0000664 0000000 0000000 00000000312 14156664651 0026556 0 ustar 00root root 0000000 0000000 bx.intervals.operations.find_clusters module
============================================
.. automodule:: bx.intervals.operations.find_clusters
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.intersect.rst 0000664 0000000 0000000 00000000276 14156664651 0025723 0 ustar 00root root 0000000 0000000 bx.intervals.operations.intersect module
========================================
.. automodule:: bx.intervals.operations.intersect
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.join.rst 0000664 0000000 0000000 00000000257 14156664651 0024661 0 ustar 00root root 0000000 0000000 bx.intervals.operations.join module
===================================
.. automodule:: bx.intervals.operations.join
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.merge.rst 0000664 0000000 0000000 00000000262 14156664651 0025015 0 ustar 00root root 0000000 0000000 bx.intervals.operations.merge module
====================================
.. automodule:: bx.intervals.operations.merge
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.quicksect.rst 0000664 0000000 0000000 00000000276 14156664651 0025716 0 ustar 00root root 0000000 0000000 bx.intervals.operations.quicksect module
========================================
.. automodule:: bx.intervals.operations.quicksect
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.rst 0000664 0000000 0000000 00000001126 14156664651 0023717 0 ustar 00root root 0000000 0000000 bx.intervals.operations package
===============================
Submodules
----------
.. toctree::
bx.intervals.operations.base_coverage
bx.intervals.operations.complement
bx.intervals.operations.concat
bx.intervals.operations.coverage
bx.intervals.operations.find_clusters
bx.intervals.operations.intersect
bx.intervals.operations.join
bx.intervals.operations.merge
bx.intervals.operations.quicksect
bx.intervals.operations.subtract
Module contents
---------------
.. automodule:: bx.intervals.operations
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.operations.subtract.rst 0000664 0000000 0000000 00000000273 14156664651 0025547 0 ustar 00root root 0000000 0000000 bx.intervals.operations.subtract module
=======================================
.. automodule:: bx.intervals.operations.subtract
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.random_intervals.rst 0000664 0000000 0000000 00000000262 14156664651 0025103 0 ustar 00root root 0000000 0000000 bx.intervals.random_intervals module
====================================
.. automodule:: bx.intervals.random_intervals
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intervals.rst 0000664 0000000 0000000 00000000666 14156664651 0021545 0 ustar 00root root 0000000 0000000 bx.intervals package
====================
Subpackages
-----------
.. toctree::
bx.intervals.operations
Submodules
----------
.. toctree::
bx.intervals.cluster
bx.intervals.cluster_tests
bx.intervals.intersection
bx.intervals.intersection_tests
bx.intervals.io
bx.intervals.random_intervals
Module contents
---------------
.. automodule:: bx.intervals
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intseq.ngramcount.rst 0000664 0000000 0000000 00000000227 14156664651 0023206 0 ustar 00root root 0000000 0000000 bx.intseq.ngramcount module
===========================
.. automodule:: bx.intseq.ngramcount
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.intseq.rst 0000664 0000000 0000000 00000000327 14156664651 0021033 0 ustar 00root root 0000000 0000000 bx.intseq package
=================
Submodules
----------
.. toctree::
bx.intseq.ngramcount
Module contents
---------------
.. automodule:: bx.intseq
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.bgzf.rst 0000664 0000000 0000000 00000000177 14156664651 0021415 0 ustar 00root root 0000000 0000000 bx.misc.bgzf module
===================
.. automodule:: bx.misc.bgzf
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.bgzf_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022625 0 ustar 00root root 0000000 0000000 bx.misc.bgzf_tests module
=========================
.. automodule:: bx.misc.bgzf_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.binary_file.rst 0000664 0000000 0000000 00000000224 14156664651 0022741 0 ustar 00root root 0000000 0000000 bx.misc.binary_file module
==========================
.. automodule:: bx.misc.binary_file
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.cdb.rst 0000664 0000000 0000000 00000000174 14156664651 0021212 0 ustar 00root root 0000000 0000000 bx.misc.cdb module
==================
.. automodule:: bx.misc.cdb
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.cdb_tests.rst 0000664 0000000 0000000 00000000216 14156664651 0022431 0 ustar 00root root 0000000 0000000 bx.misc.cdb_tests module
========================
.. automodule:: bx.misc.cdb_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.filecache.rst 0000664 0000000 0000000 00000000216 14156664651 0022362 0 ustar 00root root 0000000 0000000 bx.misc.filecache module
========================
.. automodule:: bx.misc.filecache
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.filecache_tests.rst 0000664 0000000 0000000 00000000240 14156664651 0023601 0 ustar 00root root 0000000 0000000 bx.misc.filecache_tests module
==============================
.. automodule:: bx.misc.filecache_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.readlengths.rst 0000664 0000000 0000000 00000000224 14156664651 0022756 0 ustar 00root root 0000000 0000000 bx.misc.readlengths module
==========================
.. automodule:: bx.misc.readlengths
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.rst 0000664 0000000 0000000 00000000677 14156664651 0020473 0 ustar 00root root 0000000 0000000 bx.misc package
===============
Submodules
----------
.. toctree::
bx.misc.bgzf
bx.misc.bgzf_tests
bx.misc.binary_file
bx.misc.cdb
bx.misc.cdb_tests
bx.misc.filecache
bx.misc.filecache_tests
bx.misc.readlengths
bx.misc.seekbzip2
bx.misc.seekbzip2_tests
bx.misc.seeklzop
bx.misc.seeklzop_tests
Module contents
---------------
.. automodule:: bx.misc
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.seekbzip2.rst 0000664 0000000 0000000 00000000216 14156664651 0022355 0 ustar 00root root 0000000 0000000 bx.misc.seekbzip2 module
========================
.. automodule:: bx.misc.seekbzip2
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.seekbzip2_tests.rst 0000664 0000000 0000000 00000000240 14156664651 0023574 0 ustar 00root root 0000000 0000000 bx.misc.seekbzip2_tests module
==============================
.. automodule:: bx.misc.seekbzip2_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.seeklzop.rst 0000664 0000000 0000000 00000000213 14156664651 0022310 0 ustar 00root root 0000000 0000000 bx.misc.seeklzop module
=======================
.. automodule:: bx.misc.seeklzop
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.misc.seeklzop_tests.rst 0000664 0000000 0000000 00000000235 14156664651 0023536 0 ustar 00root root 0000000 0000000 bx.misc.seeklzop_tests module
=============================
.. automodule:: bx.misc.seeklzop_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.io.rst 0000664 0000000 0000000 00000000373 14156664651 0021255 0 ustar 00root root 0000000 0000000 bx.motif.io package
===================
Submodules
----------
.. toctree::
bx.motif.io.transfac
bx.motif.io.transfac_tests
Module contents
---------------
.. automodule:: bx.motif.io
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.io.transfac.rst 0000664 0000000 0000000 00000000227 14156664651 0023053 0 ustar 00root root 0000000 0000000 bx.motif.io.transfac module
===========================
.. automodule:: bx.motif.io.transfac
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.io.transfac_tests.rst 0000664 0000000 0000000 00000000251 14156664651 0024272 0 ustar 00root root 0000000 0000000 bx.motif.io.transfac_tests module
=================================
.. automodule:: bx.motif.io.transfac_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.logo.rst 0000664 0000000 0000000 00000000245 14156664651 0021604 0 ustar 00root root 0000000 0000000 bx.motif.logo package
=====================
Module contents
---------------
.. automodule:: bx.motif.logo
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.pwm.rst 0000664 0000000 0000000 00000000177 14156664651 0021453 0 ustar 00root root 0000000 0000000 bx.motif.pwm module
===================
.. automodule:: bx.motif.pwm
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.pwm_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022663 0 ustar 00root root 0000000 0000000 bx.motif.pwm_tests module
=========================
.. automodule:: bx.motif.pwm_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.motif.rst 0000664 0000000 0000000 00000000454 14156664651 0020647 0 ustar 00root root 0000000 0000000 bx.motif package
================
Subpackages
-----------
.. toctree::
bx.motif.io
bx.motif.logo
Submodules
----------
.. toctree::
bx.motif.pwm
bx.motif.pwm_tests
Module contents
---------------
.. automodule:: bx.motif
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.phylo.newick.rst 0000664 0000000 0000000 00000000210 14156664651 0022131 0 ustar 00root root 0000000 0000000 bx.phylo.newick module
======================
.. automodule:: bx.phylo.newick
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.phylo.newick_tests.rst 0000664 0000000 0000000 00000000232 14156664651 0023357 0 ustar 00root root 0000000 0000000 bx.phylo.newick_tests module
============================
.. automodule:: bx.phylo.newick_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.phylo.phast.rst 0000664 0000000 0000000 00000000205 14156664651 0021774 0 ustar 00root root 0000000 0000000 bx.phylo.phast module
=====================
.. automodule:: bx.phylo.phast
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.phylo.phast_tests.rst 0000664 0000000 0000000 00000000227 14156664651 0023222 0 ustar 00root root 0000000 0000000 bx.phylo.phast_tests module
===========================
.. automodule:: bx.phylo.phast_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.phylo.rst 0000664 0000000 0000000 00000000422 14156664651 0020657 0 ustar 00root root 0000000 0000000 bx.phylo package
================
Submodules
----------
.. toctree::
bx.phylo.newick
bx.phylo.newick_tests
bx.phylo.phast
bx.phylo.phast_tests
Module contents
---------------
.. automodule:: bx.phylo
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.bed_score_aligned_pwm.rst 0000664 0000000 0000000 00000000257 14156664651 0024627 0 ustar 00root root 0000000 0000000 bx.pwm.bed_score_aligned_pwm module
===================================
.. automodule:: bx.pwm.bed_score_aligned_pwm
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.bed_score_aligned_string.rst 0000664 0000000 0000000 00000000270 14156664651 0025325 0 ustar 00root root 0000000 0000000 bx.pwm.bed_score_aligned_string module
======================================
.. automodule:: bx.pwm.bed_score_aligned_string
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.maf_select_motifs.rst 0000664 0000000 0000000 00000000243 14156664651 0024012 0 ustar 00root root 0000000 0000000 bx.pwm.maf_select_motifs module
===============================
.. automodule:: bx.pwm.maf_select_motifs
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.position_weight_matrix.rst 0000664 0000000 0000000 00000000262 14156664651 0025127 0 ustar 00root root 0000000 0000000 bx.pwm.position_weight_matrix module
====================================
.. automodule:: bx.pwm.position_weight_matrix
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.pwm_score_maf.rst 0000664 0000000 0000000 00000000227 14156664651 0023152 0 ustar 00root root 0000000 0000000 bx.pwm.pwm_score_maf module
===========================
.. automodule:: bx.pwm.pwm_score_maf
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.pwm_score_motifs.rst 0000664 0000000 0000000 00000000240 14156664651 0023703 0 ustar 00root root 0000000 0000000 bx.pwm.pwm_score_motifs module
==============================
.. automodule:: bx.pwm.pwm_score_motifs
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.pwm_score_positions.rst 0000664 0000000 0000000 00000000251 14156664651 0024433 0 ustar 00root root 0000000 0000000 bx.pwm.pwm_score_positions module
=================================
.. automodule:: bx.pwm.pwm_score_positions
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.pwm_tests.rst 0000664 0000000 0000000 00000000213 14156664651 0022351 0 ustar 00root root 0000000 0000000 bx.pwm.pwm_tests module
=======================
.. automodule:: bx.pwm.pwm_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.pwm.rst 0000664 0000000 0000000 00000000633 14156664651 0020333 0 ustar 00root root 0000000 0000000 bx.pwm package
==============
Submodules
----------
.. toctree::
bx.pwm.bed_score_aligned_pwm
bx.pwm.bed_score_aligned_string
bx.pwm.maf_select_motifs
bx.pwm.position_weight_matrix
bx.pwm.pwm_score_maf
bx.pwm.pwm_score_motifs
bx.pwm.pwm_score_positions
bx.pwm.pwm_tests
Module contents
---------------
.. automodule:: bx.pwm
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.rst 0000664 0000000 0000000 00000001210 14156664651 0017521 0 ustar 00root root 0000000 0000000 bx package
==========
Subpackages
-----------
.. toctree::
bx.align
bx.arrays
bx.bbi
bx.cookbook
bx.intervals
bx.intseq
bx.misc
bx.motif
bx.phylo
bx.pwm
bx.seq
bx.tabular
Submodules
----------
.. toctree::
bx.binned_array
bx.binned_array_tests
bx.bitset
bx.bitset_builders
bx.bitset_tests
bx.bitset_utils
bx.filter
bx.gene_reader
bx.interval_index_file
bx.interval_index_file_tests
bx.seqmapping
bx.seqmapping_tests
bx.wiggle
bx.wiggle_tests
Module contents
---------------
.. automodule:: bx
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.core.rst 0000664 0000000 0000000 00000000174 14156664651 0021247 0 ustar 00root root 0000000 0000000 bx.seq.core module
==================
.. automodule:: bx.seq.core
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.fasta.rst 0000664 0000000 0000000 00000000177 14156664651 0021420 0 ustar 00root root 0000000 0000000 bx.seq.fasta module
===================
.. automodule:: bx.seq.fasta
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.fasta_tests.rst 0000664 0000000 0000000 00000000221 14156664651 0022630 0 ustar 00root root 0000000 0000000 bx.seq.fasta_tests module
=========================
.. automodule:: bx.seq.fasta_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.nib.rst 0000664 0000000 0000000 00000000171 14156664651 0021064 0 ustar 00root root 0000000 0000000 bx.seq.nib module
=================
.. automodule:: bx.seq.nib
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.nib_tests.rst 0000664 0000000 0000000 00000000213 14156664651 0022303 0 ustar 00root root 0000000 0000000 bx.seq.nib_tests module
=======================
.. automodule:: bx.seq.nib_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.qdna.rst 0000664 0000000 0000000 00000000174 14156664651 0021242 0 ustar 00root root 0000000 0000000 bx.seq.qdna module
==================
.. automodule:: bx.seq.qdna
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.qdna_tests.rst 0000664 0000000 0000000 00000000216 14156664651 0022461 0 ustar 00root root 0000000 0000000 bx.seq.qdna_tests module
========================
.. automodule:: bx.seq.qdna_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.rst 0000664 0000000 0000000 00000000573 14156664651 0020323 0 ustar 00root root 0000000 0000000 bx.seq package
==============
Submodules
----------
.. toctree::
bx.seq.core
bx.seq.fasta
bx.seq.fasta_tests
bx.seq.nib
bx.seq.nib_tests
bx.seq.qdna
bx.seq.qdna_tests
bx.seq.seq
bx.seq.seq_tests
bx.seq.twobit
bx.seq.twobit_tests
Module contents
---------------
.. automodule:: bx.seq
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.seq.rst 0000664 0000000 0000000 00000000171 14156664651 0021104 0 ustar 00root root 0000000 0000000 bx.seq.seq module
=================
.. automodule:: bx.seq.seq
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.seq_tests.rst 0000664 0000000 0000000 00000000213 14156664651 0022323 0 ustar 00root root 0000000 0000000 bx.seq.seq_tests module
=======================
.. automodule:: bx.seq.seq_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.twobit.rst 0000664 0000000 0000000 00000000202 14156664651 0021617 0 ustar 00root root 0000000 0000000 bx.seq.twobit module
====================
.. automodule:: bx.seq.twobit
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seq.twobit_tests.rst 0000664 0000000 0000000 00000000224 14156664651 0023045 0 ustar 00root root 0000000 0000000 bx.seq.twobit_tests module
==========================
.. automodule:: bx.seq.twobit_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seqmapping.rst 0000664 0000000 0000000 00000000202 14156664651 0021664 0 ustar 00root root 0000000 0000000 bx.seqmapping module
====================
.. automodule:: bx.seqmapping
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.seqmapping_tests.rst 0000664 0000000 0000000 00000000224 14156664651 0023112 0 ustar 00root root 0000000 0000000 bx.seqmapping_tests module
==========================
.. automodule:: bx.seqmapping_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.tabular.io.rst 0000664 0000000 0000000 00000000202 14156664651 0021560 0 ustar 00root root 0000000 0000000 bx.tabular.io module
====================
.. automodule:: bx.tabular.io
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.tabular.rst 0000664 0000000 0000000 00000000323 14156664651 0021156 0 ustar 00root root 0000000 0000000 bx.tabular package
==================
Submodules
----------
.. toctree::
bx.tabular.io
Module contents
---------------
.. automodule:: bx.tabular
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.wiggle.rst 0000664 0000000 0000000 00000000166 14156664651 0021007 0 ustar 00root root 0000000 0000000 bx.wiggle module
================
.. automodule:: bx.wiggle
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx.wiggle_tests.rst 0000664 0000000 0000000 00000000210 14156664651 0022217 0 ustar 00root root 0000000 0000000 bx.wiggle_tests module
======================
.. automodule:: bx.wiggle_tests
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.fpconst.rst 0000664 0000000 0000000 00000000216 14156664651 0022567 0 ustar 00root root 0000000 0000000 bx_extras.fpconst module
========================
.. automodule:: bx_extras.fpconst
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.lrucache.rst 0000664 0000000 0000000 00000000221 14156664651 0022675 0 ustar 00root root 0000000 0000000 bx_extras.lrucache module
=========================
.. automodule:: bx_extras.lrucache
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.pstat.rst 0000664 0000000 0000000 00000000210 14156664651 0022240 0 ustar 00root root 0000000 0000000 bx_extras.pstat module
======================
.. automodule:: bx_extras.pstat
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.pyparsing.rst 0000664 0000000 0000000 00000000224 14156664651 0023126 0 ustar 00root root 0000000 0000000 bx_extras.pyparsing module
==========================
.. automodule:: bx_extras.pyparsing
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.rst 0000664 0000000 0000000 00000000447 14156664651 0021122 0 ustar 00root root 0000000 0000000 bx_extras package
=================
Submodules
----------
.. toctree::
bx_extras.fpconst
bx_extras.lrucache
bx_extras.pstat
bx_extras.pyparsing
bx_extras.stats
Module contents
---------------
.. automodule:: bx_extras
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/bx_extras.stats.rst 0000664 0000000 0000000 00000000210 14156664651 0022243 0 ustar 00root root 0000000 0000000 bx_extras.stats module
======================
.. automodule:: bx_extras.stats
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/lib/modules.rst 0000664 0000000 0000000 00000000110 14156664651 0020556 0 ustar 00root root 0000000 0000000 lib
===
.. toctree::
:maxdepth: 4
bx
bx_extras
psyco_full
bx-python-0.8.13/doc/source/lib/psyco_full.rst 0000664 0000000 0000000 00000000171 14156664651 0021274 0 ustar 00root root 0000000 0000000 psyco_full module
=================
.. automodule:: psyco_full
:members:
:undoc-members:
:show-inheritance:
bx-python-0.8.13/doc/source/static/ 0000775 0000000 0000000 00000000000 14156664651 0017105 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/doc/source/static/base.css 0000664 0000000 0000000 00000004143 14156664651 0020533 0 ustar 00root root 0000000 0000000 @import url(tripoli.base.css);
html {
font-family: 'Verdana', sans-serif;
color: #333333;
}
body {
padding: 3em 3em;
}
h1.pageheader {
font-variant: small-caps;
margin-top: 0;
border-top: solid 1px;
padding-top: 2px;
border-bottom: solid 1px;
border-color: #CCCCCC;
margin-bottom: 1em;
}
h1.pageheader a {
color: inherit;
text-decoration: inherit;
border: none;
}
.content h1, .content h2, .content h3, .content h4, .content h5, .content h6 {
font-family: 'Hoefler Text', 'Georgia', serif;
font-weight: normal;
color: #666666;
/* border-bottom: solid #666666 1px; */
}
.content h1.pagetitle {
color: #c33;
}
#main {
}
.colpad {
padding: 0 2em;
}
#main > .inner {
min-width: 70em;
max-width: 90em;
margin: auto;
height: 100%;
}
#left {
background: white;
margin-right: 36%; /* 31em; */
padding-right: 3%;
height: 100%;
}
#right {
float: right;
width: 33%; /* 28em; */
padding-left: 3%;
border-left: solid #CCCCCC 1px;
}
.sidebar {
font-size: 1em;
}
.sidebar ul {
margin-left: 0;
}
.sidebar ul li {
list-style-type: none;
margin-bottom: 0.6em;
}
.sidebar ul.pages {
margin-left: 5px;
margin-top: 0.6em;
}
.sidebar ul.pages li {
background: url(hbullet.png) 0 0.4em no-repeat;
padding-left: 25px;
list-style-type: none;
}
.sidebar ul.pages li {
}
.sidebar h1 {
clear: both;
}
.sidebar .publications .info {
color: #666666;
}
.postinfo {
color: #666666;
font-size: 92%;
margin-top: -1em;
}
.postreadlink {
margin-top: -1em;
}
.sidebar .posts .info {
color: #666666;
}
.comments_title {
margin-top: 2em;
}
label {
display: block;
}
#footer {
clear: both;
}
a, a:link, a:visited {
text-decoration: none;
border-bottom: dotted #666666 1px;
color: black;
}
a:hover {
color: #CC3333;
}
li {
list-style: square;
}
table.layout td {
vertical-align: top;
padding-left: 2em;
padding-right: 2em;
border-left: solid #999999 1px
}
hr {
border: none;
height: 1px;
background: #999999;
}
bx-python-0.8.13/doc/source/static/tripoli.base.css 0000664 0000000 0000000 00000012655 14156664651 0022223 0 ustar 00root root 0000000 0000000 /*
* Tripoli is a generic CSS standard for HTML rendering.
* Copyright (C) 2007 David Hellsing
*
* http://devkick.com/lab/tripoli/
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
**/
/*
_______________________________
RESET */
*
{
text-decoration:none;
font-size:1em;
outline:none;
margin:0;
padding:0;
}
code,kbd,samp,pre,tt,var,textarea,input,select,isindex,listing,xmp,plaintext
{
font:inherit;
white-space:normal;
}
a,img,a img,iframe,form,abbr,acronym,object,applet,table,a abbr,a acronym
{
border-width:0;
}
dfn,i,cite,var,address,em
{
font-style:normal;
}
th,b,strong,h1,h2,h3,h4,h5,h6,dt
{
font-weight:normal;
}
caption,th,td
{
text-align:left;
}
html
{
background:white;
color:black;
line-height:1;
font-family:arial, sans-serif;
}
/* \*/
html
{
font-family:sans-serif;
}
/* */
q
{
quotes:"\201C""\201D""\2018""\2019";
}
ul,ol,dir,menu
{
list-style:none;
}
sub,sup
{
vertical-align:baseline;
}
a
{
color:inherit;
}
/*
_______________________________
DISABLE DEPRECATED HTML */
font,basefont
{
color:inherit;
font:inherit;
font-size:100%;
}
center,*[align]
{
text-align:inherit;
}
s,strike,u
{
text-decoration:inherit;
}
img
{
border:none;
margin:0;
}
ol
{
list-style-type:decimal;
}
body
{
background-color:transparent;
}
tr,th,td
{
width:auto;
height:auto;
background-color:transparent;
vertical-align:inherit;
border:none;
}
table[border],.content table[border]
{
border-collapse:separate;
border-spacing:0;
}
nobr
{
white-space:normal;
}
marquee
{
overflow:visible;
-moz-binding:none;
}
blink
{
text-decoration:none;
}
/*
_______________________________
GENERAL */
html
{
font-size:125%;
}
body
{
font-size:50%;
}
a
{
text-decoration:underline;
}
strong,th,thead td,h1,h2,h3,h4,h5,h6,dt
{
font-weight:bold;
}
cite,em,dfn
{
font-style:italic;
}
code,kbd,samp,pre,tt,var,input[type='text'],input[type='password'],textarea
{
font-size:100%;
font-family:mono-space,monospace;
}
pre
{
white-space:pre;
}
pre *
{
font-size:100%;
white-space:pre;
}
del
{
text-decoration:line-through;
}
ins,dfn
{
border-bottom:1px solid black;
}
small,sup,sub
{
font-size:85%;
}
big
{
font-size:125%;
line-height:80%;
}
abbr,acronym
{
text-transform:uppercase;
font-size:85%;
letter-spacing:.1em;
}
abbr[title],acronym[title],dfn[title]
{
cursor:help;
border-bottom:1px dotted black;
}
sup
{
vertical-align:super;
}
sub
{
vertical-align:sub;
}
blockquote
{
padding-left:2.2em;
}
hr
{
display:none; /* We will re-reset it later for content */
}
:lang(af),:lang(nl),:lang(pl)
{
quotes:'\201E' '\201D' '\201A' '\2019';
}
:lang(bg),:lang(cs),:lang(de),:lang(is),:lang(lt),:lang(sk),:lang(sr),:lang(ro)
{
quotes:'\201E' '\201C' '\201A' '\2018';
}
:lang(da),:lang(hr)
{
quotes:'\00BB' '\00AB' '\203A' '\2039';
}
:lang(el),:lang(es),:lang(sq),:lang(tr)
{
quotes:'\00AB' '\00BB' '\2039' '\203A';
}
:lang(en-GB)
{
quotes:'\2018' '\2019' '\201C' '\201D';
}
:lang(fi),:lang(sv)
{
quotes:'\201D' '\201D' '\2019' '\2019';
}
:lang(fr)
{
quotes:'\ab\2005' '\2005\bb' '\2039\2005' '\2005\203a';
}
*[lang|='en'] q:before
{
content:'\201C';
}
*[lang|='en'] q:after
{
content:'\201D';
}
*[lang|='en'] q q:before
{
content:'\2018';
}
*[lang|='en'] q q:after
{
content:'\2019';
}
input,select,button
{
cursor:pointer;
}
input[type='text'],input[type='password']
{
cursor:text;
}
input[type='hidden']
{
display:none;
}
/*
_______________________________
CONTENT */
.content
{
font-size:1.2em;
line-height:1.6em;
}
.content h1
{
font-size:1.6em;
line-height:1;
margin:1em 0 .5em;
}
.content h2
{
font-size:1.5em;
line-height:1;
margin:1.07em 0 .535em;
}
.content h3
{
font-size:1.4em;
line-height:1;
margin:1.14em 0 .57em;
}
.content h4
{
font-size:1.3em;
line-height:1;
margin:1.23em 0 .615em;
}
.content h5
{
font-size:1.2em;
line-height:1;
margin:1.33em 0 .67em;
}
.content h6
{
font-size:1em;
line-height:1;
margin:1.6em 0 .8em;
}
.content hr
{
display:block;
background:black;
color:black;
width:100%;
height:1px;
border:none;
}
.content ul
{
list-style:disc outside;
}
.content ol
{
list-style:decimal outside;
}
.content table
{
border-collapse:collapse;
}
.content hr,.content p,.content ul,.content ol,.content dl,.content pre, .content address,.content table,.content form
{
margin-bottom:1.6em;
}
.content p+p
{
margin-top:-.8em;
}
.content fieldset
{
margin:1.6em 0;
padding:1.6em;
}
/* \*/
.content legend
{
padding-left:.8em;
padding-right:.8em;
}
/* */
@media all and (min-width: 0px) /* for Opera 8 */
{
.content legend
{
margin-bottom:1.6em;
}
.content fieldset
{
margin-top:0;
}
.content[class^='content'] fieldset
{
margin-top:1.6em;
}
}
.content fieldset>*:first-child
{
margin-top:0;
}
.content textarea,.content input[type='text']
{
padding:.1em .2em;
}
.content input
{
padding:.2em .1em;
}
.content select
{
padding:.2em .1em 0;
}
.content select[multiple]
{
margin-bottom:.8em;
}
.content option
{
padding:0 .4em .1em;
}
.content button
{
padding:.3em .5em;
}
.content input[type='radio']
{
position:relative;
bottom:-.2em;
}
.content dt
{
margin-top:.8em;
margin-bottom:.4em;
}
.content ul,.content ol
{
margin-left:2.2em;
}
.content caption,.content form div
{
padding-bottom:.8em;
}
.content ul ul,content ol ul,.content ul ol,content ol ol
{
margin-bottom:0;
}
/*
_______________________________
END */
bx-python-0.8.13/doc/source/templates/ 0000775 0000000 0000000 00000000000 14156664651 0017614 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/doc/source/templates/index.html 0000664 0000000 0000000 00000003207 14156664651 0021613 0 ustar 00root root 0000000 0000000 {% extends "layout.html" %}
{% set title = 'bx python' %}
{% block body %}
Welcome
The bx-python project is a python library and associated set of scripts to allow for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
Classes for reading and working with genome-scale multiple local alignments (in MAF, AXT, and LAV formats)
Generic data structure for indexing on disk files that contain blocks of data associated with intervals on various sequences (used, for example, to provide random access to individual alignments in huge files; optomized for use over network filesystems)
Data structures for working with intervals on sequences
"Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
"Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation
These tools have been used in a variety of published research, and are a fundamental part of the ongoing Galaxy and ESPERR projects.
{% endblock %}
{# Sidebar and already handled #}
{% block relbar1 %}{% endblock %}
{% block relbar2 %}{% endblock %}
{% block sidebar1 %}{% endblock %}
{% block sidebar2 %}{% endblock %}
{% block footer %}{% endblock %} bx-python-0.8.13/lib/ 0000775 0000000 0000000 00000000000 14156664651 0014317 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/ 0000775 0000000 0000000 00000000000 14156664651 0014730 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/__init__.py 0000664 0000000 0000000 00000000027 14156664651 0017040 0 ustar 00root root 0000000 0000000 __version__ = '0.8.13'
bx-python-0.8.13/lib/bx/_seqmapping.pyx 0000664 0000000 0000000 00000020065 14156664651 0020000 0 ustar 00root root 0000000 0000000 """
Pyrex extension classes used by `seqmapping.py`.
"""
cdef extern from "stdlib.h":
void* malloc( size_t )
void free( void* )
cdef extern from "Python.h":
int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *) except -1
int PyObject_AsWriteBuffer(object, void **, Py_ssize_t *) except -1
int PyBytes_AsStringAndSize(object, char **, Py_ssize_t *) except -1
from numpy import zeros
from math import floor
import random
import sys
cdef class CharToIntArrayMapping:
"""Mapping for converting strings to int arrays"""
cdef int table[256]
cdef int out_size
cdef object reverse_table
def __cinit__( self ):
"""Init empty mapping (all characters map to -1)"""
cdef int i
for i from 0 <= i < 256: self.table[i] = -1
self.out_size = 0
def __init__( self ):
self.reverse_table = dict()
def set_mapping( self, c, int symbol ):
"""Modify mapping so 'chars' map to 'symbol'"""
char = ord( c )
self.table[ char ] = symbol
if self.out_size <= symbol:
self.out_size = symbol + 1
self.reverse_table[ symbol ] = chr( char )
def translate( self, string ):
"""Translate 'string' and return as int array"""
cdef Py_ssize_t s_len, t_len
cdef unsigned char * s_buf
cdef int * t_buf
# Get direct access to string
PyBytes_AsStringAndSize( string, &s_buf, &s_len )
# Initialize empty array
rval = zeros( s_len, 'i' )
PyObject_AsWriteBuffer( rval, &t_buf, &t_len )
# Translate
for i from 0 <= i < s_len:
t_buf[i] = self.table[ s_buf[ i ] ]
# Done
return rval
def translate_list( self, strings ):
"""Translate a list of strings into an int array"""
cdef Py_ssize_t text_len, i
cdef Py_ssize_t s_len, t_len
cdef int factor
cdef unsigned char * s_buf
cdef int * t_buf
# No input, no output
if len( strings ) < 1: return None
# Length of result
text_len = len( strings[0] )
# Init result array
rval = zeros( text_len, 'i' )
PyObject_AsWriteBuffer( rval, &t_buf, &t_len )
# Loop over seqs and accumulate result values
factor = 1
for string in strings:
PyBytes_AsStringAndSize( string, &s_buf, &s_len )
for i from 0 <= i < text_len:
if t_buf[i] >= 0:
if self.table[ s_buf[i] ] == -1:
t_buf[i] = -1
else:
t_buf[i] = t_buf[i] + ( self.table[ s_buf[i] ] * factor )
factor = factor * self.out_size
return rval
def reverse_map( self, val, nseqs ):
factor = self.out_size ** (nseqs-1)
rval = []
while factor > 0:
rval.append( self.reverse_table[ int( floor( val / factor ) ) ] )
val = val - ( floor(val/factor) * factor )
factor = floor( factor / self.out_size )
rval.reverse()
return rval
def get_out_size( self ):
return self.out_size
cdef class IntToIntMapping:
cdef int* table
cdef int in_size
cdef int out_size
def __cinit__( self, int in_size ):
self.in_size = in_size
self.table = malloc( in_size * sizeof( int ) )
if self.table == NULL: raise "Malloc Failed"
for i from 0 <= i < in_size: self.table[i] = -1
self.out_size = 0
def __dealloc__( self ):
# sys.stderr.write( "freeing mapping_helper.IntToIntMapping\n" ); sys.stderr.flush()
free( self.table )
def set_mapping( self, int index, int symbol ):
assert ( -1 <= index < self.in_size ), "%d not between 0 and %s" % ( index, self.in_size )
self.table[index] = symbol
if self.out_size <= symbol:
self.out_size = symbol + 1
def translate( self, src ):
"""Translate `string` and return as int array"""
cdef Py_ssize_t s_len, t_len
cdef int *s_buf
cdef int *t_buf
# Get direct access to string
PyObject_AsReadBuffer( src, &s_buf, &s_len )
s_len = s_len / sizeof( int )
assert s_len == len( src ), "`src` argument must be a buffer of 32bit integers"
# Initialize empty array
rval = zeros( s_len, 'i' )
PyObject_AsWriteBuffer( rval, &t_buf, &t_len )
# Translate
for i from 0 <= i < s_len:
if s_buf[i] == -1:
t_buf[i] = -1
elif s_buf[i] >= self.in_size:
t_buf[i] = -1
else:
t_buf[i] = self.table[ s_buf[ i ] ]
# Done
return rval
def __getitem__( self, int x ):
if x == -1: return -1
assert 0 <= x < self.in_size
return self.table[ x ]
def collapse( self, int a, int b ):
cdef int i
cdef IntToIntMapping copy
copy = IntToIntMapping( self.in_size )
copy.out_size = self.out_size - 1
if a > b: a, b = b, a
for i from 0 <= i < self.in_size:
if self.table[i] == b: copy.table[i] = a
elif self.table[i] == copy.out_size: copy.table[i] = b
else: copy.table[i] = self.table[i]
return copy
def expand( self, int x ):
"""Grow the alphabet by making 'a' a seperate symbol. If it already mapped to a single symbol, do nothing"""
cdef int i, count, a, b
cdef IntToIntMapping copy
# Get the symbol x maps to
a = self.table[x]
# Symbols that map to -1 should not be touched
if a < 0: return self
# Count how many other input symbols map to a
count = 0
for i from 0 <= i < self.in_size:
if self.table[i] == a: count = count + 1
# Already a singleton
if count < 2: return self
# Otherwise, make a copy with the separated symbol
copy = IntToIntMapping( self.in_size )
copy.out_size = self.out_size + 1
for i from 0 <= i < self.in_size:
copy.table[i] = self.table[i]
copy.table[x] = self.out_size
return copy
def expand_out( self, int a ):
"""Grow the alphabet breaking 'a' into two symbols randomly"""
cdef int i, count, to_split, b
cdef IntToIntMapping copy
count = 0
for i from 0 <= i < self.in_size:
if self.table[i] == a: count = count + 1
if count < 2: return self
copy = IntToIntMapping( self.in_size )
copy.out_size = self.out_size + 1
b = self.out_size
to_split = random.randrange( count )
count = 0
for i from 0 <= i < self.in_size:
if self.table[i] == a:
if count == to_split: copy.table[i] = b
else: copy.table[i] = a
count = count + 1
else:
copy.table[i] = self.table[i]
return copy
def expand_random_split( self, int a ):
"""Grow the alphabet breaking 'a' into two symbols randomly"""
cdef int i, count, b
cdef IntToIntMapping copy
count = 0
for i from 0 <= i < self.in_size:
if self.table[i] == a: count = count + 1
if count < 2: return self
copy = IntToIntMapping( self.in_size )
copy.out_size = self.out_size + 1
b = self.out_size
to_split = random.sample( range( count ), count/2 )
count = 0
for i from 0 <= i < self.in_size:
if self.table[i] == a:
if count in to_split: copy.table[i] = b
else: copy.table[i] = a
count = count + 1
else:
copy.table[i] = self.table[i]
return copy
def get_in_size( self ):
return self.in_size
def get_out_size( self ):
return self.out_size
def get_table( self ):
rval = zeros( self.in_size, 'i' )
for i in range( self.in_size ):
rval[i] = self.table[i]
return rval
bx-python-0.8.13/lib/bx/align/ 0000775 0000000 0000000 00000000000 14156664651 0016022 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/align/__init__.py 0000664 0000000 0000000 00000000340 14156664651 0020130 0 ustar 00root root 0000000 0000000 """
Support for dealing with (genome scale) sequence alignments. See `core` for
the abstract alignment classes and `maf`, `axt`, and `lav` for readers and
writers in various formats.
"""
from bx.align.core import * # noqa
bx-python-0.8.13/lib/bx/align/_core.pyx 0000664 0000000 0000000 00000000454 14156664651 0017656 0 ustar 00root root 0000000 0000000 """
Pyrex extension to speed up some operations in `core.py`.
"""
def coord_to_col( int start, char * text, int pos ):
cdef int col
col = 0
while start < pos:
# Note: ord( '-' ) = 45
if text[col] != 45:
start = start + 1
col = col + 1
return col bx-python-0.8.13/lib/bx/align/_epo.pyx 0000664 0000000 0000000 00000013212 14156664651 0017505 0 ustar 00root root 0000000 0000000
import logging, gzip
from collections import namedtuple
import numpy
cimport numpy
log = logging.getLogger(__name__)
cimport cython
DTYPE = numpy.uint64
cdef inline int max2( int a, int b ):
if b > a:
return b
return a
cdef inline int min2( int a, int b ):
if b < a:
return b
return a
def rem_dash(p, q):
"""remove dash columns and shift match intervals to the left. both iterables
are read on the same direction left-to-right.
"""
def myp(l):
if l: return l.pop(0)
def adv(queue, i, d):
# shifted interval
shi = i[0]-d, i[1]-d
assert shi[0] >= 0
if queue and queue[-1][1] == shi[0]:
# join to the preceeding one
queue[-1] = (queue[-1][0], shi[1])
else:
queue.append( shi )
return queue
p_card = sum( map(lambda i: p[i][1] - p[i][0], range(len(p))) )
q_card = sum( map(lambda i: q[i][1] - q[i][0], range(len(q))) )
P, Q = [], []
dash = 0 # dash (on both cigars) count so far
a, b = p.pop(0), q.pop(0)
#while p or q:
while a and b:
assert dash <= min(a[0], b[0])
i = max(a[0], b[0]) - min(a[1], b[1])
if i >= 0: # no intersection
if a[1] <= b[0]:
if p:
i = min(i, p[0][0] - a[1])
P = adv(P, a, dash)
a = myp(p)
else:
if q:
i = min(i, q[0][0] - b[1])
Q = adv(Q, b, dash)
b = myp(q)
dash += i
else: # intersection
if a[1] >= b[1]:
Q = adv(Q, b, dash); b = myp(q)
elif a[1] < b[1]:
P = adv(P, a, dash); a = myp(p)
#if not a or not b: # no more matchings
# break
assert (not p) or (not q), "one or both should be empty: p=%s, q=%s" % (str(p), str(q))
if a: P = adv(P, a, dash)
if b: Q = adv(Q, b, dash)
# remaining intervals (in q or p)
r, R = p, P
if q: r, R = q, Q
# just extend the last inteval by the remaining bases
R[-1] = (R[-1][0], R[-1][1] + sum( map(lambda i: i[1]-i[0], r) ))
P_card = sum( map(lambda i: P[i][1] - P[i][0], range(len(P))) )
Q_card = sum( map(lambda i: Q[i][1] - Q[i][0], range(len(Q))) )
assert p_card == P_card, "%d != %d" % (p_card, P_card)
assert q_card == Q_card, "%d != %d" % (q_card, Q_card)
return P, Q
def fastLoadChain(fname, hf):
data = []
open_f = (fname.endswith(".gz") and gzip.open or open)
with open_f(fname, "rt") as fd:
while True:
line = fd.readline()
if line == "":
break
hd = hf(line)
N = []
line = fd.readline().split()
while len(line) == 3:
N.append( (int(line[0]), int(line[1]), int(line[2])) )
line = fd.readline().split()
if len(line) != 1:
raise ValueError("last matching block expected (found %s)" % str(line))
N.append( (int(line[0]), 0, 0) )
s, t, q = zip( *N )
data.append( (hd,
numpy.array(s, dtype=numpy.int),
numpy.array(t, dtype=numpy.int),
numpy.array(q, dtype=numpy.int)) )
assert hd.tEnd - hd.tStart == sum(s) + sum(t)
assert hd.qEnd - hd.qStart == sum(s) + sum(q)
fd.readline() # a blank line
log.info("parsed %d elements from %s" % (len(data), fname))
return data
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef numpy.ndarray[numpy.uint64_t, ndim=2] bed_union( numpy.ndarray[numpy.uint64_t, ndim=2] elements ):
"""compute the union of these elements. simply walk the sorted elements and join the intersecting ones
works on half-open intervals, i.e., [a, b), [b, c) ---> [a, c)
@param elements: 2-dim numpy array of unsigned64 ints
@return: 2-dim numpy array of unsigned64 ints"""
assert numpy.shape(elements)[0] > 0
cdef Py_ssize_t cst, cen, i, j
cdef numpy.ndarray[numpy.uint64_t, ndim=2] tmp_elems, final_elems
elements.sort(axis=0)
assert elements[0][0] <= elements[numpy.shape(elements)[0]-1][0]
tmp_elems = numpy.zeros((numpy.shape(elements)[0], 2), dtype=DTYPE)
cst = elements[0, 0]
cen = elements[0, 1]
j = 0
for i in range(1, numpy.shape(elements)[0]):
if elements[i, 0] <= cen: # overlaps with the last one
cen = max2(cen, elements[i, 1])
else:
tmp_elems[j, 0] = cst
tmp_elems[j, 1] = cen
j += 1
cst = elements[i, 0]
cen = elements[i, 1]
tmp_elems[j, 0] = cst
tmp_elems[j, 1] = cen
j += 1
final_elems = numpy.empty((j, 2), dtype=DTYPE)
for i in range(j):
final_elems[i, 0] = tmp_elems[i, 0]
final_elems[i, 1] = tmp_elems[i, 1]
assert final_elems[0, 0] == elements[0, 0], "fe=%d, e=%d" % (final_elems[0,0], elements[0,0])
return final_elems
#@cython.wraparound(False)
#@cython.boundscheck(False)
cpdef numpy.ndarray[numpy.int64_t, ndim=2] cummulative_intervals(numpy.ndarray[numpy.int64_t, ndim=1] S,
numpy.ndarray[numpy.int64_t, ndim=1] D ):
"""compute cummulative intervals for this side of an aligmnent. S and D are one side of
the alignment as described in the chain file format"""
cdef int N = S.shape[0]
cdef int i = 0, j = 0
assert N == D.shape[0]
cdef numpy.ndarray[numpy.int64_t, ndim=2] cumm_i = numpy.empty((N, 2), dtype=numpy.int64)
cumm_i[0,0] = 0
cumm_i[0,1] = S[0]
for i in range(N-1):
j = i + 1
cumm_i[j,0] = cumm_i[i, 1] + D[i]
cumm_i[j,1] = cumm_i[j,0] + S[j]
return cumm_i
bx-python-0.8.13/lib/bx/align/axt.py 0000664 0000000 0000000 00000016454 14156664651 0017202 0 ustar 00root root 0000000 0000000 """
Support for reading and writing the `AXT`_ format used for pairwise
alignments.
.. _AXT: http://genome.ucsc.edu/goldenPath/help/axt.html
"""
from bx import interval_index_file
from bx.align import (
Alignment,
Component,
src_split
)
# Tools for dealing with pairwise alignments in AXT format
class MultiIndexed:
"""Similar to 'indexed' but wraps more than one axt_file"""
def __init__(self, axt_filenames, keep_open=False):
self.indexes = [Indexed(axt_file, axt_file + ".index") for axt_file in axt_filenames]
def get(self, src, start, end):
blocks = []
for index in self.indexes:
blocks += index.get(src, start, end)
return blocks
class Indexed:
"""Indexed access to a axt using overlap queries, requires an index file"""
def __init__(self, axt_filename, index_filename=None, keep_open=False, species1=None, species2=None, species_to_lengths=None, support_ids=False):
if index_filename is None:
index_filename = axt_filename + ".index"
self.indexes = interval_index_file.Indexes(filename=index_filename)
self.axt_filename = axt_filename
# nota bene: (self.species1 = species1 or "species1") is incorrect if species1=""
self.species1 = species1
if self.species1 is None:
self.species1 = "species1"
self.species2 = species2
if self.species2 is None:
self.species2 = "species2"
self.species_to_lengths = species_to_lengths
self.support_ids = support_ids # for extra text at end of axt header lines
if keep_open:
self.f = open(axt_filename)
else:
self.f = None
def get(self, src, start, end):
intersections = self.indexes.find(src, start, end)
return (self.get_axt_at_offset(val) for start, end, val in intersections)
def get_axt_at_offset(self, offset):
if self.f:
self.f.seek(offset)
return read_next_axt(self.f, self.species1, self.species2, self.species_to_lengths, self.support_ids)
else:
f = open(self.axt_filename)
try:
f.seek(offset)
return read_next_axt(f, self.species1, self.species2, self.species_to_lengths, self.support_ids)
finally:
f.close()
class Reader:
"""Iterate over all axt blocks in a file in order"""
def __init__(self, file, species1=None, species2=None, species_to_lengths=None, support_ids=False):
self.file = file
# nota bene: (self.species1 = species1 or "species1") is incorrect if species1=""
self.species1 = species1
if self.species1 is None:
self.species1 = "species1"
self.species2 = species2
if self.species2 is None:
self.species2 = "species2"
self.species_to_lengths = species_to_lengths
self.support_ids = support_ids # for extra text at end of axt header lines
self.attributes = {}
def __next__(self):
return read_next_axt(self.file, self.species1, self.species2, self.species_to_lengths, self.support_ids)
def __iter__(self):
return ReaderIter(self)
def close(self):
self.file.close()
class ReaderIter:
def __init__(self, reader):
self.reader = reader
def __iter__(self):
return self
def __next__(self):
v = next(self.reader)
if not v:
raise StopIteration
return v
class Writer:
def __init__(self, file, attributes=None):
if attributes is None:
attributes = {}
self.file = file
self.block = 0
self.src_split = True
if "src_split" in attributes:
self.src_split = attributes["src_split"]
def write(self, alignment):
if len(alignment.components) != 2:
raise ValueError(
"%d-component alignment is not compatible with axt" %
len(alignment.components))
c1 = alignment.components[0]
c2 = alignment.components[1]
if c1.strand != "+":
c1 = c1.reverse_complement()
c2 = c2.reverse_complement()
if self.src_split:
spec1, chr1 = src_split(c1.src)
spec2, chr2 = src_split(c2.src)
else:
chr1, chr2 = c1.src, c2.src
self.file.write(
"%d %s %d %d %s %d %d %s %s\n" %
(self.block, chr1, c1.start+1, c1.start+c1.size,
chr2, c2.start+1, c2.start+c2.size, c2.strand,
alignment.score))
self.file.write("%s\n" % c1.text)
self.file.write("%s\n" % c2.text)
self.file.write("\n")
self.block += 1
def close(self):
self.file.close()
# ---- Helper methods ---------------------------------------------------------
# typical axt block:
# 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 [optional text]
# TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
# TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
# start and stop are origin-1, inclusive
# first species is always on plus strand
# when second species is on minus strand, start and stop are counted from sequence end
def read_next_axt(file, species1, species2, species_to_lengths=None, support_ids=False):
line = readline(file, skip_blank=True)
if not line:
return
fields = line.split()
if len(fields) < 9 or (not support_ids and len(fields) > 9):
raise ValueError("bad axt-block header: %s" % line)
attributes = {}
if len(fields) > 9:
attributes["id"] = "_".join(fields[9:])
seq1 = readline(file)
if not line or line.isspace():
raise ValueError("incomplete axt-block; header: %s" % line)
seq2 = readline(file)
if not line or line.isspace():
raise ValueError("incomplete axt-block; header: %s" % line)
# Build 2 component alignment
alignment = Alignment(attributes=attributes, species_to_lengths=species_to_lengths)
# Build component for species 1
component = Component()
component.src = fields[1]
if species1 != "":
component.src = species1 + "." + component.src
component.start = int(fields[2]) - 1 # (axt intervals are origin-1
end = int(fields[3]) # and inclusive on both ends)
component.size = end - component.start
component.strand = "+"
component.text = seq1.strip()
alignment.add_component(component)
# Build component for species 2
component = Component()
component.src = fields[4]
if species2 != "":
component.src = species2 + "." + component.src
component.start = int(fields[5]) - 1
end = int(fields[6])
component.size = end - component.start
component.strand = fields[7]
component.text = seq2.strip()
alignment.add_component(component)
# add score
try:
alignment.score = int(fields[8])
except ValueError:
try:
alignment.score = float(fields[8])
except ValueError:
alignment.score = fields[8]
return alignment
def readline(file, skip_blank=False):
"""Read a line from provided file, skipping any blank or comment lines"""
while True:
line = file.readline()
if not line:
return None
if line[0] != '#' and not (skip_blank and line.isspace()):
return line
bx-python-0.8.13/lib/bx/align/core.py 0000664 0000000 0000000 00000043447 14156664651 0017340 0 ustar 00root root 0000000 0000000 """
Classes that represent alignments between multiple sequences.
"""
import random
import weakref
from bx.misc.readlengths import read_lengths_file
# DNA reverse complement table
# DNA_COMP = " - " \
# " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \
# " " \
# " "
DNA_COMP = str.maketrans("ACGTacgt", "TGCAtgca")
class Alignment:
def __init__(self, score=0, attributes=None, species_to_lengths=None):
# species_to_lengths is needed only for file formats that don't provide
# chromosome lengths; it maps each species name to one of these:
# - the name of a file that contains a list of chromosome length pairs
# - a dict mapping chromosome names to their length
# - a single length value (useful when we just have one sequence and no chromosomes)
# internally a file name is replaced by a dict, but only on an "as
# needed" basis
if attributes is None:
attributes = {}
self.score = score
self.text_size = 0
self.attributes = attributes
if species_to_lengths is None:
self.species_to_lengths = {}
else:
self.species_to_lengths = species_to_lengths
self.components = []
def add_component(self, component):
component._alignment = weakref.ref(self)
self.components.append(component)
if component.text is not None:
if self.text_size == 0:
self.text_size = len(component.text)
elif self.text_size != len(component.text):
raise Exception("Components must have same text length")
def get_score(self):
return self.__score
def set_score(self, score):
if isinstance(score, str):
try:
score = int(score)
except ValueError:
try:
score = float(score)
except ValueError:
pass
self.__score = score
score = property(fget=get_score, fset=set_score)
def __str__(self):
s = "a score=" + str(self.score)
for key in self.attributes:
s += f" {key}={self.attributes[key]}"
s += "\n"
# Components
for c in self.components:
s += str(c)
s += "\n"
return s
def src_size(self, src):
species, chrom = src_split(src)
if species in self.species_to_lengths:
chrom_to_length = self.species_to_lengths[species]
elif chrom in self.species_to_lengths:
chrom_to_length = self.species_to_lengths
else:
raise ValueError("no src_size (no length file for %s)" % species)
if isinstance(chrom_to_length, int): # (if it's a single length)
return chrom_to_length
if isinstance(chrom_to_length, str): # (if it's a file name)
chrom_to_length = read_lengths_file(chrom_to_length)
self.species_to_lengths[species] = chrom_to_length
if chrom not in chrom_to_length:
raise ValueError(f"no src_size ({species} has no length for {chrom})")
return chrom_to_length[chrom]
def get_component_by_src(self, src):
for c in self.components:
if c.src == src:
return c
return None
def get_components_by_src(self, src):
for c in self.components:
if c.src == src:
yield c
def get_component_by_src_start(self, src):
for c in self.components:
if c.src.startswith(src):
return c
return None
def slice(self, start, end):
new = Alignment(score=self.score, attributes=self.attributes)
for component in self.components:
new.components.append(component.slice(start, end))
new.text_size = end - start
return new
def reverse_complement(self):
new = Alignment(score=self.score, attributes=self.attributes)
for component in self.components:
new.components.append(component.reverse_complement())
new.text_size = self.text_size
return new
def slice_by_component(self, component_index, start, end):
"""
Return a slice of the alignment, corresponding to an coordinate interval in a specific component.
component_index is one of
an integer offset into the components list
a string indicating the src of the desired component
a component
start and end are relative to the + strand, regardless of the component's strand.
"""
if isinstance(component_index, int):
ref = self.components[component_index]
elif isinstance(component_index, str):
ref = self.get_component_by_src(component_index)
elif isinstance(component_index, Component):
ref = component_index
else:
raise ValueError("can't figure out what to do")
start_col = ref.coord_to_col(start)
end_col = ref.coord_to_col(end)
if ref.strand == '-':
(start_col, end_col) = (end_col, start_col)
return self.slice(start_col, end_col)
def column_iter(self):
# FIXME: The empty component are not present
# in column_iter.
# Maybe it would be good to use - and =
for i in range(self.text_size):
yield [c.text[i] for c in self.components if not c.empty]
def limit_to_species(self, species):
new = Alignment(score=self.score, attributes=self.attributes)
new.text_size = self.text_size
for component in self.components:
if component.src.split('.')[0] in species:
new.add_component(component)
return new
def remove_all_gap_columns(self):
"""
Remove any columns containing only gaps from alignment components,
text of components is modified IN PLACE.
"""
seqs = []
for c in self.components:
if c.empty:
seqs.append(None)
try:
seqs.append(list(c.text))
except TypeError:
seqs.append(None)
i = 0
text_size = self.text_size
while i < text_size:
all_gap = True
for seq in seqs:
if seq is None:
continue
if seq[i] != '-':
all_gap = False
if all_gap:
for seq in seqs:
if seq is None:
continue
del seq[i]
text_size -= 1
else:
i += 1
for i in range(len(self.components)):
if seqs[i] is None:
continue
self.components[i].text = ''.join(seqs[i])
self.text_size = text_size
def __eq__(self, other):
if other is None or not isinstance(other, type(self)):
return False
if self.score != other.score:
return False
if self.attributes != other.attributes:
return False
if len(self.components) != len(other.components):
return False
for c1, c2 in zip(self.components, other.components):
if c1 != c2:
return False
return True
def __ne__(self, other):
return not(self.__eq__(other))
def __deepcopy__(self, memo):
from copy import deepcopy
new = Alignment(score=self.score, attributes=deepcopy(self.attributes), species_to_lengths=deepcopy(self.species_to_lengths))
for component in self.components:
new.add_component(deepcopy(component))
return new
class Component:
def __init__(self, src='', start=0, size=0, strand=None, src_size=None, text=''):
self._alignment = None
self.src = src
self.start = start # Nota Bene: start,size,strand are as they
self.size = size # .. appear in a MAF file-- origin-zero, end
self.strand = strand # .. excluded, and minus strand counts from
self._src_size = src_size # .. end of sequence
self.text = text
self.quality = None
# Optional fields to keep track of synteny status (only makes sense
# when the alignment is part of an ordered set)
self.synteny_left = None
self.synteny_right = None
self.synteny_empty = None
# If true, this component actually represents a non-aligning region,
# and text is None.
self.empty = False
# Index maps a coordinate (distance along + strand from + start) to alignment column
self.index = None
def __str__(self):
if self.empty:
rval = "e %s %d %d %s %d %s" % (
self.src, self.start, self.size, self.strand, self.src_size, self.synteny_empty)
else:
rval = "s %s %d %d %s %d %s" % (
self.src, self.start, self.size, self.strand, self.src_size, self.text)
if self.synteny_left and self.synteny_right:
rval += "\ni %s %s %d %s %d" % (
self.src, self.synteny_left[0], self.synteny_left[1],
self.synteny_right[0], self.synteny_right[1])
return rval
def get_end(self):
return self.start + self.size
end = property(fget=get_end)
def get_src_size(self):
if self._src_size is None:
if self._alignment is None:
raise Exception("component has no src_size")
self._src_size = self._alignment().src_size(self.src)
return self._src_size
def set_src_size(self, src_size):
self._src_size = src_size
src_size = property(fget=get_src_size, fset=set_src_size)
def get_forward_strand_start(self):
if self.strand == '-':
return self.src_size - self.end
else:
return self.start
forward_strand_start = property(fget=get_forward_strand_start)
def get_forward_strand_end(self):
if self.strand == '-':
return self.src_size - self.start
else:
return self.end
forward_strand_end = property(fget=get_forward_strand_end)
def reverse_complement(self):
start = self.src_size - self.end
if self.strand == "+":
strand = "-"
else:
strand = "+"
if self.empty:
text = None
else:
comp = [ch for ch in self.text.translate(DNA_COMP)]
comp.reverse()
text = "".join(comp)
new = Component(self.src, start, self.size, strand, self._src_size, text)
if self.empty:
new.empty = True
new.synteny_empty = self.synteny_empty
# Propagate supplementary info
if self.synteny_left:
new.synteny_right = self.synteny_left
if self.synteny_right:
new.synteny_left = self.synteny_right
new._alignment = self._alignment
return new
def slice(self, start, end):
new = Component(src=self.src, start=self.start, strand=self.strand, src_size=self._src_size)
new._alignment = self._alignment
if self.empty:
new.empty = True
new.size = self.size
new.text = None
new.synteny_empty = self.synteny_empty
return new
new.text = self.text[start:end]
# for i in range( 0, start ):
# if self.text[i] != '-': new.start += 1
# for c in new.text:
# if c != '-': new.size += 1
new.start += start - self.text.count('-', 0, start)
new.size = len(new.text) - new.text.count('-')
# FIXME: This annotation probably means nothing after slicing if
# one of the ends changes. In general the 'i' rows of a MAF only
# make sense in context (relative to the previous and next alignments
# in a stream, slicing breaks that).
# LD: Indeed, I think it is wrong to keep them. Let's keep the info
# only when the boundaries are kept.
if self.synteny_left:
if start == 0:
new.synteny_left = self.synteny_left
if self.synteny_right:
if end == len(self.text):
new.synteny_right = self.synteny_right
return new
def slice_by_coord(self, start, end):
"""
Return the slice of the component corresponding to a coordinate interval.
start and end are relative to the + strand, regardless of the component's strand.
"""
start_col = self.coord_to_col(start)
end_col = self.coord_to_col(end)
if (self.strand == '-'):
(start_col, end_col) = (end_col, start_col)
return self.slice(start_col, end_col)
def coord_to_col(self, pos):
"""
Return the alignment column index corresponding to coordinate pos.
pos is relative to the + strand, regardless of the component's strand.
"""
if self.empty:
raise ValueError("There is no column index. It is empty.")
start, end = self.get_forward_strand_start(), self.get_forward_strand_end()
if pos < start or pos > end:
raise ValueError("Range error: %d not in %d-%d" % (pos, start, end))
if not self.index:
self.index = list()
if self.strand == '-':
# nota bene: for - strand self.index[x] maps to one column
# higher than is actually associated with the position; thus
# when slice_by_component() and slice_by_coord() flip the ends,
# the resulting slice is correct
for x in range(len(self.text)-1, -1, -1):
if not self.text[x] == '-':
self.index.append(x + 1)
self.index.append(0)
else:
for x in range(len(self.text)):
if not self.text[x] == '-':
self.index.append(x)
self.index.append(len(self.text))
x = None
try:
x = self.index[pos - start]
except IndexError:
raise Exception("Error in index.")
return x
def __eq__(self, other):
if other is None or not isinstance(other, type(self)):
return False
return (self.src == other.src
and self.start == other.start
and self.size == other.size
and self.strand == other.strand
and self._src_size == other._src_size
and self.text == other.text
and self.synteny_left == other.synteny_left
and self.synteny_right == other.synteny_right
and self.synteny_empty == other.synteny_empty
and self.empty == other.empty)
def __ne__(self, other):
return not(self.__eq__(other))
def __deepcopy__(self, memo):
new = Component(src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text)
new._alignment = self._alignment
new.quality = self.quality
new.synteny_left = self.synteny_left
new.synteny_right = self.synteny_right
new.synteny_empty = self.synteny_empty
new.empty = self.empty
new.index = self.index
return new
def get_reader(format, infile, species_to_lengths=None):
import bx.align.axt
import bx.align.lav
import bx.align.maf
if format == "maf":
return bx.align.maf.Reader(infile, species_to_lengths)
elif format == "axt":
return bx.align.axt.Reader(infile, species_to_lengths)
elif format == "lav":
return bx.align.lav.Reader(infile)
else:
raise ValueError("Unknown alignment format %s" % format)
def get_writer(format, outfile, attributes=None):
import bx.align.axt
import bx.align.lav
import bx.align.maf
if attributes is None:
attributes = {}
if format == "maf":
return bx.align.maf.Writer(outfile, attributes)
elif format == "axt":
return bx.align.axt.Writer(outfile, attributes)
elif format == "lav":
return bx.align.lav.Writer(outfile, attributes)
else:
raise ValueError("Unknown alignment format %s" % format)
def get_indexed(format, filename, index_filename=None, keep_open=False, species_to_lengths=None):
import bx.align.axt
import bx.align.lav
import bx.align.maf
if format == "maf":
return bx.align.maf.Indexed(filename, index_filename, keep_open, species_to_lengths)
elif format == "axt":
return bx.align.axt.Indexed(filename, index_filename, keep_open, species_to_lengths)
elif format == "lav":
raise Exception("LAV support for Indexed has not been implemented")
else:
raise ValueError("Unknown alignment format %s" % format)
def shuffle_columns(a):
"""Randomize the columns of an alignment"""
mask = list(range(a.text_size))
random.shuffle(mask)
for c in a.components:
if not c.empty:
c.text = ''.join([c.text[i] for i in mask])
def src_split(src): # splits src into species,chrom
dot = src.rfind(".")
if dot == -1:
return None, src
else:
return src[:dot], src[dot+1:]
def src_merge(species, chrom, contig=None): # creates src (inverse of src_split)
if species is None:
src = chrom
else:
src = species + "." + chrom
if contig is not None:
src += "[%s]" % contig
return src
# ---- Read C extension if available ---------------------------------------
try:
from ._core import coord_to_col
except ImportError:
def coord_to_col(start, text, pos):
col = 0
while start < pos:
if text[col] != '-':
start += 1
col += 1
return col
bx-python-0.8.13/lib/bx/align/epo.py 0000664 0000000 0000000 00000026403 14156664651 0017164 0 ustar 00root root 0000000 0000000 """Classes and utilities for mutliple alignments from the EPO pipeline"""
import logging
import os
import pickle as cPickle
import re
from collections import namedtuple
from ._epo import ( # noqa: F401
bed_union,
cummulative_intervals,
fastLoadChain,
rem_dash
)
log = logging.getLogger(__name__)
class Chain(namedtuple('Chain', 'score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id')):
"""A Chain header as in http://genome.ucsc.edu/goldenPath/help/chain.html
chain coordinates are with respect to the strand, so for example tStart on the + strand is the
distance from the leftmost position; tStart on the - strand is the distance from the rightmost position."""
__slots__ = ()
def __str__(self):
return "chain {score} {tName} {tSize} {tStrand} {tStart} {tEnd} {qName} {qSize} {qStrand} {qStart} {qEnd} {id}".format(**self._asdict())
@classmethod
def _strfactory(cls, line):
"""factory class method for Chain
:param line: header of a chain (in .chain format)
"""
assert isinstance(line, str), "this is a factory from string"
line = line.rstrip().split()[1:] # the first component is the keyword "chain"
tup = [t[0](t[1]) for t in zip([int, str, int, str, int, int, str, int, str, int, int, str], line)]
return tuple.__new__(cls, tup)
@classmethod
def _make_from_epo(cls, trg_comp, qr_comp, trg_chrom_sizes, qr_chrom_sizes):
"""crate a chain of collinear rings from the given components.
The target of the chain will always be on the forward strand.
This is done to avoid confusion when mapping psl files. So,
if trg_comp.strand=-, qr_comp.strand=- (resp. +) the
chain header will have tStrand=+, qStrand=+ (resp. -). No strand
changes on the other cases.
:param trg_comp: target (i.e, the first) component
:type trg_comp: L{EPOitem}
:param qr_comp: query (i.e, the second) component
:type qr_comp: L{EPOitem}
:param trg_chrom_sizes: chromosome sizes of the target
:type trg_chrom_sizes: dictionary of the type (chrom) --> size
:param qr_chrom_sizes: chromosome sizes of the query
:type qr_chrom_sizes: dictionary of the type (chrom) --> size
:return: A L{Chain} instance"""
# size, target, query arrays
S, T, Q = [], [], []
# the target strand of the chain must be on the forward strand
trg_intervals = trg_comp.intervals(reverse=trg_comp.strand == '-')
qr_intervals = qr_comp.intervals(reverse=trg_comp.strand == '-')
if len(trg_intervals) == 0 or len(qr_intervals) == 0:
log.warning("deletion/insertion only intervals")
return None
A, B = rem_dash(trg_intervals, qr_intervals)
# correct for when cigar starts/ends with dashes (in number of bases)
tr_start_correction = max(B[0][0] - A[0][0], 0)
tr_end_correction = max(A[-1][1] - B[-1][1], 0)
qr_start_correction = max(A[0][0] - B[0][0], 0)
qr_end_correction = max(B[-1][1] - A[-1][1], 0)
a, b = A.pop(0), B.pop(0)
# intervals are 0-base, halfo-open => lengths = coordinate difference
while A or B:
if a[1] < b[1]:
T.append(0)
Q.append(A[0][0] - a[1])
S.append(min(a[1], b[1]) - max(a[0], b[0]))
a = A.pop(0)
elif b[1] < a[1]:
Q.append(0)
T.append(B[0][0] - b[1])
S.append(min(a[1], b[1]) - max(a[0], b[0]))
b = B.pop(0)
elif A and B:
assert 1 > 2, "there are dash columns"
else:
break
S.append(min(a[1], b[1]) - max(a[0], b[0]))
assert len(T) == len(Q) == len(S) - 1, "(S, T, Q) = (%d, %d, %d)" % tuple(map(len, (S, T, Q)))
tSize = trg_chrom_sizes[trg_comp.chrom]
qSize = qr_chrom_sizes[qr_comp.chrom]
# UCSC coordinates are 0-based, half-open and e! coordinates are 1-base, closed
# chain_start = epo_start - 1 and chain_end = epo_end
if qr_comp.strand == '+':
chain = Chain(
0, trg_comp.chrom, tSize, "+",
(trg_comp.start - 1) + tr_start_correction, trg_comp.end - tr_end_correction,
qr_comp.chrom, qSize, (qr_comp.strand == trg_comp.strand and '+' or '-'),
(qr_comp.start - 1) + qr_start_correction, qr_comp.end - qr_end_correction,
qr_comp.gabid)
else:
chain = Chain(
0, trg_comp.chrom, tSize, "+",
(trg_comp.start - 1) + tr_start_correction, trg_comp.end - tr_end_correction,
qr_comp.chrom, qSize, (qr_comp.strand == trg_comp.strand and '+' or '-'),
(qr_comp.start - 1) + qr_end_correction, qr_comp.end - qr_start_correction,
qr_comp.gabid)
# strand correction. in UCSC coordinates this is: size - coord
if chain.qStrand == '-':
chain = chain._replace(
qEnd=chain.qSize - chain.qStart,
qStart=chain.qSize - chain.qEnd)
assert chain.tEnd - chain.tStart == sum(S) + sum(T), "[%s] %d != %d" % (
str(chain), chain.tEnd - chain.tStart, sum(S) + sum(T))
assert chain.qEnd - chain.qStart == sum(S) + sum(Q), "[%s] %d != %d" % (
str(chain), chain.qEnd - chain.qStart, sum(S) + sum(Q))
return chain, S, T, Q
def slice(self, who):
"return the slice entry (in a bed6 format), AS IS in the chain header"
assert who in ('t', 'q'), "who should be 't' or 'q'"
if who == 't':
return (self.tName, self.tStart, self.tEnd, self.id, self.score, self.tStrand)
else:
return (self.qName, self.qStart, self.qEnd, self.id, self.score, self.qStrand)
def bedInterval(self, who):
"return a BED6 entry, thus DOES coordinate conversion for minus strands"
if who == 't':
st, en = self.tStart, self.tEnd
if self.tStrand == '-':
st, en = self.tSize-en, self.tSize-st
return (self.tName, st, en, self.id, self.score, self.tStrand)
else:
st, en = self.qStart, self.qEnd
if self.qStrand == '-':
st, en = self.qSize-en, self.qSize-st
assert en-st == self.qEnd - self.qStart
return (self.qName, st, en, self.id, self.score, self.qStrand)
@classmethod
def _parse_file(cls, path, pickle=False):
"""parse a .chain file into a list of the type [(L{Chain}, arr, arr, arr) ...]
:param fname: name of the file"""
fname = path
if fname.endswith(".gz"):
fname = path[:-3]
if fname.endswith('.pkl'):
# you asked for the pickled file. I'll give it to you
log.debug("loading pickled file %s ...", fname)
with open(fname, "rb") as f:
return cPickle.load(f)
elif os.path.isfile("%s.pkl" % fname):
# there is a cached version I can give to you
log.info("loading pickled file %s.pkl ...", fname)
if os.stat(path).st_mtime > os.stat("%s.pkl" % fname).st_mtime:
log.critical("*** pickled file %s.pkl is not up to date ***", fname)
try:
with open("%s.pkl" % fname, "rb") as f:
return cPickle.load(f)
except Exception:
log.warning("Loading pickled file %s.pkl failed", fname)
data = fastLoadChain(path, cls._strfactory)
if pickle and not os.path.isfile('%s.pkl' % fname):
log.info("pickling to %s.pkl", fname)
with open('%s.pkl' % fname, 'wb') as f:
cPickle.dump(data, f)
return data
class EPOitem(namedtuple('Epo_item', 'species gabid chrom start end strand cigar')):
"this format is how alignments are delivered from e!"
__slots__ = ()
cigar_pattern = re.compile(r"(\d*)([MD])")
def __repr__(self):
return str(self)
def __str__(self):
c = self.cigar[:5] + "..." + self.cigar[-5:]
return "(%s %s %s %d %d %s %s)" % tuple(self[:6] + (c,))
@classmethod
def _strfactory(cls, line):
"""factory method for an EPOitem
:param line: a line of input"""
cmp = line.rstrip().split()
chrom = cmp[2]
if not chrom.startswith("chr"):
chrom = "chr%s" % chrom
instance = tuple.__new__(
cls,
(cmp[0], cmp[1], chrom, int(cmp[3]), int(cmp[4]), {'1': '+', '-1': '-'}[cmp[5]], cmp[6]))
span = instance.end - instance.start + 1
m_num = sum((t[1] == "M" and [t[0]] or [0])[0] for t in instance.cigar_iter(False))
if span != m_num:
log.warning("[{gabid}] {species}.{chrom}:{start}-{end}.".format(**instance._asdict()) + "(span) %d != %d (matches)" % (span, m_num))
return None
return instance
@classmethod
def _parse_epo(cls, fname):
"""Load an entire file in the EPO format into a dictionary of the type {gab_id => [Epoitem, ...]}
:param fname: file name"""
data = {}
with open(fname) as fd:
for el in (cls._strfactory(_) for _ in fd):
if el:
data.setdefault(el.gabid, []).append(el)
log.info("parsed %d elements from %s", len(data), fname)
return data
def cigar_iter(self, reverse):
"""self.cigar => [(length, type) ... ] iterate the cigar
:param reverse: whether to iterate in the reverse direction (right-to-left)
:type reverse: boolean
:return a list of pairs of the type [(length, M/D) ..]
"""
l = 0
P = self.cigar_pattern
data = []
cigar = self.cigar
parsed_cigar = re.findall(P, cigar)
if reverse:
parsed_cigar = parsed_cigar[::-1]
for _l, t in parsed_cigar:
# 1M is encoded as M
l = (_l and int(_l) or 1) # int(_l) cannot be 0
data.append((l, t))
return data
def intervals(self, reverse, thr=0):
"""return a list of (0-based half-open) intervals representing the match regions of the cigar
for example 4MD4M2DM with reverse=False will produce [(0,4), (5,9), (11,12)]
4MD4M2DM with reverse=True will produce [(0,1), (3,7), (8,12)] (= 12 - previous interval)
:param reverse: whether to iterate in the reverse direction (right-to-left) (this is passed as is to self.cigar_iter)
:type reverse: boolean
:param thr: shift all intervals by this much
:type thr: integer
:return: list of pairs"""
d = [(thr, thr)]
dl = 0
for tup in self.cigar_iter(reverse):
if tup[1] == "D":
dl = tup[0]
else:
s = d[-1][1] + dl
d.append((s, s+tup[0]))
assert d[0] == (thr, thr)
# assert that nr. of Ms in the interval == sum of produced intervals
assert sum(t[0] for t in self.cigar_iter(False) if t[1] == "M") == sum(t[1]-t[0] for t in d)
d_sum = sum(t[1]-t[0] for t in d)
assert self.end - self.start + 1 == d_sum, "[ (%d, %d) = %d ] != %d" % (
self.start, self.end, self.end-self.start+1, d_sum)
return d[1:] # clip the (thr, thr) entry
bx-python-0.8.13/lib/bx/align/epo_tests.py 0000664 0000000 0000000 00000021214 14156664651 0020401 0 ustar 00root root 0000000 0000000 "tests for bx.align.epo"
import pdb
import random
import unittest
import numpy as np
from bx.align._epo import (
bed_union,
cummulative_intervals,
)
from bx.align.epo import (
Chain,
EPOitem
)
class TestBed(unittest.TestCase):
def setUp(self):
self.N = random.randint(1, 1000)
def test_ci(self):
S, D = [], []
for i in range(self.N):
S.append(random.randint(10, 50))
D.append(random.randint(10, 50))
D[-1] = 0
C = cummulative_intervals(np.array(S, dtype=np.int64), np.array(D, dtype=np.int64))
for i in range(self.N):
assert C[i, 1] - C[i, 0] == S[i]
for i in range(1, self.N):
assert C[i, 0] - C[i-1, 1] == D[i-1], "[%d] %d != %d" % (i, C[i, 0] - C[i-1, 1], D[i-1])
def test_elem_u(self):
# back to back, so should return a single interval
EL = []
th = 0
for i in range(self.N):
size = random.randint(1, 20)
EL.append((th, th+size))
th += size
U = bed_union(np.array(EL, dtype=np.uint64))
assert U[0, 0] == 0 and U[0, 1] == th
# disjoint
EL = []
th = 0
for i in range(self.N):
size = random.randint(1, 20)
EL.append((th, th+size))
th += (size + 1)
U = bed_union(np.array(EL, dtype=np.uint64))
for i in range(U.shape[0]):
assert (U[i, 0], U[i, 1]) == EL[i]
# random with some empty elements
EL = []
th = 0
for i in range(self.N):
size = random.randint(1, 20)
EL.append((th, th+size))
th += random.randint(1, size+size) # 50% of overlapping
U = bed_union(np.array(EL, dtype=np.uint64))
assert U[0, 1] > U[0, 0]
for i in range(1, U.shape[0]):
assert U[i, 1] > U[i, 0]
assert U[i, 0] > U[i-1, 1]
cigar_pairs = [
("GGACCTGGAGAGATCAG---------------------------GACTTCAACTGTGTG-------------TCTTAGACTGGG--------AGGGTGTTA",
"AGGCCAGGAGAGATCAGGTAAGTCTTAATTTAATAAAGAGATAGGACCTGAACTGTGTCTAACAATAGGTAATATTAGACTGGGGGAGAGAGAAGACTTTC"),
("TTT--------------------------------------------------------------------------------------------------------------------T",
"CTTGTACCAAGGACAGTACTGGCAGCCTAATTGCTAACACTTTGTGGTGGATTGGTCCACTCAATATTTGTTCCCACCTCTTTTCAGTCCAGTTCTATAAAGGACAGAAAGTTGAAAACT"),
("A-------------------------------------------------ACACTGGACACAGCACTAACACGATTACTTA",
"ACATTTCCCACACTCCCTTGCAGCTAGGTTTCTAGATATAATTTAGATTCCA----------------------------A"),
("TTTGGTCCTCTGGA------CGAGCAGCCAGTGCT---------------------------------------------------------------------------AAAAAAAA",
"T---CATTCTAGCAGGTGCTGCAGCAGCAGGTAGCCCTGGAGCCAACAGTTGTGGCTATGATTCTTGATCATCAGATTTGGCTCAAGTGATGTGTTCCTCTAGCATGCACTTGAGATA"),
("G-----------------------C----------------------------------------------------------------------------------------A",
"GGCCTGCACTGCCAGTAATTTTAACAAATTTTTAGGCACTGAATTCCCTGTATTAAATCTGTTTTCCTTAGCGTAAACAGATCTCTGTTAAATGAAACTAAACCCTGACTGATA"),
("TATT----------------------------------T",
"TCCTTCATTTTATTTCTCCCTTAAAATTTTTTTTATTACT"),
("TAAAAA--A------A------------------------------------------------------------TTTTTTTTTTT",
"T---AATTATTTTGCAGCAGGTCCTTGATAACATATCATCTATAAATATTTCAGCAAGAATCTCTAAAAGGCAAGAACCTCCTTCTT"),
("AAACAA---------------------------------------TT---T",
"AAACAATACCACTGCATCACTATCAAACCCAAAAAATAACAAAAATTGGGT"),
("TCTTAAC---TGCTGAGCCATCCCTCCAGCTCCTGTTTTATTTTTATTATGAAGTAATAATA--ATAG--TAATAATAATGATG",
"TACACTTAATTCTAAAACTTGTTATGAATCATCA----------TTGG--TTTTTTATTGTGAAGAACTAATATAATCAGA--G"),
("ATGATAATGGTATCCTAGCTCAACACCTG-GAGTTCACCCCAACAGTTAACTAA----GTTTGAGGAAGTGTTAACAAGCCTA---ACAAAGAGGACATGCCAATAGCTGACAGAGTCAC",
"A-------CCTCTGCTAGCTCAACTCCTGAGAATCAATTATATAAGCTAGGTCAGTGGTTTTGAGAAAGTATTAGTAGACATTTCTCCAAAGAATACATAAAAATGGCC-A--CAAGTAT")
]
def toCigar(species, id, s):
I = [(0, 0)]
L = [len(_) for _ in s.split("-")]
NZ = [_ for _ in L if _]
if L[0] > 0:
I.append((0, L[0]))
NZ = NZ[1:]
L = L[1:]
for i in range(len(NZ)):
L.insert(0, 0)
size = NZ[i]
start = L.index(size)
I.append((I[-1][1] + start, I[-1][1]+start+size))
L = L[start+1:]
if len(L):
I.append((I[-1][1] + len(L), I[-1][1] + len(L)))
C = []
for i in range(1, len(I)):
dl = I[i][0] - I[i-1][1]
ml = I[i][1] - I[i][0]
dc = ""
if dl:
dc = (dl > 1 and str(dl) or "") + "D"
mc = ""
if ml:
mc = (ml > 1 and str(ml) or "") + "M"
C.append(dc+mc)
MSUM = sum(i[1]-i[0] for i in I)
start = random.randint(50, 10000)
return "%s\t%d\t1\t%d\t%d\t%d\t%s" % (species, id, start, start+MSUM-1, random.choice((-1, 1)), "".join(C))
class TestEpo(unittest.TestCase):
def setUp(self):
self.epo_records = []
for i, (t, q) in enumerate(cigar_pairs):
gab_pair = (toCigar("homo_sapiens", i, t), toCigar("mus_musculus", i, q))
A = EPOitem._strfactory(gab_pair[0])
B = EPOitem._strfactory(gab_pair[1])
if A and B:
self.epo_records.append((A, B))
def test_out(self):
def ch(c, ci):
th = 0
for l, t in ci:
if t == 'M':
assert c[th:th+l].find('-') == -1
else:
assert c[th:th+l] == '-' * l
th += l
for (a, b) in self.epo_records:
ca, cb = cigar_pairs[int(a.gabid)]
ch(ca, a.cigar_iter(False))
ch(cb, b.cigar_iter(False))
def test_make_chain(self):
def cch(cigar, s, e):
return cigar[s:e].find('-') == -1
for p in self.epo_records:
chain = Chain._make_from_epo(p[0], p[1], {"chr1": 500}, {"chr1": 800})
if not chain:
continue
ch, S, T, Q = chain
i = int(ch.id)
c1, c2 = cigar_pairs[i]
if p[0].strand == '-':
c1 = c1[::-1]
c2 = c2[::-1]
th = 0
for s, t, q in zip(S, T, Q):
if not (cch(c1, th, th+s) and cch(c2, th, th+s)):
pdb.set_trace()
assert cch(c1, th, th+s) and cch(c2, th, th+s), f"{c1[th:th+s]} and {c2[th:th+s]}"
if t > q:
cch(c1, th+s, th+s+t) and c1[th+s:th+s+t] == '-'*t
else:
cch(c2, th+s, th+s+q) and c1[th+s:th+s+q] == '-'*q
th = th + s + max(t, q)
def test_rem_dash(self):
# ****--****-------**** 4M2D4M7D4M
# *******-------******* 7M7D7M
# has 4 dash columns and should become
# ****--****---**** 4M2D4M3D4M
# *******---******* 7M3D7M
for i in range(100):
dash_cols = random.randint(0, 10)
tStart = random.randint(0, 1000)
qStart = random.randint(0, 1000)
epo_pair = (
EPOitem._strfactory("homo_sapiens\t0\t1\t%d\t%d\t1\t%s" % (tStart, tStart+12-1, "4M2D4M%dD4M" % (dash_cols+3))),
EPOitem._strfactory("mus_musculus\t0\t1\t%d\t%d\t1\t%s" % (qStart, qStart+14-1, "7M%dD7M" % (dash_cols+3))))
chain = Chain._make_from_epo(epo_pair[0], epo_pair[1], {"chr1": 500}, {"chr1": 800})
ti = epo_pair[0].intervals(False)
qi = epo_pair[1].intervals(False)
assert ti[2][0] - ti[1][1] - dash_cols == chain[2][1]
assert qi[1][0] - qi[0][1] - dash_cols == chain[2][1]
# ----*****
# *-------*
# has 3 dash cols and should become
# *
# *
# with the qStart += 1 and tStart += 4
for i in range(100):
dash_cols = random.randint(0, 10)
tm = random.randint(6, 10)
qm = random.randint(1, 5)
tStart = random.randint(0, 1000)
qStart = random.randint(0, 1000)
epo_pair = (
EPOitem._strfactory("homo_sapiens\t0\t1\t%d\t%d\t1\t%s" % (tStart, tStart+tm-1, "%dD%dM" % (dash_cols+1, tm))),
EPOitem._strfactory("mus_musculus\t0\t1\t%d\t%d\t1\t%s" % (qStart, qStart+qm+1-1, "M%dD%dM" % (dash_cols+tm-qm, qm))))
chain = Chain._make_from_epo(epo_pair[0], epo_pair[1], {"chr1": 500}, {"chr1": 800})
if chain[1][-1] != qm:
pdb.set_trace()
assert chain[1][-1] == qm
# correct also for coordinate interpretation differences between UCSC and EPO
assert (qStart + 1) - 1 == chain[0].qStart, "%d != %d" % (qStart + 1, chain[0].qStart)
if __name__ == '__main__':
unittest.main()
bx-python-0.8.13/lib/bx/align/lav.py 0000664 0000000 0000000 00000052445 14156664651 0017170 0 ustar 00root root 0000000 0000000 """
Support for reading and writing the LAV format produced by the `blastz`_
pairwise aligner.
.. _blastz: http://www.bx.psu.edu/miller_lab/
"""
import sys
from io import StringIO
import bx.seq
from bx.align import (
Alignment,
Component,
src_merge,
src_split
)
class Reader:
"""Iterate over all lav blocks in a file in order"""
def __init__(self, file, path_subs=None, fail_to_ns=False):
self.file = file
self.lineNumber = 0
self.path_subs = path_subs # list of (prefix,replacement) to allow
if self.path_subs is None: # .. redirection of sequence file paths
self.path_subs = [] # .. on different machines
self.fail_to_ns = fail_to_ns # True => if sequences fail to open, create a fake file of all Ns
self.d_stanza_text = None
self.seq1_filename = None
self.seq1_file = None
self.seq1_header = None
self.seq1_start = None
self.seq1_end = None
self.seq1_strand = None
self.seq1_contig = None
self.seq1_src = None
self.seq1_gap = None
self.seq2_filename = None
self.seq2_file = None
self.seq2_header = None
self.seq2_start = None
self.seq2_end = None
self.seq2_strand = None
self.seq2_contig = None
self.seq2_src = None
self.seq2_gap = None
def __next__(self):
while True:
line = self.fetch_line(strip=None, requireLine=False)
assert (line), "unexpected end of file (missing #:eof)"
line = line.rstrip()
if line == "": # (allow blank lines between stanzas)
continue
if line == "#:eof":
line = self.file.readline().rstrip()
assert (not line), "extra line after #:eof (line %d, \"%s\")" % (self.lineNumber, line)
return None
if line == "#:lav":
continue
if line.startswith("d {"):
self.d_stanza_text = self.parse_unknown_stanza()
continue
if line.startswith("s {"):
self.parse_s_stanza()
continue
if line.startswith("h {"):
self.parse_h_stanza()
continue
if line.startswith("a {"):
(score, pieces) = self.parse_a_stanza()
break
if line.endswith("{"):
self.parse_unknown_stanza()
continue
raise ValueError("incomprehensible line (line %d, \"%s\")" % (self.lineNumber, line))
return self.build_alignment(score, pieces)
def __iter__(self):
return ReaderIter(self)
def close(self):
self.file.close()
def open_seqs(self):
if self.seq1_file is not None and self.seq2_file is not None:
return
if self.seq1_file is None:
if self.seq1_strand == "+":
revcomp = False
else:
revcomp = "-5'"
if self.seq1_contig == 1:
contig = None
else:
contig = self.seq1_contig
try:
f = open(self.seq1_filename, "rb")
except Exception:
if self.fail_to_ns:
f = StringIO(">seq1\n" + ("n" * (self.seq1_end - self.seq1_start)))
revcomp = False
contig = 1
else:
raise Exception("failed to open %s" % self.seq1_filename)
self.seq1_file = bx.seq.seq_file(f, revcomp=revcomp, contig=contig)
self.seq1_gap = self.seq1_file.gap
try:
name1 = self.header_to_src_name(self.seq1_header)
except ValueError:
try:
name1 = self.path_to_src_name(self.seq1_filename)
except ValueError:
name1 = "seq1"
(species1, chrom1) = src_split(name1)
self.seq1_src = src_merge(species1, chrom1, contig)
if contig is not None:
chrom1 += "[%s]" % contig
if self.seq2_file is None:
if self.seq2_strand == "+":
revcomp = False
else:
revcomp = "-5'"
if self.seq2_contig == 1:
contig = None
else:
contig = self.seq2_contig
try:
f = open(self.seq2_filename, "rb")
except Exception:
if self.fail_to_ns:
f = StringIO(">seq2\n" + ("n" * (self.seq2_end - self.seq2_start)))
revcomp = False
contig = 1
else:
raise Exception("failed to open %s" % self.seq1_filename)
self.seq2_file = bx.seq.seq_file(f, revcomp=revcomp, contig=contig)
self.seq2_gap = self.seq2_file.gap
try:
name2 = self.header_to_src_name(self.seq2_header)
except ValueError:
try:
name2 = self.path_to_src_name(self.seq2_filename)
except ValueError:
name2 = "seq2"
(species2, chrom2) = src_split(name2)
self.seq2_src = src_merge(species2, chrom2, contig)
if contig is not None:
chrom2 += "[%s]" % contig
length1 = self.seq1_file.length
length2 = self.seq2_file.length
assert (species1 != species2) or (chrom1 != chrom2) or (length1 == length2), \
"conflicting lengths for %s (%d and %d)" % (self.seq1_src, length1, length2)
self.species_to_lengths = {}
self.species_to_lengths[species1] = {}
self.species_to_lengths[species2] = {} # (OK if it clobbers line above)
self.species_to_lengths[species1][chrom1] = self.seq1_file.length
self.species_to_lengths[species2][chrom2] = self.seq2_file.length
def close_seqs(self):
if self.seq1_file is not None:
self.seq1_file.close()
self.seq1_file = None
if self.seq2_file is not None:
self.seq2_file.close()
self.seq2_file = None
def parse_s_stanza(self):
self.close_seqs()
line = self.fetch_line(report=" in s-stanza")
(self.seq1_filename,
self.seq1_start,
self.seq1_end,
self.seq1_strand,
self.seq1_contig) = self.parse_s_seq(line)
line = self.fetch_line(report=" in s-stanza")
(self.seq2_filename,
self.seq2_start,
self.seq2_end,
self.seq2_strand,
self.seq2_contig) = self.parse_s_seq(line)
line = self.fetch_line(report=" in s-stanza")
assert (line == "}"), "improper s-stanza terminator (line %d, \"%s\")" \
% (self.lineNumber, line)
def parse_s_seq(self, line):
fields = line.split()
filename = fields[0].strip('"')
start = int(fields[1]) - 1
end = int(fields[2])
contig = int(fields[4])
if fields[3] == "1":
strand = "-"
else:
strand = "+"
if filename.endswith("-"):
assert (strand == "-"), "strand mismatch in \"%s\"" % line
filename = filename[:-1]
filename = do_path_subs(filename, self.path_subs)
return (filename, start, end, strand, contig)
def parse_h_stanza(self):
line = self.fetch_line(strip='"', report=" in h-stanza")
self.seq1_header = line
self.seq1_header_prefix = ""
if line.startswith(">"):
self.seq1_header = line[1:].strip()
self.seq1_header_prefix = ">"
self.seq1_header = self.seq1_header.split(None, 1)
if len(self.seq1_header) > 0:
self.seq1_header = self.seq1_header[0]
else:
self.seq1_header = "seq1"
line = self.fetch_line(strip='"', report=" in h-stanza")
self.seq2_header = line
self.seq2_header_prefix = ""
if line.startswith(">"):
self.seq2_header = line[1:].strip()
self.seq2_header_prefix = ">"
self.seq2_header = self.seq2_header.split(None, 1)
if len(self.seq2_header) > 0:
self.seq2_header = self.seq2_header[0]
else:
self.seq2_header = "seq2"
line = self.fetch_line(report=" in h-stanza")
assert (line == "}"), "improper h-stanza terminator (line %d, \"%s\")" \
% (self.lineNumber, line)
def parse_a_stanza(self):
"""returns the pair (score,pieces)
where pieces is a list of ungapped segments (start1,start2,length,pctId)
with start1,start2 origin-0"""
# 's' line -- score, 1 field
line = self.fetch_line(report=" in a-stanza")
fields = line.split()
assert (fields[0] == "s"), "s line expected in a-stanza (line %d, \"%s\")" \
% (self.lineNumber, line)
try:
score = int(fields[1])
except ValueError:
score = float(fields[1])
# 'b' line -- begin positions in seqs, 2 fields
line = self.fetch_line(report=" in a-stanza")
fields = line.split()
assert (fields[0] == "b"), "b line expected in a-stanza (line %d, \"%s\")" \
% (self.lineNumber, line)
# 'e' line -- end positions in seqs, 2 fields
line = self.fetch_line(report=" in a-stanza")
fields = line.split()
assert (fields[0] == "e"), "e line expected in a-stanza (line %d, \"%s\")" \
% (self.lineNumber, line)
# 'l' lines
pieces = []
while (True):
line = self.fetch_line(report=" in a-stanza")
fields = line.split()
if fields[0] != "l":
break
start1 = int(fields[1]) - 1
start2 = int(fields[2]) - 1
length = int(fields[3]) - start1
length2 = int(fields[4]) - start2
try:
pctId = int(fields[5])
except ValueError:
pctId = float(fields[5])
assert (length2 == length), "length mismatch in a-stanza"
pieces.append((start1+self.seq1_start, start2+self.seq2_start, length, pctId))
assert (line == "}"), "improper a-stanza terminator (line %d, \"%s\")" \
% (self.lineNumber, line)
return (score, pieces)
def parse_unknown_stanza(self):
lines = []
while (True):
line = self.fetch_line()
assert (line), "unexpected end of file (missing #:eof)"
if line == "}":
break
lines.append(line)
return " " + "\n ".join(lines) + "\n"
def fetch_line(self, strip=True, requireLine=True, report=""):
if strip is None:
line = self.file.readline()
elif strip is True:
line = self.file.readline().strip()
else:
line = self.file.readline().strip().strip(strip)
self.lineNumber += 1
if requireLine:
assert (line), "unexpected blank line or end of file%s (line %d)" \
% (report, self.lineNumber)
return line
def d_stanza(self):
if self.d_stanza_text is None:
return ""
return "d {\n%s}" % self.d_stanza_text
def s_stanza(self):
if self.seq1_filename is None:
return ""
if self.seq1_strand == "-":
seq1_strand = "1"
else:
seq1_strand = "0"
if self.seq2_strand == "-":
seq2_strand = "1"
else:
seq2_strand = "0"
s = " \"%s\" %d %d %s %d\n"\
% (self.seq1_filename, self.seq2_start+1, self.seq1_end,
seq1_strand, self.seq1_contig)
s += " \"%s\" %d %d %s %d\n"\
% (self.seq2_filename, self.seq2_start+1, self.seq2_end,
seq2_strand, self.seq2_contig)
return "s {\n%s}" % s
def h_stanza(self):
if self.seq1_header is None:
return ""
s = f" \"{self.seq1_header_prefix}{self.seq1_header}\"\n"
s += f" \"{self.seq2_header_prefix}{self.seq2_header}\"\n"
return "h {\n%s}" % s
def build_alignment(self, score, pieces):
"""converts a score and pieces to an alignment"""
# build text
self.open_seqs()
text1 = text2 = ""
end1 = end2 = None
for (start1, start2, length, _pctId) in pieces:
if end1 is not None:
if start1 == end1: # insertion in sequence 2
text1 += self.seq1_gap * (start2-end2)
text2 += self.seq2_file.get(end2, start2-end2)
else: # insertion in sequence 1
text1 += self.seq1_file.get(end1, start1-end1)
text2 += self.seq2_gap * (start1-end1)
text1 += self.seq1_file.get(start1, length)
text2 += self.seq2_file.get(start2, length)
end1 = start1 + length
end2 = start2 + length
# create alignment
start1 = pieces[0][0]
start2 = pieces[0][1]
end1 = pieces[-1][0] + pieces[-1][2]
end2 = pieces[-1][1] + pieces[-1][2]
size1 = end1 - start1
size2 = end2 - start2
a = Alignment(score=score, species_to_lengths=self.species_to_lengths)
# if (self.seq1_strand == "-"): start1 = self.seq1_file.length - end1
a.add_component(Component(self.seq1_src, start1, size1, self.seq1_strand, text=text1))
# if (self.seq2_strand == "-"): start2 = self.seq2_file.length - end2
a.add_component(Component(self.seq2_src, start2, size2, self.seq2_strand, text=text2))
return a
def path_to_src_name(self, path_name):
# converts, e.g. ".../hg18/seq/chr13.nib" to "hg18.chr13"
if path_name is None or path_name == "":
raise ValueError
if path_name.endswith(".nib"):
path_name = path_name[:-4]
if path_name.endswith(".fa"):
path_name = path_name[:-3]
if path_name.endswith(".fasta"):
path_name = path_name[:-6]
slash = path_name.rfind("/")
if slash == -1:
return path_name
name = path_name[slash+1:]
path_name = path_name[:slash]
if path_name.endswith("/seq"):
path_name = path_name[:-4]
slash = path_name.rfind("/")
if slash != -1:
path_name = path_name[slash+1:]
return path_name + "." + name
def header_to_src_name(self, header):
# converts, e.g. "hg18.chr13:115404472-117281897" to "hg18.chr13"
if header is None or header == "":
raise ValueError
colon = header.rfind(":")
if colon != -1:
header = header[:colon]
if "/" in header:
raise ValueError
if header.count(".") == 0:
return header
header = header.split(".")
if header[0] == "" or header[1] == "":
raise ValueError
return ".".join(header)
class ReaderIter:
def __init__(self, reader):
self.reader = reader
def __iter__(self):
return self
def __next__(self):
v = next(self.reader)
if not v:
raise StopIteration
return v
class LavAsPiecesReader(Reader):
"""Iterate over all lav blocks in a file in order, returning alignments
as score and pieces, as returned by Reader.parse_a_stanza"""
def build_alignment(self, score, pieces):
return (score, pieces)
class Writer:
# blockHash is a hash from (src1,strand1,src2,strand2) to a list of blocks;
# the blocks are collected on each call to write(), but the actual writing
# does not occur until close().
def __init__(self, file, attributes=None):
if attributes is None:
attributes = {}
self.file = file
self.fname1 = None
self.fname2 = None
self.block = 0
self.blockHash = {} # (see note above)
if "name_format_1" in attributes:
self.fname1 = attributes["name_format_1"]
if "name_format_2" in attributes:
self.fname2 = attributes["name_format_2"]
if "d_stanza" in attributes:
self.write_lav_marker()
print("d {", file=self.file)
print(attributes["d_stanza"], file=self.file)
print("}", file=self.file)
def write(self, alignment):
if len(alignment.components) != 2:
raise ValueError(
"%d-component alignment is not compatible with lav" %
len(alignment.components))
c1 = alignment.components[0]
c2 = alignment.components[1]
key = (c1.src, c1.strand, c2.src, c2.strand)
if key not in self.blockHash:
self.blockHash[key] = []
self.blockHash[key].append(alignment)
self.block += 1
def close(self):
keys = [key for key in self.blockHash]
keys = sort_keys_by_chrom(keys)
for key in keys:
(src1, strand1, src2, strand2) = key
alignment = self.blockHash[key][0]
self.src1 = src1
self.strand1 = strand1
self.length1 = alignment.src_size(src1)
self.src2 = src2
self.strand2 = strand2
self.length2 = alignment.src_size(src2)
self.write_s_stanza()
self.write_h_stanza()
for alignment in self.blockHash[key]:
self.write_a_stanza(alignment)
self.write_trailer()
if self.file != sys.stdout:
self.file.close()
def write_s_stanza(self):
self.write_lav_marker()
(strand1, flag1) = minus_or_nothing(self.strand1)
(strand2, flag2) = minus_or_nothing(self.strand2)
fname1 = build_filename(self.fname1, self.src1)
fname2 = build_filename(self.fname2, self.src2)
print("s {", file=self.file)
print(" \"%s%s\" 1 %d %d 1" % (fname1, strand1, self.length1, flag1), file=self.file)
print(" \"%s%s\" 1 %d %d 1" % (fname2, strand2, self.length2, flag2), file=self.file)
print("}", file=self.file)
def write_h_stanza(self):
strand1 = rc_or_nothing(self.strand1)
strand2 = rc_or_nothing(self.strand2)
print("h {", file=self.file)
print(f" \"> {self.src1}{strand1}\"", file=self.file)
print(f" \"> {self.src2}{strand2}\"", file=self.file)
print("}", file=self.file)
def write_a_stanza(self, alignment):
c1 = alignment.components[0]
pos1 = c1.start
text1 = c1.text.upper()
c2 = alignment.components[1]
pos2 = c2.start
text2 = c2.text.upper()
# collect ungapped pieces
pieces = []
piece1 = None
for ix in range(len(text1)):
ch1 = text1[ix]
ch2 = text2[ix]
nonGap = (ch1 != "-") and (ch2 != "-")
if nonGap:
if piece1 is None: # new piece starts
(piece1, piece2, idCount) = (pos1, pos2, 0)
if ch1 == ch2:
idCount += 1
elif piece1 is not None: # new gap starts
size = pos1 - piece1
pctId = (200*idCount + size) / (2*size)
pieces.append((piece1, piece2, size, pctId))
piece1 = None
if ch1 != "-":
pos1 += 1
if ch2 != "-":
pos2 += 1
if piece1 is not None:
size = pos1 - piece1
pctId = (200*idCount + size) / (2*size)
pieces.append((piece1, piece2, size, pctId))
# write the block
(start1, start2, size, pctId) = pieces[-1] # get end of final piece
end1 = start1 + size
end2 = start2 + size
(start1, start2, size, pctId) = pieces[0] # get start of first piece
score = int(round(alignment.score))
print("a {", file=self.file)
print(" s %s" % score, file=self.file)
print(" b %d %d" % (start1 + 1, start2 + 1), file=self.file)
print(" e %d %d" % (end1, end2), file=self.file)
for (start1, start2, size, pctId) in pieces:
print(" l %d %d %d %d %d" % (start1 + 1, start2 + 1, start1 + size, start2 + size, pctId), file=self.file)
print("}", file=self.file)
def write_lav_marker(self):
print("#:lav", file=self.file)
def write_trailer(self):
print("#:eof", file=self.file)
def sort_keys_by_chrom(keys):
decorated = sorted((chrom_key(src1), strand1, chrom_key(src2), strand2, (src1, strand1, src2, strand2))
for (src1, strand1, src2, strand2) in keys)
return [key for (src1, strand1, src2, strand2, key) in decorated]
def chrom_key(src):
(species, chrom) = src_split(src)
if chrom.startswith("chr"):
chrom = chrom[3:]
try:
chrom = int(chrom)
except ValueError:
pass
return chrom
def build_filename(fmt, src):
if fmt is None:
return src
num = fmt.count("%s")
if num == 0:
return fmt
(species, chrom) = src_split(src)
if num == 1:
return fmt % chrom
return fmt % (species, chrom)
def minus_or_nothing(strand):
if strand == "-":
return ("-", 1)
else:
return ("", 0)
def rc_or_nothing(strand):
if strand == "-":
return " (reverse complement)"
else:
return ""
def do_path_subs(path, path_subs):
for (prefix, replacement) in path_subs:
if path.startswith(prefix):
return replacement + path[len(prefix):]
return path
bx-python-0.8.13/lib/bx/align/lav_tests.py 0000664 0000000 0000000 00000004105 14156664651 0020400 0 ustar 00root root 0000000 0000000 """
Tests for `bx.align.lav`.
"""
import unittest
import bx.align.lav as lav
test_lav = "test_data/lav_tests/apple_orange.lav"
class lavTestCase(unittest.TestCase):
def testReader(self):
reader = lav.Reader(open(test_lav))
a = next(reader)
assert a.score == 10286, "a.score is wrong: %s" % a.score
assert len(a.components) == 2
check_component(a.components[0], "apple", 106, 252, "+", 411, "GTCCGGCCGGCTGAGAGCTACAATACACATGCACGCAGTTTGGCCACTCACATTAAGTATATGAGGAAGGGTTAGCATGAGTTGTACTATAAGGCAGCGGATAGCAGGTTGTGGAAAAATATCCTCCCGATTCAAATCCCCAGGTGCCTAAA----------------GTAGGGCCGGTAGTTGAATGCTTGCCTGTCAGACTGGATGACCAAGTTCAGTATCAACACAATATAGTGCCAGGAGCTAATTGTTCCCCAGCAGCGTGAC")
check_component(a.components[1], "lav_tests.orange", 53, 252, "+", 361, "GTCCGGCCGGCTGTGTGCTACAATACACGTTCACGCAGTTTGGCCAATCACTTTAAGTATATACGAAATGGTTACCATGAGTTGTACTGTAAGGCAGCGGAAAGC---TTGTTAA--------CTCCTGGGCGACATT----GGGGCTGCAACATCGTTTATCCTCCTCTACAACCAATAGCTG-TTGCTTCTTGGTTCAAGTATATCCCATGGATTAGTATCAACACGATATAGTGTCAGGAGCTAATTGTTCCCCAGCAGCGTGAC")
a = next(reader)
assert a.score == 3586, "a.score is wrong: %s" % a.score
assert len(a.components) == 2
check_component(a.components[0], "apple", 52, 72, "+", 411, "TGCATATCGACTATTACAGCCACGCGAGTTACATTCCTCTTTTTTTTTGCTGGCGTCCGGCCGGCTGAGAGC")
check_component(a.components[1], "lav_tests.orange", 2, 72, "-", 361, "TGCATATCGACTAGTACAGCCTCTCGAGTTACCCCCCCCATTCCTCTTGCTGACGTCACGCTGCTGGGGAAC")
a = next(reader)
assert a is None
reader.close()
def check_component(c, src, start, size, strand, src_size, text):
# ..print "\"%s\" == \"%s\"" % (c.src,src)
assert c.src == src, f"c.src = {c.src} (expected {src})"
assert c.start == start, f"c.start = {c.start} (expected {start})"
assert c.size == size, f"c.size = {c.size} (expected {size})"
assert c.strand == strand, f"c.strand = {c.strand} (expected {strand})"
assert c.src_size == src_size, f"c.src_size = {c.src_size} (expected {src_size})"
assert c.text == text, f"c.text = \"{c.text}\" (expected \"{text}\")"
bx-python-0.8.13/lib/bx/align/maf.py 0000664 0000000 0000000 00000020411 14156664651 0017135 0 ustar 00root root 0000000 0000000 """
Support for the `MAF`_ multiple sequence alignment format used by `multiz`_.
.. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
.. _multiz: http://www.bx.psu.edu/miller_lab/
"""
from io import (
StringIO,
TextIOWrapper,
)
from bx import interval_index_file
from bx.align import (
Alignment,
Component
)
MAF_INVERSE_STATUS = 'V'
MAF_INSERT_STATUS = 'I'
MAF_CONTIG_STATUS = 'C'
MAF_CONTIG_NESTED_STATUS = 'c'
MAF_NEW_STATUS = 'N'
MAF_NEW_NESTED_STATUS = 'n'
MAF_MAYBE_NEW_STATUS = 'S'
MAF_MAYBE_NEW_NESTED_STATUS = 's'
MAF_MISSING_STATUS = 'M'
class MAFIndexedAccess(interval_index_file.AbstractIndexedAccess):
"""
Indexed access to a MAF file.
"""
def read_at_current_offset(self, file, **kwargs):
"""
Read the MAF block at the current position in `file` and return an
instance of `Alignment`.
"""
return read_next_maf(file, **kwargs)
def open_data(self):
data = super().open_data()
return TextIOWrapper(data, encoding="ascii")
class MAFMultiIndexedAccess(interval_index_file.AbstractMultiIndexedAccess):
"""
Indexed access to multiple MAF files.
"""
indexed_access_class = MAFIndexedAccess
Indexed = MAFIndexedAccess
"""Deprecated: `MAFIndexedAccess` is also available under the name `Indexed`."""
MultiIndexed = MAFMultiIndexedAccess
"""Deprecated: `MAFMultiIndexedAccess` is also available under the name `MultiIndexed`."""
class Reader:
"""
Iterate over all maf blocks in a file in order
"""
def __init__(self, file, **kwargs):
self.file = file
self.maf_kwargs = kwargs
# Read and verify maf header, store any attributes
fields = self.file.readline().split()
if fields[0] != '##maf':
raise Exception("File does not have MAF header")
self.attributes = parse_attributes(fields[1:])
def __next__(self):
return read_next_maf(self.file, **self.maf_kwargs)
def __iter__(self):
return ReaderIter(self)
def close(self):
self.file.close()
class ReaderIter:
"""
Adapts a `Reader` to the iterator protocol.
"""
def __init__(self, reader):
self.reader = reader
def __iter__(self):
return self
def __next__(self):
v = next(self.reader)
if not v:
raise StopIteration
return v
class Writer:
def __init__(self, file, attributes=None):
if attributes is None:
attributes = {}
self.file = file
# Write header, Webb's maf code wants version first, we accomodate
if 'version' not in attributes:
attributes['version'] = 1
self.file.write("##maf version=%s" % attributes['version'])
for key in attributes:
if key == 'version':
continue
self.file.writelines(f" {key}={attributes[key]}")
self.file.write("\n")
def write(self, alignment):
self.file.write("a score=" + str(alignment.score))
for key in alignment.attributes:
self.file.write(f" {key}={alignment.attributes[key]}")
self.file.write("\n")
# Components
rows = []
for c in alignment.components:
# "Empty component" generates an 'e' row
if c.empty:
rows.append(("e", c.src, str(c.start), str(c.size), c.strand, str(c.src_size), c.synteny_empty))
continue
# Regular component
rows.append(("s", c.src, str(c.start), str(c.size), c.strand, str(c.src_size), c.text))
# If component has quality, write a q row
if c.quality is not None:
rows.append(("q", c.src, "", "", "", "", c.quality))
# If component has synteny follow up with an 'i' row
if c.synteny_left and c.synteny_right:
rows.append(("i", c.src, "", "", "", "", " ".join(map(str, c.synteny_left + c.synteny_right))))
self.file.write(format_tabular(rows, "llrrrrl"))
self.file.write("\n")
def close(self):
self.file.close()
# ---- Helper methods -------------------------------------------------------
def from_string(string, **kwargs):
return read_next_maf(StringIO(string), **kwargs)
def read_next_maf(file, species_to_lengths=None, parse_e_rows=False):
"""
Read the next MAF block from `file` and return as an `Alignment`
instance. If `parse_e_rows` is true, empty components will be created
when e rows are encountered.
"""
alignment = Alignment(species_to_lengths=species_to_lengths)
# Attributes line
line = readline(file, skip_blank=True)
if not line:
return None
fields = line.split()
if fields[0] != 'a':
raise Exception("Expected 'a ...' line")
alignment.attributes = parse_attributes(fields[1:])
if 'score' in alignment.attributes:
alignment.score = alignment.attributes['score']
del alignment.attributes['score']
else:
alignment.score = 0
# Sequence lines
last_component = None
while True:
line = readline(file)
# EOF or Blank line terminates alignment components
if not line or line.isspace():
break
if line.isspace():
break
# Parse row
fields = line.split()
if fields[0] == 's':
# An 's' row contains sequence for a component
component = Component()
component.src = fields[1]
component.start = int(fields[2])
component.size = int(fields[3])
component.strand = fields[4]
component.src_size = int(fields[5])
if len(fields) > 6:
component.text = fields[6].strip()
# Add to set
alignment.add_component(component)
last_component = component
elif fields[0] == 'e':
# An 'e' row, when no bases align for a given species this tells
# us something about the synteny
if parse_e_rows:
component = Component()
component.empty = True
component.src = fields[1]
component.start = int(fields[2])
component.size = int(fields[3])
component.strand = fields[4]
component.src_size = int(fields[5])
component.text = None
synteny = fields[6].strip()
assert len(synteny) == 1, \
"Synteny status in 'e' rows should be denoted with a single character code"
component.synteny_empty = synteny
alignment.add_component(component)
last_component = component
elif fields[0] == 'i':
# An 'i' row, indicates left and right synteny status for the
# previous component, we hope ;)
assert fields[1] == last_component.src, "'i' row does not follow matching 's' row"
last_component.synteny_left = (fields[2], int(fields[3]))
last_component.synteny_right = (fields[4], int(fields[5]))
elif fields[0] == 'q':
assert fields[1] == last_component.src, "'q' row does not follow matching 's' row"
# TODO: Should convert this to an integer array?
last_component.quality = fields[2]
return alignment
def readline(file, skip_blank=False):
"""Read a line from provided file, skipping any blank or comment lines"""
while True:
line = file.readline()
if not line:
return None
if line[0] != '#' and not (skip_blank and line.isspace()):
return line
def parse_attributes(fields):
"""Parse list of key=value strings into a dict"""
attributes = {}
for field in fields:
pair = field.split('=')
attributes[pair[0]] = pair[1]
return attributes
def format_tabular(rows, align=None):
if len(rows) == 0:
return ""
lengths = [len(col) for col in rows[0]]
for row in rows[1:]:
for i in range(0, len(row)):
lengths[i] = max(lengths[i], len(row[i]))
rval = ""
for row in rows:
for i in range(0, len(row)):
if align and align[i] == "l":
rval += row[i].ljust(lengths[i])
else:
rval += row[i].rjust(lengths[i])
rval += " "
rval += "\n"
return rval
bx-python-0.8.13/lib/bx/align/maf_tests.py 0000664 0000000 0000000 00000026111 14156664651 0020362 0 ustar 00root root 0000000 0000000 """
Tests for `bx.align.maf`.
"""
from io import StringIO
import bx.align as align
import bx.align.maf as maf
# A simple MAF from the rat paper days
test_maf = """##maf version=1 scoring=humor.v4
# humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
# /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
a score=0.128
s human_hoxa 100 8 + 100257 ACA-TTACT
s horse_hoxa 120 9 - 98892 ACAATTGCT
s fugu_hoxa 88 7 + 90788 ACA--TGCT
a score=0.071
s human_unc 9077 8 + 10998 ACAGTATT
# Comment
s horse_unc 4555 6 - 5099 ACA--ATT
s fugu_unc 4000 4 + 4038 AC----TT
"""
# A more complicated MAF with synteny annotation and such
test_maf_2 = """##maf version=1 scoring=autoMZ.v1
a score=3656.000000
s hg17.chr1 2005 34 + 245522847 TGTAACTTAATACCACAACCAGGCATAGGGG--AAA-------------
s rheMac2.chr11 9625228 31 + 134511895 TGTAACCTCTTACTGCAACAAGGCACAGGGG------------------
i rheMac2.chr11 C 0 I 1678
s panTro1.chr1 2014 34 + 229575298 TGTAACTTAATACCACAACCAGGCATGGGGG--AAA-------------
i panTro1.chr1 C 0 C 0
s bosTau2.chr5 64972365 47 + 76426644 TCCAGCCATGTGTTGTGATCAG--CCAGGGGCTAAAGCCATGGCGGTAG
i bosTau2.chr5 C 0 I 1462
s canFam2.chr27 45129665 31 + 48908698 TTTGACTCTGTGCTCTTATCAGGCCCAAGGG------------------
i canFam2.chr27 C 0 I 1664
e danRer3.chr18 2360867 428 + 50308305 I
e oryCun1.scaffold_139397 643 1271 - 4771 I
e loxAfr1.scaffold_5603 58454 1915 + 68791 I
e echTel1.scaffold_212365 4641 1430 + 9822 I
e echTel1.scaffold_212365 4641 1430 + 9822 I
e rn3.chr4 29161032 1524 - 187371129 I
e mm7.chr6 28091695 3290 - 149646834 I
"""
# A MAF to test slicing upon
test_maf_3 = """##maf version=1 scoring=none
a score=0
s apple 34 64 + 110 AGGGA---GTTCGTCACT------GTCGTAAGGGTTCAGA--CTGTCTATGTATACACAAGTTGTGTTGCA--ACCG
s orange 19 61 - 100 AGGGATGCGTT--TCACTGCTATCGTCGTA----TTCAGACTTCG-CTATCT------GAGTTGT---GCATTACCG
"""
complex_maf = align.Alignment()
complex_maf.score = "7009"
complex_maf.components.append(align.Component(src="human_hoxa", start=100, size=8, strand="+", src_size=100257, text="ACA-TTACT"))
complex_maf.components.append(align.Component(src="horse_hoxa", start=120, size=9, strand="-", src_size=98892, text="ACAATTGCT"))
complex_maf.components[-1].synteny_left = (maf.MAF_NEW_STATUS, 0)
complex_maf.components[-1].synteny_right = (maf.MAF_CONTIG_STATUS, 0)
complex_maf.components.append(align.Component(src="unknown_1", start=150, size=3, strand="-", src_size=98892, text="---ATT---"))
complex_maf.components.append(align.Component(src="unknown_2", start=12, size=1000, strand="+", src_size=1200, text=None))
complex_maf.components[-1].empty = True
complex_maf.components[-1].synteny_empty = maf.MAF_INSERT_STATUS
complex_maf.text_size = 9
def test_reader():
reader = maf.Reader(StringIO(test_maf))
assert reader.attributes["version"] == "1"
assert reader.attributes["scoring"] == "humor.v4"
a = next(reader)
assert a.score == 0.128
assert len(a.components) == 3
check_component(a.components[0], "human_hoxa", 100, 8, "+", 100257, "ACA-TTACT")
check_component(a.components[1], "horse_hoxa", 120, 9, "-", 98892, "ACAATTGCT")
check_component(a.components[2], "fugu_hoxa", 88, 7, "+", 90788, "ACA--TGCT")
a = next(reader)
assert a.score == 0.071
assert len(a.components) == 3
check_component(a.components[0], "human_unc", 9077, 8, "+", 10998, "ACAGTATT")
check_component(a.components[1], "horse_unc", 4555, 6, "-", 5099, "ACA--ATT")
check_component(a.components[2], "fugu_unc", 4000, 4, "+", 4038, "AC----TT")
a = next(reader)
assert a is None
reader.close()
def test_writer():
val = StringIO()
writer = maf.Writer(val, {'scoring': 'foobar'})
a = align.Alignment()
a.score = 7009
a.components.append(align.Component(src="human_hoxa", start=100, size=9, strand="+", src_size=1000257, text="ACA-TTACT"))
a.components.append(align.Component(src="horse_hoxa", start=120, size=10, strand="-", src_size=98892, text="ACAATTGCT"))
check_component(a.components[0], "human_hoxa", 100, 9, "+", 1000257, "ACA-TTACT")
check_component(a.components[1], "horse_hoxa", 120, 10, "-", 98892, "ACAATTGCT")
writer.write(a)
assert val.getvalue() == """##maf version=1 scoring=foobar
a score=7009
s human_hoxa 100 9 + 1000257 ACA-TTACT
s horse_hoxa 120 10 - 98892 ACAATTGCT
""" # noqa: W291
def test_slice():
b = complex_maf.slice_by_component(0, 101, 105)
check_component(b.components[0], src="human_hoxa", start=101, size=4, strand="+", src_size=100257, text="CA-TT")
check_component(b.components[1], src="horse_hoxa", start=121, size=5, strand="-", src_size=98892, text="CAATT")
check_component(b.components[2], src="unknown_1", start=150, size=3, strand="-", src_size=98892, text="--ATT")
check_component(b.components[3], src="unknown_2", start=12, size=1000, strand="+", src_size=1200, text=None)
assert b.components[3].empty
assert b.components[3].synteny_empty == maf.MAF_INSERT_STATUS
# test slicing with + strand src
reader = maf.Reader(StringIO(test_maf_3))
a = next(reader)
b = a.slice_by_component(0, 40, 62)
check_component(b.components[0], src="apple", start=40, size=22, strand="+", src_size=110, text="TTCGTCACT------GTCGTAAGGGTTC")
check_component(b.components[1], src="orange", start=28, size=22, strand="-", src_size=100, text="TT--TCACTGCTATCGTCGTA----TTC")
# test slicing with - strand src
b = a.slice_by_component(1, 30, 68)
check_component(b.components[0], src="apple", start=46, size=41, strand="+", src_size=110, text="ACT------GTCGTAAGGGTTCAGA--CTGTCTATGTATACACAAGTTG")
check_component(b.components[1], src="orange", start=32, size=38, strand="-", src_size=100, text="ACTGCTATCGTCGTA----TTCAGACTTCG-CTATCT------GAGTTG")
a = next(reader)
assert a is None
def test_reverse_complement():
b = complex_maf.reverse_complement()
check_component(b.components[0], src="human_hoxa", start=100257-100-8, size=8, strand="-", src_size=100257, text="AGTAA-TGT")
check_component(b.components[1], src="horse_hoxa", start=98892-120-9, size=9, strand="+", src_size=98892, text="AGCAATTGT")
assert b.components[1].synteny_right == (maf.MAF_NEW_STATUS, 0)
assert b.components[1].synteny_left == (maf.MAF_CONTIG_STATUS, 0)
check_component(b.components[2], src="unknown_1", start=98892-150-3, size=3, strand="+", src_size=98892, text="---AAT---")
check_component(b.components[3], src="unknown_2", start=1200-12-1000, size=1000, strand="-", src_size=1200, text=None)
assert b.components[3].empty
assert b.components[3].synteny_empty == maf.MAF_INSERT_STATUS
def test_column_iter():
expected = [['A', 'A', '-'],
['C', 'C', '-'],
['A', 'A', '-'],
['-', 'A', 'A'],
['T', 'T', 'T'],
['T', 'T', 'T'],
['A', 'G', '-'],
['C', 'C', '-'],
['T', 'T', '-']]
for i, c in enumerate(complex_maf.column_iter()):
assert c == expected[i]
def test_remove_all_gap_column():
complex_maf_gap = align.Alignment()
complex_maf_gap.score = "7009"
complex_maf_gap.components.append(align.Component(src="human_hoxa", start=100, size=8, strand="+", src_size=100257, text="-ACA--TTACT"))
complex_maf_gap.components.append(align.Component(src="horse_hoxa", start=120, size=9, strand="-", src_size=98892, text="-ACA-ATTGCT"))
complex_maf_gap.components[-1].synteny_left = (maf.MAF_NEW_STATUS, 0)
complex_maf_gap.components[-1].synteny_right = (maf.MAF_CONTIG_STATUS, 0)
complex_maf_gap.components.append(align.Component(src="unknown_1", start=150, size=3, strand="-", src_size=98892, text="-----ATT---"))
complex_maf_gap.components.append(align.Component(src="unknown_2", start=12, size=1000, strand="+", src_size=1200, text=None))
complex_maf_gap.components[-1].empty = True
complex_maf_gap.components[-1].synteny_empty = maf.MAF_INSERT_STATUS
complex_maf_gap.text_size = 11
complex_maf_gap.remove_all_gap_columns()
assert complex_maf_gap == complex_maf
def test_read_with_synteny():
reader = maf.Reader(StringIO(test_maf_2), parse_e_rows=True)
a = next(reader)
check_component(a.components[0], "hg17.chr1", 2005, 34, "+", 245522847, "TGTAACTTAATACCACAACCAGGCATAGGGG--AAA-------------")
check_component(a.components[1], "rheMac2.chr11", 9625228, 31, "+", 134511895, "TGTAACCTCTTACTGCAACAAGGCACAGGGG------------------")
print(a.components[1].synteny_left)
assert a.components[1].synteny_left == (maf.MAF_CONTIG_STATUS, 0)
assert a.components[1].synteny_right == (maf.MAF_INSERT_STATUS, 1678)
rat = a.get_component_by_src_start("rn3.")
check_component(rat, "rn3.chr4", 29161032, 1524, "-", 187371129, None)
assert rat.synteny_empty == maf.MAF_INSERT_STATUS
def test_write_with_synteny():
reader = maf.Reader(StringIO(test_maf_2), parse_e_rows=True)
a = next(reader)
val = StringIO()
writer = maf.Writer(val, {'scoring': 'foobar'})
writer.write(a)
actual = val.getvalue()
expected = """##maf version=1 scoring=foobar
a score=3656.0
s hg17.chr1 2005 34 + 245522847 TGTAACTTAATACCACAACCAGGCATAGGGG--AAA-------------
s rheMac2.chr11 9625228 31 + 134511895 TGTAACCTCTTACTGCAACAAGGCACAGGGG------------------
i rheMac2.chr11 C 0 I 1678
s panTro1.chr1 2014 34 + 229575298 TGTAACTTAATACCACAACCAGGCATGGGGG--AAA-------------
i panTro1.chr1 C 0 C 0
s bosTau2.chr5 64972365 47 + 76426644 TCCAGCCATGTGTTGTGATCAG--CCAGGGGCTAAAGCCATGGCGGTAG
i bosTau2.chr5 C 0 I 1462
s canFam2.chr27 45129665 31 + 48908698 TTTGACTCTGTGCTCTTATCAGGCCCAAGGG------------------
i canFam2.chr27 C 0 I 1664
e danRer3.chr18 2360867 428 + 50308305 I
e oryCun1.scaffold_139397 643 1271 - 4771 I
e loxAfr1.scaffold_5603 58454 1915 + 68791 I
e echTel1.scaffold_212365 4641 1430 + 9822 I
e echTel1.scaffold_212365 4641 1430 + 9822 I
e rn3.chr4 29161032 1524 - 187371129 I
e mm7.chr6 28091695 3290 - 149646834 I
""" # noqa: W291
print(actual)
print("---")
print(expected)
assert actual == expected
def check_component(c, src, start, size, strand, src_size, text):
assert c.src == src
assert c.start == start
assert c.size == size
assert c.strand == strand
assert c.src_size == src_size
assert c.text == text
bx-python-0.8.13/lib/bx/align/score.py 0000664 0000000 0000000 00000027156 14156664651 0017522 0 ustar 00root root 0000000 0000000 """
Support for scoring alignments using arbitrary scoring matrices, arbitrary
alphabets, and affine gap penalties.
"""
from numpy import (
float32,
int32,
ones,
zeros
)
class ScoringScheme:
# note that gap_open and gap_extend are penalties, which means you should make them positive
def __init__(self, gap_open, gap_extend, default=-100, alphabet1="ACGT", alphabet2=None, gap1="-", gap2=None, text1_range=128, text2_range=None, typecode=int32):
if text2_range is None:
text2_range = text1_range
if alphabet2 is None:
alphabet2 = alphabet1
if gap2 is None:
gap2 = gap1 # (scheme with gap1=gap2=None is legit)
if isinstance(alphabet1, str):
alphabet1 = [ch for ch in alphabet1]
if isinstance(alphabet2, str):
alphabet2 = [ch for ch in alphabet2]
self.table = ones((text1_range, text2_range), typecode)
self.table *= default
self.gap_open = gap_open
self.gap_extend = gap_extend
self.gap1 = gap1
self.gap2 = gap2
self.alphabet1 = alphabet1
self.alphabet2 = alphabet2
# private _set_score and _get_score allow subclasses to override them to
# implement a different underlying table object
def _set_score(self, a_b_pair, val):
(a, b) = a_b_pair
self.table[a, b] = val
def _get_score(self, a_b_pair):
(a, b) = a_b_pair
return self.table[a, b]
def set_score(self, a, b, val, foldcase1=False, foldcase2=False):
self._set_score((a, b), val)
if foldcase1:
aCh = chr(a)
if (aCh.isupper()):
aa = ord(aCh.lower())
elif (aCh.islower()):
aa = ord(aCh.upper())
else:
foldcase1 = False
if foldcase2:
bCh = chr(b)
if (bCh.isupper()):
bb = ord(bCh.lower())
elif (bCh.islower()):
bb = ord(bCh.upper())
else:
foldcase2 = False
if foldcase1 and foldcase2:
self._set_score((aa, b), val)
self._set_score((a, bb), val)
self._set_score((aa, bb), val)
elif foldcase1:
self._set_score((aa, b), val)
elif foldcase2:
self._set_score((a, bb), val)
def score_alignment(self, a):
return score_alignment(self, a)
def score_texts(self, text1, text2):
return score_texts(self, text1, text2)
def __str__(self):
isDna1 = "".join(self.alphabet1) == "ACGT"
isDna2 = "".join(self.alphabet2) == "ACGT"
labelRows = not (isDna1 and isDna2)
width = 3
for a in self.alphabet1:
for b in self.alphabet2:
score = self._get_score((ord(a), ord(b)))
if (isinstance(score, float)):
s = "%8.6f" % score
else:
s = "%s" % score
if (len(s)+1 > width):
width = len(s)+1
lines = []
line = []
if labelRows:
if isDna1:
line.append(" ")
else:
line.append(" ")
for b in self.alphabet2:
if isDna2:
s = b
else:
s = "%02X" % ord(b)
line.append("%*s" % (width, s))
lines.append(("".join(line))+"\n")
for a in self.alphabet1:
line = []
if labelRows:
if isDna1:
line.append(a)
else:
line.append("%02X" % ord(a))
for b in self.alphabet2:
score = self._get_score((ord(a), ord(b)))
if (isinstance(score, float)):
s = "%8.6f" % score
else:
s = "%s" % score
line.append("%*s" % (width, s))
lines.append(("".join(line))+"\n")
return "".join(lines)
def read_scoring_scheme(f, gap_open, gap_extend, gap1="-", gap2=None, **kwargs):
"""
Initialize scoring scheme from a file containint a blastz style text blob.
f can be either a file or the name of a file.
"""
close_it = False
if (isinstance(f, str)):
f = open(f)
close_it = True
ss = build_scoring_scheme("".join([line for line in f]), gap_open, gap_extend, gap1=gap1, gap2=gap2, **kwargs)
if (close_it):
f.close()
return ss
def build_scoring_scheme(s, gap_open, gap_extend, gap1="-", gap2=None, **kwargs):
"""
Initialize scoring scheme from a blastz style text blob, first line
specifies the bases for each row/col, subsequent lines contain the
corresponding scores. Slaw extensions allow for unusual and/or
asymmetric alphabets. Symbols can be two digit hex, and each row
begins with symbol. Note that a row corresponds to a symbol in text1
and a column to a symbol in text2.
examples:
blastz slaw
A C G T 01 02 A C G T
91 -114 -31 -123 01 200 -200 -50 100 -50 100
-114 100 -125 -31 02 -200 200 100 -50 100 -50
-31 -125 100 -114
-123 -31 -114 91
"""
# perform initial parse to determine alphabets and locate scores
bad_matrix = "invalid scoring matrix"
s = s.rstrip("\n")
lines = s.split("\n")
rows = []
symbols2 = lines.pop(0).split()
symbols1 = None
rows_have_syms = False
a_la_blastz = True
for i, line in enumerate(lines):
row_scores = line.split()
if len(row_scores) == len(symbols2): # blastz-style row
if symbols1 is None:
if len(lines) != len(symbols2):
raise bad_matrix
symbols1 = symbols2
elif (rows_have_syms):
raise bad_matrix
elif len(row_scores) == len(symbols2) + 1: # row starts with symbol
if symbols1 is None:
symbols1 = []
rows_have_syms = True
a_la_blastz = False
elif not rows_have_syms:
raise bad_matrix
symbols1.append(row_scores.pop(0))
else:
raise bad_matrix
rows.append(row_scores)
# convert alphabets from strings to characters
try:
alphabet1 = [sym_to_char(sym) for sym in symbols1]
alphabet2 = [sym_to_char(sym) for sym in symbols2]
except ValueError:
raise bad_matrix
if (alphabet1 != symbols1) or (alphabet2 != symbols2):
a_la_blastz = False
if a_la_blastz:
alphabet1 = [ch.upper() for ch in alphabet1]
alphabet2 = [ch.upper() for ch in alphabet2]
# decide if rows and/or columns should reflect case
if a_la_blastz:
foldcase1 = foldcase2 = True
else:
foldcase1 = "".join(alphabet1) == "ACGT"
foldcase2 = "".join(alphabet2) == "ACGT"
# create appropriately sized matrix
text1_range = text2_range = 128
if ord(max(alphabet1)) >= 128:
text1_range = 256
if ord(max(alphabet2)) >= 128:
text2_range = 256
typecode = int32
for i, row_scores in enumerate(rows):
for j, score in enumerate(map(int_or_float, row_scores)):
if isinstance(score, float):
typecode = float32
if isinstance(gap_open, float):
typecode = float32
if isinstance(gap_extend, float):
typecode = float32
ss = ScoringScheme(gap_open, gap_extend, alphabet1=alphabet1, alphabet2=alphabet2, gap1=gap1, gap2=gap2, text1_range=text1_range, text2_range=text2_range, typecode=typecode, **kwargs)
# fill matrix
for i, row_scores in enumerate(rows):
for j, score in enumerate(map(int_or_float, row_scores)):
ss.set_score(ord(alphabet1[i]), ord(alphabet2[j]), score)
if foldcase1 and foldcase2:
ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j].upper()), score)
ss.set_score(ord(alphabet1[i].upper()), ord(alphabet2[j].lower()), score)
ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j].lower()), score)
elif foldcase1:
ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j]), score)
elif foldcase2:
ss.set_score(ord(alphabet1[i]), ord(alphabet2[j].lower()), score)
return ss
def int_or_float(s):
try:
return int(s)
except ValueError:
return float(s)
# convert possible two-char symbol to a single character
def sym_to_char(sym):
if len(sym) == 1:
return sym
elif len(sym) != 2:
raise ValueError
else:
return chr(int(sym, base=16))
def score_alignment(scoring_scheme, a):
score = 0
ncomps = len(a.components)
for i in range(ncomps):
for j in range(i+1, ncomps):
score += score_texts(scoring_scheme, a.components[i].text, a.components[j].text)
return score
def score_texts(scoring_scheme, text1, text2):
rval = 0
last_gap_a = last_gap_b = False
for i in range(len(text1)):
a = text1[i]
b = text2[i]
# Ignore gap/gap pair
if a == scoring_scheme.gap1 and b == scoring_scheme.gap2:
continue
# Gap in first species
elif a == scoring_scheme.gap1:
rval -= scoring_scheme.gap_extend
if not last_gap_a:
rval -= scoring_scheme.gap_open
last_gap_a = True
last_gap_b = False
# Gap in second species
elif b == scoring_scheme.gap2:
rval -= scoring_scheme.gap_extend
if not last_gap_b:
rval -= scoring_scheme.gap_open
last_gap_a = False
last_gap_b = True
# Aligned base
else:
rval += scoring_scheme._get_score((ord(a), ord(b)))
last_gap_a = last_gap_b = False
return rval
def accumulate_scores(scoring_scheme, text1, text2, skip_ref_gaps=False):
"""
Return cumulative scores for each position in alignment as a 1d array.
If `skip_ref_gaps` is False positions in returned array correspond to each
column in alignment, if True they correspond to each non-gap position (each
base) in text1.
"""
if skip_ref_gaps:
rval = zeros(len(text1) - text1.count(scoring_scheme.gap1))
else:
rval = zeros(len(text1))
score = 0
pos = 0
last_gap_a = last_gap_b = False
for i in range(len(text1)):
a = text1[i]
b = text2[i]
# Ignore gap/gap pair
if a == scoring_scheme.gap1 and b == scoring_scheme.gap2:
continue
# Gap in first species
elif a == scoring_scheme.gap1:
score -= scoring_scheme.gap_extend
if not last_gap_a:
score -= scoring_scheme.gap_open
last_gap_a = True
last_gap_b = False
# Gap in second species
elif b == scoring_scheme.gap2:
score -= scoring_scheme.gap_extend
if not last_gap_b:
score -= scoring_scheme.gap_open
last_gap_a = False
last_gap_b = True
# Aligned base
else:
score += scoring_scheme._get_score((ord(a), ord(b)))
last_gap_a = last_gap_b = False
if not(skip_ref_gaps) or a != scoring_scheme.gap1:
rval[pos] = score
pos += 1
return rval
hox70 = build_scoring_scheme(""" A C G T
91 -114 -31 -123
-114 100 -125 -31
-31 -125 100 -114
-123 -31 -114 91 """, 400, 30)
bx-python-0.8.13/lib/bx/align/score_tests.py 0000664 0000000 0000000 00000007166 14156664651 0020743 0 ustar 00root root 0000000 0000000 """
Tests for `bx.align.score`.
"""
import unittest
from io import StringIO
from numpy import (
allclose,
array,
cumsum,
)
import bx.align.maf
import bx.align.score
aligns = [("CCACTAGTTTTTAAATAATCTACTATCAAATAAAAGATTTGTTAATAATAAATTTTAAATCATTAACACTT",
"CCATTTGGGTTCAAAAATTGATCTATCA----------TGGTGGATTATTATTTAGCCATTAAGGACAAAT",
-111),
("CCACTAGTTTTTAAATAATCTAC-----AATAAAAGATTTGTTAATAAT---AAATTTTAAATCATTAA-----CACTT",
"CCATTTGGGTTCAAAAATTGATCTATCA----------TGGTGGAT---TATTATTT-----AGCCATTAAGGACAAAT",
-3626),
("CCACTAGTTTTTGATTC",
"CCATTTGGGTTC-----",
-299),
("CTTAGTTTTTGATCACC",
"-----CTTGGGTTTACC",
-299),
("gggaattgaacaatgagaacacatggacacaggaaggggaacatcacacacc----------ggggcctgttgtggggtggggggaag",
"ggaactagaacaagggagacacatacaaacaacaacaacaacaacacagcccttcccttcaaagagcttatagtctgatggaggagag",
1690)]
mafs = """##maf
a score=2883.0
s hg17.chr1 6734 30 + 245522847 CTACCTCAGTGTGGAAGGTGGGCAGTTCTG
s rheMac1.SCAFFOLD71394 9319 30 - 13789 CTACCTCAGTGTGGAAGGTGGGCAGTTCTG
a score=8167.0
s hg17.chr1 41401 40 + 245522847 TGTGTGATTAATGCCTGAGACTGTGTGAAGTAAGAGATGG
s panTro1.chr1 49673 40 + 229575298 TGCGTGATTAATGCCTGAGATTGTGTGAAGTAAAAGATGG
s rheMac1.SCAFFOLD45837 26063 33 - 31516 TGTGTGATTAATGCCTGAGATTGTGTGAAGTAA-------
"""
nonsymm_scheme = bx.align.score.build_scoring_scheme(""" A C G T
91 0 -31 -123
-114 100 -125 -31
-31 -125 100 -114
-123 -31 -114 91 """, 400, 30)
aligns_for_nonsymm_scheme = [("AAAACCCCGGGGTTTT",
"ACGTACGTACGTACGT",
-580)]
asymm_scheme = bx.align.score.build_scoring_scheme(""" 01 02 A C G T
01 200 -200 -50 100 -50 100
02 -200 200 100 -50 100 -50 """,
0, 0, gap1='\x00')
aligns_for_asymm_scheme = [("\x01\x01\x01\x01\x01\x01",
"ACGT\x01\x02",
100)]
class BasicTests(unittest.TestCase):
def test_scoring_text(self):
ss = bx.align.score.hox70
for t1, t2, score in aligns:
self.assertEqual(bx.align.score.score_texts(ss, t1, t2), score)
def test_align(self):
ss = bx.align.score.hox70
for block in bx.align.maf.Reader(StringIO(mafs)):
self.assertEqual(bx.align.score.score_alignment(ss, block), float(block.score))
def test_accumulate(self):
ss = bx.align.score.hox70
self.assertTrue(allclose(
bx.align.score.accumulate_scores(ss, "-----CTTT", "CTTAGTTTA"),
cumsum(array([-430, -30, -30, -30, -30, -31, 91, 91, -123]))
))
self.assertTrue(allclose(
bx.align.score.accumulate_scores(ss, "-----CTTT", "CTTAGTTTA", skip_ref_gaps=True),
cumsum(array([-581, 91, 91, -123]))
))
def test_nonsymm_scoring(self):
ss = nonsymm_scheme
for t1, t2, score in aligns_for_nonsymm_scheme:
self.assertEqual(bx.align.score.score_texts(ss, t1, t2), score)
def test_asymm_scoring(self):
ss = asymm_scheme
for t1, t2, score in aligns_for_asymm_scheme:
self.assertEqual(bx.align.score.score_texts(ss, t1, t2), score)
bx-python-0.8.13/lib/bx/align/sitemask/ 0000775 0000000 0000000 00000000000 14156664651 0017642 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/align/sitemask/__init__.py 0000664 0000000 0000000 00000000304 14156664651 0021750 0 ustar 00root root 0000000 0000000 """
Tools for masking out specific sites in aligments by various criteria, for
example masking CpG sites or sites with low sequence quality.
"""
from bx.align.sitemask.core import * # noqa: F40
bx-python-0.8.13/lib/bx/align/sitemask/_cpg.pyx 0000664 0000000 0000000 00000005670 14156664651 0021324 0 ustar 00root root 0000000 0000000 """
Pyrex/C extension for quickly finding potential CpG sites in pairs of
sequences.
"""
from cpython.version cimport PY_MAJOR_VERSION
cdef extern from "find_cpg.h":
int next_cpg( char * sp1, char * sp2, int start)
int next_cpg_restricted( char * sp1, char *sp2, int start)
int next_non_cpg( char * sp1, char * sp2, int start)
def find_cpg( sp1, sp2, start ):
cdef char* a
cdef char* b
cdef int pos
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
pos = start
if pos > len(sp1): return -1
return next_cpg( a, b, pos )
def find_cpg_restricted( sp1, sp2, start ):
cdef char* a
cdef char* b
cdef int pos
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
pos = start
if pos > len(sp1): return -1
return next_cpg_restricted( a, b, pos )
def find_non_cpg( sp1, sp2, start ):
cdef char* a
cdef char* b
cdef int pos
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
pos = start
if pos > len(sp1): return -1
return next_non_cpg( a, b, pos )
def list_cpg( sp1, sp2 ):
cdef char * a
cdef char * b
cdef int start
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
start = 0
cpglist = list()
while start > -1 and start < len(sp1):
start = next_cpg( a, b, start )
if start == -1: break
cpglist.append(start)
start = start + 1
return cpglist
def list_cpg_restricted( sp1, sp2 ):
cdef char * a
cdef char * b
cdef int start
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
start = 0
cpglist = list()
while start > -1 and start < len(sp1):
start = next_cpg_restricted( a, b, start )
if start == -1: break
cpglist.append(start)
start = start + 1
return cpglist
def list_non_cpg( sp1, sp2 ):
cdef char * a
cdef char * b
cdef int start
if PY_MAJOR_VERSION >= 3:
bytes_sp1, bytes_sp2 = sp1.encode(), sp2.encode()
else:
bytes_sp1, bytes_sp2 = sp1, sp2
a = bytes_sp1
b = bytes_sp2
start = 0
cpglist = list()
while start > -1 and start < len(sp1):
start = next_non_cpg( a, b, start )
if start == -1: break
cpglist.append(start)
start = start + 1
return cpglist
def remove_gaps( sp, cpglist ):
for item in cpglist:
if sp[item] == '-':
cpglist.remove(item)
return cpglist
bx-python-0.8.13/lib/bx/align/sitemask/core.py 0000664 0000000 0000000 00000002156 14156664651 0021150 0 ustar 00root root 0000000 0000000 """
Base classes for site maskers.
"""
from bx.filter import (
Filter,
Pipeline,
)
class Masker(Filter):
def __init__(self, **kwargs):
self.masked = 0
self.total = 0
Exception("Abstract class")
class MaskPipeline(Pipeline):
"""
MaskPipeline implements a Pipeline through which alignments can be
pushed and masked. Pipelines can be aggregated.
"""
def get_masked(self):
masked = 0
for masker in self.pipeline:
try:
masked += masker.masked
except AttributeError:
pass
return masked
masked = property(fget=get_masked)
def __call__(self, block):
if not block:
return
# push alignment block through all filters
self.total += len(block.components[0].text)
for masker in self.filters:
if not block:
return
try:
masker.__call__
except AttributeError:
raise Exception("Masker in pipeline does not implement \"filter( self, block )\".")
masker(block)
bx-python-0.8.13/lib/bx/align/sitemask/cpg.py 0000664 0000000 0000000 00000005347 14156664651 0020776 0 ustar 00root root 0000000 0000000 """
Support for masking potential CpG sites in *pairwise* alignments.
"""
from bx.align.sitemask import Masker
from ._cpg import (
list_cpg,
list_cpg_restricted,
list_non_cpg,
)
# Restricted. Only mask out sites that are defitely CpG
class Restricted(Masker):
def __init__(self, mask='?'):
self.mask = mask
self.masked = 0
self.total = 0
def __call__(self, block):
if not block:
return block
if len(block.components) < 2:
return
cpglist = list_cpg_restricted(
block.components[0].text.upper(),
block.components[1].text.upper())
# now we have a fast list of CpG columns, iterate/mask
self.masked += len(cpglist)
self.total += len(block.components[0].text)
for component in block.components:
component.text = mask_columns(cpglist, component.text, self.mask)
return block
# Inclusive. Mask out all sites that are not non-CpG sites.
class Inclusive(Masker):
def __init__(self, mask='?'):
self.mask = mask
self.masked = 0
self.total = 0
def __call__(self, block):
if not block:
return block
if len(block.components) < 2:
return
cpglist = list_cpg(
block.components[0].text.upper(),
block.components[1].text.upper())
self.masked += len(cpglist)
self.total += len(block.components[0].text)
for component in block.components:
component.text = mask_columns(cpglist, component.text, self.mask)
return block
# Mak nonCpG sites
class nonCpG(Masker):
def __init__(self, mask='?'):
self.mask = mask
self.masked = 0
self.total = 0
def __call__(self, block):
if not block:
return block
if len(block.components) < 2:
return
noncpglist = list_non_cpg(
block.components[0].text.upper(),
block.components[1].text.upper())
# now we have a fast list of non-CpG columns, iterate/mask
self.masked += len(noncpglist)
self.total += len(block.components[0].text)
for component in block.components:
component.text = mask_columns(noncpglist, component.text, self.mask)
return block
def mask_columns(masklist, text, mask):
templist = list()
for position in masklist:
if text[position] != "-":
templist.append(position)
templist.append(len(text)) # Add the end of the text
# cut string
newtext = list()
c = 0
for position in templist:
newtext.append(text[c:position])
c = position + 1 # Gaps have len = 1
joinedtext = mask.join(newtext)
return joinedtext
bx-python-0.8.13/lib/bx/align/sitemask/find_cpg.c 0000664 0000000 0000000 00000003236 14156664651 0021563 0 ustar 00root root 0000000 0000000 #include
/*
Author: Ian N Schenck
Version: 7/21/2006
Most of this was ripped out of James Taylor's never-released code,
and plugged in here for use in Python. Slight modifications were
made where I saw fit.
It looks as if CpG's are typically not next to gaps.
*/
static inline int is_cpg( char * sp1, char * sp2, int pos)
{
if ( pos < 1 ) return 0;
if ( sp1[pos + 1] == '\0' ) return 0;
if ( sp1[pos - 1] != 'C' && sp2[pos - 1] != 'C' &&
sp1[pos + 1] == 'G' && sp2[pos + 1] == 'G' &&
(sp1[pos] == 'C' || sp2[pos] == 'C') ) return 1;
if ( sp1[pos + 1] != 'G' && sp2[pos + 1] != 'G' &&
sp1[pos - 1] == 'C' && sp2[pos - 1] == 'C' &&
(sp1[pos] == 'G' || sp2[pos] == 'G') ) return 1;
return 0;
}
static inline int is_non_cpg( char * sp1, char * sp2, int pos)
{
// first one can't assuredly be cpg
if ( pos < 1 ) return 1;
if ( sp1[pos + 1] == '\0' ) return 0;
return
( sp1[pos - 1] != 'C' && sp2[pos - 1] != 'C' &&
sp1[pos + 1] != 'G' && sp2[pos + 1] != 'G' );
}
static inline int is_cpg_restricted( char * sp1, char * sp2, int pos )
{
return !is_non_cpg( sp1, sp2, pos );
}
int next( char * sp1, char * sp2, int start, int (*func)(char*,char*,int))
{
while( sp1[start+1] != '\0')
{
if( func(sp1, sp2, start) )
return start;
start++;
}
// nothing found
return -1;
}
int next_cpg( char * sp1, char * sp2, int start)
{
return next( sp1, sp2, start, &is_cpg);
}
int next_cpg_restricted( char * sp1, char *sp2, int start)
{
return next( sp1, sp2, start, &is_cpg_restricted );
}
int next_non_cpg( char * sp1, char * sp2, int start)
{
return next( sp1, sp2, start, &is_non_cpg);
}
bx-python-0.8.13/lib/bx/align/sitemask/find_cpg.h 0000664 0000000 0000000 00000000441 14156664651 0021563 0 ustar 00root root 0000000 0000000 #ifndef __find_cpg__
#define __find_cpg__
int next( char * sp1, char * sp2, int start, int (*func)(char*,char*,int));
int next_cpg( char * sp1, char * sp2, int start);
int next_cpg_restricted( char * sp1, char *sp2, int start);
int next_non_cpg( char * sp1, char * sp2, int start);
#endif
bx-python-0.8.13/lib/bx/align/sitemask/quality.py 0000664 0000000 0000000 00000013012 14156664651 0021701 0 ustar 00root root 0000000 0000000 """
Support for masking out sites in alignments based on sequence quality. Both
simple masking of regions below some threshold and masking using the
neighborhood quality standard (NQS) are supported. Uses sequence quality
values stored in a `bx.binned_array.FileBinnedArray`.
"""
from bx.align.sitemask import Masker
from bx.binned_array import FileBinnedArray
# This class implements simple rules for masking quality, if base <
# minqual, mask
class Simple(Masker):
# keys should be:
# qualspecies: dictionary of species as key, lengths
# dict by chromosome or chromosome list as value
# qualfiles: prefix for quality file for each species in qualspecies
# mask: mask character (default is '?')
# minqual: minimum quality
# cache: optional, but sets the number of megabytes allowed in cache per quality masked species
def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
if not qualfiles:
raise Exception("No quality files.")
if not qualspecies:
raise Exception("No species dictionary.")
if not minqual:
raise Exception("No minimum quality specified.")
self.mask = "?"
self.minqual = minqual
self.mask = mask
self.total = 0
self.masked = 0
self.qualfiles = qualfiles
self.qualspecies = qualspecies
self.cache = cache * 2 # typical bin size is 512K
# load quality files into FileBinnedArray
self.qualities = {}
for species, qualfile in self.qualfiles.items():
specdict = {}
for chrom in self.qualspecies[species]:
specdict[chrom] = FileBinnedArray(
open(qualfile + "." + chrom + ".bqv", "rb"),
cache=self.cache/len(qualfiles))
self.qualities[species] = specdict
def __call__(self, block):
if not block:
return
for qualspec in self.qualities:
comp = block.get_component_by_src_start(qualspec)
if not comp:
continue
chrom = comp.src.split(".")[1]
start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
# get quality slice, for + strand
qual = self.qualities[qualspec][chrom][start:end]
x = 0
while start+x < end:
self.total += 1
# got the column in the alignment for this particular base
if qual[x] < self.minqual:
col = comp.coord_to_col(start+x)
self.masked += 1
for component in block.components:
if component.text[col] != "-":
component.text = component.text[0:col] + \
self.mask + \
component.text[col+1:len(component.text)]
# iterate through quality
x += 1
return block
class NQS(Masker):
# keys should be:
# qualspecies: dictionary of species as key, lengths
# dict by chromosome or chromosome list as value
# qualfiles: prefix for quality file for each species in qualspecies
# mask: mask character (default is '?')
# minqual: minimum quality
# neighborqual: neighborhood minimum quality (bases within 5 bps are masked)
# cache: optional, but sets the number of megabytes allowed in cache per quality masked species
def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
if not qualfiles:
raise Exception("No quality files.")
if not qualspecies:
raise Exception("No species dictionary.")
if not minqual:
raise Exception("No minimum quality specified.")
self.mask = "?"
self.minqual = minqual
self.mask = mask
self.total = 0
self.masked = 0
self.qualfiles = qualfiles
self.qualspecies = qualspecies
self.cache = cache * 2 # typical bin size is 512K
# load quality files into FileBinnedArray
self.qualities = {}
for species, qualfile in self.qualfiles.items():
specdict = {}
for chrom in self.qualspecies[species]:
specdict[chrom] = FileBinnedArray(
open(qualfile + "." + chrom + ".bqv", "rb"),
cache=self.cache/len(qualfiles))
self.qualities[species] = specdict
def __call__(self, block):
if not block:
return
for qualspec in self.qualities:
comp = block.get_component_by_src_start(qualspec)
chrom = comp.src.split(".")[1]
start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
# get quality slice, for + strand
qual = self.qualities[qualspec][chrom][start:end]
x = 0
while start+x < end:
self.total += 1
# got the column in the alignment for this particular base
if qual[x] < self.minqual:
col = comp.coord_to_col(start+x)
self.masked += 1
for component in block.components:
if component.text[col] != "-":
component.text = component.text[0:col] + \
self.mask + \
component.text[col+1:len(component.text)]
# iterate through quality
x += 1
return block
bx-python-0.8.13/lib/bx/align/sitemask/sitemask_tests.py 0000664 0000000 0000000 00000005002 14156664651 0023253 0 ustar 00root root 0000000 0000000 """
Tests for `bx.align.maf.sitemask`.
"""
import tempfile
from io import StringIO
import bx.align.maf
from . import cpg
test_maf_cpg = """##maf version=1 scoring=none
a score=0
s apple 34 64 + 110 AGGGA---GTTCGTCACT------GTCGTAAGGGTTCAGA--CTGTCTATGTATACACAAGTTGTGTTGCA--ACCG
s orange 19 61 - 100 AGGGATGCGTT--TCACTGCTATCGTCGTA----TTCAGACTTCG-CTATCT------GAGTTGT---GCATTACCG
"""
cpg_inclusive_result = [
"##maf,version=1",
"a,score=0",
"s,apple,34,64,+,110,AGGGA---GTTCGTCACT------GT##TAAGGGTTCAGA--CTGTCTATGTATACACAAGTTGTGTTGCA--ACCG",
"s,orange,19,61,-,100,AGGGATG#GTT--TCACTGCTAT#GT##TA----TTCAGACTTCG-CTATCT------GAGTTGT---GCATTACCG"
]
cpg_restricted_result = [
"##maf,version=1",
"a,score=0",
"s,apple,34,64,+,110,A##GA---#TT##TC#C#------#T##TA###GTTC#GA--C##TC#A#G#ATAC####GT#G#GT#GC#--AC#G",
"s,orange,19,61,-,100,A##GA#G##TT--TC#C#GC#AT##T##TA----TTC#GAC#T##-C#A#C#------##GT#G#---GC#TTAC#G"
]
noncpg_result = [
"##maf,version=1",
"a,score=0",
"s,apple,34,64,+,110,#GG##---G##CG##A#T------G#CG##AGG####A##--#TG##T#T#T####ACAA##T#T##T##A--##CG",
"s,orange,19,61,-,100,#GG##T#CG##--##A#T##T##CG#CG##----###A###T#CG-#T#T#T------GA##T#T---##A####CG"
]
def test_cpg_inclusive():
reader = bx.align.maf.Reader(StringIO(test_maf_cpg))
out = tempfile.NamedTemporaryFile('w')
writer = bx.align.maf.Writer(out)
cpgfilter = cpg.Inclusive(mask='#')
cpgfilter.run(reader, writer.write)
out.seek(0)
j = 0
for line in open(out.name):
line = line.strip()
if not(line):
continue
assert cpg_inclusive_result[j] == ",".join(line.split())
j += 1
def test_cpg_restricted():
reader = bx.align.maf.Reader(StringIO(test_maf_cpg))
out = tempfile.NamedTemporaryFile('w')
writer = bx.align.maf.Writer(out)
cpgfilter = cpg.Restricted(mask='#')
cpgfilter.run(reader, writer.write)
out.seek(0)
j = 0
for line in open(out.name):
line = line.strip()
if not(line):
continue
assert cpg_restricted_result[j] == ",".join(line.split())
j += 1
def test_non_cpg():
reader = bx.align.maf.Reader(StringIO(test_maf_cpg))
out = tempfile.NamedTemporaryFile('w')
writer = bx.align.maf.Writer(out)
cpgfilter = cpg.nonCpG(mask='#')
cpgfilter.run(reader, writer.write)
out.seek(0)
j = 0
for line in open(out.name):
line = line.strip()
if not(line):
continue
assert noncpg_result[j] == ",".join(line.split())
j += 1
bx-python-0.8.13/lib/bx/align/tools/ 0000775 0000000 0000000 00000000000 14156664651 0017162 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/align/tools/__init__.py 0000664 0000000 0000000 00000000320 14156664651 0021266 0 ustar 00root root 0000000 0000000 """
Various utilities for working with `bx.align.Alignment` objects.
"""
from .chop import * # noqa: F40
from .fuse import * # noqa: F40
from .thread import * # noqa: F40
from .tile import * # noqa: F40
bx-python-0.8.13/lib/bx/align/tools/chop.py 0000664 0000000 0000000 00000002030 14156664651 0020460 0 ustar 00root root 0000000 0000000 """
Support for chopping a list of alignment blocks to only the portion that
intersects a particular interval.
"""
def chop_list(blocks, src, start, end):
"""
For each alignment block in the sequence `blocks`, chop out the portion
of the block that overlaps the interval [`start`,`end`) in the
component/species named `src`.
"""
new_blocks = []
for block in blocks:
ref = block.get_component_by_src(src)
# If the reference component is on the '-' strand we should complement the interval
if ref.strand == '-':
slice_start = max(ref.src_size - end, ref.start)
slice_end = max(ref.src_size - start, ref.end)
else:
slice_start = max(start, ref.start)
slice_end = min(end, ref.end)
sliced = block.slice_by_component(ref, slice_start, slice_end)
good = True
for c in sliced.components:
if c.size < 1:
good = False
if good:
new_blocks.append(sliced)
return new_blocks
bx-python-0.8.13/lib/bx/align/tools/fuse.py 0000664 0000000 0000000 00000006032 14156664651 0020477 0 ustar 00root root 0000000 0000000 """
Tools for fusing contiguous alignment blocks together.
"""
from copy import deepcopy
def fuse_list(mafs):
"""
Try to fuse a list of blocks by progressively fusing each adjacent pair.
"""
last = None
for m in mafs:
if last is None:
last = m
else:
fused = fuse(last, m)
if fused:
last = fused
else:
yield last
last = m
if last:
yield last
def fuse(m1, m2):
"""
Attempt to fuse two blocks. If they can be fused returns a new block,
otherwise returns None.
Example:
>>> import bx.align.maf
>>> block1 = bx.align.maf.from_string( '''
... a score=0.0
... s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
... i hg18.chr10 N 0 C 0
... s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
... ''' )
>>> block2 = bx.align.maf.from_string( '''
... a score=0.0
... s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC
... i hg18.chr10 C 0 I 12
... s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT
... ''' )
>>> fused = fuse( block1, block2 )
>>> print(fused)
a score=0.0
s hg18.chr10 52686 113 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGCGCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC
i hg18.chr10 N 0 I 12
s panTro1.chrUn_random 208115356 113 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGTGCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT
"""
# Check if the blocks are adjacent and easily fusable
# return none if not.
if len(m1.components) != len(m2.components):
return None
for c1, c2 in zip(m1.components, m2.components):
if c1.src != c2.src:
return None
if c1.strand != c2.strand:
return None
if c1.end != c2.start:
return None
if c1.empty or c2.empty:
return None
# Try to fuse:
n = deepcopy(m1)
for c1, c2 in zip(n.components, m2.components):
c1.text += c2.text
c1.size += c2.size
# Propagate the synteny right
c1.synteny_right = c2.synteny_right
n.text_size = len(n.components[0].text)
return n
class FusingAlignmentWriter:
"""
Wrapper for an alignment Writer which attempts to fuse adjacent blocks
"""
def __init__(self, maf_writer):
self.maf_writer = maf_writer
self.last = None
def write(self, m):
if not self.last:
self.last = m
else:
fused = fuse(self.last, m)
if fused:
self.last = fused
else:
self.maf_writer.write(self.last)
self.last = m
def close(self):
if self.last:
self.maf_writer.write(self.last)
self.maf_writer.close()
bx-python-0.8.13/lib/bx/align/tools/thread.py 0000664 0000000 0000000 00000006203 14156664651 0021004 0 ustar 00root root 0000000 0000000 """
Tools for "threading" out specific species from alignments (removing other
species and fixing alignment text).
"""
from copy import deepcopy
def thread(mafs, species):
"""
Restrict an list of alignments to a given list of species by:
1) Removing components for any other species
2) Remove any columns containing all gaps
Example:
>>> import bx.align.maf
>>> block1 = bx.align.maf.from_string( '''
... a score=4964.0
... s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
... s rheMac2.chr20 58163346 43 - 88221753 ATATTATCTTAACATTAAAGA-AGAACAGTAATTCTGGTCATAA
... s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
... s oryCun1.scaffold_175207 85970 22 + 212797 ----------------------AAAATATTAGTTATCACCATAT
... s bosTau2.chr23 23894492 43 + 41602928 AAACTACCTTAATGTCACAGG-AAACAATGTATgctgctgctgc
... ''' )
>>> block2 = bx.align.maf.from_string( '''
... a score=9151.0
... s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAG-GAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTT-TGAC
... s oryCun1.scaffold_175207 85992 71 + 212797 TCTAGTGCTCTCCAATAATATAATAGATTATAACTTCATATAATTATGTGAAATATAAGATTATTTATCAG
... s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAG-GGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTT-TGAT
... s rheMac2.chr20 58163389 69 - 88221753 ACACATATTATTTCTTAACATGGAGGATTATATCTT-AAACATGTGTGCaaaatataaatatatat-tcaa
... ''' )
>>> mafs = [ block1, block2 ]
>>> threaded = [ t for t in thread( mafs, [ "hg18", "panTro1" ] ) ]
>>> len( threaded )
2
>>> print(threaded[0])
a score=0.0
s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
>>> print(threaded[1])
a score=0.0
s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC
s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT
"""
for m in mafs:
new_maf = deepcopy(m)
new_components = get_components_for_species(new_maf, species)
if new_components:
new_maf.components = new_components
new_maf.score = 0.0
new_maf.text_size = len(new_components[0].text)
new_maf.remove_all_gap_columns()
yield new_maf
def get_components_for_species(alignment, species):
"""Return the component for each species in the list `species` or None"""
# If the number of components in the alignment is less that the requested number
# of species we can immediately fail
if len(alignment.components) < len(species):
return None
# Otherwise, build an index of components by species, then lookup
index = {c.src.split('.')[0]: c for c in alignment.components}
try:
return [index[s] for s in species]
except Exception:
return None
bx-python-0.8.13/lib/bx/align/tools/tile.py 0000664 0000000 0000000 00000005406 14156664651 0020476 0 ustar 00root root 0000000 0000000 """
Tools for tiling / projecting alignments onto an interval of a sequence.
"""
import bx.seq.nib
def tile_interval(sources, index, ref_src, start, end, seq_db=None):
"""
Tile maf blocks onto an interval. The resulting block will span the interval
exactly and contain the column from the highest scoring alignment at each
position.
`sources`: list of sequence source names to include in final block
`index`: an instnace that can return maf blocks overlapping intervals
`ref_src`: source name of the interval (ie, hg17.chr7)
`start`: start of interval
`end`: end of interval
`seq_db`: a mapping for source names in the reference species to nib files
"""
# First entry in sources should also be on the reference species
assert sources[0].split('.')[0] == ref_src.split('.')[0], \
"{} != {}".format(sources[0].split('.')[0], ref_src.split('.')[0])
base_len = end - start
blocks = index.get(ref_src, start, end)
# From low to high score
blocks.sort(key=lambda t: t.score)
mask = [-1] * base_len
for i, block in enumerate(blocks):
ref = block.get_component_by_src_start(ref_src)
assert ref.strand == "+"
slice_start = max(start, ref.start)
slice_end = min(end, ref.end)
for j in range(slice_start, slice_end):
mask[j-start] = i
tiled = []
for i in range(len(sources)):
tiled.append([])
for ss, ee, index in intervals_from_mask(mask):
# Interval with no covering alignments
if index < 0:
# Get sequence if available, otherwise just use 'N'
if seq_db:
tiled[0].append(bx.seq.nib.NibFile(open(seq_db[ref_src])).get(start+ss, ee-ss))
else:
tiled[0].append("N" * (ee-ss))
# Gaps in all other species
for row in tiled[1:]:
row.append("-" * (ee - ss))
else:
slice_start = start + ss
slice_end = start + ee
block = blocks[index]
ref = block.get_component_by_src_start(ref_src)
sliced = block.slice_by_component(ref, slice_start, slice_end)
sliced = sliced.limit_to_species(sources)
sliced.remove_all_gap_columns()
for i, src in enumerate(sources):
comp = sliced.get_component_by_src_start(src)
if comp:
tiled[i].append(comp.text)
else:
tiled[i].append("-" * sliced.text_size)
return ["".join(t) for t in tiled]
def intervals_from_mask(mask):
start = 0
last = mask[0]
for i in range(1, len(mask)):
if mask[i] != last:
yield start, i, last
start = i
last = mask[i]
yield start, len(mask), last
bx-python-0.8.13/lib/bx/arrays/ 0000775 0000000 0000000 00000000000 14156664651 0016231 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/arrays/__init__.py 0000664 0000000 0000000 00000000061 14156664651 0020337 0 ustar 00root root 0000000 0000000 """
Classes for working with arrays of data.
"""
bx-python-0.8.13/lib/bx/arrays/array_tree.pyx 0000664 0000000 0000000 00000046423 14156664651 0021141 0 ustar 00root root 0000000 0000000 from __future__ import division
__all__ = [ 'ArrayTree', 'FileArrayTreeDict', 'array_tree_dict_from_reader' ]
import numpy
from numpy import *
cimport numpy
cimport bx.arrays.wiggle
from bx.misc.binary_file import BinaryFileWriter, BinaryFileReader
from bx.misc.cdb import FileCDBDict
"""
Classes for storing binary data on disk in a tree structure that allows for
efficient sparse storage (when the data occurs in contiguous blocks), fast
access to a specific block of data, and fast access to summaries at different
resolutions.
On disk format
--------------
Blocks are stored contiguously on disk in level-order. Contents should always be
network byte order (big endian), however this implementation will byte-swap when
reading if necessary. File contents:
- magic: uint32
- version: unit32
- array size: uint32
- block size: uint32
- array type: 4 chars (numpy typecode, currently only simple types represented by one char are supported)
- Internal nodes in level order
- Summary
- count of valid values in each subtree : sizeof( dtype ) * block_size
- frequencies: sizeof ( int32 ) * block_size
- min of valid values in each subtree : sizeof( dtype ) * block_size
- max of valid values in each subtree : sizeof( dtype ) * block_size
- sum of valid values in each subtree : sizeof( dtype ) * block_size
- sum of squares of valid values in each subtree : sizeof( dtype ) * block_size
- File offsets of each child node: uint64 * block_size
- Leaf nodes
- data points: sizeof( dtype ) * block_size
- Version 1 reads version 0 and version 1
"""
## Enhancement ideas:
##
## - Write markers of the number of blocks skipped between blocks. This would
## allow fast striding across data or summaries (use the indexes to get to
## the start of a block, then just read straight through). Would this help?
##
## - Compression for blocks?
MAGIC = 0x310ec7dc
VERSION = 1
NUM_SUMMARY_ARRAYS = 6
def array_tree_dict_from_reader( reader, sizes, default_size=2147483647, block_size=1000, no_leaves=False ):
# Create empty array trees
rval = {}
## for key, size in sizes.iteritems():
## rval[ key ] = ArrayTree( size, 1000 )
# Fill
last_chrom = None
last_array_tree = None
for chrom, start, end, _, val in reader:
if chrom != last_chrom:
if chrom not in rval:
rval[chrom] = ArrayTree( sizes.get( chrom, default_size ), block_size, no_leaves=no_leaves )
last_array_tree = rval[chrom]
last_array_tree.set_range( start, end, val )
return rval
cdef class FileArrayTreeDict:
"""
Access to a file containing multiple array trees indexed by a string key.
"""
cdef object io
cdef object cdb_dict
def __init__( self, file ):
self.io = io = BinaryFileReader( file, MAGIC )
assert (0 <= io.read_uint32() <= 1) # Check for version 0 or 1
self.cdb_dict = FileCDBDict( file, is_little_endian=io.is_little_endian )
def __getitem__( self, key ):
offset = self.cdb_dict[key]
offset = self.io.unpack( "L", offset.encode() )[0]
self.io.seek( offset )
return FileArrayTree( self.io.file, self.io.is_little_endian )
@classmethod
def dict_to_file( Class, dict, file, is_little_endian=True, no_leaves=False ):
"""
Writes a dictionary of array trees to a file that can then be
read efficiently using this class.
"""
io = BinaryFileWriter( file, is_little_endian=is_little_endian )
# Write magic number and version
io.write_uint32( MAGIC )
io.write_uint32( VERSION )
# Write cdb index with fake values just to fill space
cdb_dict = {}
for key in dict.iterkeys():
cdb_dict[ key ] = io.pack( "L", 0 )
cdb_offset = io.tell()
FileCDBDict.to_file( cdb_dict, file, is_little_endian=is_little_endian )
# Write each tree and save offset
for key, value in dict.iteritems():
offset = io.tell()
cdb_dict[ key ] = io.pack( "L", offset )
value.to_file( file, is_little_endian=is_little_endian, no_leaves=no_leaves )
# Go back and write the index again
io.seek( cdb_offset )
FileCDBDict.to_file( cdb_dict, file, is_little_endian=is_little_endian )
cdef class FileArrayTree:
"""
Wrapper for ArrayTree stored in file that reads as little as possible
"""
cdef public int max
cdef public int block_size
cdef public object dtype
cdef public int levels
cdef public int offset
cdef public int root_offset
cdef object io
def __init__( self, file, is_little_endian=True ):
self.io = BinaryFileReader( file, is_little_endian=is_little_endian )
self.offset = self.io.tell()
# Read basic info about the tree
self.max = self.io.read_uint32()
self.block_size = self.io.read_uint32()
# Read dtype and canonicalize
dt = self.io.read( 1 )
self.dtype = numpy.dtype( dt )
self.io.skip( 3 )
# How many levels are needed to cover the entire range?
self.levels = 0
while ( self.block_size ) ** ( self.levels + 1 ) < self.max:
self.levels += 1
# Not yet dealing with the case where the root is a Leaf
assert self.levels > 0, "max < block_size not yet handled"
# Save offset of root
self.root_offset = self.io.tell()
def __getitem__( self, index ):
min = self.r_seek_to_node( index, 0, self.root_offset, self.levels, 0 )
if min < 0:
return nan
self.io.skip( self.dtype.itemsize * ( index - min ) )
return self.io.read_raw_array( self.dtype, 1 )[0]
def get_summary( self, index, level ):
if level <= 0 or level > self.levels:
raise ValueError, "level must be <= self.levels"
if self.r_seek_to_node( index, 0, self.root_offset, self.levels, level ) < 0:
return None
# Read summary arrays
s = Summary()
s.counts = self.io.read_raw_array( self.dtype, self.block_size )
s.frequencies = self.io.read_raw_array( self.dtype, self.block_size )
s.sums = self.io.read_raw_array( self.dtype, self.block_size )
s.mins = self.io.read_raw_array( self.dtype, self.block_size)
s.maxs = self.io.read_raw_array( self.dtype, self.block_size )
s.sumsquares = self.io.read_raw_array( self.dtype, self.block_size )
return s
def get_leaf( self, index ):
if self.r_seek_to_node( index, 0, self.root_offset, self.levels, 0 ) < 0:
return []
return self.io.read_raw_array( self.dtype, self.block_size )
cdef int r_seek_to_node( self, int index, int min, long long offset, int level, int desired_level ):
"""
Seek to the start of the node at `desired_level` that contains `index`.
Returns the minimum value represented in that node.
"""
cdef int child_size, bin_index, child_min
self.io.seek( offset )
if level > desired_level:
child_size = self.block_size ** level
bin_index = ( index - min ) // ( child_size )
child_min = min + ( bin_index * child_size )
# Skip summary arrays -- # arrays * itemsize * block_size
self.io.skip( NUM_SUMMARY_ARRAYS * self.dtype.itemsize * self.block_size )
# Skip to offset of correct child -- offsets are 8 bytes
self.io.skip( 8 * bin_index )
# Read offset of child
child_offset = self.io.read_uint64()
# print "co: %s\tbi: %s\tcm: %s\n" % (child_offset, bin_index, child_min)
if child_offset == 0:
return -1
return self.r_seek_to_node( index, child_min, child_offset, level - 1, desired_level )
else:
# The file pointer is at the start of the desired node, do nothing
return min
cdef class Summary:
"""
Summary for a non-leaf level of the tree, contains arrays of the min, max,
valid count, sum, and sum-of-squares for each child.
"""
cdef public object counts
cdef public object frequencies
cdef public object mins
cdef public object maxs
cdef public object sums
cdef public object sumsquares
cdef class ArrayTreeNode
cdef class ArrayTreeLeaf
cdef class ArrayTree:
"""
Stores a sparse array of data as a tree.
An array of `self.max` values is stored in a tree in which each leaf
contains `self.block_size` values and each internal node contains
`self.block_size` children.
Entirely empty subtrees are not stored. Thus, the storage is efficient for
data that is block sparse -- having contiguous chunks of `self.block_size` or
larger data. Currently it is not efficient if the data is strided (e.g.
one or two data points in every interval of length `self.block_size`).
Internal nodes store `Summary` instances for their subtrees.
"""
cdef public int max
cdef public int block_size
cdef public object dtype
cdef public int levels
cdef public int no_leaves
cdef public ArrayTreeNode root
def __init__( self, int max, int block_size, dtype=float32, no_leaves=False ):
"""
Create a new array tree of size `max`
"""
self.max = max
self.block_size = block_size
self.no_leaves = no_leaves
# Force the dtype argument to its canonical dtype object
self.dtype = numpy.dtype( dtype )
# How many levels are needed to cover the entire range?
self.levels = 0
while ( self.block_size ) ** ( self.levels + 1 ) < self.max:
self.levels += 1
# Not yet dealing with the case where the root is a Leaf
assert self.levels > 0, "max < block_size not yet handled"
# Create the root node`
self.root = ArrayTreeNode( self, 0, max, block_size, self.levels )
def __setitem__( self, int index, value ):
self.root.set( index, value )
def set_range( self, int start, int end, value ):
for i from start <= i < end:
self.root.set( i, value )
def __getitem__( self, int index ):
return self.root.get( index )
def to_file( self, f, is_little_endian=True, no_leaves=False ):
io = BinaryFileWriter( f, is_little_endian=is_little_endian )
## io.write_uint32( VERSION )
io.write_uint32( self.max )
io.write_uint32( self.block_size )
io.write( self.dtype.char )
io.write( "\0\0\0" )
# Data pass, level order
if no_leaves:
bottom_level = 0
else:
bottom_level = -1
for level in range( self.levels, bottom_level, -1 ):
self.root.to_file_data_pass( io, level )
# Offset pass to fix up indexes
self.root.to_file_offset_pass( io )
@classmethod
def from_file( Class, f, is_little_endian=True ):
io = BinaryFileReader( f, is_little_endian=is_little_endian )
## assert io.read_uint32() == VERSION
max = io.read_uint32()
block_size = io.read_uint32()
dt = io.read( 1 )
io.read( 3 )
tree = Class( max, block_size, dt )
tree.root.from_file( io )
return tree
@classmethod
def from_sequence( Class, s, block_size=1000 ):
"""
Build an ArrayTree from a sequence like object (must have at least
length and getitem).
"""
tree = Class( len( s ), block_size )
for i in range( len( s ) ):
tree[i] = s[i]
return tree
cdef class ArrayTreeNode:
"""
Internal node of an ArrayTree. Contains summary data and pointers to
subtrees.
"""
cdef ArrayTree tree
cdef int min
cdef int max
cdef int block_size
cdef int level
cdef int child_size
cdef object children
cdef public Summary summary
cdef public long start_offset
def __init__( self, ArrayTree tree, int min, int max, int block_size, int level ):
self.tree = tree
self.min = min
self.max = max
self.block_size = block_size
self.level = level
# Each of my children represents block_size ** level values
self.child_size = self.block_size ** self.level
self.children = [None] * self.block_size
self.summary = None
self.start_offset = 0
cdef inline init_bin( self, int index ):
cdef int min = self.min + ( index * self.child_size )
cdef int max = min + self.child_size
if self.level == 1:
self.children[ index ] = ArrayTreeLeaf( self.tree, min, max )
else:
self.children[ index ] = ArrayTreeNode( self.tree, min, max, self.block_size, self.level - 1 )
def set( self, int index, value ):
cdef int bin_index = ( index - self.min ) // ( self.child_size )
if self.children[ bin_index ] is None:
self.init_bin( bin_index )
self.children[ bin_index ].set( index, value )
def get( self, int index ):
cdef int bin_index = ( index - self.min ) // ( self.child_size )
if self.children[ bin_index ] is None:
return nan
else:
return self.children[ bin_index ].get( index )
cpdef build_summary( self ):
"""
Build summary of children.
"""
counts = empty( self.tree.block_size, self.tree.dtype )
frequencies = empty( self.tree.block_size, self.tree.dtype )
mins = empty( self.tree.block_size, self.tree.dtype )
maxs = empty( self.tree.block_size, self.tree.dtype )
sums = empty( self.tree.block_size, self.tree.dtype )
sumsquares = empty( self.tree.block_size, self.tree.dtype )
for i in range( len( self.children ) ):
if self.children[i]:
if self.level == 1:
v = self.children[i].values
counts[i] = sum( ~isnan( v ) )
frequencies[i] = self.children[i].frequency
mins[i] = nanmin( v )
maxs[i] = nanmax( v )
sums[i] = nansum( v )
sumsquares[i] = nansum( v ** 2 )
else:
c = self.children[i]
c.build_summary()
counts[i] = sum( c.summary.counts )
frequencies[i] = sum( c.summary.frequencies )
mins[i] = nanmin( c.summary.mins )
maxs[i] = nanmax( c.summary.maxs )
sums[i] = nansum( c.summary.sums )
sumsquares[i] = nansum( c.summary.sumsquares )
else:
counts[i] = 0
frequencies[i] = 0
mins[i] = nan
maxs[i] = nan
sums[i] = nan
sumsquares[i] = nan
s = Summary()
s.counts = counts
s.frequencies = frequencies
s.mins = mins
s.maxs = maxs
s.sums = sums
s.sumsquares = sumsquares
self.summary = s
def to_file_data_pass( self, io, level ):
"""
First pass of writing to file, writes data and saves position of block.
"""
assert self.summary, "Writing without summaries is currently not supported"
# If we are at the current level being written, write a block
if self.level == level:
# Save file offset where this block starts
self.start_offset = io.tell()
# Write out summary data
io.write_raw_array( self.summary.counts )
io.write_raw_array( self.summary.frequencies )
io.write_raw_array( self.summary.sums )
io.write_raw_array( self.summary.mins )
io.write_raw_array( self.summary.maxs )
io.write_raw_array( self.summary.sumsquares )
# Skip enough room for child offsets (block_size children * 64bits)
io.skip( self.tree.block_size * 8 )
# Must be writing a lower level, so recurse
else:
# Write all non-empty children
for i in range( len( self.children ) ):
if self.children[i] is not None:
self.children[i].to_file_data_pass( io, level )
def to_file_offset_pass( self, io ):
"""
Second pass of writing to file, seek to appropriate position and write
offsets of children.
"""
# Seek to location of child offfsets (skip over # summary arrays)
skip_amount = NUM_SUMMARY_ARRAYS * self.tree.dtype.itemsize * self.block_size
io.seek( self.start_offset + skip_amount )
# Write the file offset of each child into the index
for child in self.children:
if child is None:
io.write_uint64( 0 )
else:
io.write_uint64( child.start_offset )
# Recursively write offsets in child nodes
for child in self.children:
if child is not None:
child.to_file_offset_pass( io )
def from_file( self, io ):
"""
Load entire summary and all children into memory.
"""
dtype = self.tree.dtype
block_size = self.tree.block_size
# Read summary arrays
s = Summary()
s.counts = io.read_raw_array( dtype, block_size )
s.frequencies = io.read_raw_array( int32, block_size )
s.sums = io.read_raw_array( dtype, block_size )
s.mins = io.read_raw_array( dtype, block_size)
s.maxs = io.read_raw_array( dtype, block_size )
s.sumsquares = io.read_raw_array( dtype, block_size )
self.summary = s
# Read offset of all children
child_offsets = [ io.read_uint64() for i in range( block_size ) ]
for i in range( block_size ):
if child_offsets[i] > 0:
self.init_bin( i )
io.seek( child_offsets[i] )
self.children[i].from_file( io )
def get_from_file( self, io, index ):
cdef int bin_index = ( index - self.min ) //( self.child_size )
if self.children[ bin_index ] is None:
return nan
else:
return self.children[ bin_index ].get( index )
cdef class ArrayTreeLeaf:
"""
Leaf node of an ArrayTree, contains data values.
"""
cdef ArrayTree tree
cdef int min
cdef int max
cdef public int frequency
cdef public numpy.ndarray values
cdef public long start_offset
def __init__( self, ArrayTree tree, int min, int max ):
self.tree = tree
self.min = min
self.max = max
self.frequency = 0
self.values = empty( max - min, self.tree.dtype )
self.values[:] = nan
self.start_offset = 0
def set( self, index, value ):
self.frequency += 1
self.values[ index - self.min ] = value
def get( self, index ):
return self.values[ index - self.min ]
def to_file_data_pass( self, io, level ):
assert level == 0
self.start_offset = io.tell()
io.write_raw_array( self.values )
def to_file_offset_pass( self, io ):
pass
def from_file( self, io ):
self.values = io.read_raw_array( self.tree.dtype, self.tree.block_size )
bx-python-0.8.13/lib/bx/arrays/array_tree_tests.py 0000664 0000000 0000000 00000007220 14156664651 0022163 0 ustar 00root root 0000000 0000000 import os
import sys
import tempfile
import unittest
try:
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
except Exception:
sys.path.insert(0, os.path.dirname(os.path.abspath(".")))
from bx.arrays.array_tree import ArrayTree, FileArrayTreeDict
class TestArrayTree(unittest.TestCase):
def setUp(self):
tree = ArrayTree(10000, 10) # max value of 10000, each block has 10 numbers
for i in range(5000):
tree[i] = i
# Insert extra copies to test frequency
for i in range(3000):
tree[i] = i
tree.set_range(5000, 9001, 100)
tree.root.build_summary()
d = {'test': tree}
f = tempfile.TemporaryFile()
FileArrayTreeDict.dict_to_file(d, f)
f.seek(0)
self.filearraytreedict = FileArrayTreeDict(f)
self.filearraytree = self.filearraytreedict['test']
def test_get_summary(self):
f = self.filearraytree
lvl1 = f.get_summary(0, 1)
self.assertEqual([float(_) for _ in lvl1.sums/lvl1.counts], [4.5, 14.5, 24.5, 34.5, 44.5, 54.5, 64.5, 74.5, 84.5, 94.5])
lvl2 = f.get_summary(0, 2)
self.assertEqual([float(_) for _ in lvl2.sums/lvl2.counts], [49.5, 149.5, 249.5, 349.5, 449.5, 549.5, 649.5, 749.5, 849.5, 949.5])
lvl3 = f.get_summary(0, 3)
self.assertEqual([float(_) for _ in lvl3.sums/lvl3.counts], [499.5, 1499.5, 2499.5, 3499.5, 4499.5, 100.0, 100.0, 100.0, 100.0, 100.0])
lvl2_2 = f.get_summary(3000, 2)
self.assertEqual([float(_) for _ in lvl2_2.sums/lvl2_2.counts], [3049.5, 3149.5, 3249.5, 3349.5, 3449.5, 3549.5, 3649.5, 3749.5, 3849.5, 3949.5])
def test_get_leaf(self):
f = self.filearraytree
from_start = [int(i) for i in f.get_leaf(0)]
from_middle = [int(i) for i in f.get_leaf(5)]
self.assertEqual(from_start, from_middle)
self.assertEqual(from_start, list(range(10)))
from_start = [int(i) for i in f.get_leaf(4999)]
self.assertEqual(from_start, list(range(4990, 5000)))
from_start = [int(i) for i in f.get_leaf(9600)]
self.assertEqual(from_start, [])
def test_big(self):
tree = ArrayTree(2147483647, 1000) # What we use for tracks
for i in range(5000):
tree[i] = i
# Insert extra copies to test frequency
for i in range(3000):
tree[i] = i
tree.set_range(5000, 9001, 100)
tree.set_range(14000000, 15000000, 200)
tree.root.build_summary()
d = {'test': tree}
f = tempfile.TemporaryFile()
FileArrayTreeDict.dict_to_file(d, f)
f.seek(0)
at = FileArrayTreeDict(f)['test']
lvl1 = at.get_summary(14000000, 1)
avgs = [float(_) for _ in lvl1.sums/lvl1.counts]
self.assertEqual(len(avgs), 1000)
self.assertEqual(avgs, [200 for i in range(0, 1000)])
def test_get_frequencies(self):
f = self.filearraytree
self.assertEqual([float(_) for _ in f.get_summary(0, 1).frequencies], ([20] * 10))
self.assertEqual([float(_) for _ in f.get_summary(4000, 1).frequencies], ([10] * 10))
self.assertEqual([float(_) for _ in f.get_summary(0, 2).frequencies], ([200] * 10))
self.assertEqual([int(_) for _ in f.get_summary(0, 3).frequencies], [2000, 2000, 2000, 1000, 1000, 1000, 1000, 1000, 1000, 1])
def test_wrong_dictkey(self):
self.assertRaises(KeyError, self.filearraytreedict.__getitem__, "non-existing")
def test_higher_level_than_tree(self):
f = self.filearraytree
self.assertEqual(3, f.levels)
self.assertRaises(ValueError, f.get_summary, 0, 4)
if __name__ == '__main__':
unittest.main()
bx-python-0.8.13/lib/bx/arrays/bed.pyx 0000664 0000000 0000000 00000002540 14156664651 0017526 0 ustar 00root root 0000000 0000000 """
Iterator for the BED format ( http://genome.ucsc.edu/FAQ/FAQformat.html#format1 )
Returns chrom, chromStart, chromEnd, name, score
"""
cdef class BedReader:
cdef object f
def __init__( self, f ):
self.f = f
def __iter__( self ):
return self
def __next__( self ):
while True:
line = self.f.readline()
if not line:
raise StopIteration()
if line.isspace():
continue
if line[0] == "#":
continue
if line[0].isalpha():
if line.startswith( "track" ) or line.startswith( "browser" ):
continue
feature = line.strip().split()
chrom = feature[0]
chrom_start = int(feature[1])
chrom_end = int(feature[2])
if len(feature) > 3:
name = feature[3]
else:
name = None
if len(feature) > 4:
score = int(feature[4])
else:
score = None
return chrom, chrom_start, chrom_end, name, score
else:
raise "Unexpected input line: %s" % line.strip()
bx-python-0.8.13/lib/bx/arrays/wiggle.pxd 0000664 0000000 0000000 00000000370 14156664651 0020224 0 ustar 00root root 0000000 0000000 cdef enum linemode:
MODE_BED
MODE_FIXED
MODE_VARIABLE
cdef class WiggleReader:
cdef object file
cdef object current_chrom
cdef long current_pos
cdef long current_step
cdef long current_span
cdef linemode mode
bx-python-0.8.13/lib/bx/arrays/wiggle.pyx 0000664 0000000 0000000 00000007172 14156664651 0020260 0 ustar 00root root 0000000 0000000 """
Support for scores in the `wiggle`_ file format used by the UCSC Genome
Browser.
The positions in the wiggle format are 1-relative, however,
the positions returned match the BED/interval format which is zero-based, half-open.
.. _wiggle: http://genome.ucsc.edu/goldenPath/help/wiggle.html
"""
def parse_header( line ):
return dict( [ field.split( '=' ) for field in line.split()[1:] ] )
cdef class WiggleReader:
"""
Iterator yielding chrom, start, end, strand, value.
Values are zero-based, half-open.
Regions which lack a score are ignored.
"""
#cdef object file
#cdef object current_chrom
#cdef long current_pos
#cdef long current_step
#cdef long current_span
#cdef linemode mode
def __init__( self, file ):
self.file = file
self.current_chrom = None
self.current_pos = -1
self.current_step = -1
self.current_span = -1
self.mode = MODE_BED
def __iter__( self ):
return self
def __next__( self ):
while True:
line = self.file.readline()
if not line:
raise StopIteration()
if line.isspace():
continue
if line[0] == "#":
continue
if line[0].isalpha():
if line.startswith( "track" ) or line.startswith( "browser" ):
continue
elif line.startswith( "variableStep" ):
header = parse_header( line )
self.current_chrom = header['chrom']
self.current_pos = -1
self.current_step = -1
if 'span' in header:
self.current_span = int( header['span'] )
else:
self.current_span = 1
self.mode = MODE_VARIABLE
continue
elif line.startswith( "fixedStep" ):
header = parse_header( line )
self.current_chrom = header['chrom']
self.current_pos = int( header['start'] ) - 1
self.current_step = int( header['step'] )
if 'span' in header:
self.current_span = int( header['span'] )
else:
self.current_span = 1
self.mode = MODE_FIXED
continue
elif self.mode == MODE_BED:
fields = line.split()
if len( fields ) > 3:
if len( fields ) > 5:
return fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] )
else:
return fields[0], int( fields[1] ), int( fields[2] ), "+", float( fields[3] )
elif self.mode == MODE_VARIABLE:
fields = line.split()
try:
pos = int( fields[0] ) - 1
val = float( fields[1] )
except ValueError:
continue
return self.current_chrom, pos, pos + self.current_span, "+", val
elif self.mode == MODE_FIXED:
fields = line.split()
try:
val = float( fields[0] )
except ValueError:
continue
return self.current_chrom, self.current_pos, self.current_pos + self.current_span, "+", val
# FIXME: unreachable! need to test this and fix!
self.current_pos += self.current_step
else:
raise "Unexpected input line: %s" % line.strip()
bx-python-0.8.13/lib/bx/bbi/ 0000775 0000000 0000000 00000000000 14156664651 0015464 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/bbi/__init__.py 0000664 0000000 0000000 00000000123 14156664651 0017571 0 ustar 00root root 0000000 0000000 """
Support for the UCSC "Big Binary Indexed" file formats (bigWig and bigBed)
"""
bx-python-0.8.13/lib/bx/bbi/bbi_file.pxd 0000664 0000000 0000000 00000006176 14156664651 0017746 0 ustar 00root root 0000000 0000000 from bpt_file cimport BPTFile
from cirtree_file cimport CIRTreeFile
from types cimport *
import numpy
cimport numpy
cdef class SummaryBlock:
"""
A block of summary data from disk
"""
cdef public bits32 chrom_id
cdef public bits32 start
cdef public bits32 end
cdef public bits32 valid_count
cdef public double min_val
cdef public double max_val
cdef public double sum_data
cdef public double sum_squares
cdef class SummarizedData:
"""
The result of using SummaryBlocks read from the file to produce a
aggregation over a particular range and resolution
"""
cdef public bits32 start
cdef public bits32 end
cdef public int size
cdef public numpy.ndarray valid_count
cdef public numpy.ndarray min_val
cdef public numpy.ndarray max_val
cdef public numpy.ndarray sum_data
cdef public numpy.ndarray sum_squares
cdef accumulate_interval_value( self, bits32 s, bits32 e, float val )
cdef class BBIFile
cdef class BlockHandler:
"""
Callback for `BBIFile.visit_blocks_in_region`
"""
cdef handle_block( self, bytes block_data, BBIFile bbi_file )
cdef class BBIFile:
"""
A "big binary indexed" file. Stores blocks of raw data and numeric
summaries of that data at different levels of aggregation ("zoom levels").
Generic enough to accommodate both wiggle and bed data.
"""
# Probably a PyFileObject, or any seekable file-like
cdef object file
# A BinaryFileReader created from file
cdef object reader
# The magic number or type signature (whether the file is bigWig or bigBed or...)
cdef public bits32 magic
# Is the file byteswapped relative to our native byte order?
cdef boolean is_byteswapped
# The index to the chromosomes, an embedded BPT file
cdef BPTFile chrom_bpt
# Version number
cdef public bits16 version
# Number of zoom levels
cdef public bits16 zoom_levels
# Offset to chromosome index
cdef bits64 chrom_tree_offset
# Offset to unzoomed data
cdef bits64 unzoomed_data_offset
# Offset to unzoomed index
cdef bits64 unzoomed_index_offset
# If bed, number of columns
cdef bits16 field_count
cdef bits16 defined_field_count
# Offset to an embedded string containing "AutoSQL" format data that defines the columns
cdef bits64 as_offset
# Offset to total summary information (if any)
cdef bits64 total_summary_offset
# Size of uncompression buffer, 0 if no compression
cdef bits32 uncompress_buf_size
# Zoom levels list
cdef public object level_list
cdef visit_blocks_in_region( self, bits32 chrom_id, bits32 start, bits32 end, BlockHandler handler )
cdef _get_chrom_id_and_size( self, char * chrom )
cdef _best_zoom_level( self, int desired_reduction )
cpdef summarize( self, object chrom, bits32 start, bits32 end, int summary_size )
cpdef summarize_from_full( self, char * chrom, bits32 start, bits32 end, int summary_size )
cpdef query( self, object chrom, bits32 start, bits32 end, int summary_size )
cdef _summarize_from_full( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size )
bx-python-0.8.13/lib/bx/bbi/bbi_file.pyx 0000664 0000000 0000000 00000040711 14156664651 0017764 0 ustar 00root root 0000000 0000000 # cython: profile=False
"""
Core implementation for reading UCSC "big binary indexed" files.
There isn't really any specification for the format beyond the code, so this
mirrors Jim Kent's 'bbiRead.c' mostly.
"""
from cpython.version cimport PY_MAJOR_VERSION
import sys
cimport cython
from collections import deque
from bpt_file cimport BPTFile
from cirtree_file cimport CIRTreeFile
from types cimport *
from libc cimport limits
import numpy
cimport numpy
from bx.misc.binary_file import BinaryFileReader
from io import BytesIO
import zlib, math
cdef extern from "Python.h":
char * PyBytes_AsString( object )
# Signatures for bbi related file types
cdef public int big_wig_sig = 0x888FFC26
cdef public int big_bed_sig = 0x8789F2EB
# Some record sizes for parsing
DEF summary_on_disk_size = 32
@cython.profile(False)
cdef inline int range_intersection( int start1, int end1, int start2, int end2 ):
return min( end1, end2 ) - max( start1, start2 )
@cython.profile(False)
cdef inline int imax(int a, int b): return a if a >= b else b
@cython.profile(False)
cdef inline int imin(int a, int b): return a if a <= b else b
cdef enum summary_type:
summary_type_mean = 0
summary_type_max = 1
summary_type_min = 2
summary_type_coverage = 3
summary_type_sd = 4
cdef class SummaryBlock:
"""
A block of summary data from disk
"""
pass
cdef class SummarizedData:
"""
The result of using SummaryBlocks read from the file to produce a
aggregation over a particular range and resolution
"""
def __init__( self, bits32 start, bits32 end, int size ):
self.start = start
self.end = end
self.size = size
self.valid_count = numpy.zeros( self.size, dtype=numpy.float64 )
self.min_val = numpy.zeros( self.size, dtype=numpy.float64 )
self.max_val = numpy.zeros( self.size, dtype=numpy.float64 )
self.sum_data = numpy.zeros( self.size, dtype=numpy.float64 )
self.sum_squares = numpy.zeros( self.size, dtype=numpy.float64 )
cdef accumulate_interval_value( self, bits32 s, bits32 e, float val ):
cdef int base_start, base_end, base_step, overlap, j, interval_size
cdef double overlap_factor, interval_weight
# We locally cdef the arrays so all indexing will be at C speeds
cdef numpy.ndarray[numpy.float64_t, ndim=1] valid_count = self.valid_count
cdef numpy.ndarray[numpy.float64_t, ndim=1] min_val = self.min_val
cdef numpy.ndarray[numpy.float64_t, ndim=1] max_val = self.max_val
cdef numpy.ndarray[numpy.float64_t, ndim=1] sum_data = self.sum_data
cdef numpy.ndarray[numpy.float64_t, ndim=1] sum_squares = self.sum_squares
# Trim interval down to region of interest
if s < self.start:
s = self.start
if e > self.end:
e = self.end
if s >= e:
return
base_step = ( self.end - self.start ) / self.size
for j from 0 <= j < self.size:
base_start = self.start + ( base_step * j )
base_end = base_start + base_step
overlap = range_intersection( base_start, base_end, s, e )
if overlap > 0:
interval_size = e - s
overlap_factor = overlap / interval_size
interval_weight = interval_size * overlap_factor
valid_count[j] += interval_weight
sum_data[j] += val * interval_weight
sum_squares[j] += val * val * interval_weight
if max_val[j] < val:
max_val[j] = val
if min_val[j] > val:
min_val[j] = val
cdef class BlockHandler:
"""
Callback for `BBIFile.visit_blocks_in_region`
"""
cdef handle_block( self, bytes block_data, BBIFile bbi_file ):
pass
cdef class BBIFile:
"""
A "big binary indexed" file. Stores blocks of raw data and numeric
summaries of that data at different levels of aggregation ("zoom levels").
Generic enough to accommodate both wiggle and bed data.
"""
def __init__( self, file=None, expected_sig=None, type_name=None ):
if file is not None:
self.open( file, expected_sig, type_name )
def open( self, file, expected_sig, type_name ):
"""
Initialize from an existing bbi file, signature (magic) must be passed
in since this is generic.
"""
assert expected_sig is not None
self.file = file
# Open the file in a BinaryFileReader, handles magic and byteswapping
self.reader = reader = BinaryFileReader( file, expected_sig )
self.magic = expected_sig
self.is_byteswapped = self.reader.byteswap_needed
# Read header stuff
self.version = reader.read_uint16()
self.zoom_levels = reader.read_uint16()
self.chrom_tree_offset = reader.read_uint64()
self.unzoomed_data_offset = reader.read_uint64()
self.unzoomed_index_offset = reader.read_uint64()
self.field_count = reader.read_uint16()
self.defined_field_count = reader.read_uint16()
self.as_offset = reader.read_uint64()
self.total_summary_offset = reader.read_uint64()
self.uncompress_buf_size = reader.read_uint32()
# Skip reserved
reader.seek( 64 )
# Read zoom headers
self.level_list = []
for i from 0 <= i < self.zoom_levels:
level = ZoomLevel()
level.bbi_file = self
level.reduction_level = reader.read_uint32()
level.reserved = reader.read_uint32()
level.data_offset = reader.read_uint64()
level.index_offset = reader.read_uint64()
self.level_list.append( level )
# Initialize and attach embedded BPTFile containing chromosome names and ids
reader.seek( self.chrom_tree_offset )
self.chrom_bpt = BPTFile( file=self.file )
cdef visit_blocks_in_region( self, bits32 chrom_id, bits32 start, bits32 end, BlockHandler handler ):
"""
Visit each block from the full data that overlaps a specific region
"""
cdef CIRTreeFile ctf
reader = self.reader
reader.seek( self.unzoomed_index_offset )
ctf = CIRTreeFile( reader.file )
block_list = ctf.find_overlapping_blocks( chrom_id, start, end )
for offset, size in block_list:
# Seek to and read all data for the block
reader.seek( offset )
block_data = reader.read( size )
# Might need to uncompress
if self.uncompress_buf_size > 0:
block_data = zlib.decompress( block_data )
handler.handle_block( block_data, self )
cpdef summarize( self, object chrom, bits32 start, bits32 end, int summary_size ):
"""
Gets `summary_size` data points over the regions `chrom`:`start`-`end`.
"""
cdef char * cchrom
if PY_MAJOR_VERSION >= 3:
bytes_chrom = chrom.encode()
else:
bytes_chrom = chrom
cchrom = PyBytes_AsString(bytes_chrom)
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( cchrom )
if chrom_id is None:
return None
# Return value will be a structured array (rather than an array
# of summary element structures
# Find appropriate zoom level
cdef bits32 base_size = end - start
cdef int full_reduction = base_size / summary_size
cdef int zoom = full_reduction / 2
if zoom < 0:
zoom = 0
cdef ZoomLevel zoom_level = self._best_zoom_level( zoom )
if zoom_level is not None:
return zoom_level._summarize( chrom_id, start, end, summary_size )
else:
return self._summarize_from_full( chrom_id, start, end, summary_size )
cpdef summarize_from_full( self, char * chrom, bits32 start, bits32 end, int summary_size ):
"""
Gets `summary_size` data points over the regions `chrom`:`start`-`end`,
always using the raw data points
"""
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( chrom )
if chrom_id is None:
return None
# Return value will be a structured array (rather than an array
# of summary element structures
return self._summarize_from_full( chrom_id, start, end, summary_size )
cpdef query( self, object chrom, bits32 start, bits32 end, int summary_size ):
"""
Provides a different view of summary for region, a list of dictionaries
with keys: mean, max, min, coverage, std_dev
"""
if end > 2147483647 or start < 0:
raise ValueError
results = self.summarize(chrom, start, end, summary_size)
if not results:
return None
rval = []
for i in range(summary_size):
sum_data = results.sum_data[i]
valid_count = results.valid_count[i]
mean = sum_data / valid_count
coverage = summary_size / (end - start) * valid_count
# print results.sum_squares[i], sum_data, valid_count
variance = results.sum_squares[i] - sum_data * sum_data / valid_count
if valid_count > 1:
variance /= valid_count - 1
std_dev = math.sqrt(max(variance, 0))
rval.append( { "mean": mean, "max": results.max_val[i], "min": results.min_val[i], \
"coverage": coverage, "std_dev": std_dev } )
return rval
cdef _get_chrom_id_and_size( self, char * chrom ):
"""
Lookup id and size from the chromosome named `chrom`
"""
bytes = self.chrom_bpt.find( chrom )
if bytes is not None:
# The value is two 32 bit uints, use the BPT's reader for checking byteswapping
assert len( bytes ) == 8
chrom_id, chrom_size = self.chrom_bpt.reader.unpack( "II", bytes )
return chrom_id, chrom_size
else:
return None, None
cdef _summarize_from_full( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size ):
"""
Create summary from full data. This is data specific so must be overridden.
"""
pass
cdef _best_zoom_level( self, int desired_reduction ):
if desired_reduction <= 1:
return None
cdef ZoomLevel level, closest_level
cdef int diff, closest_diff = limits.INT_MAX
closest_level = None
for level in self.level_list:
diff = desired_reduction - level.reduction_level
if diff >= 0 and diff < closest_diff:
closest_diff = diff
closest_level = level
return closest_level
cdef class ZoomLevel:
cdef BBIFile bbi_file
cdef public bits32 reduction_level
cdef bits32 reserved
cdef public bits64 data_offset
cdef public bits64 index_offset
cdef int item_count
def _summary_blocks_in_region( self, bits32 chrom_id, bits32 start, bits32 end ):
"""
Return a list of all SummaryBlocks that overlap the region
`chrom_id`:`start`-`end`
"""
cdef CIRTreeFile ctf
cdef SummaryBlock summary
rval = deque()
reader = self.bbi_file.reader
reader.seek( self.index_offset )
ctf = CIRTreeFile( reader.file )
block_list = ctf.find_overlapping_blocks( chrom_id, start, end )
for offset, size in block_list:
# Seek to and read all data for the block
reader.seek( offset )
block_data = reader.read( size )
# Might need to uncompress
if self.bbi_file.uncompress_buf_size > 0:
## block_data = zlib.decompress( block_data, buf_size = self.bbi_file.uncompress_buf_size )
block_data = zlib.decompress( block_data )
block_size = len( block_data )
# The block should be a bunch of summaries.
assert block_size % summary_on_disk_size == 0
item_count = block_size / summary_on_disk_size
# Create another reader just for the block, shouldn't be too expensive
block_reader = BinaryFileReader( BytesIO( block_data ), is_little_endian=reader.is_little_endian )
for i from 0 <= i < item_count:
## NOTE: Look carefully at bbiRead again to be sure the endian
## conversion here is all correct. It looks like it is
## just pushing raw data into memory and not swapping
sum_chrom_id = block_reader.read_uint32()
# A block can contain summaries from more that one chrom_id
if sum_chrom_id != chrom_id:
block_reader.skip(7*4)
continue
summary = SummaryBlock()
summary.chrom_id = sum_chrom_id
summary.start = block_reader.read_uint32()
summary.end = block_reader.read_uint32()
summary.valid_count = block_reader.read_uint32()
summary.min_val = block_reader.read_float()
summary.max_val = block_reader.read_float()
summary.sum_data = block_reader.read_float()
summary.sum_squares = block_reader.read_float()
rval.append( summary )
return rval
cdef _get_summary_slice( self, bits32 base_start, bits32 base_end, summaries ):
cdef float valid_count = 0.0
cdef float sum_data = 0.0
cdef float sum_squares = 0.0
cdef float min_val = numpy.nan
cdef float max_val = numpy.nan
cdef float overlap_factor
cdef int overlap
if summaries:
min_val = summaries[0].min_val
max_val = summaries[0].max_val
for summary in summaries:
if summary.start >= base_end:
break
overlap = range_intersection( base_start, base_end, summary.start, summary.end )
if overlap > 0:
overlap_factor = overlap / (summary.end - summary.start)
valid_count += summary.valid_count * overlap_factor
sum_data += summary.sum_data * overlap_factor
sum_squares += summary.sum_squares * overlap_factor
if max_val < summary.max_val:
max_val = summary.max_val
if min_val > summary.min_val:
min_val = summary.min_val
return valid_count, sum_data, sum_squares, min_val, max_val
cdef _summarize( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size ):
"""
Summarize directly from file.
Looking at Jim's code, it appears that
- bbiSummariesInRegion returns all summaries that span start-end in
sorted order
- bbiSummarySlice is then used to aggregate over the subset of those
summaries that overlap a single summary element
"""
cdef bits32 base_start, base_end, base_step
# We locally cdef the arrays so all indexing will be at C speeds
cdef numpy.ndarray[numpy.float64_t] valid_count
cdef numpy.ndarray[numpy.float64_t] min_val
cdef numpy.ndarray[numpy.float64_t] max_val
cdef numpy.ndarray[numpy.float64_t] sum_data
cdef numpy.ndarray[numpy.float64_t] sum_squares
# What we will load into
rval = SummarizedData( start, end, summary_size )
valid_count = rval.valid_count
min_val = rval.min_val
max_val = rval.max_val
sum_data = rval.sum_data
sum_squares = rval.sum_squares
# First, load up summaries
reader = self.bbi_file.reader
reader.seek( self.index_offset )
summaries = self._summary_blocks_in_region(chrom_id, start, end)
base_step = (end - start) / summary_size
base_start = start
base_end = start
for i in range(summary_size):
base_end += base_step
while summaries and summaries[0].end <= base_start:
summaries.popleft()
valid_count[i], sum_data[i], sum_squares[i], min_val[i], max_val[i] = self._get_summary_slice(base_start, base_end, summaries)
base_start = base_end
return rval
bx-python-0.8.13/lib/bx/bbi/bigbed_file.pyx 0000664 0000000 0000000 00000010631 14156664651 0020442 0 ustar 00root root 0000000 0000000 """
BigBed file.
"""
from bbi_file cimport *
from cirtree_file cimport CIRTreeFile
import numpy
cimport numpy
from types cimport *
from bx.intervals.io import GenomicInterval
from bx.misc.binary_file import BinaryFileReader
from io import BytesIO
import zlib
DEF big_bed_sig = 0x8789F2EB
cdef inline int range_intersection( int start1, int end1, int start2, int end2 ):
return min( end1, end2 ) - max( start1, start2 )
cdef class BigBedBlockHandler( BlockHandler ):
"""
BlockHandler that parses the block into a series of BED records
"""
cdef bits32 chrom_id
cdef bits32 start
cdef bits32 end
def __init__( self, bits32 chrom_id, bits32 start, bits32 end ):
BlockHandler.__init__( self )
self.chrom_id = chrom_id
self.start = start
self.end = end
cdef handle_block( self, bytes block_data, BBIFile bbi_file ):
cdef object bytes_io
cdef int length
cdef bits32 chrom_id, s, e
cdef str rest
# Now we parse the block, which should just be a bunch of BED records
bytes_io = BytesIO( block_data )
block_reader = BinaryFileReader( bytes_io, is_little_endian=bbi_file.reader.is_little_endian )
length = len( block_data )
while bytes_io.tell() < length:
chrom_id = block_reader.read_uint32()
s = block_reader.read_uint32()
e = block_reader.read_uint32()
rest = block_reader.read_c_string()
if chrom_id != self.chrom_id:
continue
if s < self.end and e > self.start:
self.handle_interval_value( s, e, rest )
cdef handle_interval_value( self, bits32 s, bits32 e, str rest ):
pass
cdef class SummarizingBlockHandler( BigBedBlockHandler ):
"""
Accumulates intervals into a SummarizedData
"""
cdef SummarizedData sd
def __init__( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size ):
BigBedBlockHandler.__init__( self, chrom_id, start, end )
# What we will load into
self.sd = SummarizedData( start, end, summary_size )
for i in range(summary_size):
self.sd.min_val[i] = +numpy.inf
for i in range(summary_size):
self.sd.max_val[i] = -numpy.inf
cdef handle_interval_value( self, bits32 s, bits32 e, str rest ):
# FIXME: Does this really obvious thing actually do what we want?
# No... sum_data will end up being the coverage, but min/max/etc are wrong
self.sd.accumulate_interval_value( s, e, 1 )
cdef class IntervalAccumulatingBlockHandler( BigBedBlockHandler ):
cdef list intervals
"""
Accumulates intervals into a list of intervals with values
"""
def __init__( self, bits32 chrom_id, bits32 start, bits32 end ):
BigBedBlockHandler.__init__( self, chrom_id, start, end )
self.intervals = []
cdef handle_interval_value( self, bits32 s, bits32 e, str rest ):
self.intervals.append( ( s, e, rest ) )
cdef class BigBedFile( BBIFile ):
"""
A "big binary indexed" file whose raw data is in BED format.
"""
def __init__( self, file=None ):
BBIFile.__init__( self, file, big_bed_sig, "bigbed" )
cdef _summarize_from_full( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size ):
"""
Create summary from full data.
"""
v = SummarizingBlockHandler( chrom_id, start, end, summary_size )
self.visit_blocks_in_region( chrom_id, start, end, v )
# Round valid count, in place
for i from 0 <= i < summary_size:
v.sd.valid_count[i] = round( v.sd.valid_count[i] )
return v.sd
cpdef get( self, char * chrom, bits32 start, bits32 end ):
"""
Gets all data points over the regions `chrom`:`start`-`end`.
"""
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( chrom )
if chrom_id is None:
return None
v = IntervalAccumulatingBlockHandler( chrom_id, start, end )
self.visit_blocks_in_region( chrom_id, start, end, v )
rval = []
# FIXME: Not sure the best way to return, will user GenomicInterval for
# now.
for ( s, e, rest ) in v.intervals:
fields = [ chrom, str( s ), str( e ) ] + rest.split( "\t" )
rval.append( GenomicInterval( None, fields, 0, 1, 2, 5, "+" ) )
return rval
bx-python-0.8.13/lib/bx/bbi/bigwig_file.pyx 0000664 0000000 0000000 00000017353 14156664651 0020506 0 ustar 00root root 0000000 0000000 """
BigWig file.
"""
from collections import deque
from bbi_file cimport *
from cirtree_file cimport CIRTreeFile
import numpy
cimport numpy
from types cimport *
from bx.misc.binary_file import BinaryFileReader
from io import BytesIO
import zlib
DEF big_wig_sig = 0x888FFC26
DEF bwg_bed_graph = 1
DEF bwg_variable_step = 2
DEF bwg_fixed_step = 3
cdef inline int range_intersection( int start1, int end1, int start2, int end2 ):
return min( end1, end2 ) - max( start1, start2 )
cdef class BigWigBlockHandler( BlockHandler ):
"""
BlockHandler that parses the block into a series of wiggle records, and calls `handle_interval_value` for each.
"""
cdef bits32 start
cdef bits32 end
def __init__( self, bits32 start, bits32 end ):
BlockHandler.__init__( self )
self.start = start
self.end = end
cdef handle_block( self, bytes block_data, BBIFile bbi_file ):
cdef bits32 b_chrom_id, b_start, b_end, b_valid_count
cdef bits32 b_item_step, b_item_span
cdef bits16 b_item_count
cdef UBYTE b_type
cdef int s, e
cdef float val
# Now we parse the block, first the header
block_reader = BinaryFileReader( BytesIO( block_data ), is_little_endian=bbi_file.reader.is_little_endian )
b_chrom_id = block_reader.read_uint32()
b_start = block_reader.read_uint32()
b_end = block_reader.read_uint32()
b_item_step = block_reader.read_uint32()
b_item_span = block_reader.read_uint32()
b_type = block_reader.read_uint8()
block_reader.skip(1)
b_item_count = block_reader.read_uint16()
for i from 0 <= i < b_item_count:
# Depending on the type, s and e are either read or
# generate using header, val is always read
if b_type == bwg_bed_graph:
s = block_reader.read_uint32()
e = block_reader.read_uint32()
val = block_reader.read_float()
elif b_type == bwg_variable_step:
s = block_reader.read_uint32()
e = s + b_item_span
val = block_reader.read_float()
elif b_type == bwg_fixed_step:
s = b_start + ( i * b_item_span )
e = s + b_item_span
val = block_reader.read_float()
else:
# FIXME: raise exception???
# s, e, val are uninitialized/not updated at this point!
pass
if s < self.start:
s = self.start
if e > self.end:
e = self.end
if s >= e:
continue
self.handle_interval_value( s, e, val )
cdef handle_interval_value( self, bits32 s, bits32 e, float val ):
pass
cdef class SummarizingBlockHandler( BigWigBlockHandler ):
"""
Accumulates intervals into a SummarizedData
"""
cdef SummarizedData sd
def __init__( self, bits32 start, bits32 end, int summary_size ):
BigWigBlockHandler.__init__( self, start, end )
# What we will load into
self.sd = SummarizedData( start, end, summary_size )
for i in range(summary_size):
self.sd.min_val[i] = +numpy.inf
for i in range(summary_size):
self.sd.max_val[i] = -numpy.inf
cdef handle_interval_value( self, bits32 s, bits32 e, float val ):
self.sd.accumulate_interval_value( s, e, val )
cdef class IntervalAccumulatingBlockHandler( BigWigBlockHandler ):
cdef list intervals
"""
Accumulates intervals into a list of intervals with values
"""
def __init__( self, bits32 start, bits32 end ):
BigWigBlockHandler.__init__( self, start, end )
self.intervals = []
cdef handle_interval_value( self, bits32 s, bits32 e, float val ):
self.intervals.append( ( s, e, val ) )
cdef class ArrayAccumulatingBlockHandler( BigWigBlockHandler ):
"""
Accumulates intervals into a list of intervals with values
"""
cdef numpy.ndarray array
def __init__( self, bits32 start, bits32 end ):
BigWigBlockHandler.__init__( self, start, end )
self.array = numpy.zeros( end - start, dtype=numpy.float32 )
self.array[...] = numpy.nan
cdef handle_interval_value( self, bits32 s, bits32 e, float val ):
cdef numpy.ndarray[ numpy.float32_t, ndim=1 ] array = self.array
cdef int i
# Slicing is not optimized by Cython
for i from s - self.start <= i < e - self.start:
array[ i ] = val
cdef class BigWigHeaderBlockHandler( BigWigBlockHandler ):
"Reads and returns headers"
cdef list headers
def __init__( self, bits32 start, bits32 end ):
BigWigBlockHandler.__init__( self, start, end )
self.headers = []
cdef handle_block( self, bytes block_data, BBIFile bbi_file ):
cdef bits32 b_chrom_id, b_start, b_end, b_valid_count
cdef bits32 b_item_step, b_item_span
cdef bits16 b_item_count
cdef UBYTE b_type
cdef int s, e
cdef float val
# parse the block header
block_reader = BinaryFileReader( BytesIO( block_data ), is_little_endian=bbi_file.reader.is_little_endian )
b_chrom_id = block_reader.read_uint32()
b_start = block_reader.read_uint32()
b_end = block_reader.read_uint32()
b_item_step = block_reader.read_uint32()
b_item_span = block_reader.read_uint32()
b_type = block_reader.read_uint8()
block_reader.skip(1)
b_item_count = block_reader.read_uint16()
self.handle_header( b_start, b_end, b_item_step, b_item_span, b_type, b_item_count )
cdef handle_header( self, bits32 start, bits32 end, bits32 step, bits32 span, bits8 type, bits16 itemCount ):
self.headers.append( ( start, end, step, span, type, itemCount ) )
cdef class BigWigFile( BBIFile ):
"""
A "big binary indexed" file whose raw data is in wiggle format.
"""
def __init__( self, file=None ):
BBIFile.__init__( self, file, big_wig_sig, "bigwig" )
cdef _summarize_from_full( self, bits32 chrom_id, bits32 start, bits32 end, int summary_size ):
"""
Create summary from full data.
"""
v = SummarizingBlockHandler( start, end, summary_size )
self.visit_blocks_in_region( chrom_id, start, end, v )
# Round valid count, in place
for i from 0 <= i < summary_size:
v.sd.valid_count[i] = round( v.sd.valid_count[i] )
return v.sd
cpdef get( self, char * chrom, bits32 start, bits32 end ):
"""
Gets all data points over the regions `chrom`:`start`-`end`.
"""
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( chrom )
if chrom_id is None:
return None
v = IntervalAccumulatingBlockHandler( start, end )
self.visit_blocks_in_region( chrom_id, start, end, v )
return v.intervals
cpdef get_as_array( self, char * chrom, bits32 start, bits32 end ):
"""
Gets all data points over the regions `chrom`:`start`-`end`.
"""
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( chrom )
if chrom_id is None:
return None
v = ArrayAccumulatingBlockHandler( start, end )
self.visit_blocks_in_region( chrom_id, start, end, v )
return v.array
cpdef get_headers( self, char * chrom, bits32 start, bits32 end ):
if start >= end:
return None
chrom_id, chrom_size = self._get_chrom_id_and_size( chrom )
if chrom_id is None:
return None
v = BigWigHeaderBlockHandler( start, end )
self.visit_blocks_in_region( chrom_id, start, end, v )
return v.headers
bx-python-0.8.13/lib/bx/bbi/bigwig_tests.py 0000664 0000000 0000000 00000006135 14156664651 0020535 0 ustar 00root root 0000000 0000000 import os
import sys
import numpy
import pytest
try:
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
except Exception:
sys.path.insert(0, os.path.dirname(os.path.abspath(".")))
from bx.bbi.bigwig_file import BigWigFile
def allclose(a, b, tol=0.00001):
"""
Like numpy.allclose but treat Nan == Nan
"""
d = numpy.absolute(a - b)
return numpy.all(numpy.isnan(d) | (d < tol))
class TestBigWig:
@pytest.fixture(autouse=True)
def setUp(self):
f = open("test_data/bbi_tests/test.bw", 'rb')
self.bw = BigWigFile(file=f)
def test_get_summary(self):
data = self.bw.query("chr1", 10000, 20000, 10)
means = [x['mean'] for x in data]
assert numpy.allclose([float(_) for _ in means], [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998])
# Summarize variant
sd = self.bw.summarize("chr1", 10000, 20000, 10)
assert numpy.allclose(sd.sum_data / sd.valid_count, [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998])
# Test min and max for this entire summary region
data = self.bw.query("chr1", 10000, 20000, 1)
maxs = [x['max'] for x in data]
mins = [x['min'] for x in data]
assert [float(_) for _ in maxs] == [0.289000004529953]
assert [float(_) for _ in mins] == [-3.9100000858306885]
def test_get_leaf(self):
data = self.bw.query("chr1", 11000, 11005, 5)
means = [x['mean'] for x in data]
assert numpy.allclose([float(_) for _ in means], [0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311])
# Test min and max for this entire leaf region
data = self.bw.query("chr1", 11000, 11005, 1)
maxs = [x['max'] for x in data]
mins = [x['min'] for x in data]
assert [float(_) for _ in maxs] == [0.050842501223087311]
assert [float(_) for _ in mins] == [-2.4589500427246094]
def test_wrong_nochrom(self):
data = self.bw.query("chr2", 0, 10000, 10)
assert data is None
@pytest.mark.parametrize("line", open("test_data/bbi_tests/test.expectation").readlines())
def test_summary_from_file(self, line):
fields = line.split()
chrom = fields[0]
start = int(fields[1])
end = int(fields[2])
n = int(fields[3])
t = fields[4]
values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]]
sd = self.bw.summarize(chrom, start, end, n)
if t == 'mean':
assert allclose(sd.sum_data / sd.valid_count, values)
elif t == 'min':
assert allclose(sd.min_val, values)
elif t == 'max':
assert allclose(sd.max_val, values)
# elif t == 'std':
# assert numpy.allclose( sd.max_val, values )
bx-python-0.8.13/lib/bx/bbi/bpt_file.pxd 0000664 0000000 0000000 00000000571 14156664651 0017770 0 ustar 00root root 0000000 0000000 from bx.misc.binary_file import BinaryFileReader
from types cimport *
cdef class BPTFile:
"""
On disk B+ tree compatible with Jim Kent's bPlusTree.c
"""
cdef object file
cdef object reader
cdef boolean is_byteswapped
cdef bits32 block_size
cdef bits32 key_size
cdef bits32 value_size
cdef bits64 item_count
cdef bits64 root_offset
bx-python-0.8.13/lib/bx/bbi/bpt_file.pyx 0000664 0000000 0000000 00000005012 14156664651 0020010 0 ustar 00root root 0000000 0000000 from bx.misc.binary_file import BinaryFileReader
DEF bpt_sig = 0x78CA8C91
# bptFileHeaderSize = 32
# bptBlockHeaderSize = 4
cdef class BPTFile:
"""
On disk B+ tree compatible with Jim Kent's bPlusTree.c
"""
def __init__(self, file=None):
if file is not None:
self.attach(file)
def attach(self, file):
"""
Attach to an open file
"""
self.file = file
self.reader = reader = BinaryFileReader(file, bpt_sig)
self.is_byteswapped = self.reader.byteswap_needed
# Read header stuff
self.block_size = reader.read_uint32()
self.key_size = reader.read_uint32()
self.value_size = reader.read_uint32()
self.item_count = reader.read_uint64()
reader.skip(8)
self.root_offset = reader.tell()
def r_find(self, bits64 block_start, key):
"""
Recursively seek the value matching key under the subtree starting
at file offset `block_start`
"""
cdef UBYTE is_leaf
cdef bits16 child_count
cdef bits64 offset
self.reader.seek(block_start)
# Block header
is_leaf = self.reader.read_uint8()
self.reader.read_uint8()
child_count = self.reader.read_uint16()
if is_leaf:
for i from 0 <= i < child_count:
node_key = self.reader.read(self.key_size)
node_value = self.reader.read(self.value_size)
if node_key == key:
return node_value
return None
else:
# Read and discard first key, store offset
self.reader.read(self.key_size)
offset = self.reader.read_uint64()
# Loop until correct subtree is found
for i from 0 <= i < child_count - 1:
node_key = self.reader.read(self.key_size)
if node_key > key:
break
offset = self.reader.read_uint64()
return self.r_find(offset, key)
def find(self, key):
"""
Find the value matching `key` (a bytestring). Returns the matching
value as a bytestring if found, or None
"""
# Key is greater than key_size, must not be a match
if len(key) > self.key_size:
return None
# Key is less than key_size, right pad with 0 bytes
if len(key) < self.key_size:
key += b'\0' * (self.key_size - len(key))
# Call the recursive finder
return self.r_find(self.root_offset, key)
bx-python-0.8.13/lib/bx/bbi/cirtree_file.pxd 0000664 0000000 0000000 00000000565 14156664651 0020643 0 ustar 00root root 0000000 0000000 from types cimport *
cdef class CIRTreeFile:
cdef object file
cdef object reader
cdef boolean is_byteswapped
cdef bits64 root_offset
cdef bits32 block_size
cdef bits64 item_count
cdef bits32 start_chrom_ix
cdef bits32 start_base
cdef bits32 end_chrom_ix
cdef bits32 end_base
cdef bits64 file_size
cdef bits32 items_per_slot
bx-python-0.8.13/lib/bx/bbi/cirtree_file.pyx 0000664 0000000 0000000 00000010617 14156664651 0020667 0 ustar 00root root 0000000 0000000 from bx.misc.binary_file import BinaryFileReader
DEF cir_tree_sig = 0x2468ACE0
cdef int ovcmp( bits32 a_hi, bits32 a_lo, bits32 b_hi, bits32 b_lo ):
if a_hi < b_hi:
return 1
elif a_hi > b_hi:
return -1
else:
if a_lo < b_lo:
return 1
elif a_lo > b_lo:
return -1
else:
return 0
cdef overlaps( qchrom, qstart, qend, rstartchrom, rstartbase, rendchrom, rendbase ):
return ( ovcmp( qchrom, qstart, rendchrom, rendbase ) > 0 ) and \
( ovcmp( qchrom, qend, rstartchrom, rstartbase ) < 0 )
cdef class CIRTreeFile:
def __init__( self, file=None ):
if file is not None:
self.attach( file )
def attach( self, file ):
"""
Attach to an open file
"""
self.file = file
self.reader = reader = BinaryFileReader( file, cir_tree_sig )
self.is_byteswapped = self.reader.byteswap_needed
# Header
self.block_size = reader.read_uint32()
self.item_count = reader.read_uint64()
self.start_chrom_ix = reader.read_uint32()
self.start_base = reader.read_uint32()
self.end_chrom_ix = reader.read_uint32()
self.end_base = reader.read_uint32()
self.file_size = reader.read_uint64()
self.items_per_slot = reader.read_uint32()
# Skip reserved
reader.read_uint32()
# Save root
self.root_offset = reader.tell()
def r_find_overlapping( self, int level, bits64 index_file_offset, bits32 chrom_ix, bits32 start, bits32 end, object rval, object reader ):
cdef UBYTE is_leaf
cdef bits16 child_count
reader.seek( index_file_offset )
# Block header
is_leaf = reader.read_uint8()
assert is_leaf == 0 or is_leaf == 1
reader.read_uint8()
child_count = reader.read_uint16()
# Read block
if is_leaf:
self.r_find_overlapping_leaf( level, chrom_ix, start, end, rval, child_count, reader )
else:
self.r_find_overlapping_parent( level, chrom_ix, start, end, rval, child_count, reader )
def r_find_overlapping_leaf( self, int level, bits32 chrom_ix, bits32 start, bits32 end, object rval,
bits16 child_count, object reader ):
cdef bits32 start_chrom_ix, start_base, end_chrom_ix, end_base
cdef bits64 offset
cdef bits64 size
for i from 0 <= i < child_count:
start_chrom_ix = reader.read_uint32()
start_base = reader.read_uint32()
end_chrom_ix = reader.read_uint32()
end_base = reader.read_uint32()
offset = reader.read_uint64()
size = reader.read_uint64()
if overlaps( chrom_ix, start, end, start_chrom_ix, start_base, end_chrom_ix, end_base ):
rval.append( ( offset, size ) )
def r_find_overlapping_parent( self, int level, bits32 chrom_ix, bits32 start, bits32 end, object rval,
bits16 child_count, object reader ):
# Read and cache offsets for all children to avoid excessive seeking
## cdef bits32 start_chrom_ix[child_count], start_base[child_count], end_chrom_ix[child_count], end_base[child_count]
## cdef bits64 offset[child_count]
start_chrom_ix = []; start_base = []; end_chrom_ix = []; end_base = []
offset = []
for i from 0 <= i < child_count:
## start_chrom_ix[i] = reader.read_bits32()
## start_base[i] = reader.read_bits32()
## end_chrom_ix[i] = reader.read_bits32()
## end_base[i] = reader.read_bits32()
## offset[i] = reader.read_bits64()
start_chrom_ix.append( reader.read_uint32() )
start_base.append( reader.read_uint32() )
end_chrom_ix.append( reader.read_uint32() )
end_base.append( reader.read_uint32() )
offset.append( reader.read_uint64() )
# Now recurse
for i from 0 <= i < child_count:
if overlaps( chrom_ix, start, end, start_chrom_ix[i], start_base[i], end_chrom_ix[i], end_base[i] ):
self.r_find_overlapping( level + 1, offset[i], chrom_ix, start, end, rval, reader )
def find_overlapping_blocks( self, bits32 chrom_ix, bits32 start, bits32 end ):
rval = []
self.r_find_overlapping( 0, self.root_offset, chrom_ix, start, end, rval, self.reader )
return rval
bx-python-0.8.13/lib/bx/bbi/types.pxd 0000664 0000000 0000000 00000000440 14156664651 0017343 0 ustar 00root root 0000000 0000000 ctypedef unsigned char UBYTE
ctypedef signed char BYTE
ctypedef unsigned short UWORD
ctypedef short WORD
ctypedef unsigned long long bits64
ctypedef unsigned bits32
ctypedef unsigned short bits16
ctypedef unsigned char bits8
ctypedef int signed32
ctypedef bint boolean
bx-python-0.8.13/lib/bx/binned_array.py 0000664 0000000 0000000 00000030124 14156664651 0017737 0 ustar 00root root 0000000 0000000 """
Numeric arrays stored as individually compressed blocks on disk, allowing
pseudo-random acccess.
`BinnedArray` is used to build such an array in memory and save it to disk.
`BinnedArrayWriter` can instead be used when creating the array sequentially
(does not require keeping all data in memory). `FileBinnedArray` provides
read only access to an on disk binned array.
"""
import math
import sys
from struct import (
calcsize,
pack,
unpack,
)
from numpy import (
array,
concatenate,
frombuffer,
NaN,
resize,
zeros
)
from bx_extras.lrucache import LRUCache
platform_is_little_endian = (sys.byteorder == 'little')
MAGIC = 0x4AB04612
# Version incremented from version 0 to version 1 by Ian Schenck, June
# 23, 2006. Version 1 supports different typecodes, and in doing so
# breaks the original header format. The new FileBinnedArray is
# backwards compatible with version 0.
# Version 1 -> 2 by James Taylor, allow specifying different compression
# types.
VERSION = 2
# Compression types
comp_types = {
'none': (lambda x: x, lambda x: x)
}
try:
import zlib
comp_types['zlib'] = (zlib.compress, zlib.decompress)
except Exception:
pass
try:
import lzo
comp_types['lzo'] = (lzo.compress, lzo.decompress)
except Exception:
pass
MAX = 512*1024*1024
def bytesify(s):
if isinstance(s, bytes):
return s
else:
return s.encode()
class BinnedArray:
def __init__(self, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f"):
self.max_size = max_size
self.bin_size = bin_size
self.nbins = int(math.ceil(max_size / self.bin_size))
self.bins = [None] * self.nbins
self.default = default
self.typecode = typecode
def get_bin_offset(self, index):
return index // self.bin_size, index % self.bin_size
def init_bin(self, index):
# self.bins[index] = zeros( self.bin_size ) * self.default
self.bins[index] = zeros(self.bin_size, self.typecode)
self.bins[index][:] = self.default
def get(self, key):
bin, offset = self.get_bin_offset(key)
if self.bins[bin] is None:
return self.default
else:
return self.bins[bin][offset]
def set(self, key, value):
bin, offset = self.get_bin_offset(key)
if self.bins[bin] is None:
self.init_bin(bin)
self.bins[bin][offset] = value
def get_range(self, start, end):
size = end - start
assert size >= 0
rval = []
while size > 0:
bin, offset = self.get_bin_offset(start)
delta = self.bin_size - offset
if self.bins[bin] is None:
if delta < size:
rval.append(resize(array(self.default, self.typecode), (delta,)))
size -= delta
start += delta
else:
rval.append(resize(array(self.default, "f"), (size,)))
size = 0
else:
if delta < size:
rval.append(self.bins[bin][offset:offset+delta])
size -= delta
start += delta
else:
rval.append(self.bins[bin][offset:offset+size])
size = 0
return concatenate(rval)
def __getitem__(self, key):
if isinstance(key, slice):
start, stop, stride = key.indices(self.max_size)
assert stride == 1, "Slices with strides are not supported"
return self.get_range(start, stop)
else:
return self.get(key)
def __setitem__(self, key, value):
return self.set(key, value)
def to_file(self, f, comp_type='zlib'):
# Get compress method
compress, _ = comp_types[comp_type]
# Write header
write_packed(f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins)
# save type code
f.write(pack('c', bytesify(self.typecode)))
# save compression type
f.write(bytesify(comp_type[0:4].ljust(4)))
# write default value
a = array(self.default, self.typecode)
# Struct module can't deal with NaN and endian conversion, we'll hack
# around that by byteswapping the array
if platform_is_little_endian:
a = a.byteswap()
f.write(a.tobytes())
# Save current position (start of bin offsets)
index_start_pos = f.tell()
# Skip forward to save space for index
f.seek(calcsize(">2I") * self.nbins, 1)
bin_pos_and_size = []
# Write each bin
for bin in self.bins:
if bin is None:
bin_pos_and_size.append((0, 0))
else:
assert bin.dtype.char == self.typecode
if platform_is_little_endian:
s = bin.byteswap().tobytes()
else:
s = bin.tobytes()
compressed = compress(s)
bin_pos_and_size.append((f.tell(), len(compressed)))
f.write(compressed)
# Go back and fill in table
f.seek(index_start_pos)
for pos, size in bin_pos_and_size:
write_packed(f, ">2I", pos, size)
class FileBinnedArray:
def __init__(self, f, cache=32):
# If cache=None, then everything is allowed to stay in memory,
# this is the default behavior.
self.f = f
M, V, max_size, bin_size, nbins = read_packed(f, ">5I")
assert M == MAGIC
# assert version less than max supported
assert V <= VERSION, "File is version %d but I don't know about anything beyond %d" % (V, VERSION)
self.max_size = max_size
self.bin_size = bin_size
self.nbins = nbins
self.bins = LRUCache(size=cache)
# Read typecode
if V >= 1:
self.typecode = (unpack('c', f.read(1))[0]).decode()
else:
self.typecode = 'f'
# Read compression type
if V >= 2:
self.comp_type = f.read(4).strip().decode()
else:
self.comp_type = 'zlib'
self.decompress = comp_types[self.comp_type][1]
# Read default value
s = f.read(calcsize(self.typecode))
a = frombuffer(s, self.typecode)
if platform_is_little_endian:
a = a.byteswap()
self.default = a[0]
# Read bin sizes and offsets
self.bin_pos = []
self.bin_sizes = []
for _ in range(nbins):
pos, size = read_packed(f, ">2I")
self.bin_pos.append(pos)
self.bin_sizes.append(size)
def get_bin_offset(self, index):
return int(index // self.bin_size), int(index % self.bin_size)
def load_bin(self, index):
assert self.bin_pos[index] != 0
self.f.seek(self.bin_pos[index])
raw = self.f.read(self.bin_sizes[index])
a = frombuffer(self.decompress(raw), self.typecode)
if platform_is_little_endian:
a = a.byteswap()
assert len(a) == self.bin_size
self.bins[index] = a
def get(self, key):
bin, offset = self.get_bin_offset(key)
if bin in self.bins:
return self.bins[bin][offset]
elif self.bin_pos[bin]:
self.load_bin(bin)
return self.bins[bin][offset]
else:
return self.default
def get_range(self, start, end):
size = end - start
assert size >= 0
rval = []
while size > 0:
bin, offset = self.get_bin_offset(start)
delta = self.bin_size - offset
if bin not in self.bins and self.bin_pos[bin] != 0:
self.load_bin(bin)
if self.bins[bin] is None:
if delta < size:
rval.append(resize(array(self.default, self.typecode), (delta,)))
size -= delta
start += delta
else:
rval.append(resize(array(self.default, self.typecode), (size,)))
size = 0
else:
if delta < size:
rval.append(self.bins[bin][offset:offset+delta])
size -= delta
start += delta
else:
rval.append(self.bins[bin][offset:offset+size])
size = 0
return concatenate(rval)
def __getitem__(self, key):
if isinstance(key, slice):
start, stop, stride = key.indices(self.max_size)
assert stride == 1, "Slices with strides are not supported"
return self.get_range(start, stop)
else:
return self.get(key)
class BinnedArrayWriter:
def __init__(self, f, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f", comp_type='zlib'):
# All parameters in the constructor are immutable after creation
self.f = f
self.max_size = max_size
self.bin_size = bin_size
self.nbins = int(math.ceil(max_size / self.bin_size))
self.default = default
self.typecode = typecode
self.bin = 0
self.bin_pos = 0
self.bin_index = []
self.buffer = resize(array(self.default, self.typecode), (self.bin_size,))
self.buffer_contains_values = False
self.comp_type = comp_type
self.compress = comp_types[comp_type][0]
self.write_header()
# Put the fp at the start of the data (we go back and fill in the index at the end)
self.f.seek(self.data_offset)
def write_header(self):
self.f.seek(0)
# Write header
write_packed(self.f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins)
# save type code
self.f.write(pack('c', bytesify(self.typecode)))
# write default value
a = array(self.default, self.typecode)
# write comp type
self.f.write(bytesify(self.comp_type[0:4].ljust(4)))
# write default
# Struct module can't deal with NaN and endian conversion, we'll hack
# around that by byteswapping the array
if platform_is_little_endian:
a = a.byteswap()
self.f.write(a.tobytes())
# Save current position (start of bin offsets)
self.index_pos = self.f.tell()
self.data_offset = self.index_pos + (self.nbins * calcsize(">2I"))
def write_index(self):
self.f.seek(self.index_pos)
for pos, size in self.bin_index:
write_packed(self.f, ">2I", pos, size)
def skip(self):
self.bin_pos += 1
if self.bin_pos == self.bin_size:
self.flush()
self.bin_pos = 0
self.bin += 1
assert self.bin <= self.nbins
self.buffer = resize(array(self.default, self.typecode), (self.bin_size,))
self.buffer_contains_values = False
def write(self, data):
self.buffer[self.bin_pos] = data
self.buffer_contains_values = True
self.bin_pos += 1
if self.bin_pos == self.bin_size:
self.flush()
self.bin_pos = 0
self.bin += 1
assert self.bin <= self.nbins
self.buffer = resize(array(self.default, self.typecode), (self.bin_size,))
self.buffer_contains_values = False
def flush(self):
# Flush buffer to file
if self.buffer_contains_values:
pos = self.f.tell()
if platform_is_little_endian:
s = self.buffer.byteswap().tobytes()
else:
s = self.buffer.tobytes()
compressed = self.compress(s)
size = len(compressed)
assert len(self.bin_index) == self.bin
self.bin_index.append((pos, size))
self.f.write(compressed)
else:
assert len(self.bin_index) == self.bin
self.bin_index.append((0, 0))
def finish(self):
self.flush()
self.nbins = self.bin + 1
self.write_header()
self.write_index()
def write_packed(f, pattern, *vals):
f.write(pack(pattern, *vals))
def read_packed(f, pattern):
rval = unpack(pattern, f.read(calcsize(pattern)))
if len(rval) == 1:
return rval[0]
return rval
bx-python-0.8.13/lib/bx/binned_array_tests.py 0000664 0000000 0000000 00000006511 14156664651 0021164 0 ustar 00root root 0000000 0000000 """
Tests for `bx.binned_array`.
"""
from numpy import (
allclose,
concatenate,
NaN,
zeros
)
from numpy.random import random_sample as random
from .binned_array import (
BinnedArray,
BinnedArrayWriter,
FileBinnedArray,
)
# Bigger values take longer, but excercise more bins
CHUNK_SIZE_RANDOM = 945
CHUNK_SIZE_ZEROS = 897
# CHUNK_SIZE_RANDOM=9456
# CHUNK_SIZE_ZEROS=8972
source = target = None
def setup():
global source
global target
source = []
for _ in range(13):
if random() < 0.5:
source = concatenate((source, random(CHUNK_SIZE_RANDOM)))
else:
source = concatenate((source, zeros(CHUNK_SIZE_ZEROS, 'f')))
source = source.astype('f')
# Set on target
target = BinnedArray(128, NaN, len(source))
for i in range(len(source)):
# if not isNaN( source[i] ):
target[i] = source[i]
return source, target
def test_simple():
# Verify
for i in range(len(source)):
assert source[i] == target[i], "No match, index: %d, source: %f, target: %f, len( source ): %d" % (i, source[i], target[i], len(source))
# Verify with slices
for _ in range(10):
a = int(random() * len(source))
b = int(random() * len(source))
if b < a:
a, b = b, a
assert allclose(source[a:b], target[a:b]), "No match, index: %d:%d, source: %s, target: %s" % \
(a, b, ",".join(map(str, source[a:a+10])), ",".join(map(str, target[a:a+10])))
def test_file():
# With a file (zlib)
target.to_file(open("/tmp/foo", "wb"))
target2 = FileBinnedArray(open("/tmp/foo", 'rb'))
for i in range(len(source)):
assert source[i] == target2[i], "No match, index: %d, source: %d, target: %d" % (i, source[i], target2[i])
# Verify with slices
target2 = FileBinnedArray(open("/tmp/foo", 'rb'))
for _ in range(10):
a = int(random() * len(source))
b = int(random() * len(source))
if b < a:
a, b = b, a
assert allclose(source[a:b], target[a:b]), "No match, index: %d:%d, source: %s, target: %s" % \
(a, b, ",".join(map(str, source[a:a+10])), ",".join(map(str, target2[a:a+10])))
def test_file_lzo():
# With a file (lzo)
target.to_file(open("/tmp/foo3", "wb"), comp_type="lzo")
target3 = FileBinnedArray(open("/tmp/foo3", 'rb'))
# Verify
for i in range(len(source)):
assert source[i] == target3[i], "No match, index: %d, source: %d, target: %d" % (i, source[i], target3[i])
# Verify with slices
target3 = FileBinnedArray(open("/tmp/foo3", 'rb'))
for _ in range(10):
a = int(random() * len(source))
b = int(random() * len(source))
if b < a:
a, b = b, a
assert allclose(source[a:b], target3[a:b]), "No match, index: %d:%d, source: %s, target: %s" % \
(a, b, ",".join(map(str, source[a:a+10])), ",".join(map(str, target3[a:a+10])))
def test_binned_array_writer():
# Test with ba writer
o = open("/tmp/foo4", "wb")
w = BinnedArrayWriter(o, 128, comp_type='lzo')
for val in source:
w.write(val)
w.finish()
o.close()
# Verify
target4 = FileBinnedArray(open("/tmp/foo4", 'rb'))
for i in range(len(source)):
assert allclose(source[i], target4[i]), "No match, index: %d, source: %d, target: %d" % (i, source[i], target4[i])
bx-python-0.8.13/lib/bx/bitset.pyx 0000664 0000000 0000000 00000022036 14156664651 0016767 0 ustar 00root root 0000000 0000000 """
Compact mutable sequences of bits (vectors of 0s and 1s) supporting various
boolean operations, and a "binned" variation which stores long runs of
identical bits compactly.
Because the binned implementation avoids a lot of memory allocation and access
when working with either small subregions of the total interval or setting /
testing spans larger than the bin size, it can be much faster.
"""
import sys
cdef extern from "common.h":
ctypedef int boolean
cdef extern from "bits.h":
ctypedef unsigned char Bits
# Allocate bits.
Bits * bitAlloc( int bitCount )
# Clone bits.
Bits * bitClone(Bits* orig, int bitCount )
# Free bits.
void bitFree(Bits **pB)
# Set a single bit.
void bitSetOne(Bits *b, int bitIx)
# Clear a single bit.
void bitClearOne(Bits *b, int bitIx)
# Set a range of bits.
void bitSetRange(Bits *b, int startIx, int bitCount)
# Read a single bit.
int bitReadOne(Bits *b, int bitIx)
# Count number of bits set in range.
int bitCountRange(Bits *b, int startIx, int bitCount)
# Find the index of the the next set bit.
int bitFindSet(Bits *b, int startIx, int bitCount)
# Find the index of the the next clear bit.
int bitFindClear(Bits *b, int startIx, int bitCount)
# Clear many bits.
void bitClear(Bits *b, int bitCount)
# And two bitmaps. Put result in a.
void bitAnd(Bits *a, Bits *b, int bitCount)
# Or two bitmaps. Put result in a.
void bitOr(Bits *a, Bits *b, int bitCount)
# Xor two bitmaps. Put result in a.
void bitXor(Bits *a, Bits *b, int bitCount)
# Flip all bits in a.
void bitNot(Bits *a, int bitCount)
## # Print part or all of bit map as a string of 0s and 1s. Mostly useful for
## void bitPrint(Bits *a, int startIx, int bitCount, FILE* out)
cdef extern from "binBits.h":
struct BinBits:
int size
int bin_size
int nbins
Bits ** bins
BinBits* binBitsAlloc( int size, int granularity )
void binBitsFree( BinBits * bb )
int binBitsReadOne( BinBits * bb, int pos )
void binBitsSetOne( BinBits * bb, int pos )
void binBitsClearOne( BinBits * bb, int pos )
void binBitsSetRange( BinBits *bb, int start, int size )
int binBitsCountRange( BinBits *bb, int start, int size )
int binBitsFindSet( BinBits *bb, int start )
int binBitsFindClear( BinBits *bb, int start )
void binBitsAnd( BinBits *bb1, BinBits *bb2 )
void binBitsOr( BinBits *bb1, BinBits *bb2 )
void binBitsNot( BinBits *bb )
## ---- Forward declerations ------------------------------------------------
cdef class BitSet
cdef class BinnedBitSet
## ---- BitSet bounds checking ----------------------------------------------
cdef inline b_check_index( BitSet b, index ):
if index < 0:
raise IndexError( "BitSet index (%d) must be non-negative." % index )
if index >= b.bitCount:
raise IndexError( "%d is larger than the size of this BitSet (%d)." % ( index, b.bitCount ) )
cdef inline b_check_range( BitSet b, start, end ):
b_check_index( b, start )
if end < start:
raise IndexError( "Range end (%d) must be greater than range start(%d)." % ( end, start ) )
if end > b.bitCount:
raise IndexError( "End %d is larger than the size of this BitSet (%d)." % ( end, b.bitCount ) )
cdef inline b_check_range_count( BitSet b, start, count ):
b_check_index( b, start )
if count < 0:
raise IndexError( "Count (%d) must be non-negative." % count )
if start + count > b.bitCount:
raise IndexError( "End %d is larger than the size of this BitSet (%d)." % ( start + count, b.bitCount ) )
cdef inline b_check_same_size( BitSet b, BitSet other ):
if b.bitCount != other.bitCount:
raise ValueError( "BitSets must have the same size" )
## ---- BitSet --------------------------------------------------------------
# Maximum value of a signed 32 bit integer ( 2**31 - 1 )
cdef int MAX_INT = 2147483647
cdef class BitSet:
cdef Bits * bits
cdef int bitCount
def __cinit__( self, bitCount ):
if bitCount > MAX_INT:
raise ValueError( "%d is larger than the maximum BitSet size of %d." % ( bitCount, MAX_INT ) )
self.bitCount = bitCount
self.bits = bitAlloc( bitCount )
def __dealloc__( self ):
if self.bits:
bitFree( & self.bits )
property size:
def __get__( self ):
return self.bitCount
def set( self, index ):
b_check_index( self, index )
bitSetOne( self.bits, index )
def clear( self, index ):
b_check_index( self, index )
bitClearOne( self.bits, index )
def clone( self ):
other = BitSet( self.bitCount )
other.ior( self )
return other
def set_range( self, start, count ):
b_check_range_count( self, start, count )
bitSetRange( self.bits, start, count )
def get( self, index ):
b_check_index( self, index )
return bitReadOne( self.bits, index );
def count_range( self, start=0, count=None ):
if count == None:
count = self.bitCount - start
b_check_range_count( self, start, count )
return bitCountRange( self.bits, start, count )
def next_set( self, start, end=None ):
if end == None:
end = self.bitCount
b_check_range( self, start, end )
return bitFindSet( self.bits, start, end )
def next_clear( self, start, end=None ):
if end == None:
end = self.bitCount
b_check_range( self, start, end )
return bitFindClear( self.bits, start, end )
def iand( self, BitSet other ):
b_check_same_size( self, other )
bitAnd( self.bits, other.bits, self.bitCount )
def ior( self, BitSet other ):
b_check_same_size( self, other )
bitOr( self.bits, other.bits, self.bitCount )
def ixor( self, BitSet other ):
b_check_same_size( self, other )
bitXor( self.bits, other.bits, self.bitCount )
def invert( self ):
bitNot( self.bits, self.bitCount)
def __getitem__( self, index ):
return self.get( index )
def __iand__( self, other ):
self.iand( other )
return self
def __ior__( self, other ):
self.ior( other )
return self
def __invert__( self ):
self.invert()
return self
## ---- BinnedBitSet bounds checking ----------------------------------------
cdef inline bb_check_index( BinnedBitSet bb, index ):
if index < 0:
raise IndexError( "BitSet index (%d) must be non-negative." % index )
if index >= bb.bb.size:
raise IndexError( "%d is larger than the size of this BitSet (%d)." % ( index, bb.bb.size ) )
cdef inline bb_check_start( BinnedBitSet bb, start ):
bb_check_index( bb, start )
cdef inline bb_check_range_count( BinnedBitSet bb, start, count ):
bb_check_index( bb, start )
if count < 0:
raise IndexError( "Count (%d) must be non-negative." % count )
if start + count > bb.bb.size:
raise IndexError( "End (%d) is larger than the size of this BinnedBitSet (%d)." % ( start + count, bb.bb.size ) )
cdef inline bb_check_same_size( BinnedBitSet bb, BinnedBitSet other ):
if bb.bb.size != other.bb.size:
raise ValueError( "BitSets must have the same size" )
## ---- BinnedBitSet --------------------------------------------------------
MAX=512*1024*1024
cdef class BinnedBitSet:
cdef BinBits * bb
def __cinit__( self, size=MAX, granularity=1024 ):
if size > MAX_INT:
raise ValueError( "%d is larger than the maximum BinnedBitSet size of %d." % ( size, MAX_INT ) )
self.bb = binBitsAlloc( size, granularity )
def __dealloc__( self ):
if self.bb:
binBitsFree( self.bb );
def __getitem__( self, index ):
bb_check_index( self, index )
return binBitsReadOne( self.bb, index )
def set( self, index ):
bb_check_index( self, index )
binBitsSetOne( self.bb, index )
def clear( self, index ):
bb_check_index( self, index )
binBitsClearOne( self.bb, index )
def set_range( self, int start, count ):
bb_check_range_count( self, start, count )
binBitsSetRange( self.bb, start, count )
def count_range( self, start, count ):
bb_check_range_count( self, start, count )
return binBitsCountRange( self.bb, start, count )
def next_set( self, start ):
bb_check_start( self, start )
return binBitsFindSet( self.bb, start )
def next_clear( self, start ):
bb_check_start( self, start )
return binBitsFindClear( self.bb, start )
property size:
def __get__( self ):
return self.bb.size
property bin_size:
def __get__( self ):
return self.bb.bin_size
def iand( self, BinnedBitSet other ):
bb_check_same_size( self, other )
binBitsAnd( self.bb, other.bb )
def ior( self, BinnedBitSet other ):
bb_check_same_size( self, other )
binBitsOr( self.bb, other.bb )
def invert( self ):
binBitsNot( self.bb )
bx-python-0.8.13/lib/bx/bitset_builders.py 0000664 0000000 0000000 00000013343 14156664651 0020471 0 ustar 00root root 0000000 0000000 """
Support for creating dictionaries of `Bitset`s / `BinnedBitset`s from text
files containg sets of "covered" intervals in sequences (e.g. `BED`_ files).
.. BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
"""
import re
from warnings import warn
from bx.bitset import (
BinnedBitSet,
MAX
)
def binned_bitsets_from_file(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={}):
"""
Read a file into a dictionary of bitsets. The defaults arguments
- 'f' should be a file like object (or any iterable containing strings)
- 'chrom_col', 'start_col', and 'end_col' must exist in each line.
- 'strand_col' is optional, any line without it will be assumed to be '+'
- if 'lens' is provided bitset sizes will be looked up from it, otherwise
chromosomes will be assumed to be the maximum size
"""
last_chrom = None
last_bitset = None
bitsets = dict()
for line in f:
if line.startswith("#") or line.isspace():
continue
fields = line.split()
chrom = fields[chrom_col]
if chrom != last_chrom:
if chrom not in bitsets:
if chrom in lens:
size = lens[chrom]
else:
size = MAX
bitsets[chrom] = BinnedBitSet(size)
last_chrom = chrom
last_bitset = bitsets[chrom]
start, end = int(fields[start_col]), int(fields[end_col])
if upstream_pad:
start = max(0, start - upstream_pad)
if downstream_pad:
end = min(size, end + downstream_pad)
if start > end:
warn("Interval start after end!")
last_bitset.set_range(start, end-start)
return bitsets
def binned_bitsets_from_bed_file(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={}):
"""
Read a file into a dictionary of bitsets. The defaults arguments
- 'f' should be a file like object (or any iterable containing strings)
- 'chrom_col', 'start_col', and 'end_col' must exist in each line.
- 'strand_col' is optional, any line without it will be assumed to be '+'
- if 'lens' is provided bitset sizes will be looked up from it, otherwise
chromosomes will be assumed to be the maximum size
"""
last_chrom = None
last_bitset = None
bitsets = dict()
offset = 0
for line in f:
if line.startswith("#") or line.isspace():
continue
# Ignore browser lines completely
if line.startswith("browser"):
continue
# Need to check track lines due to the offset
if line.startswith("track"):
m = re.search(r"offset=(\d+)", line)
if m and m.group(1):
offset = int(m.group(1))
continue
fields = line.split()
chrom = fields[chrom_col]
if chrom != last_chrom:
if chrom not in bitsets:
if chrom in lens:
size = lens[chrom]
else:
size = MAX
bitsets[chrom] = BinnedBitSet(size)
last_chrom = chrom
last_bitset = bitsets[chrom]
start, end = int(fields[start_col]) + offset, int(fields[end_col]) + offset
if upstream_pad:
start = max(0, start - upstream_pad)
if downstream_pad:
end = min(size, end + downstream_pad)
if start > end:
warn("Interval start after end!")
last_bitset.set_range(start, end-start)
return bitsets
def binned_bitsets_proximity(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream=0, downstream=0):
"""Read a file into a dictionary of bitsets"""
last_chrom = None
last_bitset = None
bitsets = dict()
for line in f:
if line.startswith("#"):
continue
# print "input=%s" % ( line ),
fields = line.split()
strand = "+"
if len(fields) >= strand_col + 1:
if fields[strand_col] == "-":
strand = "-"
chrom = fields[chrom_col]
if chrom != last_chrom:
if chrom not in bitsets:
bitsets[chrom] = BinnedBitSet(MAX)
last_chrom = chrom
last_bitset = bitsets[chrom]
start, end = int(fields[start_col]), int(fields[end_col])
if strand == "+":
if upstream:
start = max(0, start - upstream)
if downstream:
end = min(MAX, end + downstream)
if strand == "-":
if upstream:
end = min(MAX, end + upstream)
if downstream:
start = max(0, start - downstream)
# print "set: start=%d\tend=%d" % ( start, end )
if end-start > 0:
last_bitset.set_range(start, end-start)
return bitsets
def binned_bitsets_from_list(list=[]):
"""Read a list into a dictionary of bitsets"""
last_chrom = None
last_bitset = None
bitsets = dict()
for l in list:
chrom = l[0]
if chrom != last_chrom:
if chrom not in bitsets:
bitsets[chrom] = BinnedBitSet(MAX)
last_chrom = chrom
last_bitset = bitsets[chrom]
start, end = int(l[1]), int(l[2])
last_bitset.set_range(start, end - start)
return bitsets
def binned_bitsets_by_chrom(f, chrom, chrom_col=0, start_col=1, end_col=2):
"""Read a file by chrom name into a bitset"""
bitset = BinnedBitSet(MAX)
for line in f:
if line.startswith("#"):
continue
fields = line.split()
if fields[chrom_col] == chrom:
start, end = int(fields[start_col]), int(fields[end_col])
bitset.set_range(start, end-start)
return bitset
bx-python-0.8.13/lib/bx/bitset_tests.py 0000664 0000000 0000000 00000006736 14156664651 0020032 0 ustar 00root root 0000000 0000000 """
Tests for `bx.bitset`.
"""
import unittest
import bx.bitset
class AbstractTests:
def assert_bits(self, bits, list):
assert bits.size == len(list), "Bitset size and verification list size do not match"
for i in range(bits.size):
self.assertEqual(bits[i], list[i])
def test_overflow_create(self):
self.assertRaises(ValueError, self.new_bits, 4000000000)
def test_overflow_access(self):
bits = self.new_bits(100)
self.assertRaises(IndexError, bits.set, -5)
self.assertRaises(IndexError, bits.set, 110)
def test_access(self):
# Create and assert empty
bits = self.new_bits(100)
l = [0] * 100
self.assert_bits(bits, l)
# Set some positions
for pos in (11, 14, 70, 16):
bits.set(pos)
l[pos] = 1
# Clear some positions
for pos in (14, 80, 16):
bits.clear(pos)
l[pos] = 0
self.assert_bits(bits, l)
def test_range_access(self):
# Create and assert empty
bits = self.new_bits(100)
l = [0] * 100
self.assert_bits(bits, l)
# Set some positions
for b, e in ((11, 14), (20, 75), (90, 99)):
bits.set_range(b, e-b)
for pos in range(b, e):
l[pos] = 1
self.assert_bits(bits, l)
def test_count(self):
# Create and assert empty
bits = self.new_bits(100)
# Set some positions
for b, e in ((11, 14), (20, 75), (90, 100)):
bits.set_range(b, e-b)
self.assertEqual(bits.count_range(0, 0), 0)
self.assertEqual(bits.count_range(0, 20), 3)
self.assertEqual(bits.count_range(25, 25), 25)
self.assertEqual(bits.count_range(80, 20), 10)
self.assertEqual(bits.count_range(0, 100), 68)
def test_find(self):
# Create and assert empty
bits = self.new_bits(100)
# Set some positions
for b, e in ((11, 14), (20, 75), (90, 100)):
bits.set_range(b, e-b)
# Next set
self.assertEqual(bits.next_set(0), 11)
self.assertEqual(bits.next_set(13), 13)
self.assertEqual(bits.next_set(15), 20)
# Next clear
self.assertEqual(bits.next_clear(0), 0)
self.assertEqual(bits.next_clear(11), 14)
self.assertEqual(bits.next_clear(20), 75)
self.assertEqual(bits.next_clear(92), 100)
def test_and(self):
bits1 = self.new_bits(100)
bits2 = self.new_bits(100)
bits1.set_range(20, 40)
bits2.set_range(50, 25)
bits1.iand(bits2)
l = [0]*100
for i in range(50, 60):
l[i] = 1
self.assert_bits(bits1, l)
def test_or(self):
bits1 = self.new_bits(100)
bits2 = self.new_bits(100)
bits1.set_range(20, 40)
bits2.set_range(50, 25)
bits1.ior(bits2)
l = [0]*100
for i in range(20, 75):
l[i] = 1
self.assert_bits(bits1, l)
def test_not(self):
bits = self.new_bits(100)
bits.set_range(20, 40)
bits.invert()
l = [1]*100
for i in range(20, 60):
l[i] = 0
self.assert_bits(bits, l)
class BitSetTests(AbstractTests, unittest.TestCase):
def new_bits(self, size):
return bx.bitset.BitSet(size)
class BinnedBitSetTests(AbstractTests, unittest.TestCase):
def new_bits(self, size):
granularity = size % 11
return bx.bitset.BinnedBitSet(size, granularity)
bx-python-0.8.13/lib/bx/bitset_utils.py 0000664 0000000 0000000 00000003537 14156664651 0020024 0 ustar 00root root 0000000 0000000 """
Utility functions for working with `Bitset`s and treating lists of (start,end)
as `Bitset`s.
"""
from bx.bitset import (
BinnedBitSet,
MAX,
)
def bitset_intersect(ex1, ex2):
bits1 = list2bits(ex1)
bits2 = list2bits(ex2)
bits1.iand(bits2)
return bits2list(bits1)
def bitset_subtract(ex1, ex2):
bits1 = list2bits(ex1)
bits2 = list2bits(ex2)
bits2.invert()
bits1.iand(bits2)
return bits2list(bits1)
def list2bits(ex):
bits = BinnedBitSet(MAX)
for start, end in ex:
bits.set_range(start, end - start)
return bits
def bits2list(bits):
ex = []
end = 0
while True:
start = bits.next_set(end)
if start == bits.size:
break
end = bits.next_clear(start)
ex.append((start, end))
return ex
def bitset_complement(exons):
bits = BinnedBitSet(MAX)
introns = []
for start, end in exons:
bits.set_range(start, end - start)
bits.invert()
# only complement within the range of the list
ex_start = min(a[0] for a in exons)
ex_end = max(a[1] for a in exons)
end = ex_start
len = ex_end
while True:
start = bits.next_set(end)
if start == bits.size:
break
end = bits.next_clear(start)
if end > len:
end = len
if start != end:
introns.append((start, end))
if end == len:
break
return introns
def bitset_interval_intersect(bits, istart, iend):
rval = []
end = istart
len = iend
while True:
start = bits.next_set(end)
if start >= len:
break
end = bits.next_clear(start)
if start != end:
rval.append((start, end))
if end >= len:
break
return rval
def bitset_union(exons):
bits = list2bits(exons)
return bits2list(bits)
bx-python-0.8.13/lib/bx/cookbook/ 0000775 0000000 0000000 00000000000 14156664651 0016536 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/cookbook/__init__.py 0000664 0000000 0000000 00000005505 14156664651 0020654 0 ustar 00root root 0000000 0000000 """
Various useful utilities, mostly taken from the ASPN Python cookbook.
"""
import types
seq_types = type(()), type([])
def flatten(*args):
for arg in args:
if type(arg) in seq_types:
for elem in arg:
yield from flatten(elem)
else:
yield arg
def cross_lists(*sets):
"""Return the cross product of the arguments"""
wheels = [iter(_) for _ in sets]
digits = [next(it) for it in wheels]
while True:
yield digits[:]
for i in range(len(digits)-1, -1, -1):
try:
digits[i] = next(wheels[i])
break
except StopIteration:
wheels[i] = iter(sets[i])
digits[i] = next(wheels[i])
else:
break
# Cached / memoized methods
def cachedmethod(function):
return types.MethodType(Memoize(function), None)
class Memoize:
def __init__(self, function):
self._cache = {}
self._callable = function
def __call__(self, *args, **kwds):
cache = self._cache
key = self._getKey(*args, **kwds)
try:
return cache[key]
except KeyError:
cachedValue = cache[key] = self._callable(*args, **kwds)
return cachedValue
def _getKey(self, *args, **kwds):
return kwds and (args, ImmutableDict(kwds)) or args
class memoized:
"""Decorator that caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned, and
not re-evaluated.
"""
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)
def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__
class ImmutableDict(dict):
'''A hashable dict.'''
def __init__(self, *args, **kwds):
dict.__init__(self, *args, **kwds)
def __setitem__(self, key, value):
raise NotImplementedError("dict is immutable")
def __delitem__(self, key):
raise NotImplementedError("dict is immutable")
def clear(self):
raise NotImplementedError("dict is immutable")
def setdefault(self, k, default=None):
raise NotImplementedError("dict is immutable")
def popitem(self):
raise NotImplementedError("dict is immutable")
def update(self, other):
raise NotImplementedError("dict is immutable")
def __hash__(self):
return hash(tuple(self.items()))
bx-python-0.8.13/lib/bx/cookbook/argparse.py 0000664 0000000 0000000 00000252244 14156664651 0020725 0 ustar 00root root 0000000 0000000 # Author: Steven J. Bethard .
"""Command-line parsing library
This module is an optparse-inspired command-line parsing library that:
- handles both optional and positional arguments
- produces highly informative usage messages
- supports parsers that dispatch to sub-parsers
The following is a simple usage example that sums integers from the
command-line and writes the result to a file::
parser = argparse.ArgumentParser(
description='sum the integers at the command line')
parser.add_argument(
'integers', metavar='int', nargs='+', type=int,
help='an integer to be summed')
parser.add_argument(
'--log', default=sys.stdout, type=argparse.FileType('w'),
help='the file where the sum should be written')
args = parser.parse_args()
args.log.write('%s' % sum(args.integers))
args.log.close()
The module contains the following public classes:
- ArgumentParser -- The main entry point for command-line parsing. As the
example above shows, the add_argument() method is used to populate
the parser with actions for optional and positional arguments. Then
the parse_args() method is invoked to convert the args at the
command-line into an object with attributes.
- ArgumentError -- The exception raised by ArgumentParser objects when
there are errors with the parser's actions. Errors raised while
parsing the command-line are caught by ArgumentParser and emitted
as command-line messages.
- FileType -- A factory for defining types of files to be created. As the
example above shows, instances of FileType are typically passed as
the type= argument of add_argument() calls.
- Action -- The base class for parser actions. Typically actions are
selected by passing strings like 'store_true' or 'append_const' to
the action= argument of add_argument(). However, for greater
customization of ArgumentParser actions, subclasses of Action may
be defined and passed as the action= argument.
- HelpFormatter, RawDescriptionHelpFormatter, RawTextHelpFormatter,
ArgumentDefaultsHelpFormatter -- Formatter classes which
may be passed as the formatter_class= argument to the
ArgumentParser constructor. HelpFormatter is the default,
RawDescriptionHelpFormatter and RawTextHelpFormatter tell the parser
not to change the formatting for help text, and
ArgumentDefaultsHelpFormatter adds information about argument defaults
to the help.
All other classes in this module are considered implementation details.
(Also note that HelpFormatter and RawDescriptionHelpFormatter are only
considered public as object names -- the API of the formatter objects is
still considered an implementation detail.)
"""
__version__ = '1.2.1'
__all__ = [
'ArgumentParser',
'ArgumentError',
'ArgumentTypeError',
'FileType',
'HelpFormatter',
'ArgumentDefaultsHelpFormatter',
'RawDescriptionHelpFormatter',
'RawTextHelpFormatter',
'Namespace',
'Action',
'ONE_OR_MORE',
'OPTIONAL',
'PARSER',
'REMAINDER',
'SUPPRESS',
'ZERO_OR_MORE',
]
import copy as _copy
import os as _os
import re as _re
import sys as _sys
import textwrap as _textwrap
from gettext import gettext as _
try:
set
except NameError:
# for python < 2.4 compatibility (sets module is there since 2.3):
from sets import Set as set
try:
basestring
except NameError:
basestring = str
try:
sorted
except NameError:
# for python < 2.4 compatibility:
def sorted(iterable, reverse=False):
result = sorted(iterable)
if reverse:
result.reverse()
return result
def _callable(obj):
return hasattr(obj, '__call__') or hasattr(obj, '__bases__')
SUPPRESS = '==SUPPRESS=='
OPTIONAL = '?'
ZERO_OR_MORE = '*'
ONE_OR_MORE = '+'
PARSER = 'A...'
REMAINDER = '...'
_UNRECOGNIZED_ARGS_ATTR = '_unrecognized_args'
# =============================
# Utility functions and classes
# =============================
class _AttributeHolder:
"""Abstract base class that provides __repr__.
The __repr__ method returns a string in the format::
ClassName(attr=name, attr=name, ...)
The attributes are determined either by a class-level attribute,
'_kwarg_names', or by inspecting the instance __dict__.
"""
def __repr__(self):
type_name = type(self).__name__
arg_strings = []
for arg in self._get_args():
arg_strings.append(repr(arg))
for name, value in self._get_kwargs():
arg_strings.append(f'{name}={value!r}')
return '{}({})'.format(type_name, ', '.join(arg_strings))
def _get_kwargs(self):
return sorted(self.__dict__.items())
def _get_args(self):
return []
def _ensure_value(namespace, name, value):
if getattr(namespace, name, None) is None:
setattr(namespace, name, value)
return getattr(namespace, name)
# ===============
# Formatting Help
# ===============
class HelpFormatter:
"""Formatter for generating usage messages and argument help strings.
Only the name of this class is considered a public API. All the methods
provided by the class are considered an implementation detail.
"""
def __init__(self,
prog,
indent_increment=2,
max_help_position=24,
width=None):
# default setting for width
if width is None:
try:
width = int(_os.environ['COLUMNS'])
except (KeyError, ValueError):
width = 80
width -= 2
self._prog = prog
self._indent_increment = indent_increment
self._max_help_position = max_help_position
self._width = width
self._current_indent = 0
self._level = 0
self._action_max_length = 0
self._root_section = self._Section(self, None)
self._current_section = self._root_section
self._whitespace_matcher = _re.compile(r'\s+')
self._long_break_matcher = _re.compile(r'\n\n\n+')
# ===============================
# Section and indentation methods
# ===============================
def _indent(self):
self._current_indent += self._indent_increment
self._level += 1
def _dedent(self):
self._current_indent -= self._indent_increment
assert self._current_indent >= 0, 'Indent decreased below 0.'
self._level -= 1
class _Section:
def __init__(self, formatter, parent, heading=None):
self.formatter = formatter
self.parent = parent
self.heading = heading
self.items = []
def format_help(self):
# format the indented section
if self.parent is not None:
self.formatter._indent()
join = self.formatter._join_parts
for func, args in self.items:
func(*args)
item_help = join([func(*args) for func, args in self.items])
if self.parent is not None:
self.formatter._dedent()
# return nothing if the section was empty
if not item_help:
return ''
# add the heading if the section was non-empty
if self.heading is not SUPPRESS and self.heading is not None:
current_indent = self.formatter._current_indent
heading = '%*s%s:\n' % (current_indent, '', self.heading)
else:
heading = ''
# join the section-initial newline, the heading and the help
return join(['\n', heading, item_help, '\n'])
def _add_item(self, func, args):
self._current_section.items.append((func, args))
# ========================
# Message building methods
# ========================
def start_section(self, heading):
self._indent()
section = self._Section(self, self._current_section, heading)
self._add_item(section.format_help, [])
self._current_section = section
def end_section(self):
self._current_section = self._current_section.parent
self._dedent()
def add_text(self, text):
if text is not SUPPRESS and text is not None:
self._add_item(self._format_text, [text])
def add_usage(self, usage, actions, groups, prefix=None):
if usage is not SUPPRESS:
args = usage, actions, groups, prefix
self._add_item(self._format_usage, args)
def add_argument(self, action):
if action.help is not SUPPRESS:
# find all invocations
get_invocation = self._format_action_invocation
invocations = [get_invocation(action)]
for subaction in self._iter_indented_subactions(action):
invocations.append(get_invocation(subaction))
# update the maximum item length
invocation_length = max(len(s) for s in invocations)
action_length = invocation_length + self._current_indent
self._action_max_length = max(self._action_max_length,
action_length)
# add the item to the list
self._add_item(self._format_action, [action])
def add_arguments(self, actions):
for action in actions:
self.add_argument(action)
# =======================
# Help-formatting methods
# =======================
def format_help(self):
help = self._root_section.format_help()
if help:
help = self._long_break_matcher.sub('\n\n', help)
help = help.strip('\n') + '\n'
return help
def _join_parts(self, part_strings):
return ''.join([part
for part in part_strings
if part and part is not SUPPRESS])
def _format_usage(self, usage, actions, groups, prefix):
if prefix is None:
prefix = _('usage: ')
# if usage is specified, use that
if usage is not None:
usage = usage % dict(prog=self._prog)
# if no optionals or positionals are available, usage is just prog
elif usage is None and not actions:
usage = '%(prog)s' % dict(prog=self._prog)
# if optionals and positionals are available, calculate usage
elif usage is None:
prog = '%(prog)s' % dict(prog=self._prog)
# split optionals from positionals
optionals = []
positionals = []
for action in actions:
if action.option_strings:
optionals.append(action)
else:
positionals.append(action)
# build full usage string
format = self._format_actions_usage
action_usage = format(optionals + positionals, groups)
usage = ' '.join([s for s in [prog, action_usage] if s])
# wrap the usage parts if it's too long
text_width = self._width - self._current_indent
if len(prefix) + len(usage) > text_width:
# break usage into wrappable parts
part_regexp = r'\(.*?\)+|\[.*?\]+|\S+'
opt_usage = format(optionals, groups)
pos_usage = format(positionals, groups)
opt_parts = _re.findall(part_regexp, opt_usage)
pos_parts = _re.findall(part_regexp, pos_usage)
assert ' '.join(opt_parts) == opt_usage
assert ' '.join(pos_parts) == pos_usage
# helper for wrapping lines
def get_lines(parts, indent, prefix=None):
lines = []
line = []
if prefix is not None:
line_len = len(prefix) - 1
else:
line_len = len(indent) - 1
for part in parts:
if line_len + 1 + len(part) > text_width:
lines.append(indent + ' '.join(line))
line = []
line_len = len(indent) - 1
line.append(part)
line_len += len(part) + 1
if line:
lines.append(indent + ' '.join(line))
if prefix is not None:
lines[0] = lines[0][len(indent):]
return lines
# if prog is short, follow it with optionals or positionals
if len(prefix) + len(prog) <= 0.75 * text_width:
indent = ' ' * (len(prefix) + len(prog) + 1)
if opt_parts:
lines = get_lines([prog] + opt_parts, indent, prefix)
lines.extend(get_lines(pos_parts, indent))
elif pos_parts:
lines = get_lines([prog] + pos_parts, indent, prefix)
else:
lines = [prog]
# if prog is long, put it on its own line
else:
indent = ' ' * len(prefix)
parts = opt_parts + pos_parts
lines = get_lines(parts, indent)
if len(lines) > 1:
lines = []
lines.extend(get_lines(opt_parts, indent))
lines.extend(get_lines(pos_parts, indent))
lines = [prog] + lines
# join lines into usage
usage = '\n'.join(lines)
# prefix with 'usage:'
return f'{prefix}{usage}\n\n'
def _format_actions_usage(self, actions, groups):
# find group indices and identify actions in groups
group_actions = set()
inserts = {}
for group in groups:
try:
start = actions.index(group._group_actions[0])
except ValueError:
continue
else:
end = start + len(group._group_actions)
if actions[start:end] == group._group_actions:
for action in group._group_actions:
group_actions.add(action)
if not group.required:
if start in inserts:
inserts[start] += ' ['
else:
inserts[start] = '['
inserts[end] = ']'
else:
if start in inserts:
inserts[start] += ' ('
else:
inserts[start] = '('
inserts[end] = ')'
for i in range(start + 1, end):
inserts[i] = '|'
# collect all actions format strings
parts = []
for i, action in enumerate(actions):
# suppressed arguments are marked with None
# remove | separators for suppressed arguments
if action.help is SUPPRESS:
parts.append(None)
if inserts.get(i) == '|':
inserts.pop(i)
elif inserts.get(i + 1) == '|':
inserts.pop(i + 1)
# produce all arg strings
elif not action.option_strings:
part = self._format_args(action, action.dest)
# if it's in a group, strip the outer []
if action in group_actions:
if part[0] == '[' and part[-1] == ']':
part = part[1:-1]
# add the action string to the list
parts.append(part)
# produce the first way to invoke the option in brackets
else:
option_string = action.option_strings[0]
# if the Optional doesn't take a value, format is:
# -s or --long
if action.nargs == 0:
part = '%s' % option_string
# if the Optional takes a value, format is:
# -s ARGS or --long ARGS
else:
default = action.dest.upper()
args_string = self._format_args(action, default)
part = f'{option_string} {args_string}'
# make it look optional if it's not required or in a group
if not action.required and action not in group_actions:
part = '[%s]' % part
# add the action string to the list
parts.append(part)
# insert things at the necessary indices
for i in sorted(inserts, reverse=True):
parts[i:i] = [inserts[i]]
# join all the action items with spaces
text = ' '.join([item for item in parts if item is not None])
# clean up separators for mutually exclusive groups
open = r'[\[(]'
close = r'[\])]'
text = _re.sub(r'(%s) ' % open, r'\1', text)
text = _re.sub(r' (%s)' % close, r'\1', text)
text = _re.sub(fr'{open} *{close}', r'', text)
text = _re.sub(r'\(([^|]*)\)', r'\1', text)
text = text.strip()
# return the text
return text
def _format_text(self, text):
if '%(prog)' in text:
text = text % dict(prog=self._prog)
text_width = self._width - self._current_indent
indent = ' ' * self._current_indent
return self._fill_text(text, text_width, indent) + '\n\n'
def _format_action(self, action):
# determine the required width and the entry label
help_position = min(self._action_max_length + 2,
self._max_help_position)
help_width = self._width - help_position
action_width = help_position - self._current_indent - 2
action_header = self._format_action_invocation(action)
# ho nelp; start on same line and add a final newline
if not action.help:
tup = self._current_indent, '', action_header
action_header = '%*s%s\n' % tup
# short action name; start on the same line and pad two spaces
elif len(action_header) <= action_width:
tup = self._current_indent, '', action_width, action_header
action_header = '%*s%-*s ' % tup
indent_first = 0
# long action name; start on the next line
else:
tup = self._current_indent, '', action_header
action_header = '%*s%s\n' % tup
indent_first = help_position
# collect the pieces of the action help
parts = [action_header]
# if there was help for the action, add lines of help text
if action.help:
help_text = self._expand_help(action)
help_lines = self._split_lines(help_text, help_width)
parts.append('%*s%s\n' % (indent_first, '', help_lines[0]))
for line in help_lines[1:]:
parts.append('%*s%s\n' % (help_position, '', line))
# or add a newline if the description doesn't end with one
elif not action_header.endswith('\n'):
parts.append('\n')
# if there are any sub-actions, add their help as well
for subaction in self._iter_indented_subactions(action):
parts.append(self._format_action(subaction))
# return a single string
return self._join_parts(parts)
def _format_action_invocation(self, action):
if not action.option_strings:
metavar, = self._metavar_formatter(action, action.dest)(1)
return metavar
else:
parts = []
# if the Optional doesn't take a value, format is:
# -s, --long
if action.nargs == 0:
parts.extend(action.option_strings)
# if the Optional takes a value, format is:
# -s ARGS, --long ARGS
else:
default = action.dest.upper()
args_string = self._format_args(action, default)
for option_string in action.option_strings:
parts.append(f'{option_string} {args_string}')
return ', '.join(parts)
def _metavar_formatter(self, action, default_metavar):
if action.metavar is not None:
result = action.metavar
elif action.choices is not None:
choice_strs = [str(choice) for choice in action.choices]
result = '{%s}' % ','.join(choice_strs)
else:
result = default_metavar
def format(tuple_size):
if isinstance(result, tuple):
return result
else:
return (result, ) * tuple_size
return format
def _format_args(self, action, default_metavar):
get_metavar = self._metavar_formatter(action, default_metavar)
if action.nargs is None:
result = '%s' % get_metavar(1)
elif action.nargs == OPTIONAL:
result = '[%s]' % get_metavar(1)
elif action.nargs == ZERO_OR_MORE:
result = '[%s [%s ...]]' % get_metavar(2)
elif action.nargs == ONE_OR_MORE:
result = '%s [%s ...]' % get_metavar(2)
elif action.nargs == REMAINDER:
result = '...'
elif action.nargs == PARSER:
result = '%s ...' % get_metavar(1)
else:
formats = ['%s' for _ in range(action.nargs)]
result = ' '.join(formats) % get_metavar(action.nargs)
return result
def _expand_help(self, action):
params = dict(vars(action), prog=self._prog)
for name in list(params):
if params[name] is SUPPRESS:
del params[name]
for name in list(params):
if hasattr(params[name], '__name__'):
params[name] = params[name].__name__
if params.get('choices') is not None:
choices_str = ', '.join([str(c) for c in params['choices']])
params['choices'] = choices_str
return self._get_help_string(action) % params
def _iter_indented_subactions(self, action):
try:
get_subactions = action._get_subactions
except AttributeError:
pass
else:
self._indent()
yield from get_subactions()
self._dedent()
def _split_lines(self, text, width):
text = self._whitespace_matcher.sub(' ', text).strip()
return _textwrap.wrap(text, width)
def _fill_text(self, text, width, indent):
text = self._whitespace_matcher.sub(' ', text).strip()
return _textwrap.fill(text, width, initial_indent=indent, subsequent_indent=indent)
def _get_help_string(self, action):
return action.help
class RawDescriptionHelpFormatter(HelpFormatter):
"""Help message formatter which retains any formatting in descriptions.
Only the name of this class is considered a public API. All the methods
provided by the class are considered an implementation detail.
"""
def _fill_text(self, text, width, indent):
return ''.join([indent + line for line in text.splitlines(True)])
class RawTextHelpFormatter(RawDescriptionHelpFormatter):
"""Help message formatter which retains formatting of all help text.
Only the name of this class is considered a public API. All the methods
provided by the class are considered an implementation detail.
"""
def _split_lines(self, text, width):
return text.splitlines()
class ArgumentDefaultsHelpFormatter(HelpFormatter):
"""Help message formatter which adds default values to argument help.
Only the name of this class is considered a public API. All the methods
provided by the class are considered an implementation detail.
"""
def _get_help_string(self, action):
help = action.help
if '%(default)' not in action.help:
if action.default is not SUPPRESS:
defaulting_nargs = [OPTIONAL, ZERO_OR_MORE]
if action.option_strings or action.nargs in defaulting_nargs:
help += ' (default: %(default)s)'
return help
# =====================
# Options and Arguments
# =====================
def _get_action_name(argument):
if argument is None:
return None
elif argument.option_strings:
return '/'.join(argument.option_strings)
elif argument.metavar not in (None, SUPPRESS):
return argument.metavar
elif argument.dest not in (None, SUPPRESS):
return argument.dest
else:
return None
class ArgumentError(Exception):
"""An error from creating or using an argument (optional or positional).
The string value of this exception is the message, augmented with
information about the argument that caused it.
"""
def __init__(self, argument, message):
self.argument_name = _get_action_name(argument)
self.message = message
def __str__(self):
if self.argument_name is None:
format = '%(message)s'
else:
format = 'argument %(argument_name)s: %(message)s'
return format % dict(message=self.message,
argument_name=self.argument_name)
class ArgumentTypeError(Exception):
"""An error from trying to convert a command line string to a type."""
# ==============
# Action classes
# ==============
class Action(_AttributeHolder):
"""Information about how to convert command line strings to Python objects.
Action objects are used by an ArgumentParser to represent the information
needed to parse a single argument from one or more strings from the
command line. The keyword arguments to the Action constructor are also
all attributes of Action instances.
Keyword Arguments:
- option_strings -- A list of command-line option strings which
should be associated with this action.
- dest -- The name of the attribute to hold the created object(s)
- nargs -- The number of command-line arguments that should be
consumed. By default, one argument will be consumed and a single
value will be produced. Other values include:
- N (an integer) consumes N arguments (and produces a list)
- '?' consumes zero or one arguments
- '*' consumes zero or more arguments (and produces a list)
- '+' consumes one or more arguments (and produces a list)
Note that the difference between the default and nargs=1 is that
with the default, a single value will be produced, while with
nargs=1, a list containing a single value will be produced.
- const -- The value to be produced if the option is specified and the
option uses an action that takes no values.
- default -- The value to be produced if the option is not specified.
- type -- The type which the command-line arguments should be converted
to, should be one of 'string', 'int', 'float', 'complex' or a
callable object that accepts a single string argument. If None,
'string' is assumed.
- choices -- A container of values that should be allowed. If not None,
after a command-line argument has been converted to the appropriate
type, an exception will be raised if it is not a member of this
collection.
- required -- True if the action must always be specified at the
command line. This is only meaningful for optional command-line
arguments.
- help -- The help string describing the argument.
- metavar -- The name to be used for the option's argument with the
help string. If None, the 'dest' value will be used as the name.
"""
def __init__(self,
option_strings,
dest,
nargs=None,
const=None,
default=None,
type=None,
choices=None,
required=False,
help=None,
metavar=None):
self.option_strings = option_strings
self.dest = dest
self.nargs = nargs
self.const = const
self.default = default
self.type = type
self.choices = choices
self.required = required
self.help = help
self.metavar = metavar
def _get_kwargs(self):
names = [
'option_strings',
'dest',
'nargs',
'const',
'default',
'type',
'choices',
'help',
'metavar',
]
return [(name, getattr(self, name)) for name in names]
def __call__(self, parser, namespace, values, option_string=None):
raise NotImplementedError(_('.__call__() not defined'))
class _StoreAction(Action):
def __init__(self,
option_strings,
dest,
nargs=None,
const=None,
default=None,
type=None,
choices=None,
required=False,
help=None,
metavar=None):
if nargs == 0:
raise ValueError('nargs for store actions must be > 0; if you '
'have nothing to store, actions such as store '
'true or store const may be more appropriate')
if const is not None and nargs != OPTIONAL:
raise ValueError('nargs must be %r to supply const' % OPTIONAL)
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=nargs,
const=const,
default=default,
type=type,
choices=choices,
required=required,
help=help,
metavar=metavar)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
class _StoreConstAction(Action):
def __init__(self,
option_strings,
dest,
const,
default=None,
required=False,
help=None,
metavar=None):
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=0,
const=const,
default=default,
required=required,
help=help)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, self.const)
class _StoreTrueAction(_StoreConstAction):
def __init__(self,
option_strings,
dest,
default=False,
required=False,
help=None):
super().__init__(
option_strings=option_strings,
dest=dest,
const=True,
default=default,
required=required,
help=help)
class _StoreFalseAction(_StoreConstAction):
def __init__(self,
option_strings,
dest,
default=True,
required=False,
help=None):
super().__init__(
option_strings=option_strings,
dest=dest,
const=False,
default=default,
required=required,
help=help)
class _AppendAction(Action):
def __init__(self,
option_strings,
dest,
nargs=None,
const=None,
default=None,
type=None,
choices=None,
required=False,
help=None,
metavar=None):
if nargs == 0:
raise ValueError('nargs for append actions must be > 0; if arg '
'strings are not supplying the value to append, '
'the append const action may be more appropriate')
if const is not None and nargs != OPTIONAL:
raise ValueError('nargs must be %r to supply const' % OPTIONAL)
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=nargs,
const=const,
default=default,
type=type,
choices=choices,
required=required,
help=help,
metavar=metavar)
def __call__(self, parser, namespace, values, option_string=None):
items = _copy.copy(_ensure_value(namespace, self.dest, []))
items.append(values)
setattr(namespace, self.dest, items)
class _AppendConstAction(Action):
def __init__(self,
option_strings,
dest,
const,
default=None,
required=False,
help=None,
metavar=None):
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=0,
const=const,
default=default,
required=required,
help=help,
metavar=metavar)
def __call__(self, parser, namespace, values, option_string=None):
items = _copy.copy(_ensure_value(namespace, self.dest, []))
items.append(self.const)
setattr(namespace, self.dest, items)
class _CountAction(Action):
def __init__(self,
option_strings,
dest,
default=None,
required=False,
help=None):
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=0,
default=default,
required=required,
help=help)
def __call__(self, parser, namespace, values, option_string=None):
new_count = _ensure_value(namespace, self.dest, 0) + 1
setattr(namespace, self.dest, new_count)
class _HelpAction(Action):
def __init__(self,
option_strings,
dest=SUPPRESS,
default=SUPPRESS,
help=None):
super().__init__(
option_strings=option_strings,
dest=dest,
default=default,
nargs=0,
help=help)
def __call__(self, parser, namespace, values, option_string=None):
parser.print_help()
parser.exit()
class _VersionAction(Action):
def __init__(self,
option_strings,
version=None,
dest=SUPPRESS,
default=SUPPRESS,
help="show program's version number and exit"):
super().__init__(
option_strings=option_strings,
dest=dest,
default=default,
nargs=0,
help=help)
self.version = version
def __call__(self, parser, namespace, values, option_string=None):
version = self.version
if version is None:
version = parser.version
formatter = parser._get_formatter()
formatter.add_text(version)
parser.exit(message=formatter.format_help())
class _SubParsersAction(Action):
class _ChoicesPseudoAction(Action):
def __init__(self, name, help):
sup = super(_SubParsersAction._ChoicesPseudoAction, self)
sup.__init__(option_strings=[], dest=name, help=help)
def __init__(self,
option_strings,
prog,
parser_class,
dest=SUPPRESS,
help=None,
metavar=None):
self._prog_prefix = prog
self._parser_class = parser_class
self._name_parser_map = {}
self._choices_actions = []
super().__init__(
option_strings=option_strings,
dest=dest,
nargs=PARSER,
choices=self._name_parser_map,
help=help,
metavar=metavar)
def add_parser(self, name, **kwargs):
# set prog from the existing prefix
if kwargs.get('prog') is None:
kwargs['prog'] = f'{self._prog_prefix} {name}'
# create a pseudo-action to hold the choice help
if 'help' in kwargs:
help = kwargs.pop('help')
choice_action = self._ChoicesPseudoAction(name, help)
self._choices_actions.append(choice_action)
# create the parser and add it to the map
parser = self._parser_class(**kwargs)
self._name_parser_map[name] = parser
return parser
def _get_subactions(self):
return self._choices_actions
def __call__(self, parser, namespace, values, option_string=None):
parser_name = values[0]
arg_strings = values[1:]
# set the parser name if requested
if self.dest is not SUPPRESS:
setattr(namespace, self.dest, parser_name)
# select the parser
try:
parser = self._name_parser_map[parser_name]
except KeyError:
tup = parser_name, ', '.join(self._name_parser_map)
msg = _('unknown parser %r (choices: %s)' % tup)
raise ArgumentError(self, msg)
# parse all the remaining options into the namespace
# store any unrecognized options on the object, so that the top
# level parser can decide what to do with them
namespace, arg_strings = parser.parse_known_args(arg_strings, namespace)
if arg_strings:
vars(namespace).setdefault(_UNRECOGNIZED_ARGS_ATTR, [])
getattr(namespace, _UNRECOGNIZED_ARGS_ATTR).extend(arg_strings)
# ==============
# Type classes
# ==============
class FileType:
"""Factory for creating file object types
Instances of FileType are typically passed as type= arguments to the
ArgumentParser add_argument() method.
Keyword Arguments:
- mode -- A string indicating how the file is to be opened. Accepts the
same values as the builtin open() function.
- bufsize -- The file's desired buffer size. Accepts the same values as
the builtin open() function.
"""
def __init__(self, mode='r', bufsize=None):
self._mode = mode
self._bufsize = bufsize
def __call__(self, string):
# the special argument "-" means sys.std{in,out}
if string == '-':
if 'r' in self._mode:
return _sys.stdin
elif 'w' in self._mode:
return _sys.stdout
else:
msg = _('argument "-" with mode %r' % self._mode)
raise ValueError(msg)
# all other arguments are used as file names
if self._bufsize:
return open(string, self._mode, self._bufsize)
else:
return open(string, self._mode)
def __repr__(self):
args = [self._mode, self._bufsize]
args_str = ', '.join([repr(arg) for arg in args if arg is not None])
return f'{type(self).__name__}({args_str})'
# ===========================
# Optional and Positional Parsing
# ===========================
class Namespace(_AttributeHolder):
"""Simple object for storing attributes.
Implements equality by attribute names and values, and provides a simple
string representation.
"""
def __init__(self, **kwargs):
for name in kwargs:
setattr(self, name, kwargs[name])
__hash__ = None
def __eq__(self, other):
return vars(self) == vars(other)
def __ne__(self, other):
return not (self == other)
def __contains__(self, key):
return key in self.__dict__
class _ActionsContainer:
def __init__(self,
description,
prefix_chars,
argument_default,
conflict_handler):
super().__init__()
self.description = description
self.argument_default = argument_default
self.prefix_chars = prefix_chars
self.conflict_handler = conflict_handler
# set up registries
self._registries = {}
# register actions
self.register('action', None, _StoreAction)
self.register('action', 'store', _StoreAction)
self.register('action', 'store_const', _StoreConstAction)
self.register('action', 'store_true', _StoreTrueAction)
self.register('action', 'store_false', _StoreFalseAction)
self.register('action', 'append', _AppendAction)
self.register('action', 'append_const', _AppendConstAction)
self.register('action', 'count', _CountAction)
self.register('action', 'help', _HelpAction)
self.register('action', 'version', _VersionAction)
self.register('action', 'parsers', _SubParsersAction)
# raise an exception if the conflict handler is invalid
self._get_handler()
# action storage
self._actions = []
self._option_string_actions = {}
# groups
self._action_groups = []
self._mutually_exclusive_groups = []
# defaults storage
self._defaults = {}
# determines whether an "option" looks like a negative number
self._negative_number_matcher = _re.compile(r'^-\d+$|^-\d*\.\d+$')
# whether or not there are any optionals that look like negative
# numbers -- uses a list so it can be shared and edited
self._has_negative_number_optionals = []
# ====================
# Registration methods
# ====================
def register(self, registry_name, value, object):
registry = self._registries.setdefault(registry_name, {})
registry[value] = object
def _registry_get(self, registry_name, value, default=None):
return self._registries[registry_name].get(value, default)
# ==================================
# Namespace default accessor methods
# ==================================
def set_defaults(self, **kwargs):
self._defaults.update(kwargs)
# if these defaults match any existing arguments, replace
# the previous default on the object with the new one
for action in self._actions:
if action.dest in kwargs:
action.default = kwargs[action.dest]
def get_default(self, dest):
for action in self._actions:
if action.dest == dest and action.default is not None:
return action.default
return self._defaults.get(dest, None)
# =======================
# Adding argument actions
# =======================
def add_argument(self, *args, **kwargs):
"""
add_argument(dest, ..., name=value, ...)
add_argument(option_string, option_string, ..., name=value, ...)
"""
# if no positional args are supplied or only one is supplied and
# it doesn't look like an option string, parse a positional
# argument
chars = self.prefix_chars
if not args or len(args) == 1 and args[0][0] not in chars:
if args and 'dest' in kwargs:
raise ValueError('dest supplied twice for positional argument')
kwargs = self._get_positional_kwargs(*args, **kwargs)
# otherwise, we're adding an optional argument
else:
kwargs = self._get_optional_kwargs(*args, **kwargs)
# if no default was supplied, use the parser-level default
if 'default' not in kwargs:
dest = kwargs['dest']
if dest in self._defaults:
kwargs['default'] = self._defaults[dest]
elif self.argument_default is not None:
kwargs['default'] = self.argument_default
# create the action object, and add it to the parser
action_class = self._pop_action_class(kwargs)
if not _callable(action_class):
raise ValueError('unknown action "%s"' % action_class)
action = action_class(**kwargs)
# raise an error if the action type is not callable
type_func = self._registry_get('type', action.type, action.type)
if not _callable(type_func):
raise ValueError('%r is not callable' % type_func)
return self._add_action(action)
def add_argument_group(self, *args, **kwargs):
group = _ArgumentGroup(self, *args, **kwargs)
self._action_groups.append(group)
return group
def add_mutually_exclusive_group(self, **kwargs):
group = _MutuallyExclusiveGroup(self, **kwargs)
self._mutually_exclusive_groups.append(group)
return group
def _add_action(self, action):
# resolve any conflicts
self._check_conflict(action)
# add to actions list
self._actions.append(action)
action.container = self
# index the action by any option strings it has
for option_string in action.option_strings:
self._option_string_actions[option_string] = action
# set the flag if any option strings look like negative numbers
for option_string in action.option_strings:
if self._negative_number_matcher.match(option_string):
if not self._has_negative_number_optionals:
self._has_negative_number_optionals.append(True)
# return the created action
return action
def _remove_action(self, action):
self._actions.remove(action)
def _add_container_actions(self, container):
# collect groups by titles
title_group_map = {}
for group in self._action_groups:
if group.title in title_group_map:
msg = _('cannot merge actions - two groups are named %r')
raise ValueError(msg % (group.title))
title_group_map[group.title] = group
# map each action to its group
group_map = {}
for group in container._action_groups:
# if a group with the title exists, use that, otherwise
# create a new group matching the container's group
if group.title not in title_group_map:
title_group_map[group.title] = self.add_argument_group(
title=group.title,
description=group.description,
conflict_handler=group.conflict_handler)
# map the actions to their new group
for action in group._group_actions:
group_map[action] = title_group_map[group.title]
# add container's mutually exclusive groups
# NOTE: if add_mutually_exclusive_group ever gains title= and
# description= then this code will need to be expanded as above
for group in container._mutually_exclusive_groups:
mutex_group = self.add_mutually_exclusive_group(
required=group.required)
# map the actions to their new mutex group
for action in group._group_actions:
group_map[action] = mutex_group
# add all actions to this container or their group
for action in container._actions:
group_map.get(action, self)._add_action(action)
def _get_positional_kwargs(self, dest, **kwargs):
# make sure required is not specified
if 'required' in kwargs:
msg = _("'required' is an invalid argument for positionals")
raise TypeError(msg)
# mark positional arguments as required if at least one is
# always required
if kwargs.get('nargs') not in [OPTIONAL, ZERO_OR_MORE]:
kwargs['required'] = True
if kwargs.get('nargs') == ZERO_OR_MORE and 'default' not in kwargs:
kwargs['required'] = True
# return the keyword arguments with no option strings
return dict(kwargs, dest=dest, option_strings=[])
def _get_optional_kwargs(self, *args, **kwargs):
# determine short and long option strings
option_strings = []
long_option_strings = []
for option_string in args:
# error on strings that don't start with an appropriate prefix
if not option_string[0] in self.prefix_chars:
msg = _('invalid option string %r: '
'must start with a character %r')
tup = option_string, self.prefix_chars
raise ValueError(msg % tup)
# strings starting with two prefix characters are long options
option_strings.append(option_string)
if option_string[0] in self.prefix_chars:
if len(option_string) > 1:
if option_string[1] in self.prefix_chars:
long_option_strings.append(option_string)
# infer destination, '--foo-bar' -> 'foo_bar' and '-x' -> 'x'
dest = kwargs.pop('dest', None)
if dest is None:
if long_option_strings:
dest_option_string = long_option_strings[0]
else:
dest_option_string = option_strings[0]
dest = dest_option_string.lstrip(self.prefix_chars)
if not dest:
msg = _('dest= is required for options like %r')
raise ValueError(msg % option_string)
dest = dest.replace('-', '_')
# return the updated keyword arguments
return dict(kwargs, dest=dest, option_strings=option_strings)
def _pop_action_class(self, kwargs, default=None):
action = kwargs.pop('action', default)
return self._registry_get('action', action, action)
def _get_handler(self):
# determine function from conflict handler string
handler_func_name = '_handle_conflict_%s' % self.conflict_handler
try:
return getattr(self, handler_func_name)
except AttributeError:
msg = _('invalid conflict_resolution value: %r')
raise ValueError(msg % self.conflict_handler)
def _check_conflict(self, action):
# find all options that conflict with this option
confl_optionals = []
for option_string in action.option_strings:
if option_string in self._option_string_actions:
confl_optional = self._option_string_actions[option_string]
confl_optionals.append((option_string, confl_optional))
# resolve any conflicts
if confl_optionals:
conflict_handler = self._get_handler()
conflict_handler(action, confl_optionals)
def _handle_conflict_error(self, action, conflicting_actions):
message = _('conflicting option string(s): %s')
conflict_string = ', '.join([option_string
for option_string, _2
in conflicting_actions])
raise ArgumentError(action, message % conflict_string)
def _handle_conflict_resolve(self, action, conflicting_actions):
# remove all conflicting options
for option_string, action in conflicting_actions:
# remove the conflicting option
action.option_strings.remove(option_string)
self._option_string_actions.pop(option_string, None)
# if the option now has no option string, remove it from the
# container holding it
if not action.option_strings:
action.container._remove_action(action)
class _ArgumentGroup(_ActionsContainer):
def __init__(self, container, title=None, description=None, **kwargs):
# add any missing keyword arguments by checking the container
update = kwargs.setdefault
update('conflict_handler', container.conflict_handler)
update('prefix_chars', container.prefix_chars)
update('argument_default', container.argument_default)
super_init = super().__init__
super_init(description=description, **kwargs)
# group attributes
self.title = title
self._group_actions = []
# share most attributes with the container
self._registries = container._registries
self._actions = container._actions
self._option_string_actions = container._option_string_actions
self._defaults = container._defaults
self._has_negative_number_optionals = \
container._has_negative_number_optionals
def _add_action(self, action):
action = super()._add_action(action)
self._group_actions.append(action)
return action
def _remove_action(self, action):
super()._remove_action(action)
self._group_actions.remove(action)
class _MutuallyExclusiveGroup(_ArgumentGroup):
def __init__(self, container, required=False):
super().__init__(container)
self.required = required
self._container = container
def _add_action(self, action):
if action.required:
msg = _('mutually exclusive arguments must be optional')
raise ValueError(msg)
action = self._container._add_action(action)
self._group_actions.append(action)
return action
def _remove_action(self, action):
self._container._remove_action(action)
self._group_actions.remove(action)
class ArgumentParser(_AttributeHolder, _ActionsContainer):
"""Object for parsing command line strings into Python objects.
Keyword Arguments:
- prog -- The name of the program (default: sys.argv[0])
- usage -- A usage message (default: auto-generated from arguments)
- description -- A description of what the program does
- epilog -- Text following the argument descriptions
- parents -- Parsers whose arguments should be copied into this one
- formatter_class -- HelpFormatter class for printing help messages
- prefix_chars -- Characters that prefix optional arguments
- fromfile_prefix_chars -- Characters that prefix files containing
additional arguments
- argument_default -- The default value for all arguments
- conflict_handler -- String indicating how to handle conflicts
- add_help -- Add a -h/-help option
"""
def __init__(self,
prog=None,
usage=None,
description=None,
epilog=None,
version=None,
parents=[],
formatter_class=HelpFormatter,
prefix_chars='-',
fromfile_prefix_chars=None,
argument_default=None,
conflict_handler='error',
add_help=True):
if version is not None:
import warnings
warnings.warn(
"""The "version" argument to ArgumentParser is deprecated. """
"""Please use """
""""add_argument(..., action='version', version="N", ...)" """
"""instead""", DeprecationWarning)
superinit = super().__init__
superinit(description=description,
prefix_chars=prefix_chars,
argument_default=argument_default,
conflict_handler=conflict_handler)
# default setting for prog
if prog is None:
prog = _os.path.basename(_sys.argv[0])
self.prog = prog
self.usage = usage
self.epilog = epilog
self.version = version
self.formatter_class = formatter_class
self.fromfile_prefix_chars = fromfile_prefix_chars
self.add_help = add_help
add_group = self.add_argument_group
self._positionals = add_group(_('positional arguments'))
self._optionals = add_group(_('optional arguments'))
self._subparsers = None
# register types
def identity(string):
return string
self.register('type', None, identity)
# add help and version arguments if necessary
# (using explicit default to override global argument_default)
if '-' in prefix_chars:
default_prefix = '-'
else:
default_prefix = prefix_chars[0]
if self.add_help:
self.add_argument(
default_prefix+'h', default_prefix*2+'help',
action='help', default=SUPPRESS,
help=_('show this help message and exit'))
if self.version:
self.add_argument(
default_prefix+'v', default_prefix*2+'version',
action='version', default=SUPPRESS,
version=self.version,
help=_("show program's version number and exit"))
# add parent arguments and defaults
for parent in parents:
self._add_container_actions(parent)
try:
defaults = parent._defaults
except AttributeError:
pass
else:
self._defaults.update(defaults)
# =======================
# Pretty __repr__ methods
# =======================
def _get_kwargs(self):
names = [
'prog',
'usage',
'description',
'version',
'formatter_class',
'conflict_handler',
'add_help',
]
return [(name, getattr(self, name)) for name in names]
# ==================================
# Optional/Positional adding methods
# ==================================
def add_subparsers(self, **kwargs):
if self._subparsers is not None:
self.error(_('cannot have multiple subparser arguments'))
# add the parser class to the arguments if it's not present
kwargs.setdefault('parser_class', type(self))
if 'title' in kwargs or 'description' in kwargs:
title = _(kwargs.pop('title', 'subcommands'))
description = _(kwargs.pop('description', None))
self._subparsers = self.add_argument_group(title, description)
else:
self._subparsers = self._positionals
# prog defaults to the usage message of this parser, skipping
# optional arguments and with no "usage:" prefix
if kwargs.get('prog') is None:
formatter = self._get_formatter()
positionals = self._get_positional_actions()
groups = self._mutually_exclusive_groups
formatter.add_usage(self.usage, positionals, groups, '')
kwargs['prog'] = formatter.format_help().strip()
# create the parsers action and add it to the positionals list
parsers_class = self._pop_action_class(kwargs, 'parsers')
action = parsers_class(option_strings=[], **kwargs)
self._subparsers._add_action(action)
# return the created parsers action
return action
def _add_action(self, action):
if action.option_strings:
self._optionals._add_action(action)
else:
self._positionals._add_action(action)
return action
def _get_optional_actions(self):
return [action
for action in self._actions
if action.option_strings]
def _get_positional_actions(self):
return [action
for action in self._actions
if not action.option_strings]
# =====================================
# Command line argument parsing methods
# =====================================
def parse_args(self, args=None, namespace=None):
args, argv = self.parse_known_args(args, namespace)
if argv:
msg = _('unrecognized arguments: %s')
self.error(msg % ' '.join(argv))
return args
def parse_known_args(self, args=None, namespace=None):
# args default to the system args
if args is None:
args = _sys.argv[1:]
# default Namespace built from parser defaults
if namespace is None:
namespace = Namespace()
# add any action defaults that aren't present
for action in self._actions:
if action.dest is not SUPPRESS:
if not hasattr(namespace, action.dest):
if action.default is not SUPPRESS:
default = action.default
if isinstance(action.default, basestring):
default = self._get_value(action, default)
setattr(namespace, action.dest, default)
# add any parser defaults that aren't present
for dest in self._defaults:
if not hasattr(namespace, dest):
setattr(namespace, dest, self._defaults[dest])
# parse the arguments and exit if there are any errors
try:
namespace, args = self._parse_known_args(args, namespace)
if hasattr(namespace, _UNRECOGNIZED_ARGS_ATTR):
args.extend(getattr(namespace, _UNRECOGNIZED_ARGS_ATTR))
delattr(namespace, _UNRECOGNIZED_ARGS_ATTR)
return namespace, args
except ArgumentError:
err = _sys.exc_info()[1]
self.error(str(err))
def _parse_known_args(self, arg_strings, namespace):
# replace arg strings that are file references
if self.fromfile_prefix_chars is not None:
arg_strings = self._read_args_from_files(arg_strings)
# map all mutually exclusive arguments to the other arguments
# they can't occur with
action_conflicts = {}
for mutex_group in self._mutually_exclusive_groups:
group_actions = mutex_group._group_actions
for i, mutex_action in enumerate(mutex_group._group_actions):
conflicts = action_conflicts.setdefault(mutex_action, [])
conflicts.extend(group_actions[:i])
conflicts.extend(group_actions[i + 1:])
# find all option indices, and determine the arg_string_pattern
# which has an 'O' if there is an option at an index,
# an 'A' if there is an argument, or a '-' if there is a '--'
option_string_indices = {}
arg_string_pattern_parts = []
arg_strings_iter = iter(arg_strings)
for i, arg_string in enumerate(arg_strings_iter):
# all args after -- are non-options
if arg_string == '--':
arg_string_pattern_parts.append('-')
for arg_string in arg_strings_iter:
arg_string_pattern_parts.append('A')
# otherwise, add the arg to the arg strings
# and note the index if it was an option
else:
option_tuple = self._parse_optional(arg_string)
if option_tuple is None:
pattern = 'A'
else:
option_string_indices[i] = option_tuple
pattern = 'O'
arg_string_pattern_parts.append(pattern)
# join the pieces together to form the pattern
arg_strings_pattern = ''.join(arg_string_pattern_parts)
# converts arg strings to the appropriate and then takes the action
seen_actions = set()
seen_non_default_actions = set()
def take_action(action, argument_strings, option_string=None):
seen_actions.add(action)
argument_values = self._get_values(action, argument_strings)
# error if this argument is not allowed with other previously
# seen arguments, assuming that actions that use the default
# value don't really count as "present"
if argument_values is not action.default:
seen_non_default_actions.add(action)
for conflict_action in action_conflicts.get(action, []):
if conflict_action in seen_non_default_actions:
msg = _('not allowed with argument %s')
action_name = _get_action_name(conflict_action)
raise ArgumentError(action, msg % action_name)
# take the action if we didn't receive a SUPPRESS value
# (e.g. from a default)
if argument_values is not SUPPRESS:
action(self, namespace, argument_values, option_string)
# function to convert arg_strings into an optional action
def consume_optional(start_index):
# get the optional identified at this index
option_tuple = option_string_indices[start_index]
action, option_string, explicit_arg = option_tuple
# identify additional optionals in the same arg string
# (e.g. -xyz is the same as -x -y -z if no args are required)
match_argument = self._match_argument
action_tuples = []
while True:
# if we found no optional action, skip it
if action is None:
extras.append(arg_strings[start_index])
return start_index + 1
# if there is an explicit argument, try to match the
# optional's string arguments to only this
if explicit_arg is not None:
arg_count = match_argument(action, 'A')
# if the action is a single-dash option and takes no
# arguments, try to parse more single-dash options out
# of the tail of the option string
chars = self.prefix_chars
if arg_count == 0 and option_string[1] not in chars:
action_tuples.append((action, [], option_string))
char = option_string[0]
option_string = char + explicit_arg[0]
new_explicit_arg = explicit_arg[1:] or None
optionals_map = self._option_string_actions
if option_string in optionals_map:
action = optionals_map[option_string]
explicit_arg = new_explicit_arg
else:
msg = _('ignored explicit argument %r')
raise ArgumentError(action, msg % explicit_arg)
# if the action expect exactly one argument, we've
# successfully matched the option; exit the loop
elif arg_count == 1:
stop = start_index + 1
args = [explicit_arg]
action_tuples.append((action, args, option_string))
break
# error if a double-dash option did not use the
# explicit argument
else:
msg = _('ignored explicit argument %r')
raise ArgumentError(action, msg % explicit_arg)
# if there is no explicit argument, try to match the
# optional's string arguments with the following strings
# if successful, exit the loop
else:
start = start_index + 1
selected_patterns = arg_strings_pattern[start:]
arg_count = match_argument(action, selected_patterns)
stop = start + arg_count
args = arg_strings[start:stop]
action_tuples.append((action, args, option_string))
break
# add the Optional to the list and return the index at which
# the Optional's string args stopped
assert action_tuples
for action, args, option_string in action_tuples:
take_action(action, args, option_string)
return stop
# the list of Positionals left to be parsed; this is modified
# by consume_positionals()
positionals = self._get_positional_actions()
# function to convert arg_strings into positional actions
def consume_positionals(start_index):
# match as many Positionals as possible
match_partial = self._match_arguments_partial
selected_pattern = arg_strings_pattern[start_index:]
arg_counts = match_partial(positionals, selected_pattern)
# slice off the appropriate arg strings for each Positional
# and add the Positional and its args to the list
for action, arg_count in zip(positionals, arg_counts):
args = arg_strings[start_index: start_index + arg_count]
start_index += arg_count
take_action(action, args)
# slice off the Positionals that we just parsed and return the
# index at which the Positionals' string args stopped
positionals[:] = positionals[len(arg_counts):]
return start_index
# consume Positionals and Optionals alternately, until we have
# passed the last option string
extras = []
start_index = 0
if option_string_indices:
max_option_string_index = max(option_string_indices)
else:
max_option_string_index = -1
while start_index <= max_option_string_index:
# consume any Positionals preceding the next option
next_option_string_index = min(
index
for index in option_string_indices
if index >= start_index)
if start_index != next_option_string_index:
positionals_end_index = consume_positionals(start_index)
# only try to parse the next optional if we didn't consume
# the option string during the positionals parsing
if positionals_end_index > start_index:
start_index = positionals_end_index
continue
else:
start_index = positionals_end_index
# if we consumed all the positionals we could and we're not
# at the index of an option string, there were extra arguments
if start_index not in option_string_indices:
strings = arg_strings[start_index:next_option_string_index]
extras.extend(strings)
start_index = next_option_string_index
# consume the next optional and any arguments for it
start_index = consume_optional(start_index)
# consume any positionals following the last Optional
stop_index = consume_positionals(start_index)
# if we didn't consume all the argument strings, there were extras
extras.extend(arg_strings[stop_index:])
# if we didn't use all the Positional objects, there were too few
# arg strings supplied.
if positionals:
self.error(_('too few arguments'))
# make sure all required actions were present
for action in self._actions:
if action.required:
if action not in seen_actions:
name = _get_action_name(action)
self.error(_('argument %s is required') % name)
# make sure all required groups had one option present
for group in self._mutually_exclusive_groups:
if group.required:
for action in group._group_actions:
if action in seen_non_default_actions:
break
# if no actions were used, report the error
else:
names = [_get_action_name(action)
for action in group._group_actions
if action.help is not SUPPRESS]
msg = _('one of the arguments %s is required')
self.error(msg % ' '.join(names))
# return the updated namespace and the extra arguments
return namespace, extras
def _read_args_from_files(self, arg_strings):
# expand arguments referencing files
new_arg_strings = []
for arg_string in arg_strings:
# for regular arguments, just add them back into the list
if arg_string[0] not in self.fromfile_prefix_chars:
new_arg_strings.append(arg_string)
# replace arguments referencing files with the file content
else:
try:
args_file = open(arg_string[1:])
try:
arg_strings = []
for arg_line in args_file.read().splitlines():
for arg in self.convert_arg_line_to_args(arg_line):
arg_strings.append(arg)
arg_strings = self._read_args_from_files(arg_strings)
new_arg_strings.extend(arg_strings)
finally:
args_file.close()
except OSError:
err = _sys.exc_info()[1]
self.error(str(err))
# return the modified argument list
return new_arg_strings
def convert_arg_line_to_args(self, arg_line):
return [arg_line]
def _match_argument(self, action, arg_strings_pattern):
# match the pattern for this action to the arg strings
nargs_pattern = self._get_nargs_pattern(action)
match = _re.match(nargs_pattern, arg_strings_pattern)
# raise an exception if we weren't able to find a match
if match is None:
nargs_errors = {
None: _('expected one argument'),
OPTIONAL: _('expected at most one argument'),
ONE_OR_MORE: _('expected at least one argument'),
}
default = _('expected %s argument(s)') % action.nargs
msg = nargs_errors.get(action.nargs, default)
raise ArgumentError(action, msg)
# return the number of arguments matched
return len(match.group(1))
def _match_arguments_partial(self, actions, arg_strings_pattern):
# progressively shorten the actions list by slicing off the
# final actions until we find a match
result = []
for i in range(len(actions), 0, -1):
actions_slice = actions[:i]
pattern = ''.join([self._get_nargs_pattern(action)
for action in actions_slice])
match = _re.match(pattern, arg_strings_pattern)
if match is not None:
result.extend([len(string) for string in match.groups()])
break
# return the list of arg string counts
return result
def _parse_optional(self, arg_string):
# if it's an empty string, it was meant to be a positional
if not arg_string:
return None
# if it doesn't start with a prefix, it was meant to be positional
if not arg_string[0] in self.prefix_chars:
return None
# if the option string is present in the parser, return the action
if arg_string in self._option_string_actions:
action = self._option_string_actions[arg_string]
return action, arg_string, None
# if it's just a single character, it was meant to be positional
if len(arg_string) == 1:
return None
# if the option string before the "=" is present, return the action
if '=' in arg_string:
option_string, explicit_arg = arg_string.split('=', 1)
if option_string in self._option_string_actions:
action = self._option_string_actions[option_string]
return action, option_string, explicit_arg
# search through all possible prefixes of the option string
# and all actions in the parser for possible interpretations
option_tuples = self._get_option_tuples(arg_string)
# if multiple actions match, the option string was ambiguous
if len(option_tuples) > 1:
options = ', '.join([_1 for _0, _1, _2 in option_tuples])
tup = arg_string, options
self.error(_('ambiguous option: %s could match %s') % tup)
# if exactly one action matched, this segmentation is good,
# so return the parsed action
elif len(option_tuples) == 1:
option_tuple, = option_tuples
return option_tuple
# if it was not found as an option, but it looks like a negative
# number, it was meant to be positional
# unless there are negative-number-like options
if self._negative_number_matcher.match(arg_string):
if not self._has_negative_number_optionals:
return None
# if it contains a space, it was meant to be a positional
if ' ' in arg_string:
return None
# it was meant to be an optional but there is no such option
# in this parser (though it might be a valid option in a subparser)
return None, arg_string, None
def _get_option_tuples(self, option_string):
result = []
# option strings starting with two prefix characters are only
# split at the '='
chars = self.prefix_chars
if option_string[0] in chars and option_string[1] in chars:
if '=' in option_string:
option_prefix, explicit_arg = option_string.split('=', 1)
else:
option_prefix = option_string
explicit_arg = None
for option_string in self._option_string_actions:
if option_string.startswith(option_prefix):
action = self._option_string_actions[option_string]
tup = action, option_string, explicit_arg
result.append(tup)
# single character options can be concatenated with their arguments
# but multiple character options always have to have their argument
# separate
elif option_string[0] in chars and option_string[1] not in chars:
option_prefix = option_string
explicit_arg = None
short_option_prefix = option_string[:2]
short_explicit_arg = option_string[2:]
for option_string in self._option_string_actions:
if option_string == short_option_prefix:
action = self._option_string_actions[option_string]
tup = action, option_string, short_explicit_arg
result.append(tup)
elif option_string.startswith(option_prefix):
action = self._option_string_actions[option_string]
tup = action, option_string, explicit_arg
result.append(tup)
# shouldn't ever get here
else:
self.error(_('unexpected option string: %s') % option_string)
# return the collected option tuples
return result
def _get_nargs_pattern(self, action):
# in all examples below, we have to allow for '--' args
# which are represented as '-' in the pattern
nargs = action.nargs
# the default (None) is assumed to be a single argument
if nargs is None:
nargs_pattern = '(-*A-*)'
# allow zero or one arguments
elif nargs == OPTIONAL:
nargs_pattern = '(-*A?-*)'
# allow zero or more arguments
elif nargs == ZERO_OR_MORE:
nargs_pattern = '(-*[A-]*)'
# allow one or more arguments
elif nargs == ONE_OR_MORE:
nargs_pattern = '(-*A[A-]*)'
# allow any number of options or arguments
elif nargs == REMAINDER:
nargs_pattern = '([-AO]*)'
# allow one argument followed by any number of options or arguments
elif nargs == PARSER:
nargs_pattern = '(-*A[-AO]*)'
# all others should be integers
else:
nargs_pattern = '(-*%s-*)' % '-*'.join('A' * nargs)
# if this is an optional action, -- is not allowed
if action.option_strings:
nargs_pattern = nargs_pattern.replace('-*', '')
nargs_pattern = nargs_pattern.replace('-', '')
# return the pattern
return nargs_pattern
# ========================
# Value conversion methods
# ========================
def _get_values(self, action, arg_strings):
# for everything but PARSER args, strip out '--'
if action.nargs not in [PARSER, REMAINDER]:
arg_strings = [s for s in arg_strings if s != '--']
# optional argument produces a default when not present
if not arg_strings and action.nargs == OPTIONAL:
if action.option_strings:
value = action.const
else:
value = action.default
if isinstance(value, basestring):
value = self._get_value(action, value)
self._check_value(action, value)
# when nargs='*' on a positional, if there were no command-line
# args, use the default if it is anything other than None
elif (not arg_strings and action.nargs == ZERO_OR_MORE
and not action.option_strings):
if action.default is not None:
value = action.default
else:
value = arg_strings
self._check_value(action, value)
# single argument or optional argument produces a single value
elif len(arg_strings) == 1 and action.nargs in [None, OPTIONAL]:
arg_string, = arg_strings
value = self._get_value(action, arg_string)
self._check_value(action, value)
# REMAINDER arguments convert all values, checking none
elif action.nargs == REMAINDER:
value = [self._get_value(action, v) for v in arg_strings]
# PARSER arguments convert all values, but check only the first
elif action.nargs == PARSER:
value = [self._get_value(action, v) for v in arg_strings]
self._check_value(action, value[0])
# all other types of nargs produce a list
else:
value = [self._get_value(action, v) for v in arg_strings]
for v in value:
self._check_value(action, v)
# return the converted value
return value
def _get_value(self, action, arg_string):
type_func = self._registry_get('type', action.type, action.type)
if not _callable(type_func):
msg = _('%r is not callable')
raise ArgumentError(action, msg % type_func)
# convert the value to the appropriate type
try:
result = type_func(arg_string)
# ArgumentTypeErrors indicate errors
except ArgumentTypeError:
name = getattr(action.type, '__name__', repr(action.type))
msg = str(_sys.exc_info()[1])
raise ArgumentError(action, msg)
# TypeErrors or ValueErrors also indicate errors
except (TypeError, ValueError):
name = getattr(action.type, '__name__', repr(action.type))
msg = _('invalid %s value: %r')
raise ArgumentError(action, msg % (name, arg_string))
# return the converted value
return result
def _check_value(self, action, value):
# converted value must be one of the choices (if specified)
if action.choices is not None and value not in action.choices:
tup = value, ', '.join(map(repr, action.choices))
msg = _('invalid choice: %r (choose from %s)') % tup
raise ArgumentError(action, msg)
# =======================
# Help-formatting methods
# =======================
def format_usage(self):
formatter = self._get_formatter()
formatter.add_usage(self.usage, self._actions,
self._mutually_exclusive_groups)
return formatter.format_help()
def format_help(self):
formatter = self._get_formatter()
# usage
formatter.add_usage(self.usage, self._actions,
self._mutually_exclusive_groups)
# description
formatter.add_text(self.description)
# positionals, optionals and user-defined groups
for action_group in self._action_groups:
formatter.start_section(action_group.title)
formatter.add_text(action_group.description)
formatter.add_arguments(action_group._group_actions)
formatter.end_section()
# epilog
formatter.add_text(self.epilog)
# determine help from format above
return formatter.format_help()
def format_version(self):
import warnings
warnings.warn(
'The format_version method is deprecated -- the "version" '
'argument to ArgumentParser is no longer supported.',
DeprecationWarning)
formatter = self._get_formatter()
formatter.add_text(self.version)
return formatter.format_help()
def _get_formatter(self):
return self.formatter_class(prog=self.prog)
# =====================
# Help-printing methods
# =====================
def print_usage(self, file=None):
if file is None:
file = _sys.stdout
self._print_message(self.format_usage(), file)
def print_help(self, file=None):
if file is None:
file = _sys.stdout
self._print_message(self.format_help(), file)
def print_version(self, file=None):
import warnings
warnings.warn(
'The print_version method is deprecated -- the "version" '
'argument to ArgumentParser is no longer supported.',
DeprecationWarning)
self._print_message(self.format_version(), file)
def _print_message(self, message, file=None):
if message:
if file is None:
file = _sys.stderr
file.write(message)
# ===============
# Exiting methods
# ===============
def exit(self, status=0, message=None):
if message:
self._print_message(message, _sys.stderr)
_sys.exit(status)
def error(self, message):
"""error(message: string)
Prints a usage message incorporating the message to stderr and
exits.
If you override this in a subclass, it should not return -- it
should either exit or raise an exception.
"""
self.print_usage(_sys.stderr)
self.exit(2, _('%s: error: %s\n') % (self.prog, message))
bx-python-0.8.13/lib/bx/cookbook/attribute.py 0000664 0000000 0000000 00000007636 14156664651 0021127 0 ustar 00root root 0000000 0000000 """
Provides functions for creating simple properties.
If, inside a class definition, you write:
attribute(foo=1, bar=2)
simple properties named 'foo' and 'bar' are created for this class.
Also, private instance variables '__foo' and '__bar' will be added
to instances of this class.
USEAGE:
# assumes attribute.py is on path
from attribute import *
class MyClass(object):
readable(foo=1, bar=2) # or, attribute('r', foo=1, bar=2)
writable(fro=3, boz=4) # or, attribute('w', fro=3, boz=4)
attribute(baz=5)
This is equivalent to the following:
class MyClass(object):
def __init__(self):
self.__foo = 1
self.__bar = 2
self.__fro = 3
self.__boz = 4
self.__baz = 5
def get_foo(self):
return self.__foo
def get_bar(self):
return self.__bar
def set_fro(self, value):
self.__fro = value
def set_boz(self, value):
self.__boz = value
def get_baz(self):
return self.__baz
def set_baz(self, value):
self.__baz = value
def del_baz(self):
del self.__baz
foo = property(fget=get_foo, doc="foo")
bar = property(fget=get_bar, doc="bar")
fro = property(fset=set_fro, doc="fro")
boz = property(fset=set_boz, doc="boz")
baz = property(fget=get_baz, fset=set_baz, fdel=del_baz, doc="baz")
"""
__all__ = ['attribute', 'readable', 'writable']
__version__ = '3.0'
__author__ = 'Sean Ross'
__credits__ = ['Guido van Rossum', 'Garth Kidd']
__created__ = '10/21/02'
import sys
def mangle(classname, attrname):
"""mangles name according to python name-mangling
conventions for private variables"""
return f"_{classname}__{attrname}"
def class_space(classlevel=3):
"returns the calling class' name and dictionary"
frame = sys._getframe(classlevel)
classname = frame.f_code.co_name
classdict = frame.f_locals
return classname, classdict
# convenience function
def readable(**kwds):
"returns one read-only property for each (key,value) pair in kwds"
return _attribute(permission='r', **kwds)
# convenience function
def writable(**kwds):
"returns one write-only property for each (key,value) pair in kwds"
return _attribute(permission='w', **kwds)
# needed because of the way class_space is resolved in _attribute
def attribute(permission='rwd', **kwds):
"""returns one property for each (key,value) pair in kwds;
each property provides the specified level of access(permission):
'r': readable, 'w':writable, 'd':deletable
"""
return _attribute(permission, **kwds)
# based on code by Guido van Rossum, comp.lang.python 2001-07-31
def _attribute(permission='rwd', **kwds):
"""returns one property for each (key,value) pair in kwds;
each property provides the specified level of access(permission):
'r': readable, 'w':writable, 'd':deletable
"""
classname, classdict = class_space()
def _property(attrname, default):
propname, attrname = attrname, mangle(classname, attrname)
fget, fset, fdel, doc = None, None, None, propname
if 'r' in permission:
def fget(self):
value = default
try:
value = getattr(self, attrname)
except AttributeError:
setattr(self, attrname, default)
return value
if 'w' in permission:
def fset(self, value):
setattr(self, attrname, value)
if 'd' in permission:
def fdel(self):
try:
delattr(self, attrname)
except AttributeError:
pass
# calling fget can restore this attribute, so remove property
delattr(self.__class__, propname)
return property(fget=fget, fset=fset, fdel=fdel, doc=doc)
for attrname, default in kwds.items():
classdict[attrname] = _property(attrname, default)
bx-python-0.8.13/lib/bx/cookbook/doc_optparse.py 0000664 0000000 0000000 00000005430 14156664651 0021574 0 ustar 00root root 0000000 0000000 """
:Author: M. Simionato
:Date: April 2004
:Title: A much simplified interface to optparse.
You should use optionparse in your scripts as follows.
First, write a module level docstring containing something like this
(this is just an example)::
'''usage: %prog files [options]
-d, --delete: delete all files
-e, --erase = ERASE: erase the given file'''
Then write a main program of this kind:
# sketch of a script to delete files::
if __name__=='__main__':
import optionparse
option,args=optionparse.parse(__doc__)
if not args and not option: optionparse.exit()
elif option.delete: print "Delete all files"
elif option.erase: print "Delete the given file"
Notice that ``optionparse`` parses the docstring by looking at the
characters ",", ":", "=", "\\n", so be careful in using them. If
the docstring is not correctly formatted you will get a SyntaxError
or worse, the script will not work as expected.
"""
import optparse
import re
import sys
import traceback
USAGE = re.compile(r'(?s)\s*usage: (.*?)(\n[ \t]*\n|$)')
def nonzero(self): # will become the nonzero method of optparse.Values
"True if options were given"
for v in self.__dict__.values():
if v is not None:
return True
return False
optparse.Values.__nonzero__ = nonzero # dynamically fix optparse.Values
class ParsingError(Exception):
pass
optionstring = ""
def exception(msg=""):
print("Exception while parsing command line:", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
exit(msg)
def exit(msg=""):
raise SystemExit(msg or optionstring.replace("%prog", sys.argv[0]))
def parse(docstring, arglist=None):
global optionstring
optionstring = docstring
match = USAGE.search(optionstring)
if not match:
raise ParsingError("Cannot find the option string")
optlines = match.group(1).splitlines()
try:
p = optparse.OptionParser(optlines[0], conflict_handler="resolve")
for line in optlines[1:]:
opt, help = line.split(':')[:2]
# Make both short and long optional (but at least one)
# Old: short,long=opt.split(',')[:2]
opt_strings = []
action = "store_true"
for k in opt.split(', '):
k = k.strip()
if k.startswith("--") and "=" in k:
action = "store"
k = k.split("=")[0]
opt_strings.append(k)
p.add_option(*opt_strings, **dict(action=action, help=help.strip()))
except (IndexError, ValueError):
raise ParsingError("Cannot parse the option string correctly")
return p.parse_args(arglist)
def help_callback(option, opt, value, parser, help):
print(help, file=sys.stderr)
sys.exit(1)
bx-python-0.8.13/lib/bx/cookbook/progress_bar.py 0000664 0000000 0000000 00000005076 14156664651 0021610 0 ustar 00root root 0000000 0000000 """
An ASCII text progress bar. See __main__ for command line use (using \r to
move the cursor back to the start of the current line is the key, on
terminals that do not support this functionality the progress bar will
not work as well).
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/168639
"""
import sys
class ProgressBar:
def __init__(self, minValue=0, maxValue=10, totalWidth=72):
self.progBar = "[]" # This holds the progress bar string
self.min = minValue
self.max = maxValue
self.span = maxValue - minValue
self.width = totalWidth
self.amount = 0 # When amount == max, we are 100% done
self.update(0) # Build progress bar string
def update(self, newAmount=0):
if newAmount < self.min:
newAmount = self.min
if newAmount > self.max:
newAmount = self.max
self.amount = newAmount
# Figure out the new percent done, round to an integer
diffFromMin = float(self.amount - self.min)
percentDone = (diffFromMin / float(self.span)) * 100.0
percentDone = round(percentDone)
percentDone = int(percentDone)
# Figure out how many hash bars the percentage should be
allFull = self.width - 2
numHashes = (percentDone / 100.0) * allFull
numHashes = int(round(numHashes))
# build a progress bar with hashes and spaces
if allFull == numHashes:
self.progBar = "[" + '='*(numHashes) + "]"
else:
self.progBar = "[" + '='*(numHashes-1) + '>' + ' '*(allFull-numHashes) + "]"
# figure out where to put the percentage, roughly centered
percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
percentString = str(percentDone) + "%"
# slice the percentage into the bar
self.progBar = self.progBar[0:percentPlace] + percentString + self.progBar[percentPlace+len(percentString):]
def update_and_print(self, newAmount=0, f=sys.stdout):
self.update(newAmount)
print("\r", self, end=' ', file=f)
f.flush()
def __str__(self):
return str(self.progBar)
def iterprogress(sized_iterable):
"""
Iterate something printing progress bar to stdout
"""
pb = ProgressBar(0, len(sized_iterable))
for i, value in enumerate(sized_iterable):
yield value
pb.update_and_print(i, sys.stderr)
if __name__ == "__main__":
bar = ProgressBar(0, 1000, 80)
for i in range(1000):
bar.update(i)
print("\r", bar, end=' ')
sys.stdout.flush()
print()
bx-python-0.8.13/lib/bx/filter.py 0000664 0000000 0000000 00000004045 14156664651 0016572 0 ustar 00root root 0000000 0000000 """
Classes for implementing `Pipeline`s composed of `Filter`s (intended to be
subclassed).
"""
class Filter:
def __init__(self, **kwargs):
raise Exception("AbstractClass")
def run(self, reader, writer):
for block in reader:
block = self(block)
if block:
writer(block)
def step(self, reader, writer):
block = next(reader)
if not block:
raise StopIteration
block = self(block)
if block:
writer(block)
def __call__(self, block):
raise Exception("AbstractMethod")
class Pipeline(Filter):
def __init__(self, **kwargs):
self.pipeline = list()
def __call__(self, block):
for function in self.pipeline:
if not block:
return block
try:
f = function.__call__
except AttributeError:
raise TypeError("'" + function.__class__.__name__ + "' is not callable.")
block = f(block)
return block
def append(self, function):
try:
function.__call__
except AttributeError:
raise TypeError("'" + function.__class__.__name__ + "' is not callable.")
return self.pipeline.append(function)
def remove(self, function):
return self.pipeline.remove(function)
def extend(self, pipeline):
for item in pipeline:
self.append(item)
# Container interface
def __len__(self):
return len(self.pipeline)
def __getitem__(self, key):
return self.pipeline[key]
def __setitem__(self, key, value):
try:
value.__call__
except AttributeError:
raise TypeError("'" + value.__class__.__name__ + "' is not callable.")
return self.pipeline.__setitem__(key, value)
def __delitem__(self, key):
return self.pipeline.__delitem__(key)
def __iter__(self):
return self.pipeline.__iter__()
def __contains__(self, item):
return self.pipeline.__contains__(item)
bx-python-0.8.13/lib/bx/gene_reader.py 0000664 0000000 0000000 00000024321 14156664651 0017544 0 ustar 00root root 0000000 0000000 """
Readers extracting gene (exon and intron) information from bed / gtf / gff
formats.
- GeneReader: yields exons
- CDSReader: yields cds_exons
- FeatureReader: yields cds_exons, introns, exons
For gff/gtf, the start_codon stop_codon line types are merged with CDSs.
"""
import sys
from bx.bitset_utils import (
bitset_complement,
bitset_intersect,
bitset_subtract,
bitset_union,
)
def GeneReader(fh, format='gff'):
""" yield chrom, strand, gene_exons, name """
known_formats = ('gff', 'gtf', 'bed')
if format not in known_formats:
print('{} format not in {}'.format(format, ",".join(known_formats)), file=sys.stderr)
raise Exception('?')
if format == 'bed':
for line in fh:
f = line.strip().split()
chrom = f[0]
chrom_start = int(f[1])
name = f[4]
strand = f[5]
int(f[6]) # cdsStart
int(f[7]) # cdsEnd
int(f[9]) # blockCount
blockSizes = [int(i) for i in f[10].strip(',').split(',')]
blockStarts = [chrom_start + int(i) for i in f[11].strip(',').split(',')]
# grab cdsStart - cdsEnd
gene_exons = []
for base, offset in zip(blockStarts, blockSizes):
exon_start = base
exon_end = base+offset
gene_exons.append((exon_start, exon_end))
yield chrom, strand, gene_exons, name
genelist = {}
grouplist = []
if format == 'gff' or format == 'gtf':
for line in fh:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
if len(fields) < 9:
continue
# fields
chrom = fields[0]
ex_st = int(fields[3]) - 1 # make zero-centered
ex_end = int(fields[4]) # + 1 # make exclusive
strand = fields[6]
if format == 'gtf':
group = fields[8].split(';')[0]
else:
group = fields[8]
if group not in grouplist:
grouplist.append(group)
if group not in genelist:
genelist[group] = (chrom, strand, [])
exons_i = 2
genelist[group][exons_i].append((ex_st, ex_end))
# for gene in genelist.values():
for gene in grouplist:
chrom, strand, gene_exons = genelist[gene]
gene_exons = bitset_union(gene_exons)
yield chrom, strand, gene_exons, gene
def CDSReader(fh, format='gff'):
""" yield chrom, strand, cds_exons, name """
known_formats = ('gff', 'gtf', 'bed')
if format not in known_formats:
print('{} format not in {}'.format(format, ",".join(known_formats)), file=sys.stderr)
raise Exception('?')
if format == 'bed':
for line in fh:
f = line.strip().split()
chrom = f[0]
chrom_start = int(f[1])
name = f[4]
strand = f[5]
cdsStart = int(f[6])
cdsEnd = int(f[7])
int(f[9]) # blockCount
blockSizes = [int(i) for i in f[10].strip(',').split(',')]
blockStarts = [chrom_start + int(i) for i in f[11].strip(',').split(',')]
# grab cdsStart - cdsEnd
cds_exons = []
for base, offset in zip(blockStarts, blockSizes):
if (base + offset) < cdsStart:
continue
if base > cdsEnd:
continue
exon_start = max(base, cdsStart)
exon_end = min(base+offset, cdsEnd)
cds_exons.append((exon_start, exon_end))
yield chrom, strand, cds_exons, name
genelist = {}
grouplist = []
if format == 'gff' or format == 'gtf':
for line in fh:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
if len(fields) < 9:
continue
if fields[2] not in ('CDS', 'stop_codon', 'start_codon'):
continue
# fields
chrom = fields[0]
ex_st = int(fields[3]) - 1 # make zero-centered
ex_end = int(fields[4]) # + 1 # make exclusive
strand = fields[6]
if format == 'gtf':
group = fields[8].split(';')[0]
else:
group = fields[8]
if group not in grouplist:
grouplist.append(group)
if group not in genelist:
genelist[group] = (chrom, strand, [])
genelist[group][2].append((ex_st, ex_end))
# for gene in genelist.values():
for gene in grouplist:
chrom, strand, cds_exons = genelist[gene]
seqlen = sum(a[1]-a[0] for a in cds_exons)
overhang = seqlen % 3
if overhang > 0:
if strand == '+':
cds_exons[-1] = (cds_exons[-1][0], cds_exons[-1][1] - overhang)
else:
cds_exons[0] = (cds_exons[0][0] + overhang, cds_exons[0][1])
cds_exons = bitset_union(cds_exons)
yield chrom, strand, cds_exons, gene
def FeatureReader(fh, format='gff', alt_introns_subtract="exons", gtf_parse=None):
"""
yield chrom, strand, cds_exons, introns, exons, name
gtf_parse Example:
# parse gene_id from transcript_id "AC073130.2-001"; gene_id "TES";
gene_name = lambda s: s.split(';')[1].split()[1].strip('"')
for chrom, strand, cds_exons, introns, exons, name in FeatureReader( sys.stdin, format='gtf', gtf_parse=gene_name )
"""
known_formats = ('gff', 'gtf', 'bed')
if format not in known_formats:
print('{} format not in {}'.format(format, ",".join(known_formats)), file=sys.stderr)
raise Exception('?')
if format == 'bed':
for line in fh:
f = line.strip().split()
chrom = f[0]
chrom_start = int(f[1])
name = f[4]
strand = f[5]
cdsStart = int(f[6])
cdsEnd = int(f[7])
int(f[9]) # blockCount
blockSizes = [int(i) for i in f[10].strip(',').split(',')]
blockStarts = [chrom_start + int(i) for i in f[11].strip(',').split(',')]
# grab cdsStart - cdsEnd
cds_exons = []
exons = []
for base, offset in zip(blockStarts, blockSizes):
if (base + offset) < cdsStart:
continue
if base > cdsEnd:
continue
# exons
exon_start = base
exon_end = base+offset
exons.append((exon_start, exon_end))
# cds exons
exon_start = max(base, cdsStart)
exon_end = min(base+offset, cdsEnd)
cds_exons.append((exon_start, exon_end))
cds_exons = bitset_union(cds_exons)
exons = bitset_union(exons)
introns = bitset_complement(exons)
yield chrom, strand, cds_exons, introns, exons, name
genelist = {}
grouplist = []
if format == 'gff' or format == 'gtf':
for line in fh:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
if len(fields) < 9:
continue
# fields
chrom = fields[0]
ex_st = int(fields[3]) - 1 # make zero-centered
ex_end = int(fields[4]) # + 1 # make exclusive
strand = fields[6]
if format == 'gtf':
if not gtf_parse:
group = fields[8].split(';')[0]
else:
group = gtf_parse(fields[8])
else:
group = fields[8]
# Results are listed in the same order as encountered
if group not in grouplist:
grouplist.append(group)
if group not in genelist:
# chrom, strand, cds_exons, introns, exons, cds_start, cds_end
genelist[group] = [chrom, strand, [], [], [], None, None]
if fields[2] == 'exon':
genelist[group][4].append((ex_st, ex_end))
elif fields[2] in ('CDS', 'stop_codon', 'start_codon'):
genelist[group][2].append((ex_st, ex_end))
if fields[2] == 'start_codon':
if strand == '+':
genelist[group][5] = ex_st
else:
genelist[group][5] = ex_end
if fields[2] == 'stop_codon':
if strand == '+':
genelist[group][5] = ex_end
else:
genelist[group][5] = ex_st
elif fields[2] == 'intron':
genelist[group][3].append((ex_st, ex_end))
for gene in grouplist:
chrom, strand, cds_exons, introns, exons, cds_start, cds_end = genelist[gene]
cds_exons = bitset_union(cds_exons)
exons = bitset_union(exons)
# assure that cds exons were within the cds range
if cds_start is not None and cds_end is not None:
if strand == '+':
cds_exons = bitset_intersect(cds_exons, [(cds_start, cds_end)])
else:
cds_exons = bitset_intersect(cds_exons, [(cds_end, cds_start)])
# assure that introns are non-overlapping with themselves or exons
if alt_introns_subtract:
if alt_introns_subtract == 'exons':
introns = bitset_subtract(introns, exons)
if alt_introns_subtract == 'cds_exons':
introns = bitset_subtract(introns, cds_exons)
else:
introns = bitset_union(introns)
# assure CDS is a multiple of 3, trim from last exon if necessary
seqlen = sum(a[1]-a[0] for a in cds_exons)
overhang = seqlen % 3
if overhang > 0:
if strand == '+':
cds_exons[-1] = (cds_exons[-1][0], cds_exons[-1][1] - overhang)
else:
cds_exons[0] = (cds_exons[0][0] + overhang, cds_exons[0][1])
yield chrom, strand, cds_exons, introns, exons, gene
bx-python-0.8.13/lib/bx/interval_index_file.py 0000664 0000000 0000000 00000045500 14156664651 0021320 0 ustar 00root root 0000000 0000000 """
Classes for index files that map genomic intervals to values.
:Authors: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu)
An interval index file maps genomic intervals to values.
This implementation writes version 1 file format, and reads versions 0 and 1.
Index File Format
-----------------
All fields are in big-endian format (most significant byte first).
All intervals are origin-zero, inclusive start, exclusive end.
The file begins with an index file header, then is immediately followed
by an index table. The index table points to index headers, and index
headers point to bins. Index headers and bins are referenced via pointers
(file offsets), and can be placed more or less anywhere in the file.
File header
~~~~~~~~~~~
============ =========== =================================================
offset 0x00: 2C FF 80 0A magic number
offset 0x04: 00 00 00 01 version (00 00 00 00 is also supported)
offset 0x08: 00 00 00 2A (N) number of index sets
offset 0x0C: ... index table
============ =========== =================================================
Index table
~~~~~~~~~~~
The index table is a list of N index headers, packed sequentially and
sorted by name. The first begins at offset 0x0C. Each header describes
one set of intervals.
============ =========== =================================================
offset: xx xx xx xx (L) length of index src name
offset+4: ... index src name (e.g. canFam1.chr1)
offset+4+L: xx xx xx xx offset (in this file) to index data
offset+8+L: xx xx xx xx (B) number of bytes in each value; for version
0, this field is absent, and B is assumed to be 4
============ =========== =================================================
Index data
~~~~~~~~~~
The index data for (for one index table) consists of the overall range of
intervals followed by an array of pointers to bins. The length of the
array is 1+binForRange(maxEnd-1,maxEnd), where maxEnd is the maximum
interval end.
============ =========== =================================================
offset: xx xx xx xx minimum interval start
offset+4: xx xx xx xx maximum interval end
offset+8: xx xx xx xx offset (in this file) to bin 0
offset+12: xx xx xx xx number of intervals in bin 0
offset+16: xx xx xx xx offset (in this file) to bin 1
offset+20: xx xx xx xx number of intervals in bin 1
... ... ...
============ =========== =================================================
Bin
~~~
A bin is an array of (start,end,val), sorted by increasing start (with
end and val as tiebreakers). Note that bins may be empty (the number of
intervals indicated in the index data is zero). Note that B is determined
from the appropriate entry in the index table.
============ =========== =================================================
offset: xx xx xx xx start for interval 1
offset+4: xx xx xx xx end for interval 1
offset+8: ... (B bytes) value for interval 1
offset+8+B: xx xx xx xx start for interval 2
offset+12+B: xx xx xx xx end for interval 2
offset+16+B: ... (B bytes) value for interval 2
... ... ...
============ =========== =================================================
"""
import os.path
import sys
from bisect import (
insort,
insort_right
)
from struct import (
calcsize,
pack,
unpack
)
from warnings import warn
from bx.misc import filecache
try:
from bx.misc import seekbzip2
except ImportError:
seekbzip2 = None
try:
from bx.misc import seeklzop
except ImportError:
seeklzop = None
__all__ = ['Indexes', 'Index']
MAGIC = 0x2cff800a
VERSION = 2
# These three constants determine the structure of the default binning strategy
BIN_LEVELS = 6 # Number of levels of bins to build
BIN_FIRST_SHIFT = 17 # Number of bits for the bottom level bin
BIN_NEXT_SHIFT = 3 # Number of bits for each higher level bin
# Build offset and max size arrays for each bin level
BIN_OFFSETS = [1, 0]
BIN_OFFSETS_MAX = [(1 << BIN_FIRST_SHIFT << BIN_NEXT_SHIFT), (1 << BIN_FIRST_SHIFT)]
for i in range(BIN_LEVELS - 2):
BIN_OFFSETS.insert(0, (2 ** (3 * (i + 1))) + BIN_OFFSETS[0])
BIN_OFFSETS_MAX.insert(0, (BIN_OFFSETS_MAX[0] << BIN_NEXT_SHIFT))
# The maximum size for the top bin is actually bigger than the signed integers
# we use to store positions in the file, so we'll change it to prevent confusion
BIN_OFFSETS_MAX[0] = sys.maxsize
# Constants for the minimum and maximum size of the overall interval
MIN = 0
OLD_MAX = 512 * 1024 * 1024 # Maximum size supported by versions < 2
DEFAULT_MAX = 512 * 1024 * 1024 # Default max size to use when none is passed
MAX = 2 ** 31 # Absolute max size (limited by file format)
def offsets_for_max_size(max_size):
"""
Return the subset of offsets needed to contain intervals over (0,max_size)
"""
for i, max in enumerate(reversed(BIN_OFFSETS_MAX)):
if max_size < max:
break
else:
raise Exception("%d is larger than the maximum possible size (%d)" % (max_size, BIN_OFFSETS_MAX[0]))
return BIN_OFFSETS[(len(BIN_OFFSETS) - i - 1):]
def bin_for_range(start, end, offsets=None):
"""Find the smallest bin that can contain interval (start,end)"""
if offsets is None:
offsets = BIN_OFFSETS
start_bin, end_bin = start, max(start, end - 1)
start_bin >>= BIN_FIRST_SHIFT
end_bin >>= BIN_FIRST_SHIFT
for offset in offsets:
if start_bin == end_bin:
return offset + start_bin
else:
start_bin >>= BIN_NEXT_SHIFT
end_bin >>= BIN_NEXT_SHIFT
raise Exception("Interval (%d,%d) out of range")
class AbstractMultiIndexedAccess:
"""
Allows accessing multiple indexes / files as if they were one
"""
indexed_access_class = None
def __init__(self, filenames, index_filenames=None, keep_open=False, use_cache=False, **kwargs):
# TODO: Handle index_filenames argument
self.indexes = [
self.new_indexed_access(fname, keep_open=keep_open, use_cache=use_cache, **kwargs)
for fname in filenames]
def new_indexed_access(self, data_filename, index_filename=None, keep_open=False, **kwargs):
return self.indexed_access_class(data_filename, index_filename, keep_open, **kwargs)
def get(self, src, start, end):
return [block for block in self.get_as_iterator(src, start, end)]
def get_as_iterator(self, src, start, end):
for block, _index, _offset in self.get_as_iterator_with_index_and_offset(src, start, end):
yield block
def get_as_iterator_with_index_and_offset(self, src, start, end):
for index in self.indexes:
yield from index.get_as_iterator_with_index_and_offset(src, start, end)
def close(self):
for index in self.indexes:
index.close()
class AbstractIndexedAccess:
"""Indexed access to a data using overlap queries, requires an index file"""
def __init__(self, data_filename, index_filename=None, keep_open=False, use_cache=False, **kwargs):
self.data_kwargs = kwargs
self.data_filename = data_filename
if data_filename.endswith(".bz2"):
if seekbzip2 is None:
raise Exception("Trying to open .bz2 file but no seekbzip2 module found")
table_filename = data_filename + "t"
self.table_filename = table_filename
if not os.path.exists(table_filename):
raise Exception("Cannot find bz2t file for: " + data_filename)
self.file_type = "bz2t"
# Strip .bz2 from the filename before adding ".index"
data_filename_root = data_filename[:-4]
elif data_filename.endswith(".lzo"):
if seeklzop is None:
raise Exception("Trying to open .lzo file but no seeklzop module found")
table_filename = data_filename + "t"
self.table_filename = table_filename
if not os.path.exists(table_filename):
raise Exception("Cannot find lzot file for: " + data_filename)
self.file_type = "lzot"
# Strip .lzo from the filename before adding ".index"
data_filename_root = data_filename[:-4]
else:
self.file_type = "plain"
data_filename_root = data_filename
# Open index
if index_filename is None:
index_filename = data_filename_root + ".index"
self.indexes = Indexes(filename=index_filename)
# Use a file cache?
self.use_cache = use_cache
# Open now?
if keep_open:
self.f = self.open_data()
else:
self.f = None
def close(self):
if self.f:
self.f.close()
self.f = None
def open_data(self):
if self.file_type == "plain":
return open(self.data_filename, 'rb')
elif self.file_type == "bz2t":
f = seekbzip2.SeekableBzip2File(self.data_filename, self.table_filename)
if self.use_cache:
return filecache.FileCache(f, f.size)
else:
return f
elif self.file_type == "lzot":
if self.use_cache:
block_cache_size = 20
else:
block_cache_size = 0
f = seeklzop.SeekableLzopFile(self.data_filename,
self.table_filename,
block_cache_size=block_cache_size)
return f
def get(self, src, start, end):
return [val for val in self.get_as_iterator(src, start, end)]
def get_as_iterator(self, src, start, end):
for val, _index, _offset in self.get_as_iterator_with_index_and_offset(src, start, end):
yield val
def get_as_iterator_with_index_and_offset(self, src, start, end):
for _val_start, _val_end, val in self.indexes.find(src, start, end):
yield self.get_at_offset(val), self, val
def get_at_offset(self, offset):
if self.f:
self.f.seek(offset)
return self.read_at_current_offset(self.f, **self.data_kwargs)
else:
f = self.open_data()
try:
f.seek(offset)
return self.read_at_current_offset(f, **self.data_kwargs)
finally:
f.close()
def read_at_current_offset(self, file, **kwargs):
raise TypeError("Abstract Method")
class Indexes:
"""A set of indexes, each identified by a unique name"""
def __init__(self, filename=None):
self.indexes = dict()
if filename is not None:
self.open(filename)
def add(self, name, start, end, val, max=DEFAULT_MAX):
if name not in self.indexes:
self.indexes[name] = Index(max=max)
self.indexes[name].add(start, end, val)
def get(self, name):
if self.indexes[name] is None:
offset, value_size = self.offsets[name]
self.indexes[name] = Index(filename=self.filename, offset=offset, value_size=value_size, version=self.version)
return self.indexes[name]
def find(self, name, start, end):
if name in self.indexes:
return self.get(name).find(start, end)
else:
return []
def open(self, filename):
self.filename = filename
self.offsets = dict() # (will map key to (offset,value_size))
with open(filename, 'rb') as f:
magic, version, length = read_packed(f, ">3I")
if magic != MAGIC:
raise Exception("File does not have expected header")
if version > VERSION:
warn("File claims version %d, I don't known anything about versions beyond %d. Attempting to continue", version, VERSION)
self.version = version
for _ in range(length):
key_len = read_packed(f, ">I")
key = f.read(key_len).decode()
offset = read_packed(f, ">I")
if version == 0:
value_size = 4
else:
value_size = read_packed(f, ">I")
assert value_size % 4 == 0, "unsupported value size: %s" % value_size
self.indexes[key] = None
self.offsets[key] = (offset, value_size)
def write(self, f):
keys = sorted(self.indexes.keys())
# First determine the size of the header
base = calcsize(">3I")
for key in keys:
key = str(key)
base += calcsize(">I")
base += len(key)
base += calcsize(">2I")
# Now actually write the header
write_packed(f, ">3I", MAGIC, VERSION, len(self.indexes))
# And write the index table
for key in keys:
key = str(key)
# Write the string prefixed by its length (pascal!)
write_packed(f, ">I", len(key))
f.write(key.encode())
# Write offset
write_packed(f, ">I", base)
base += self.indexes[key].bytes_required()
# Write value size
write_packed(f, ">I", self.indexes[key].value_size)
# And finally write each index in order
for key in keys:
self.indexes[key].write(f)
class Index:
def __init__(self, min=MIN, max=DEFAULT_MAX, filename=None, offset=0, value_size=None, version=None):
self._value_size = value_size
self.max_val = 1 # (1, rather than 0, to force value_size > 0)
if filename is None:
self.new(min, max)
else:
self.open(filename, offset, version)
def get_value_size(self):
if self._value_size is not None:
return self._value_size
else:
return round_up_to_4(bytes_of(self.max_val))
value_size = property(fget=get_value_size)
def new(self, min, max):
"""Create an empty index for intervals in the range min, max"""
# Ensure the range will fit given the shifting strategy
assert MIN <= min <= max <= MAX
self.min = min
self.max = max
# Determine offsets to use
self.offsets = offsets_for_max_size(max)
# Determine the largest bin we will actually use
self.bin_count = bin_for_range(max - 1, max, offsets=self.offsets) + 1
# Create empty bins
self.bins = [[] for i in range(self.bin_count)]
def open(self, filename, offset, version):
self.filename = filename
self.offset = offset
# Open the file and seek to where we expect our header
f = open(filename, 'rb')
f.seek(offset)
# Read min/max
min, max = read_packed(f, ">2I")
self.new(min, max)
# Decide how many levels of bins based on 'max'
if version < 2:
# Prior to version 2 all files used the bins for 512MB
self.offsets = offsets_for_max_size(OLD_MAX - 1)
else:
self.offsets = offsets_for_max_size(max)
# Read bin indexes
self.bin_offsets = []
self.bin_sizes = []
for _ in range(self.bin_count):
o, s = read_packed(f, ">2I")
self.bin_offsets.append(o)
self.bin_sizes.append(s)
# Initialize bins to None, indicating that they need to be loaded
self.bins = [None for _ in range(self.bin_count)]
def add(self, start, end, val):
"""Add the interval (start,end) with associated value val to the index"""
insort(self.bins[bin_for_range(start, end, offsets=self.offsets)], (start, end, val))
assert val >= 0
self.max_val = max(self.max_val, val)
def find(self, start, end):
rval = []
start_bin = (max(start, self.min)) >> BIN_FIRST_SHIFT
end_bin = (min(end, self.max) - 1) >> BIN_FIRST_SHIFT
for offset in self.offsets:
for i in range(start_bin + offset, end_bin + offset + 1):
if self.bins[i] is None:
self.load_bin(i)
# Iterate over bin and insert any overlapping elements into return value
for el_start, el_end, val in self.bins[i]:
if el_start < end and el_end > start:
insort_right(rval, (el_start, el_end, val))
start_bin >>= BIN_NEXT_SHIFT
end_bin >>= BIN_NEXT_SHIFT
return rval
def iterate(self):
for i in range(self.bin_count):
if self.bins[i] is None:
self.load_bin(i)
yield from self.bins[i]
def load_bin(self, index):
bin = []
if self.bin_sizes[index] == 0:
self.bins[index] = bin
return
f = open(self.filename, 'rb')
f.seek(self.bin_offsets[index])
# One big read for happy NFS
item_size = self.value_size + calcsize(">2I")
buffer = f.read(self.bin_sizes[index] * item_size)
for i in range(self.bin_sizes[index]):
start, end = unpack(">2I", buffer[i*item_size:i*item_size+8])
val = unpack_uints(buffer[i*item_size+8:(i+1)*item_size])
bin.append((start, end, val))
self.bins[index] = bin
f.close()
def write(self, f):
value_size = self.value_size
item_size = value_size + calcsize(">2I")
# Write min/max
write_packed(f, ">2I", self.min, self.max)
# Write table of bin sizes and offsets
base = f.tell() + self.bin_count * calcsize(">2I")
for bin in self.bins:
write_packed(f, ">2I", base, len(bin))
base += len(bin) * item_size
# Write contents of each bin
for bin in self.bins:
for start, end, val in bin:
write_packed(f, ">2I", start, end)
write_packed_uints(f, val, value_size)
def bytes_required(self):
item_size = self.value_size + calcsize(">2I")
rval = calcsize(">2I")
rval += self.bin_count * calcsize(">2I")
for bin in self.bins:
rval += len(bin) * item_size
return rval
def write_packed(f, pattern, *vals):
f.write(pack(pattern, *vals))
def read_packed(f, pattern):
rval = unpack(pattern, f.read(calcsize(pattern)))
if len(rval) == 1:
return rval[0]
return rval
def write_packed_uints(f, v, num_bytes):
if num_bytes < 4:
write_packed(f, ">I", v)
else:
parts = []
while num_bytes > 0:
parts.append(v & 0xFFFFFFFF)
v >>= 32
num_bytes -= 4
parts.reverse() # (write most-significant chunk first)
write_packed(f, ">%dI" % len(parts), *parts)
def unpack_uints(parts):
chunks = len(parts)/4
vals = unpack(">%dI" % chunks, parts)
val = vals[0]
for v in vals[1:]:
val = (val << 32) + v
return val
def bytes_of(v):
assert v > 0
b = 0
while v > 0:
v >>= 8
b += 1
return b
def round_up_to_4(v):
if v % 4 == 0:
return v
else:
return v + 4 - (v % 4)
bx-python-0.8.13/lib/bx/interval_index_file_tests.py 0000664 0000000 0000000 00000003025 14156664651 0022536 0 ustar 00root root 0000000 0000000 import random
from tempfile import mktemp
from . import interval_index_file
from .interval_index_file import Indexes
def test_offsets():
assert interval_index_file.offsets_for_max_size(512*1024*1024 - 1) == [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]
def test_interval_index_file():
ix = Indexes()
chrs = []
for i in range(5):
intervals = []
name = "seq%d" % i
max = random.randint(0, interval_index_file.MAX)
# print name, "size", max
for i in range(500):
start = random.randint(0, max)
end = random.randint(0, max)
if end < start:
end, start = start, end
ix.add(name, start, end, i, max=interval_index_file.MAX)
intervals.append((start, end, i))
chrs.append(intervals)
fname = mktemp()
f = open(fname, "wb")
ix.write(f)
f.close()
del ix
ix = Indexes(fname)
for i in range(5):
intervals = chrs[i]
name = "seq%d" % i
for i in range(100):
start = random.randint(0, max)
end = random.randint(0, max)
if end < start:
end, start = start, end
query_intervals = set()
for (s, e, i) in intervals:
if e > start and s < end:
query_intervals.add((s, e, i))
result = ix.find(name, start, end)
for inter in result:
assert inter in query_intervals
def test_zero():
ix = Indexes()
ix.add("t.idx", 0, 0, 1, 123)
bx-python-0.8.13/lib/bx/intervals/ 0000775 0000000 0000000 00000000000 14156664651 0016737 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/intervals/__init__.py 0000664 0000000 0000000 00000000331 14156664651 0021045 0 ustar 00root root 0000000 0000000 """
Tools and data structures for working with genomic intervals (or sets of
regions on a line in general) efficiently.
"""
# For compatiblity with existing stuff
from bx.intervals.intersection import * # noqa: F40
bx-python-0.8.13/lib/bx/intervals/cluster.pyx 0000664 0000000 0000000 00000007223 14156664651 0021166 0 ustar 00root root 0000000 0000000 """
Kanwei Li, 2009
Inspired by previous ClusterTree
Provides a ClusterTree data structure that supports efficient finding of
clusters of intervals that are within a certain distance apart.
This clustering algorithm uses a binary tree structure. Nodes correspond to
non-overlapping intervals, where overlapping means that the distance between
two intervals is less or equal to the max separation.
The tree self-balances using rotations based on the binomial sequence. Merges
among nodes are performed whenever a node is changed/added that will cause other
nodes to form a new cluster.
C source code is in src/cluster.c
"""
cdef extern from "cluster.h":
cdef struct struct_interval:
int start
int end
int id
struct_interval * next
ctypedef struct_interval interval
cdef struct struct_clusternode:
int start
int end
struct_interval *interval_head
struct_interval *interval_tail
ctypedef struct_clusternode clusternode
cdef struct struct_clustertree:
int max_dist
int min_intervals
struct_clusternode *root
ctypedef struct_clustertree clustertree
cdef struct struct_treeitr:
struct_treeitr *next
struct_clusternode *node
ctypedef struct_treeitr treeitr
clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id)
clustertree* create_clustertree(int max_dist, int min_intervals)
treeitr* clusteritr(clustertree *tree)
void freeclusteritr(treeitr *itr)
void free_tree(clustertree *tree)
cdef class ClusterTree:
cdef clustertree *tree
cdef int mincols
cdef int minregions
def __cinit__(self, mincols, minregions):
self.tree = create_clustertree(mincols, minregions)
self.mincols = mincols
self.minregions = minregions
def __dealloc__(self):
free_tree(self.tree)
def insert(self, s, e, id):
''' Insert an interval with start, end, id as parameters'''
if s > e: raise ValueError("Interval start must be before end")
self.tree.root = clusternode_insert(self.tree, self.tree.root, s, e, id)
def getregions(self):
''' Returns a list clusters in ascending order of starting position.
Each cluster is a tuple of (start, end, [sorted ids of intervals in cluster])
tree = ClusterTree(0, 0)
Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]
'''
cdef treeitr *itr
cdef interval *ival
regions = []
itr = clusteritr(self.tree)
while (itr):
ids = []
ival = itr.node.interval_head
while (ival):
ids.append(ival.id)
ival = ival.next
regions.append( (itr.node.start, itr.node.end, sorted(ids)) )
itr = itr.next
freeclusteritr(itr)
return regions
def getlines(self):
''' Similar to getregions except it just returns a list of ids of intervals
The above example would return [3, 0, 1, 4, 2]
'''
cdef treeitr *itr
cdef interval *ival
lines = []
itr = clusteritr(self.tree)
while (itr):
ids = []
ival = itr.node.interval_head
while (ival):
ids.append(ival.id)
ival = ival.next
lines.extend(sorted(ids))
itr = itr.next
freeclusteritr(itr)
return lines
bx-python-0.8.13/lib/bx/intervals/cluster_tests.py 0000664 0000000 0000000 00000007145 14156664651 0022223 0 ustar 00root root 0000000 0000000 import os
import sys
import unittest
try:
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
except Exception:
sys.path.insert(0, os.path.dirname(os.path.abspath(".")))
# from bx.intervals.cluster import ClusterTree
from .cluster import ClusterTree
class TestCluster(unittest.TestCase):
def setUp(self):
self.tree = ClusterTree(0, 0)
def insertpairs(self, pairs):
for i, (s, e) in enumerate(pairs):
self.tree.insert(s, e, i)
def test_merge_case(self):
pairs = [(3, 4), (6, 7), (9, 10), (1, 2), (3, 8)]
self.insertpairs(pairs)
self.assertEqual([(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])], self.tree.getregions())
def test_trivial(self):
pairs = [(1, 4), (4, 5)]
self.insertpairs(pairs)
self.assertEqual([(1, 5, [0, 1])], self.tree.getregions())
def test_easymerge(self):
pairs = [(1, 2), (4, 5), (2, 4)]
self.insertpairs(pairs)
self.assertEqual([(1, 5, [0, 1, 2])], self.tree.getregions())
def test_hardmerge(self):
pairs = [(1, 2), (8, 9), (3, 4), (5, 6), (7, 8), (1, 10)]
self.insertpairs(pairs)
self.assertEqual([(1, 10, [0, 1, 2, 3, 4, 5])], self.tree.getregions())
def test_duplicates(self):
pairs = [(1, 1), (1, 2), (3, 4), (3, 4), (1, 4)]
self.insertpairs(pairs)
self.assertEqual([(1, 4, [0, 1, 2, 3, 4])], self.tree.getregions())
def test_startbeforeend(self):
self.assertRaises(ValueError, self.tree.insert, 4, 2, 0)
def test_large_sorted(self):
upto = 100000
pairs = [(2*i + 1, 2*i + 2) for i in range(upto)]
self.insertpairs(pairs)
self.tree.insert(0, upto*3, upto)
self.assertEqual([(0, upto*3, [x for x in range(upto+1)])], self.tree.getregions())
def test_minregions(self):
self.tree = ClusterTree(0, 2)
pairs = [(3, 4), (6, 7), (9, 10), (1, 2), (3, 8)]
self.insertpairs(pairs)
self.assertEqual([(3, 8, [0, 1, 4])], self.tree.getregions())
def test_distance(self):
self.tree = ClusterTree(1, 0)
pairs = [(3, 4), (6, 7), (9, 10), (1, 2), (3, 8)]
self.insertpairs(pairs)
self.assertEqual([(1, 10, [0, 1, 2, 3, 4])], self.tree.getregions())
def test_merge_left_right(self):
pairs = [(6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)]
for s, e, i in pairs:
self.tree.insert(s, e, i)
self.assertEqual([(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])], self.tree.getregions())
def test_larger(self):
pairs = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12), (13, 14), (15, 16), (17, 18), (19, 20),
(1, 3), (4, 10), (10, 15), (15, 20), (21, 22)]
self.insertpairs(pairs)
self.assertEqual([(1, 20, [x for x in range(14)]), (21, 22, [14])], self.tree.getregions())
def test_another(self):
pairs = [(3, 4, 1), (13, 14, 6), (21, 22, 14), (5, 6, 2), (4, 10, 11), (1, 2, 0), (11, 12, 5), (1, 3, 10), (7, 8, 3), (15, 16, 7), (15, 20, 13), (19, 20, 9), (10, 15, 12), (17, 18, 8), (9, 10, 4)]
# pairs = [(3, 4, 1), (13, 14, 6), (21, 22, 14), (5, 6, 2), (4, 10, 11), (1, 2, 0), (11, 12, 5), (1, 3, 10), (7, 8, 3), (15, 16, 7), (15, 20, 13), (19, 20, 9), (10, 15, 12), (9, 10, 4)]
for s, e, i in pairs:
self.tree.insert(s, e, i)
self.assertEqual([(1, 20, [x for x in range(14)]), (21, 22, [14])], self.tree.getregions())
def test_none(self):
pairs = []
self.insertpairs(pairs)
self.assertEqual([], self.tree.getregions())
if __name__ == '__main__':
unittest.main()
bx-python-0.8.13/lib/bx/intervals/intersection.pyx 0000664 0000000 0000000 00000041437 14156664651 0022220 0 ustar 00root root 0000000 0000000 """
Data structure for performing intersect queries on a set of intervals which
preserves all information about the intervals (unlike bitset projection methods).
:Authors: James Taylor (james@jamestaylor.org),
Ian Schenk (ian.schenck@gmail.com),
Brent Pedersen (bpederse@gmail.com)
"""
# Historical note:
# This module original contained an implementation based on sorted endpoints
# and a binary search, using an idea from Scott Schwartz and Piotr Berman.
# Later an interval tree implementation was implemented by Ian for Galaxy's
# join tool (see `bx.intervals.operations.quicksect.py`). This was then
# converted to Cython by Brent, who also added support for
# upstream/downstream/neighbor queries. This was modified by James to
# handle half-open intervals strictly, to maintain sort order, and to
# implement the same interface as the original Intersecter.
#cython: cdivision=True
import operator
cdef extern from "stdlib.h":
int ceil(float f)
float log(float f)
int RAND_MAX
int rand()
int strlen(char *)
int iabs(int)
cdef inline int imax2(int a, int b):
if b > a: return b
return a
cdef inline int imax3(int a, int b, int c):
if b > a:
if c > b:
return c
return b
if a > c:
return a
return c
cdef inline int imin3(int a, int b, int c):
if b < a:
if c < b:
return c
return b
if a < c:
return a
return c
cdef inline int imin2(int a, int b):
if b < a: return b
return a
cdef float nlog = -1.0 / log(0.5)
cdef class IntervalNode:
"""
A single node of an `IntervalTree`.
NOTE: Unless you really know what you are doing, you probably should us
`IntervalTree` rather than using this directly.
"""
cdef float priority
cdef public object interval
cdef public int start, end
cdef int minend, maxend, minstart
cdef IntervalNode cleft, cright, croot
property left_node:
def __get__(self):
return self.cleft if self.cleft is not EmptyNode else None
property right_node:
def __get__(self):
return self.cright if self.cright is not EmptyNode else None
property root_node:
def __get__(self):
return self.croot if self.croot is not EmptyNode else None
def __repr__(self):
return "IntervalNode(%i, %i)" % (self.start, self.end)
def __cinit__(IntervalNode self, int start, int end, object interval):
# Python lacks the binomial distribution, so we convert a
# uniform into a binomial because it naturally scales with
# tree size. Also, python's uniform is perfect since the
# upper limit is not inclusive, which gives us undefined here.
self.priority = ceil(nlog * log(-1.0/(1.0 * rand()/RAND_MAX - 1)))
self.start = start
self.end = end
self.interval = interval
self.maxend = end
self.minstart = start
self.minend = end
self.cleft = EmptyNode
self.cright = EmptyNode
self.croot = EmptyNode
cpdef IntervalNode insert(IntervalNode self, int start, int end, object interval):
"""
Insert a new IntervalNode into the tree of which this node is
currently the root. The return value is the new root of the tree (which
may or may not be this node!)
"""
cdef IntervalNode croot = self
# If starts are the same, decide which to add interval to based on
# end, thus maintaining sortedness relative to start/end
cdef int decision_endpoint = start
if start == self.start:
decision_endpoint = end
if decision_endpoint > self.start:
# insert to cright tree
if self.cright is not EmptyNode:
self.cright = self.cright.insert( start, end, interval )
else:
self.cright = IntervalNode( start, end, interval )
# rebalance tree
if self.priority < self.cright.priority:
croot = self.rotate_left()
else:
# insert to cleft tree
if self.cleft is not EmptyNode:
self.cleft = self.cleft.insert( start, end, interval)
else:
self.cleft = IntervalNode( start, end, interval)
# rebalance tree
if self.priority < self.cleft.priority:
croot = self.rotate_right()
croot.set_ends()
self.cleft.croot = croot
self.cright.croot = croot
return croot
cdef IntervalNode rotate_right(IntervalNode self):
cdef IntervalNode croot = self.cleft
self.cleft = self.cleft.cright
croot.cright = self
self.set_ends()
return croot
cdef IntervalNode rotate_left(IntervalNode self):
cdef IntervalNode croot = self.cright
self.cright = self.cright.cleft
croot.cleft = self
self.set_ends()
return croot
cdef inline void set_ends(IntervalNode self):
if self.cright is not EmptyNode and self.cleft is not EmptyNode:
self.maxend = imax3(self.end, self.cright.maxend, self.cleft.maxend)
self.minend = imin3(self.end, self.cright.minend, self.cleft.minend)
self.minstart = imin3(self.start, self.cright.minstart, self.cleft.minstart)
elif self.cright is not EmptyNode:
self.maxend = imax2(self.end, self.cright.maxend)
self.minend = imin2(self.end, self.cright.minend)
self.minstart = imin2(self.start, self.cright.minstart)
elif self.cleft is not EmptyNode:
self.maxend = imax2(self.end, self.cleft.maxend)
self.minend = imin2(self.end, self.cleft.minend)
self.minstart = imin2(self.start, self.cleft.minstart)
def intersect( self, int start, int end, sort=True ):
"""
given a start and a end, return a list of features
falling within that range
"""
cdef list results = []
self._intersect( start, end, results )
return results
find = intersect
cdef void _intersect( IntervalNode self, int start, int end, list results):
# Left subtree
if self.cleft is not EmptyNode and self.cleft.maxend > start:
self.cleft._intersect( start, end, results )
# This interval
if ( self.end > start ) and ( self.start < end ):
results.append( self.interval )
# Right subtree
if self.cright is not EmptyNode and self.start < end:
self.cright._intersect( start, end, results )
cdef void _seek_left(IntervalNode self, int position, list results, int n, int max_dist):
# we know we can bail in these 2 cases.
if self.maxend + max_dist < position:
return
if self.minstart > position:
return
# the ordering of these 3 blocks makes it so the results are
# ordered nearest to farest from the query position
if self.cright is not EmptyNode:
self.cright._seek_left(position, results, n, max_dist)
if -1 < position - self.end < max_dist:
results.append(self.interval)
# TODO: can these conditionals be more stringent?
if self.cleft is not EmptyNode:
self.cleft._seek_left(position, results, n, max_dist)
cdef void _seek_right(IntervalNode self, int position, list results, int n, int max_dist):
# we know we can bail in these 2 cases.
if self.maxend < position: return
if self.minstart - max_dist > position: return
#print "SEEK_RIGHT:",self, self.cleft, self.maxend, self.minstart, position
# the ordering of these 3 blocks makes it so the results are
# ordered nearest to farest from the query position
if self.cleft is not EmptyNode:
self.cleft._seek_right(position, results, n, max_dist)
if -1 < self.start - position < max_dist:
results.append(self.interval)
if self.cright is not EmptyNode:
self.cright._seek_right(position, results, n, max_dist)
cpdef left(self, position, int n=1, int max_dist=2500):
"""
find n features with a start > than `position`
f: a Interval object (or anything with an `end` attribute)
n: the number of features to return
max_dist: the maximum distance to look before giving up.
"""
cdef list results = []
# use start - 1 becuase .left() assumes strictly left-of
self._seek_left( position - 1, results, n, max_dist )
if len(results) == n: return results
r = results
r.sort(key=operator.attrgetter('end'), reverse=True)
return r[:n]
cpdef right(self, position, int n=1, int max_dist=2500):
"""
find n features with a end < than position
f: a Interval object (or anything with a `start` attribute)
n: the number of features to return
max_dist: the maximum distance to look before giving up.
"""
cdef list results = []
# use end + 1 becuase .right() assumes strictly right-of
self._seek_right(position + 1, results, n, max_dist)
if len(results) == n: return results
r = results
r.sort(key=operator.attrgetter('start'))
return r[:n]
def traverse(self, func):
self._traverse(func)
cdef void _traverse(IntervalNode self, object func):
if self.cleft is not EmptyNode: self.cleft._traverse(func)
func(self)
if self.cright is not EmptyNode: self.cright._traverse(func)
cdef IntervalNode EmptyNode = IntervalNode( 0, 0, Interval(0, 0))
## ---- Wrappers that retain the old interface -------------------------------
cdef class Interval:
"""
Basic feature, with required integer start and end properties.
Also accepts optional strand as +1 or -1 (used for up/downstream queries),
a name, and any arbitrary data is sent in on the info keyword argument
>>> from bx.intervals.intersection import Interval
>>> from collections import OrderedDict
>>> f1 = Interval(23, 36)
>>> f2 = Interval(34, 48, value=OrderedDict([('chr', 12), ('anno', 'transposon')]))
>>> f2
Interval(34, 48, value=OrderedDict([('chr', 12), ('anno', 'transposon')]))
"""
cdef public int start, end
cdef public object value, chrom, strand
def __init__(self, int start, int end, object value=None, object chrom=None, object strand=None ):
assert start <= end, "start must be less than end"
self.start = start
self.end = end
self.value = value
self.chrom = chrom
self.strand = strand
def __repr__(self):
fstr = "Interval(%d, %d" % (self.start, self.end)
if not self.value is None:
fstr += ", value=" + str(self.value)
fstr += ")"
return fstr
def __richcmp__(self, other, op):
if op == 0:
# <
return self.start < other.start or self.end < other.end
elif op == 1:
# <=
return self == other or self < other
elif op == 2:
# ==
return self.start == other.start and self.end == other.end
elif op == 3:
# !=
return self.start != other.start or self.end != other.end
elif op == 4:
# >
return self.start > other.start or self.end > other.end
elif op == 5:
# >=
return self == other or self > other
cdef class IntervalTree:
"""
Data structure for performing window intersect queries on a set of
of possibly overlapping 1d intervals.
Usage
=====
Create an empty IntervalTree
>>> from bx.intervals.intersection import Interval, IntervalTree
>>> intersecter = IntervalTree()
An interval is a start and end position and a value (possibly None).
You can add any object as an interval:
>>> intersecter.insert( 0, 10, "food" )
>>> intersecter.insert( 3, 7, dict(foo='bar') )
>>> intersecter.find( 2, 5 )
['food', {'foo': 'bar'}]
If the object has start and end attributes (like the Interval class) there
is are some shortcuts:
>>> intersecter = IntervalTree()
>>> intersecter.insert_interval( Interval( 0, 10 ) )
>>> intersecter.insert_interval( Interval( 3, 7 ) )
>>> intersecter.insert_interval( Interval( 3, 40 ) )
>>> intersecter.insert_interval( Interval( 13, 50 ) )
>>> intersecter.find( 30, 50 )
[Interval(3, 40), Interval(13, 50)]
>>> intersecter.find( 100, 200 )
[]
Before/after for intervals
>>> intersecter.before_interval( Interval( 10, 20 ) )
[Interval(3, 7)]
>>> intersecter.before_interval( Interval( 5, 20 ) )
[]
Upstream/downstream
>>> intersecter.upstream_of_interval(Interval(11, 12))
[Interval(0, 10)]
>>> intersecter.upstream_of_interval(Interval(11, 12, strand="-"))
[Interval(13, 50)]
>>> intersecter.upstream_of_interval(Interval(1, 2, strand="-"), num_intervals=3)
[Interval(3, 7), Interval(3, 40), Interval(13, 50)]
"""
cdef IntervalNode root
def __cinit__( self ):
root = None
# ---- Position based interfaces -----------------------------------------
def insert( self, int start, int end, object value=None ):
"""
Insert the interval [start,end) associated with value `value`.
"""
if self.root is None:
self.root = IntervalNode( start, end, value )
else:
self.root = self.root.insert( start, end, value )
add = insert
def find( self, start, end ):
"""
Return a sorted list of all intervals overlapping [start,end).
"""
if self.root is None:
return []
return self.root.find( start, end )
def before( self, position, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie before `position` and are no
further than `max_dist` positions away
"""
if self.root is None:
return []
return self.root.left( position, num_intervals, max_dist )
def after( self, position, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie after `position` and are no
further than `max_dist` positions away
"""
if self.root is None:
return []
return self.root.right( position, num_intervals, max_dist )
# ---- Interval-like object based interfaces -----------------------------
def insert_interval( self, interval ):
"""
Insert an "interval" like object (one with at least start and end
attributes)
"""
self.insert( interval.start, interval.end, interval )
add_interval = insert_interval
def before_interval( self, interval, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie completely before `interval`
and are no further than `max_dist` positions away
"""
if self.root is None:
return []
return self.root.left( interval.start, num_intervals, max_dist )
def after_interval( self, interval, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie completely after `interval` and
are no further than `max_dist` positions away
"""
if self.root is None:
return []
return self.root.right( interval.end, num_intervals, max_dist )
def upstream_of_interval( self, interval, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie completely upstream of
`interval` and are no further than `max_dist` positions away
"""
if self.root is None:
return []
if interval.strand == -1 or interval.strand == "-":
return self.root.right( interval.end, num_intervals, max_dist )
else:
return self.root.left( interval.start, num_intervals, max_dist )
def downstream_of_interval( self, interval, num_intervals=1, max_dist=2500 ):
"""
Find `num_intervals` intervals that lie completely downstream of
`interval` and are no further than `max_dist` positions away
"""
if self.root is None:
return []
if interval.strand == -1 or interval.strand == "-":
return self.root.left( interval.start, num_intervals, max_dist )
else:
return self.root.right( interval.end, num_intervals, max_dist )
def traverse(self, fn):
"""
call fn for each element in the tree
"""
if self.root is None:
return None
return self.root.traverse(fn)
# For backward compatibility
Intersecter = IntervalTree
bx-python-0.8.13/lib/bx/intervals/intersection_tests.py 0000664 0000000 0000000 00000014371 14156664651 0023247 0 ustar 00root root 0000000 0000000 import os
import sys
import unittest
try:
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
except Exception:
sys.path.insert(0, os.path.dirname(os.path.abspath(".")))
from bx.intervals.intersection import Interval
from bx.intervals.intersection import IntervalNode
from bx.intervals.intersection import IntervalTree
class NeighborTestCase(unittest.TestCase):
def setUp(self):
iv = IntervalNode(50, 59, Interval(50, 59))
for i in range(0, 110, 10):
if i == 50:
continue
f = Interval(i, i + 9)
iv = iv.insert(f.start, f.end, f)
self.intervals = iv
def test_left(self):
iv = self.intervals
self.assertEqual(str(iv.left(60, n=2)), str([Interval(50, 59), Interval(40, 49)]))
for i in range(10, 100, 10):
r = iv.left(i, max_dist=10, n=1)
self.assertEqual(r[0].end, i - 1)
def test_toomany(self):
iv = self.intervals
self.assertEqual(len(iv.left(60, n=200)), 6)
def test_right(self):
iv = self.intervals
self.assertEqual(str(iv.left(60, n=2)), str([Interval(50, 59), Interval(40, 49)]))
def get_right_start(b10):
r = iv.right(b10+1, n=1)
assert len(r) == 1
return r[0].start
for i in range(10, 100, 10):
self.assertEqual(get_right_start(i), i + 10)
for i in range(0, 100, 10):
r = iv.right(i-1, max_dist=10, n=1)
print(r)
self.assertEqual(r[0].start, i)
class UpDownStreamTestCase(unittest.TestCase):
def setUp(self):
iv = IntervalTree()
iv.add_interval(Interval(50, 59))
for i in range(0, 110, 10):
if i == 50:
continue
f = Interval(i, i + 9)
iv.add_interval(f)
self.intervals = iv
def test_upstream(self):
iv = self.intervals
upstreams = iv.upstream_of_interval(Interval(59, 60), num_intervals=200)
for u in upstreams:
self.assertTrue(u.end < 59)
upstreams = iv.upstream_of_interval(Interval(60, 70, strand=-1), num_intervals=200)
for u in upstreams:
self.assertTrue(u.start > 70)
upstreams = iv.upstream_of_interval(Interval(58, 58, strand=-1), num_intervals=200)
for u in upstreams:
self.assertTrue(u.start > 59)
def test_downstream(self):
iv = self.intervals
downstreams = iv.downstream_of_interval(Interval(59, 60), num_intervals=200)
for d in downstreams:
self.assertTrue(d.start > 60)
downstreams = iv.downstream_of_interval(Interval(59, 60, strand=-1), num_intervals=200)
for d in downstreams:
self.assertTrue(d.start < 59)
def test_n(self):
iv = self.intervals
for i in range(0, 90, 10):
r = iv.after(i, max_dist=20, num_intervals=2)
self.assertEqual(r[0].start, i + 10)
self.assertEqual(r[1].start, i + 20)
r = iv.after_interval(Interval(i, i), max_dist=20, num_intervals=2)
self.assertEqual(r[0].start, i + 10)
self.assertEqual(r[1].start, i + 20)
class LotsaTestCase(unittest.TestCase):
""" put lotsa data in the tree and make sure it works"""
def setUp(self):
iv = IntervalNode(1, 2, Interval(1, 2))
self.max = 1000000
for i in range(0, self.max, 10):
f = Interval(i, i)
iv = iv.insert(f.start, f.end, f)
for i in range(600):
iv = iv.insert(0, 1, Interval(0, 1))
self.intervals = iv
def test_count(self):
iv = self.intervals
r = iv.right(1, n=33)
self.assertEqual(len(r), 33)
l = iv.left(1, n=33)
self.assertEqual(len(l), 1)
u = iv.right(1, n=9999)
self.assertEqual(len(u), 250)
# now increase max_dist
u = iv.right(1, n=9999, max_dist=99999)
self.assertEqual(len(u), 9999)
def test_max_dist(self):
iv = self.intervals
r = iv.right(1, max_dist=0, n=10)
self.assertEqual(len(r), 0)
for n, d in enumerate(range(10, 1000, 10)):
r = iv.right(1, max_dist=d, n=10000)
self.assertEqual(len(r), n + 1)
def test_find(self):
iv = self.intervals
path = sys.path[:]
sys.path = sys.path[2:]
random = __import__("random")
sys.path = path
for t in range(25):
start = random.randint(0, self.max - 10000)
end = start + random.randint(100, 10000)
results = iv.find(start, end)
for feat in results:
self.assertTrue(
(feat.end >= start and feat.end <= end)
or (feat.start <= end and feat.start >= start))
class IntervalTreeTest(unittest.TestCase):
def setUp(self):
iv = IntervalTree()
n = 0
for i in range(1, 1000, 80):
iv.insert(i, i + 10, dict(value=i*i))
# add is synonym for insert.
iv.add(i + 20, i + 30, dict(astr=str(i*i)))
# or insert/add an interval object with start, end attrs.
iv.insert_interval(Interval(i + 40, i + 50, value=dict(astr=str(i*i))))
iv.add_interval(Interval(i + 60, i + 70, value=dict(astr=str(i*i))))
n += 4
self.intervals = self.iv = iv
self.nintervals = n
def test_find(self):
r = self.iv.find(100, 200)
self.assertEqual(len(r), 5)
def test_traverse(self):
a = []
fn = a.append
self.iv.traverse(fn)
self.assertEqual(len(a), self.nintervals)
def test_empty(self):
iv = IntervalTree()
self.assertEqual([], iv.find(100, 300))
self.assertEqual([], iv.after(100))
self.assertEqual([], iv.before(100))
self.assertEqual([], iv.after_interval(100))
self.assertEqual([], iv.before_interval(100))
self.assertEqual([], iv.upstream_of_interval(100))
self.assertEqual([], iv.downstream_of_interval(100))
self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
def test_public_interval(self):
def fn(ival):
return self.assertTrue(ival.interval)
self.iv.traverse(fn)
if __name__ == "__main__":
unittest.main()
bx-python-0.8.13/lib/bx/intervals/io.py 0000664 0000000 0000000 00000024736 14156664651 0017734 0 ustar 00root root 0000000 0000000 """
Support for reading and writing genomic intervals from delimited text files.
"""
from bx.bitset import (
BinnedBitSet,
MAX
)
from bx.tabular.io import (
ParseError,
TableReader,
TableRow,
)
class MissingFieldError(ParseError):
pass
class FieldFormatError(ParseError):
def __init__(self, *args, **kwargs):
ParseError.__init__(self, *args, **kwargs)
self.expected = kwargs.get("expected", None)
def __str__(self):
if self.expected:
return ParseError.__str__(self) + ", " + self.expected + " expected"
else:
return ParseError.__str__(self)
class StrandFormatError(ParseError):
pass
class GenomicInterval(TableRow):
"""
A genomic interval stored in a set of fields (a row of a table)
"""
def __init__(self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=False):
TableRow.__init__(self, reader, fields)
self.chrom_col = chrom_col
self.start_col = start_col
self.end_col = end_col
self.strand_col = strand_col
self.nfields = nfields = len(fields)
# Parse chrom/source column
if chrom_col >= nfields:
raise MissingFieldError("No field for chrom_col (%d)" % chrom_col)
self.chrom = fields[chrom_col].strip()
# Parse start column and ensure it is an integer
if start_col >= nfields:
raise MissingFieldError("No field for start_col (%d)" % start_col)
try:
self.start = int(fields[start_col])
except ValueError as e:
raise FieldFormatError("Could not parse start_col: " + str(e), expected="integer")
# Parse end column and ensure it is an integer
if end_col >= nfields:
raise MissingFieldError("No field for end_col (%d)" % end_col)
try:
self.end = int(fields[end_col])
except ValueError as e:
raise FieldFormatError("Could not parse end_col: " + str(e), expected="integer")
# Ensure start <= end
if self.end < self.start:
raise ParseError("Start is greater than End. Interval length is < 1.")
# Parse strand and ensure it is valid
if strand_col >= nfields or strand_col < 0:
# This should probable be immutable since the fields are
# not updated when it is set
self.strand = default_strand
else:
strand = fields[strand_col]
if strand == ".":
strand = default_strand
elif strand not in ("+", "-"):
if fix_strand:
strand = "+"
else:
raise StrandFormatError("Strand must be either '+' or '-'")
self.strand = strand
def __setattr__(self, name, value):
if name == "chrom":
self.fields[self.chrom_col] = str(value)
elif name == "start":
self.fields[self.start_col] = str(value)
elif name == "end":
self.fields[self.end_col] = str(value)
elif name == "strand":
if self.strand_col < self.nfields and self.strand_col >= 0:
self.fields[self.strand_col] = str(value)
object.__setattr__(self, name, value)
def __str__(self):
return "\t".join(self.fields)
def copy(self):
return GenomicInterval(self.reader, list(self.fields), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.strand)
class GenomicIntervalReader(TableReader):
"""
Reader for iterating a set of intervals in a tab separated file. Can
also parse header and comment lines if requested.
>>> from bx.tabular.io import Comment, Header
>>> r = GenomicIntervalReader( [ "#chrom\\tname\\tstart\\tend\\textra",
... "chr1\\tfoo\\t1\\t100\\txxx",
... "chr2\\tbar\\t20\\t300\\txxx",
... "#I am a comment",
... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
>>> header = next(r)
>>> elements = list(r)
>>> elements.insert(0, header)
>>> assert isinstance(elements[0], Header)
>>> str(elements[0])
'#chrom\\tname\\tstart\\tend\\textra'
>>> assert isinstance(elements[1], GenomicInterval)
>>> print(elements[1].start, elements[1].end)
1 100
>>> str(elements[1])
'chr1\\tfoo\\t1\\t100\\txxx'
>>> elements[1].start = 30
>>> print(elements[1].start, elements[1].end)
30 100
>>> str(elements[1])
'chr1\\tfoo\\t30\\t100\\txxx'
>>> assert isinstance(elements[2], GenomicInterval)
>>> assert isinstance(elements[3], Comment)
>>> assert isinstance(elements[4], GenomicInterval)
"""
def __init__(self, input, chrom_col=0, start_col=1, end_col=2, strand_col=5,
default_strand="+", return_header=True, return_comments=True, force_header=None, fix_strand=False, comment_lines_startswith=None, allow_spaces=False):
if comment_lines_startswith is None:
comment_lines_startswith = ["#", "track "]
TableReader.__init__(self, input, return_header, return_comments, force_header, comment_lines_startswith)
self.chrom_col = chrom_col
self.start_col = start_col
self.end_col = end_col
self.strand_col = strand_col
self.default_strand = default_strand
self.fix_strand = fix_strand
self.allow_spaces = allow_spaces
def parse_row(self, line):
# Try multiple separators. First tab, our expected splitter, than
# just whitespace in the case of problematic files with space instead of
# tab separation
seps = ["\t"]
if self.allow_spaces:
seps.append(None)
for i, sep in enumerate(seps):
try:
return GenomicInterval(
self, line.split(sep), self.chrom_col, self.start_col,
self.end_col, self.strand_col, self.default_strand,
fix_strand=self.fix_strand)
except Exception as e:
# Catch and store the initial error
if i == 0:
err = e
# Ran out of separators and still have errors, raise our problem
raise err
def binned_bitsets(self, upstream_pad=0, downstream_pad=0, lens=None):
# The incoming lens dictionary is a dictionary of chromosome lengths
# which are used to initialize the bitsets.
if lens is None:
lens = {}
last_chrom = None
last_bitset = None
bitsets = dict()
for interval in self:
if isinstance(interval, GenomicInterval):
chrom = interval[self.chrom_col]
if chrom != last_chrom:
if chrom not in bitsets:
size = lens.get(chrom, MAX)
try:
bbs = BinnedBitSet(size)
except ValueError as e:
# We will only reach here when constructing this bitset from the lens dict
# since the value of MAX is always safe.
raise Exception(f"Invalid chrom length {str(size)} in 'lens' dictionary. {str(e)}")
bitsets[chrom] = bbs
last_chrom = chrom
last_bitset = bitsets[chrom]
start = max(int(interval[self.start_col]), 0)
end = min(int(interval[self.end_col]), last_bitset.size)
last_bitset.set_range(start, end-start)
return bitsets
class NiceReaderWrapper(GenomicIntervalReader):
"""
>>> from bx.tabular.io import Header
>>> r = NiceReaderWrapper(["#chrom\\tname\\tstart\\tend\\textra",
... "chr1\\tfoo\\t1\\t100\\txxx",
... "chr2\\tbar\\t20\\t300\\txxx",
... "#I am a comment",
... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
>>> assert isinstance(next(r), Header)
>>> assert r.current_line == '#chrom\\tname\\tstart\\tend\\textra', r.current_line
>>> assert len([_ for _ in r]) == 4
"""
def __init__(self, reader, **kwargs):
GenomicIntervalReader.__init__(self, reader, **kwargs)
self.outstream = kwargs.get("outstream", None)
self.print_delegate = kwargs.get("print_delegate", None)
self.input_wrapper = iter(self.input)
self.input_iter = self.iterwrapper()
self.skipped = 0
self.skipped_lines = []
def __iter__(self):
return self
def __next__(self):
while True:
try:
nextitem = super().__next__()
return nextitem
except ParseError as e:
if self.outstream:
if self.print_delegate and callable(self.print_delegate):
self.print_delegate(self.outstream, e, self)
self.skipped += 1
# no reason to stuff an entire bad file into memory
if self.skipped < 10:
self.skipped_lines.append((self.linenum, self.current_line, str(e)))
def iterwrapper(self):
# Generator which keeps track of the current line as an object attribute.
for self.current_line in self.input_wrapper:
yield self.current_line
class BitsetSafeReaderWrapper(NiceReaderWrapper):
def __init__(self, reader, lens=None):
# This class handles any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines.
# The incoming lens dictionary is a dictionary of chromosome lengths
# which are used to initialize the bitsets.
# It is assumed that the reader is an interval reader, i.e. it has chr_col, start_col, end_col and strand_col attributes.
if lens is None:
lens = {}
NiceReaderWrapper.__init__(self, reader.input, chrom_col=reader.chrom_col, start_col=reader.start_col, end_col=reader.end_col, strand_col=reader.strand_col)
self.lens = lens
def __next__(self):
while True:
rval = super().__next__()
if isinstance(rval, GenomicInterval) and rval.end > self.lens.get(rval.chrom, MAX):
self.skipped += 1
# no reason to stuff an entire bad file into memory
if self.skipped < 10:
self.skipped_lines.append((self.linenum, self.current_line, "Error in BitsetSafeReaderWrapper"))
else:
return rval
bx-python-0.8.13/lib/bx/intervals/operations/ 0000775 0000000 0000000 00000000000 14156664651 0021122 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/intervals/operations/__init__.py 0000664 0000000 0000000 00000001571 14156664651 0023237 0 ustar 00root root 0000000 0000000 """
High level operations on genomic intervals. Most accept and produce iterables
of `bx.io.inervals.io.GenomicInterval` objects.
"""
BED_DEFAULT_COLS = 0, 1, 2, 5
MAX_END = 512*1024*1024
def bits_set_in_range(bits, range_start, range_end):
"""
Yield start,end tuples for each span of set bits in [range_start,range_end)
"""
end = range_start
while True:
start = bits.next_set(end)
end = min(bits.next_clear(start), range_end)
if start >= end:
break
yield start, end
def bits_clear_in_range(bits, range_start, range_end):
"""
Yield start,end tuples for each span of clear bits in [range_start,range_end)
"""
end = range_start
while True:
start = bits.next_clear(end)
if start >= range_end:
break
end = min(bits.next_set(start), range_end)
yield start, end
bx-python-0.8.13/lib/bx/intervals/operations/base_coverage.py 0000664 0000000 0000000 00000001601 14156664651 0024257 0 ustar 00root root 0000000 0000000 """
Determine the number of bases covered by a set of intervals.
"""
from bx.intervals.io import BitsetSafeReaderWrapper
from bx.intervals.operations import MAX_END
def base_coverage(reader):
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
base_reader = BitsetSafeReaderWrapper(reader, lens={})
bitsets = base_reader.binned_bitsets()
coverage = 0
for chrom in bitsets:
try:
coverage += bitsets[chrom].count_range(0, MAX_END)
except IndexError as e:
base_reader.skipped += 1
# no reason to stuff an entire bad file into memmory
if base_reader.skipped < 10:
base_reader.skipped_lines.append((base_reader.linenum, base_reader.current_line, str(e)))
continue
return coverage
bx-python-0.8.13/lib/bx/intervals/operations/complement.py 0000664 0000000 0000000 00000004321 14156664651 0023637 0 ustar 00root root 0000000 0000000 """
Complement a set of intervals.
"""
from bx.bitset import MAX
from bx.intervals.io import (
BitsetSafeReaderWrapper,
GenomicInterval
)
from bx.intervals.operations import bits_set_in_range
def complement(reader, lens):
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
complement_reader = BitsetSafeReaderWrapper(reader, lens=lens)
bitsets = complement_reader.binned_bitsets(upstream_pad=0, downstream_pad=0, lens=lens)
# NOT them all
for key, value in bitsets.items():
value.invert()
# Read remaining intervals and subtract
for chrom in bitsets:
bitset = bitsets[chrom]
out_intervals = bits_set_in_range(bitset, 0, lens.get(chrom, MAX))
try:
# Write the intervals
for start, end in out_intervals:
fields = ["." for x in range(max(complement_reader.chrom_col, complement_reader.start_col, complement_reader.end_col)+1)]
# default the column to a + if it exists
if complement_reader.strand_col < len(fields) and complement_reader.strand_col >= 0:
fields[complement_reader.strand_col] = "+"
fields[complement_reader.chrom_col] = chrom
fields[complement_reader.start_col] = start
fields[complement_reader.end_col] = end
new_interval = GenomicInterval(complement_reader, fields, complement_reader.chrom_col, complement_reader.start_col, complement_reader.end_col, complement_reader.strand_col, "+")
yield new_interval
except IndexError as e:
complement_reader.skipped += 1
# no reason to stuff an entire bad file into memmory
if complement_reader.skipped < 10:
complement_reader.skipped_lines.append((complement_reader.linenum, complement_reader.current_line, str(e)))
continue
# def main():
# # test it all out
# f1 = fileinput.FileInput("dataset_7.dat")
# g1 = GenomicIntervalReader(f1)
# for interval in complement(g1,{"chr":16000000}):
# print "\t".join(interval)
#
# if __name__ == "__main__":
# main()
bx-python-0.8.13/lib/bx/intervals/operations/concat.py 0000664 0000000 0000000 00000005052 14156664651 0022745 0 ustar 00root root 0000000 0000000 """
Concatenate sets of intervals.
Preserves format of the first input -- it is possible to concat two files that
have different column orders. Of course, the meta-data of the second will be
lost (and filled with a "."). If all of the files (GenomicInteralReaders) are
the same format, sameformat=True will preserve all columns of the first input,
cuts extra columns on subsequent input, and pads missing columns. If
sameformat=False then extra columns are filled with ".".
"""
from bx.intervals.io import GenomicInterval
from bx.tabular.io import (
Comment,
Header,
)
def concat(readers, comments=True, header=True, sameformat=True):
# Save columns from the first input
chrom_col = readers[0].chrom_col
start_col = readers[0].start_col
end_col = readers[0].end_col
strand_col = readers[0].strand_col
nfields = None
firstdataset = True
output = False
for intervals in readers:
for interval in intervals:
if isinstance(interval, GenomicInterval):
if not nfields:
nfields = interval.nfields
out_interval = interval.copy()
if sameformat or firstdataset:
# everything except the first input has to be
# trimmed or padded to match the first input
if len(out_interval.fields) > nfields:
out_interval.fields = out_interval.fields[0:nfields]
while len(out_interval.fields) < nfields:
out_interval.fields.append(".")
output = True
yield out_interval
else:
chrom = out_interval.chrom
start = out_interval.start
end = out_interval.end
strand = out_interval.strand
out_interval.fields = ["." for col in range(nfields)]
out_interval.fields[chrom_col] = chrom
out_interval.fields[start_col] = str(start)
out_interval.fields[end_col] = str(end)
# Strand is optional, might not exist in output
if strand_col < len(out_interval.fields):
out_interval.fields[strand_col] = strand
yield out_interval
elif isinstance(interval, Header) and header:
yield interval
elif isinstance(interval, Comment) and comments:
yield interval
if output and firstdataset:
firstdataset = False
bx-python-0.8.13/lib/bx/intervals/operations/coverage.py 0000664 0000000 0000000 00000005702 14156664651 0023273 0 ustar 00root root 0000000 0000000 """
Determine amount of each interval in one set covered by the intervals of
another set. Adds two columns to the first input, giving number of bases
covered and percent coverage on the second input.
"""
from bx.intervals.io import (
BitsetSafeReaderWrapper,
GenomicInterval,
)
from bx.tabular.io import (
Comment,
Header
)
def coverage(readers, comments=True):
# The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
primary = readers[0]
intersect = readers[1:]
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
intersect[0] = BitsetSafeReaderWrapper(intersect[0], lens={})
bitsets = intersect[0].binned_bitsets()
intersect = intersect[1:]
for andset in intersect:
bitset2 = andset.binned_bitsets()
for chrom in bitsets:
if chrom not in bitset2:
continue
bitsets[chrom].ior(bitset2[chrom])
intersect = intersect[1:]
# Read remaining intervals and give coverage
for interval in primary:
if isinstance(interval, Header):
yield interval
if isinstance(interval, Comment) and comments:
yield interval
elif isinstance(interval, GenomicInterval):
chrom = interval.chrom
start = int(interval.start)
end = int(interval.end)
if start > end:
try:
# This will only work if primary is a NiceReaderWrapper
primary.skipped += 1
# no reason to stuff an entire bad file into memmory
if primary.skipped < 10:
primary.skipped_lines.append((primary.linenum, primary.current_line, "Interval start after end!"))
except Exception:
pass
continue
if chrom not in bitsets:
bases_covered = 0
percent = 0.0
else:
try:
bases_covered = bitsets[chrom].count_range(start, end-start)
except IndexError as e:
try:
# This will only work if primary is a NiceReaderWrapper
primary.skipped += 1
# no reason to stuff an entire bad file into memmory
if primary.skipped < 10:
primary.skipped_lines.append((primary.linenum, primary.current_line, str(e)))
except Exception:
pass
continue
if (end - start) == 0:
percent = 0
else:
percent = float(bases_covered) / float(end - start)
interval.fields.append(str(bases_covered))
interval.fields.append(str(percent))
yield interval
bx-python-0.8.13/lib/bx/intervals/operations/find_clusters.py 0000664 0000000 0000000 00000012106 14156664651 0024340 0 ustar 00root root 0000000 0000000 """
Find clusters of intervals within a set of intervals. A cluster is a
group (of size minregions) of intervals within a specific distance (of
mincols) of each other.
Returns Cluster objects, which have a chrom, start, end, and lines (a
list of linenumbers from the original file). The original can then be
ran through with the linenumbers to extract clustered regions without
disturbing original order, or the clusters may themselves be written
as intervals.
"""
import math
import random
from bx.intervals.cluster import ClusterTree
from bx.intervals.io import GenomicInterval
def find_clusters(reader, mincols=1, minregions=2):
extra = dict()
chroms = dict()
linenum = -1
for interval in reader:
linenum += 1
if not isinstance(interval, GenomicInterval):
extra[linenum] = interval
else:
if interval.chrom not in chroms:
chroms[interval.chrom] = ClusterTree(mincols, minregions)
try:
chroms[interval.chrom].insert(interval.start, interval.end, linenum)
except OverflowError as e:
try:
# This will work only if reader is a NiceReaderWrapper
reader.skipped += 1
if reader.skipped < 10:
reader.skipped_lines.append((reader.linenum, reader.current_line, str(e)))
except Exception:
pass
continue
return chroms, extra
# DEPRECATED: Use the ClusterTree in bx.intervals.cluster for this.
# It does the same thing, but is a C implementation.
class ClusterNode:
def __init__(self, start, end, linenum, mincols, minregions):
# Python lacks the binomial distribution, so we convert a
# uniform into a binomial because it naturally scales with
# tree size. Also, python's uniform is perfect since the
# upper limit is not inclusive, which gives us undefined here.
self.priority = math.ceil((-1.0 / math.log(.5)) * math.log(-1.0 / (random.uniform(0, 1) - 1)))
self.start = start
self.end = end
self.left = None
self.right = None
self.lines = [linenum]
self.mincols = mincols
self.minregions = minregions
def insert(self, start, end, linenum):
if start - self.mincols > self.end:
# insert to right tree
if self.right:
self.right = self.right.insert(start, end, linenum)
else:
self.right = ClusterNode(start, end, linenum, self.mincols, self.minregions)
# rebalance tree
if self.priority < self.right.priority:
return self.rotateleft()
elif end + self.mincols < self.start:
# insert to left tree
if self.left:
self.left = self.left.insert(start, end, linenum)
else:
self.left = ClusterNode(start, end, linenum, self.mincols, self.minregions)
# rebalance tree
if self.priority < self.left.priority:
return self.rotateright()
else:
# insert here
self.start = min(self.start, start)
self.end = max(self.end, end)
self.lines.append(linenum)
# recursive call to push nodes up
if self.left:
self.left = self.left.push_up(self)
if self.right:
self.right = self.right.push_up(self)
return self
def rotateright(self):
root = self.left
self.left = self.left.right
root.right = self
return root
def rotateleft(self):
root = self.right
self.right = self.right.left
root.left = self
return root
def push_up(self, topnode):
# Note: this function does not affect heap property
# Distance method removed for inline, faster?
distance = max(self.start, topnode.start) - min(self.end, topnode.end)
if distance <= self.mincols:
topnode.start = min(self.start, topnode.start)
topnode.end = max(self.end, topnode.end)
for linenum in self.lines:
topnode.lines.append(linenum)
if self.right:
return self.right.push_up(topnode)
if self.left:
return self.left.push_up(topnode)
return None
if self.end < topnode.start and self.right:
self.right = self.right.push_up(topnode)
if self.start > topnode.end and self.left:
self.left = self.left.push_up(topnode)
return self
def getintervals(self):
if self.left:
yield from self.left.getintervals(self.minregions)
if len(self.lines) >= self.minregions:
yield self.start, self.end
if self.right:
yield from self.right.getintervals(self.minregions)
def getlines(self):
if self.left:
yield from self.left.getlines()
if len(self.lines) >= self.minregions:
yield from self.lines
if self.right:
yield from self.right.getlines()
bx-python-0.8.13/lib/bx/intervals/operations/intersect.py 0000664 0000000 0000000 00000006655 14156664651 0023510 0 ustar 00root root 0000000 0000000 """
Compute the intersection of two sets of genomic intervals, either base-by-base
or at the interval level. The returned GenomicIntervalReader will be in
the order of the first set of intervals passed in, with the corresponding
additional fields.
"""
from bx.intervals.io import (
BitsetSafeReaderWrapper,
GenomicInterval
)
from bx.intervals.operations import bits_set_in_range
from bx.tabular.io import (
Comment,
Header,
)
def intersect(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True):
# The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
# Read all but first into bitsets and intersect to one
primary = readers[0]
intersect = readers[1:]
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
intersect[0] = BitsetSafeReaderWrapper(intersect[0], lens=lens)
bitsets = intersect[0].binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
intersect = intersect[1:]
for andset in intersect:
bitset2 = andset.binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
for chrom in bitsets:
if chrom not in bitset2:
continue
bitsets[chrom].iand(bitset2[chrom])
intersect = intersect[1:]
# Read remaining intervals and intersect
for interval in primary:
if isinstance(interval, Header):
yield interval
if isinstance(interval, Comment) and comments:
yield interval
elif isinstance(interval, GenomicInterval):
chrom = interval.chrom
start = int(interval.start)
end = int(interval.end)
if chrom not in bitsets:
continue
if start > end:
try:
# This will only work if primary is a NiceReaderWrapper
primary.skipped += 1
# no reason to stuff an entire bad file into memmory
if primary.skipped < 10:
primary.skipped_lines.append((primary.linenum, primary.current_line, "Interval start after end!"))
except Exception:
pass
continue
out_intervals = []
# Intersect or Overlap
try:
if bitsets[chrom].count_range(start, end-start) >= mincols:
if pieces:
out_intervals = bits_set_in_range(bitsets[chrom], start, end)
else:
out_intervals = [(start, end)]
# Write the intervals
for start, end in out_intervals:
new_interval = interval.copy()
new_interval.start = start
new_interval.end = end
yield new_interval
except IndexError as e:
try:
# This will only work if primary is a NiceReaderWrapper
primary.skipped += 1
# no reason to stuff an entire bad file into memmory
if primary.skipped < 10:
primary.skipped_lines.append((primary.linenum, primary.current_line, str(e)))
except Exception:
pass
continue
bx-python-0.8.13/lib/bx/intervals/operations/join.py 0000664 0000000 0000000 00000011161 14156664651 0022433 0 ustar 00root root 0000000 0000000 """
Join two sets of intervals using their overlap as the key. The
intervals MUST be sorted by chrom(lexicographically),
start(arithmetically) and end(arithmetically). This works by simply
walking through the inputs in O(n) time.
"""
import math
from bx.intervals.io import GenomicInterval
from .quicksect import IntervalTree
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
# Read rightSet into memory:
rightlen = 0
leftlen = 0
rightTree = IntervalTree()
for item in rightSet:
if isinstance(item, GenomicInterval):
rightTree.insert(item, rightSet.linenum, item.fields)
if rightlen == 0:
rightlen = item.nfields
for interval in leftSet:
if leftlen == 0 and isinstance(interval, GenomicInterval):
leftlen = interval.nfields
if not isinstance(interval, GenomicInterval):
yield interval
else:
result = []
rightTree.intersect(interval, lambda node: result.append(node))
overlap_not_met = 0
for item in result:
if item.start in range(interval.start, interval.end+1) and item.end not in range(interval.start, interval.end+1):
overlap = interval.end-item.start
elif item.end in range(interval.start, interval.end+1) and item.start not in range(interval.start, interval.end+1):
overlap = item.end-interval.start
elif item.start in range(interval.start, interval.end+1) and item.end in range(interval.start, interval.end+1):
overlap = item.end-item.start
else: # the intersecting item's start and end are outside the interval range
overlap = interval.end-interval.start
if overlap < mincols:
overlap_not_met += 1
continue
outfields = list(interval)
outfields.extend(item.other)
setattr(item, "visited", True)
yield outfields
if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
outfields = list(interval)
for x in range(rightlen):
outfields.append(".")
yield outfields
if leftfill:
def report_unvisited(node, results):
if not hasattr(node, "visited"):
results.append(node)
results = []
rightTree.traverse(lambda x: report_unvisited(x, results))
for item in results:
outfields = list()
for x in range(leftlen):
outfields.append(".")
outfields.extend(item.other)
yield outfields
def interval_cmp(a, b):
interval1 = a[0]
interval2 = b[0]
if not (isinstance(interval1, GenomicInterval) and isinstance(interval2, GenomicInterval)):
return 0
# Both are intervals
if interval1.chrom == interval2.chrom:
center1 = interval1.start + ((interval1.end - interval1.start) / 2)
center2 = interval2.start + ((interval2.end - interval2.start) / 2)
return center1 - center2
else:
if interval1.chrom > interval2.chrom:
return 1
else:
return -1
return 0
def findintersect(interval, sortedlist, mincols):
# find range of intervals that intersect via a binary search
# find lower bound
x = len(sortedlist) / 2
n = int(math.pow(2, math.ceil(math.log(len(sortedlist), 2))))
not_found = True
not_done = True
while not_found and not_done:
n = n / 2
if n == 0:
n = 1
not_done = False
if x >= len(sortedlist):
x -= n
elif x < 0:
x += n
else:
if findoverlap(sortedlist[x][0], interval) >= mincols:
not_found = False
else:
comp = interval_cmp(sortedlist[x], [interval, 0])
if comp > 0:
x -= n
else:
x += n
print("\t".join(sortedlist[x][0].fields))
print("not_found = " + str(not_found))
if not_found:
return 0, -1
lowerbound = x
upperbound = x
while (lowerbound > -1) and (findoverlap(sortedlist[lowerbound-1][0], interval) >= mincols):
lowerbound -= 1
while (upperbound+1 < len(sortedlist)) and (findoverlap(sortedlist[upperbound+1][0], interval) >= mincols):
upperbound += 1
return lowerbound, upperbound
def findoverlap(a, b):
# overlapping
if a.chrom == b.chrom:
return min(a.end, b.end) - max(a.start, b.start)
else:
return 0
bx-python-0.8.13/lib/bx/intervals/operations/merge.py 0000664 0000000 0000000 00000002666 14156664651 0022605 0 ustar 00root root 0000000 0000000 """
Merge overlapping regions in two sets of genomic intervals.
"""
from bx.intervals.io import BitsetSafeReaderWrapper
from bx.intervals.operations import (
bits_set_in_range,
MAX_END
)
# sorting could make this a less memory intensive operation(?)
def merge(interval, mincols=1):
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
interval = BitsetSafeReaderWrapper(interval, lens={})
bitsets = interval.binned_bitsets()
if interval.header:
yield interval.header
for chrom in bitsets:
bitset = bitsets[chrom]
output = ["."] * (max(interval.chrom_col, interval.start_col, interval.end_col) + 1)
output[interval.chrom_col] = chrom
try:
for start, end in bits_set_in_range(bitset, 0, MAX_END):
output[interval.start_col] = str(start)
output[interval.end_col] = str(end)
yield output
except IndexError as e:
try:
# This will work only if interval is a NiceReaderWrapper
interval.skipped += 1
# no reason to stuff an entire bad file into memmory
if interval.skipped < 10:
interval.skipped_lines.append((interval.linenum, interval.current_line, str(e)))
except Exception:
pass
continue
bx-python-0.8.13/lib/bx/intervals/operations/quicksect.py 0000664 0000000 0000000 00000013506 14156664651 0023474 0 ustar 00root root 0000000 0000000 """
Intersects ... faster. Suports GenomicInterval datatype and multiple
chromosomes.
"""
import math
import random
try:
from time import process_time
except ImportError:
# For compatibility with Python < 3.3
from time import clock as process_time
class IntervalTree:
def __init__(self):
self.chroms = {}
def insert(self, interval, linenum=0, other=None):
chrom = interval.chrom
start = interval.start
end = interval.end
if interval.chrom in self.chroms:
self.chroms[chrom] = self.chroms[chrom].insert(start, end, linenum, other)
else:
self.chroms[chrom] = IntervalNode(start, end, linenum, other)
def intersect(self, interval, report_func):
chrom = interval.chrom
start = interval.start
end = interval.end
if chrom in self.chroms:
self.chroms[chrom].intersect(start, end, report_func)
def traverse(self, func):
for item in self.chroms.values():
item.traverse(func)
class IntervalNode:
def __init__(self, start, end, linenum=0, other=None):
# Python lacks the binomial distribution, so we convert a
# uniform into a binomial because it naturally scales with
# tree size. Also, python's uniform is perfect since the
# upper limit is not inclusive, which gives us undefined here.
self.priority = math.ceil((-1.0 / math.log(.5)) * math.log(-1.0 / (random.uniform(0, 1) - 1)))
self.start = start
self.end = end
self.maxend = self.end
self.minend = self.end
self.left = None
self.right = None
self.linenum = linenum
self.other = other
def insert(self, start, end, linenum=0, other=None):
root = self
if start > self.start:
# insert to right tree
if self.right:
self.right = self.right.insert(start, end, linenum, other)
else:
self.right = IntervalNode(start, end, linenum, other)
# rebalance tree
if self.priority < self.right.priority:
root = self.rotateleft()
else:
# insert to left tree
if self.left:
self.left = self.left.insert(start, end, linenum, other)
else:
self.left = IntervalNode(start, end, linenum, other)
# rebalance tree
if self.priority < self.left.priority:
root = self.rotateright()
if root.right and root.left:
root.maxend = max(root.end, root.right.maxend, root.left.maxend)
root.minend = min(root.end, root.right.minend, root.left.minend)
elif root.right:
root.maxend = max(root.end, root.right.maxend)
root.minend = min(root.end, root.right.minend)
elif root.left:
root.maxend = max(root.end, root.left.maxend)
root.minend = min(root.end, root.left.minend)
return root
def rotateright(self):
root = self.left
self.left = self.left.right
root.right = self
if self.right and self.left:
self.maxend = max(self.end, self.right.maxend, self.left.maxend)
self.minend = min(self.end, self.right.minend, self.left.minend)
elif self.right:
self.maxend = max(self.end, self.right.maxend)
self.minend = min(self.end, self.right.minend)
elif self.left:
self.maxend = max(self.end, self.left.maxend)
self.minend = min(self.end, self.left.minend)
return root
def rotateleft(self):
root = self.right
self.right = self.right.left
root.left = self
if self.right and self.left:
self.maxend = max(self.end, self.right.maxend, self.left.maxend)
self.minend = min(self.end, self.right.minend, self.left.minend)
elif self.right:
self.maxend = max(self.end, self.right.maxend)
self.minend = min(self.end, self.right.minend)
elif self.left:
self.maxend = max(self.end, self.left.maxend)
self.minend = min(self.end, self.left.minend)
return root
def intersect(self, start, end, report_func):
if start < self.end and end > self.start:
report_func(self)
if self.left and start < self.left.maxend:
self.left.intersect(start, end, report_func)
if self.right and end > self.start:
self.right.intersect(start, end, report_func)
def traverse(self, func):
if self.left:
self.left.traverse(func)
func(self)
if self.right:
self.right.traverse(func)
def main():
test = None
intlist = []
for _ in range(20000):
start = random.randint(0, 1000000)
end = start + random.randint(1, 1000)
if test:
test = test.insert(start, end)
else:
test = IntervalNode(start, end)
intlist.append((start, end))
starttime = process_time()
for x in range(5000):
start = random.randint(0, 10000000)
end = start + random.randint(1, 1000)
result = []
test.intersect(start, end, lambda x: result.append(x.linenum))
print("%f for tree method" % (process_time() - starttime))
starttime = process_time()
for _ in range(5000):
start = random.randint(0, 10000000)
end = start + random.randint(1, 1000)
bad_sect(intlist, start, end)
print("%f for linear (bad) method" % (process_time() - starttime))
def test_func(node):
print("[%d, %d), %d" % (node.start, node.end, node.maxend))
def bad_sect(lst, int_start, int_end):
intersection = []
for start, end in lst:
if int_start < end and int_end > start:
intersection.append((start, end))
return intersection
if __name__ == "__main__":
main()
bx-python-0.8.13/lib/bx/intervals/operations/subtract.py 0000664 0000000 0000000 00000006422 14156664651 0023327 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
"""
Subtract one set of genomic intervals from another (base-by-base or whole
intervals). The returned GenomicIntervals will be in the order
of the first set of intervals passed in, with the corresponding
meta-data.
"""
from warnings import warn
from bx.intervals.io import (
BitsetSafeReaderWrapper,
GenomicInterval
)
from bx.intervals.operations import bits_clear_in_range
from bx.tabular.io import (
Comment,
Header,
)
def subtract(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True):
# The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets.
# Read all but first into bitsets and union to one (if confused, read DeMorgan's...)
primary = readers[0]
union = readers[1:]
# Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
# the bitsets are being created by skipping the problem lines
union[0] = BitsetSafeReaderWrapper(union[0], lens=lens)
bitsets = union[0].binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
union = union[1:]
for andset in union:
bitset2 = andset.binned_bitsets(upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens)
for chrom in bitset2:
if chrom not in bitsets:
bitsets[chrom] = bitset2[chrom]
else:
bitsets[chrom].ior(bitset2[chrom])
# Read remaining intervals and subtract
for interval in primary:
if isinstance(interval, Header):
yield interval
if isinstance(interval, Comment) and comments:
yield interval
elif isinstance(interval, GenomicInterval):
chrom = interval.chrom
if chrom not in bitsets:
yield interval
else:
start = int(interval.start)
end = int(interval.end)
if start > end:
warn("Interval start after end!")
out_intervals = []
# Find the intervals that meet the criteria (for the three sensible
# permutations of reverse and pieces)
try:
if bitsets[chrom].count_range(start, end-start) >= mincols:
if pieces:
out_intervals = bits_clear_in_range(bitsets[chrom], start, end)
else:
out_intervals = [(start, end)]
# Write the intervals
for start, end in out_intervals:
new_interval = interval.copy()
new_interval.start = start
new_interval.end = end
yield new_interval
except IndexError as e:
try:
# This will work only if primary is a NiceReaderWrapper
primary.skipped += 1
# no reason to stuff an entire bad file into memmory
if primary.skipped < 10:
primary.skipped_lines.append((primary.linenum, primary.current_line, str(e)))
except Exception:
pass
continue
bx-python-0.8.13/lib/bx/intervals/random_intervals.py 0000664 0000000 0000000 00000022260 14156664651 0022662 0 ustar 00root root 0000000 0000000 """
Classes for generating random sets of intervals over larger regions.
"""
import bisect
from bx.bitset import BitSet
random = __import__('random')
class MaxtriesException(Exception):
pass
def throw_random_list(lengths, mask, allow_overlap=False):
rval = []
throw_random_gap_list(lengths, mask, lambda s, e: rval.append((s, e)), allow_overlap)
assert sum(b - a for a, b in rval) == sum(lengths)
return rval
def throw_random_bits(lengths, mask, allow_overlap=False):
rval = BitSet(mask.size)
throw_random_gap_list(lengths, mask, lambda s, e: rval.set_range(s, e - s), allow_overlap)
if not allow_overlap:
assert rval.count_range(0, rval.size) == sum(lengths)
return rval
def throw_random_gap_list(lengths, mask, save_interval_func, allow_overlap=False):
"""
Generates a set of non-overlapping random intervals from a length
distribution.
`lengths`: list containing the length of each interval to be generated.
We expect this to be sorted by decreasing length to minimize
the chance of failure (MaxtriesException) and for some
performance gains when allow_overlap==True and there are
duplicate lengths
`mask`: a BitSet in which set bits represent regions not to place
intervals. The size of the region is also determined from the
mask.
"""
# Use mask to find the gaps; gaps is a list of (length,start,end)
lengths = [length for length in lengths if length > 0]
min_length = min(lengths)
gaps = []
start = end = 0
while True:
start = mask.next_clear(end)
if start == mask.size:
break
end = mask.next_set(start)
if end-start >= min_length:
gaps.append((end-start, start, None))
# Sort (long regions first)
gaps.sort()
gaps.reverse()
# Throw
throw_random_private(lengths, gaps, save_interval_func, allow_overlap, three_args=False)
def throw_random_intervals(lengths, regions, save_interval_func=None, allow_overlap=False):
"""
Generates a set of non-overlapping random intervals from a length
distribution.
`lengths`: list containing the length of each interval to be generated.
We expect this to be sorted by decreasing length to minimize
the chance of failure (MaxtriesException) and for some
performance gains when allow_overlap==True and there are
duplicate lengths.
`regions`: A list of regions in which intervals can be placed. Elements
are tuples or lists of the form (start, end, ...), where ...
indicates any number of items (including zero).
`save_interval_func`: A function accepting three arguments which will be
passed the (start,stop,region) for each generated
interval, where region is an entry in the regions
list. If this is None, the generated intervals will
be returned as a list of elements copied from the
region with start and end modified.
"""
# Copy regions
regions = sorted((x[1]-x[0], x[0], x) for x in regions)
# Sort (long regions first)
regions.reverse()
# Throw
if (save_interval_func is not None):
throw_random_private(lengths, regions, save_interval_func, allow_overlap)
return
else:
intervals = []
def save_interval_func(s, e, rgn):
return intervals.append(overwrite_start_end(s, e, rgn))
throw_random_private(lengths, regions, save_interval_func, allow_overlap)
return intervals
def overwrite_start_end(s, e, rgn):
rgn = list(rgn)
rgn[0] = s
rgn[1] = e
return tuple(rgn)
def throw_random_private(lengths, regions, save_interval_func, allow_overlap=False, three_args=True):
"""
(Internal function; we expect calls only through the interface functions
above)
`lengths`: A list containing the length of each interval to be generated.
`regions`: A list of regions in which intervals can be placed, sorted by
decreasing length. Elements are triples of the form (length,
start, extra), This list CAN BE MODIFIED by this function.
`save_interval_func`: A function accepting three arguments which will be
passed the (start,stop,extra) for each generated
interval.
"""
# Implementation:
# We keep a list of the regions, sorted from largest to smallest. We then
# place each length by following steps:
# (1) construct a candidate counts array (cc array)
# (2) choose a candidate at random
# (3) find region containing that candidate
# (4) map candidate to position in that region
# (5) split region if not allowing overlaps
# (6) report placed segment
#
# The cc array is only constructed if there's a change (different length
# to place, or the region list has changed). It contains, for each
# region, the total number of number of candidate positions in regions
# *preceding* it in the region list:
# cc[i] = sum over k in 0..(i-1) of length[i] - L + 1
# where N is the number of regions and L is the length being thrown.
# At the same time, we determine the total number of candidates (the total
# number of places the current length can be placed) and the index range
# of regions into which the length will fit.
#
# example:
# for L = 20
# i = 0 1 2 3 4 5 6 7 8 9
# length[i] = 96 66 56 50 48 40 29 17 11 8
# cc[i] = 0 77 124 161 192 221 242 X X X
# candidates = 252
# lo_rgn = 0
# hi_rgn = 6
#
# The candidate is chosen in (0..candidates-1). The candidate counts
# array allows us to do a binary search to locate the region that holds that
# candidate. Continuing the example above, we choose a random candidate
# s in (0..251). If s happens to be in (124..160), it will be mapped to
# region 2 at start position s-124.
#
# During the binary search, if we are looking at region 3, if s < cc[3]
# then the desired region is region 2 or lower. Otherwise it is region 3 or
# higher.
min_length = min(lengths)
prev_length = None # (force initial cc array construction)
cc = [0] * (len(regions) + len(lengths) - 1)
num_thrown = 0
for length in lengths:
# construct cc array (only needed if length has changed or region list has
# changed)
if length != prev_length:
prev_length = length
assert len(cc) >= len(regions)
candidates = 0
hi_rgn = 0
for region in regions:
rgn_len = region[0]
if rgn_len < length:
break
cc[hi_rgn] = candidates
candidates += rgn_len - length + 1
hi_rgn += 1
if candidates == 0:
raise MaxtriesException(
"No region can fit an interval of length %d (we threw %d of %d)"
% (length, num_thrown, len(lengths)))
hi_rgn -= 1
# Select a candidate
s = random.randrange(candidates)
# ..
# ..for ix in range( len( regions ) ):
# .. region = regions[ix]
# .. if ix <= hi_rgn: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], cc[ix] )
# .. else: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], "X" )
# ..print "s = %s (of %s candidates)" % ( s, candidates )
# Locate region containing that candidate, by binary search
lo = 0
hi = hi_rgn
while hi > lo:
mid = (lo + hi + 1) / 2 # (we round up to prevent infinite loop)
if s < cc[mid]:
hi = mid-1 # (s < num candidates from 0..mid-1)
else:
lo = mid # (s >= num candidates from 0..mid-1)
s -= cc[lo]
# If we are not allowing overlaps we will remove the placed interval
# from the region list
if allow_overlap:
rgn_length, rgn_start, rgn_extra = regions[lo]
else:
# Remove the chosen region and split
rgn_length, rgn_start, rgn_extra = regions.pop(lo)
rgn_end = rgn_start + rgn_length
assert s >= 0
assert rgn_start + s + length <= rgn_end, "Expected: %d + %d + %d == %d <= %d" % (rgn_start, s, length, rgn_start + s + length, rgn_end)
regions.reverse()
if s >= min_length:
bisect.insort(regions, (s, rgn_start, rgn_extra))
if s + length <= rgn_length - min_length:
bisect.insort(regions, (rgn_length - (s + length), rgn_start + s + length, rgn_extra))
regions.reverse()
prev_length = None # (force cc array construction)
# Save the new interval
if (three_args):
save_interval_func(rgn_start + s, rgn_start + s + length, rgn_extra)
else:
save_interval_func(rgn_start + s, rgn_start + s + length)
num_thrown += 1
bx-python-0.8.13/lib/bx/intseq/ 0000775 0000000 0000000 00000000000 14156664651 0016233 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/intseq/__init__.py 0000664 0000000 0000000 00000000114 14156664651 0020340 0 ustar 00root root 0000000 0000000 """
Tools for working with strings over interger alphabets efficiently.
"""
bx-python-0.8.13/lib/bx/intseq/ngramcount.pyx 0000664 0000000 0000000 00000007347 14156664651 0021165 0 ustar 00root root 0000000 0000000 """
Tools for counting words (n-grams) in integer sequences.
"""
import numpy
cdef extern from "Python.h":
ctypedef int Py_intptr_t
# cdef extern from "numpy/npy_3kcompat.h":
# NOTE: including npy_3kcompat.h did not compile,
# so use the explicitly extracted function from here:
cdef extern from "npy_capsule_as_void_ptr.h":
void * NpyCapsule_AsVoidPtr(object) except NULL
# for PyArrayInterface:
CONTIGUOUS=0x01
FORTRAN=0x02
ALIGNED=0x100
NOTSWAPPED=0x200
WRITEABLE=0x400
ctypedef struct PyArrayInterface:
int two # contains the integer 2 as a sanity check
int nd # number of dimensions
char typekind # kind in array --- character code of typestr
int itemsize # size of each element
int flags # flags indicating how the data should be interpreted
Py_intptr_t *shape # A length-nd array of shape information
Py_intptr_t *strides # A length-nd array of stride information
void *data # A pointer to the first element of the array
def count_ngrams( object ints, int n, int radix ):
"""
Count the number of occurrences of each possible length `n` word in
`ints` (which contains values from 0 to `radix`). Returns an array
of length `radix` ** `n` containing the counts.
"""
cdef PyArrayInterface * ints_desc
cdef PyArrayInterface * rval_desc
# Get array interface for input string and validate
ints_desc_obj = ints.__array_struct__
ints_desc = NpyCapsule_AsVoidPtr( ints_desc_obj )
assert ints_desc.two == 2, "Array interface sanity check failed, got %d" % ints_desc.two
assert ints_desc.nd == 1, "Input array must be 1d"
assert ints_desc.typekind == 'i'[0], "Input array must contain integers"
assert ints_desc.itemsize == 4, "Input array must contain 32bit integers"
assert ints_desc.flags & CONTIGUOUS > 0, "Input array must be contiguous"
assert ints_desc.flags & ALIGNED > 0, "Input array must be aligned"
assert ints_desc.flags & NOTSWAPPED > 0, "Input array must not be byteswapped"
# Create numpy array for return value, get array interface and validate
rval = numpy.zeros( ( ( radix ) ** n ), dtype=numpy.int32 )
assert ints_desc.two == 2, "Array interface sanity check failed, got %d" % ints_desc.two
rval_desc_obj = rval.__array_struct__
rval_desc = NpyCapsule_AsVoidPtr( rval_desc_obj )
assert rval_desc.two == 2, "Array interface sanity check failed"
assert rval_desc.nd == 1, "Input array must be 1d"
assert rval_desc.typekind == 'i'[0], "Input array must contain integers"
assert rval_desc.itemsize == 4, "Input array must contain 32bit integers"
assert rval_desc.flags & CONTIGUOUS > 0, "Input array must be contiguous"
assert rval_desc.flags & ALIGNED > 0, "Input array must be aligned"
assert rval_desc.flags & NOTSWAPPED > 0, "Input array must not be byteswapped"
# Do it
_count_ngrams( ints_desc.data, ints_desc.shape[0], rval_desc.data, n, radix )
return rval
cdef _count_ngrams( int* ints, int n_ints, int* rval, int n, int radix ):
cdef int i, j, index, factor, letter
# Loop over each word in the string
for i from 0 <= i < ( n_ints - n ):
# Walk back to build index into count array
index = 0
factor = 1
for j from 0 <= j < n:
letter = ints[ i + j ]
if letter < 0 or letter >= radix:
# This word is bad, break out and do not increment counts
print "breaking, letter", letter
break
index = index + letter * factor
factor = factor * radix
else:
print index
rval[ index ] = rval[ index ] + 1
bx-python-0.8.13/lib/bx/misc/ 0000775 0000000 0000000 00000000000 14156664651 0015663 5 ustar 00root root 0000000 0000000 bx-python-0.8.13/lib/bx/misc/__init__.py 0000664 0000000 0000000 00000000451 14156664651 0017774 0 ustar 00root root 0000000 0000000 """
Various utilities.
"""
import bz2
import gzip
def open_compressed(filename, mode='r'):
if filename.endswith(".bz2"):
return bz2.BZ2File(filename, mode)
elif filename.endswith(".gz"):
return gzip.GzipFile(filename, mode)
else:
return open(filename, mode)
bx-python-0.8.13/lib/bx/misc/_seekbzip2.pyx 0000664 0000000 0000000 00000016034 14156664651 0020466 0 ustar 00root root 0000000 0000000 """
Pyrex/C extension supporting `bx.misc.seekbzip2` (wrapping the low level
functions in `micro-bunzip.c`).
"""
cdef extern from "Python.h":
char * PyBytes_AsString( object )
object PyBytes_FromStringAndSize( char *, Py_ssize_t )
cdef extern from "micro-bunzip.h":
ctypedef struct bunzip_data:
int in_fd
int inbufBitCount
int inbufPos
int inbufCount
int writeCount
unsigned int writeCRC
int writeCurrent
int writeCopies
unsigned int * dbuf
unsigned int get_bits(bunzip_data *bd, char bits_wanted)
int get_next_block( bunzip_data *bd )
int read_bunzip(bunzip_data *bd, char *outbuf, int len)
int start_bunzip(bunzip_data **bdp, int in_fd, char *inbuf, int len)
int read_bunzip_to_char(bunzip_data *bd, char *outbuf, int len, int* gotcount_out, char stopchar )
cdef extern from "unistd.h":
# Not really
ctypedef unsigned long long off_t
off_t lseek( int fildes, off_t offset, int whence )
cdef extern from "stdlib.h":
void free( void *ptr )
import sys
import os
cdef class SeekBzip2:
cdef bunzip_data * bd
cdef int file_fd
cdef int at_eof
def __init__( self, filename ):
self.at_eof = 0
self.file_fd = os.open( filename, os.O_RDONLY )
# Initialize bunzip_data from the file
start_bunzip( &( self.bd ), self.file_fd, NULL, 0 )
def close( self ):
free( self.bd.dbuf )
free( self.bd )
os.close( self.file_fd )
def seek( self, unsigned long long position ):
"""
Seek the bunzip_data to a specific chunk (position must correspond to
that start of a compressed data block).
"""
cdef off_t n_byte
cdef int n_bit
# Break position into bit and byte offsets
## sys.stderr.write( "arg pos: %d\n" % position )
n_byte = position / 8;
n_bit = position % 8;
## sys.stderr.write( "byte pos: %d\n" % n_byte )
## sys.stderr.write( "bit pos: %d\n" % n_bit )
## sys.stderr.flush()
# Seek the underlying file descriptor
if ( lseek( self.file_fd, n_byte, 0 ) != n_byte ):
raise Exception( "lseek of underlying file failed" )
# Init the buffer at the right bit position
self.bd.inbufBitCount = self.bd.inbufPos = self.bd.inbufCount = 0
get_bits( self.bd, n_bit )
# This ensures that the next read call will return 0, causing the
# buffer to be re-initialized
self.bd.writeCount = -1
# Reset EOF tracking
self.at_eof = 0
def readline( self, int amount ):
cdef object rval
cdef char * p_rval
cdef int gotcount
cdef int totalcount
cdef int status
cdef int spaceleft
cdef int desired
gotcount = 0
totalcount = 0
# If already at EOF return None
if self.at_eof:
return None
chunks = []
# We have great difficulty resizing buffers, so we'll just create
# one 8k string at a time
rval = PyBytes_FromStringAndSize( NULL, 8192 )
p_rval = PyBytes_AsString( rval )
spaceleft = 8192
while amount != 0:
if amount > 0 and amount < spaceleft:
desired = amount
else:
desired = spaceleft
## sys.stderr.write( "readline, amount: %d\n" % amount )
## sys.stderr.write( "buffer: %r" % rval[:100] )
## sys.stderr.write( "\n" )
## sys.stderr.flush()
# ord( "\n" ) = 10
status = read_bunzip_to_char( self.bd, p_rval, desired, &gotcount, 10 );
## sys.stderr.write( "readline, desired: %d, gotcount: %d\n" % ( desired, gotcount ) );
## sys.stderr.write( "buffer: %r" % rval[:100] )
## sys.stderr.write( "\n" )
## sys.stderr.flush()
if status == -9:
## sys.stderr.write( "readline, STOP_CHAR\n" ); sys.stderr.flush()
# Reached the stop character (RETVAL_STOPCHAR == -9), so
# we can stop
chunks.append( rval[:8192-spaceleft+gotcount] )
break
elif status == -10:
## sys.stderr.write( "readline, BUFFER_FULL\n" ); sys.stderr.flush()
# Filled the buffer (RETVAL_BUFFER_FULL == -10), so create
# new buffer and keep going
chunks.append( rval )
amount = amount - gotcount
if amount == 0:
# Got the desired amount
break
rval = PyBytes_FromStringAndSize( NULL, 8192 )
p_rval = PyBytes_AsString( rval )
spaceleft = 8192
elif status == -8:
## sys.stderr.write( "readline, END_OF_BLOCK\n" ); sys.stderr.flush()
# No more data in the decomp buffer (RETVAL_END_OF_BLOCK == -10)
if gotcount and p_rval[ gotcount - 1 ] == 10:
chunks.append( rval[:8192-spaceleft+gotcount] )
break
# Update buffer info
p_rval = p_rval + gotcount
spaceleft = spaceleft - gotcount
amount = amount - gotcount
# Get the next block
status = get_next_block( self.bd )
if status == -1:
# Block is end of stream block (RETVAL_LAST_BLOCK == -1)
self.at_eof = 1
chunks.append( rval[:gotcount] )
break
self.bd.writeCRC = 0xffffffff
self.bd.writeCopies = 0
else:
# Some other status
raise Exception( "read_bunzip error %d" % status )
# Return whatever we read
return "".join( chunks )
def read( self, int amount ):
cdef object rval
cdef char * p_rval
cdef int gotcount
cdef int totalcount
cdef int status
totalcount = 0
# If already at EOF return None
if self.at_eof:
return None
# Create a new python bytes string large enough to hold the result
rval = PyBytes_FromStringAndSize( NULL, amount )
p_rval = PyBytes_AsString( rval )
# Read into it
## sys.stderr.write( "read called, bd.current: %x\n" % self.bd.writeCurrent ); sys.stderr.flush()
while amount > 0:
gotcount = read_bunzip( self.bd, p_rval, amount );
if gotcount < 0:
raise Exception( "read_bunzip error %d" % gotcount )
elif gotcount == 0:
status = get_next_block( self.bd )
if status == -1:
self.at_eof = 1
break
self.bd.writeCRC = 0xffffffff
self.bd.writeCopies = 0
else:
totalcount = totalcount + gotcount
amount = amount - gotcount
p_rval = p_rval + gotcount
# Return whatever we read
return rval[:totalcount]
bx-python-0.8.13/lib/bx/misc/bgzf.pyx 0000664 0000000 0000000 00000002666 14156664651 0017367 0 ustar 00root root 0000000 0000000 """
Seekable access to BGZ files based on samtools code. Does not yet implement
complete file-like interface.
"""
from cpython.version cimport PY_MAJOR_VERSION
ctypedef unsigned long long int64_t
cdef extern from "Python.h":
char * PyBytes_AsString( object )
object PyBytes_FromStringAndSize( char *, Py_ssize_t )
cdef extern from "bgzf.h":
ctypedef struct BGZF
BGZF * bgzf_open( const char * path, const char * mode )
int bgzf_close( BGZF * fp )
int bgzf_read( BGZF * fp, void * data, int length )
int64_t bgzf_tell( BGZF * fp )
int64_t bgzf_seek( BGZF * fp, int64_t pos, int where )
cdef class BGZFFile( object ):
cdef BGZF * bgzf
def __init__( self, path, mode="r" ):
if PY_MAJOR_VERSION >= 3:
bytes_path, bytes_mode = path.encode(), mode.encode()
else:
bytes_path, bytes_mode = path, mode
self.bgzf = bgzf_open( bytes_path, bytes_mode )
if not self.bgzf:
raise IOError( "Could not open file" )
def close( self ):
if self.bgzf:
bgzf_close( self.bgzf )
def read( self, int length ):
cdef object rval
rval = PyBytes_FromStringAndSize( NULL, length )
bgzf_read( self.bgzf, PyBytes_AsString( rval ), length )
return rval
def tell( self ):
return bgzf_tell( self.bgzf )
def seek( self, int64_t pos, int where=0 ):
return bgzf_seek( self.bgzf, pos, where )
bx-python-0.8.13/lib/bx/misc/bgzf_tests.py 0000664 0000000 0000000 00000000312 14156664651 0020403 0 ustar 00root root 0000000 0000000 import bx.misc.bgzf
def test_bgzf():
f = bx.misc.bgzf.BGZFFile("test_data/bgzf_tests/test.txt.gz", "r")
assert f.read(10) == b"begin 644 "
f.seek(0)
assert f.read(10) == b"begin 644 "
bx-python-0.8.13/lib/bx/misc/binary_file.py 0000664 0000000 0000000 00000011625 14156664651 0020525 0 ustar 00root root 0000000 0000000 """
Wrappers for doing binary IO on file-like objects
"""
import struct
import sys
import numpy
def bytesify(s):
if isinstance(s, bytes):
return s
else:
return s.encode()
# Standard size:
# short is 8 bits
# int and long are 32 bits
# long long is 64 bits
class BadMagicNumber(IOError):
pass
class BinaryFileReader:
"""
Wrapper for doing binary reads on any file like object.
Currently this is not heavily optimized (it uses the `struct` module to
unpack)
"""
def __init__(self, file, magic=None, is_little_endian=False):
self.is_little_endian = is_little_endian
self.file = file
if magic is not None:
# Attempt to read magic number and chuck endianess
bytes = file.read(4)
if struct.unpack(">I", bytes)[0] == magic:
pass
elif struct.unpack("I", bytes)[0], struct.unpack("