pax_global_header00006660000000000000000000000064145356073540014526gustar00rootroot0000000000000052 comment=4061790cf4fa09749eb3e822da4528de24ab8904 dnaio-1.2.0/000077500000000000000000000000001453560735400126205ustar00rootroot00000000000000dnaio-1.2.0/.codecov.yml000066400000000000000000000002461453560735400150450ustar00rootroot00000000000000comment: off codecov: require_ci_to_pass: no coverage: precision: 1 round: down range: "90...100" status: project: yes patch: no changes: no dnaio-1.2.0/.editorconfig000066400000000000000000000001451453560735400152750ustar00rootroot00000000000000[*.{py,pyx}] charset=utf-8 end_of_line=lf insert_final_newline=true indent_style=space indent_size=4 dnaio-1.2.0/.gitattributes000066400000000000000000000000501453560735400155060ustar00rootroot00000000000000*.fastq -crlf *.fasta -crlf *.sam -crlf dnaio-1.2.0/.github/000077500000000000000000000000001453560735400141605ustar00rootroot00000000000000dnaio-1.2.0/.github/workflows/000077500000000000000000000000001453560735400162155ustar00rootroot00000000000000dnaio-1.2.0/.github/workflows/ci.yml000066400000000000000000000066601453560735400173430ustar00rootroot00000000000000name: CI on: [push, pull_request] jobs: lint: # Run for PRs only if they come from a forked repo (avoids duplicate runs) if: >- github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name timeout-minutes: 10 runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10"] toxenv: [flake8, black, mypy, docs] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install tox run: python -m pip install tox - name: Run tox ${{ matrix.toxenv }} run: tox -e ${{ matrix.toxenv }} build: if: >- github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 with: fetch-depth: 0 # required for setuptools_scm - name: Build sdist and temporary wheel run: pipx run build - uses: actions/upload-artifact@v3 with: name: sdist path: dist/*.tar.gz test: if: >- github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name timeout-minutes: 10 runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] compile_flags: [""] include: - os: macos-latest python-version: "3.10" - os: windows-latest python-version: "3.10" - os: ubuntu-latest python-version: "3.10" compile_flags: "-mssse3" steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install tox run: python -m pip install tox - name: Test run: tox -e py env: CFLAGS: ${{ matrix.compile_flags }} - name: Upload coverage report uses: codecov/codecov-action@v3 wheels: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') needs: [lint, test] timeout-minutes: 15 strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 with: fetch-depth: 0 # required for setuptools_scm - name: Build wheels uses: pypa/cibuildwheel@v2.16.2 env: CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64 cp3*-macosx_x86_64" CIBW_SKIP: "cp37-*" - uses: actions/upload-artifact@v3 with: name: wheels path: wheelhouse/*.whl publish: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') needs: [build, wheels] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v3 with: name: sdist path: dist/ - uses: actions/download-artifact@v3 with: name: wheels path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@v1.5.1 with: password: ${{ secrets.pypi_password }} #password: ${{ secrets.test_pypi_password }} #repository_url: https://test.pypi.org/legacy/ dnaio-1.2.0/.gitignore000066400000000000000000000002141453560735400146050ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /.tox/ /src/dnaio/_version.py dnaio-1.2.0/.pre-commit-config.yaml000066400000000000000000000003551453560735400171040ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black rev: 22.3.0 hooks: - id: black dnaio-1.2.0/.readthedocs.yaml000066400000000000000000000002311453560735400160430ustar00rootroot00000000000000version: 2 build: os: "ubuntu-22.04" tools: python: "3.11" python: install: - requirements: doc/requirements.txt - method: pip path: . dnaio-1.2.0/CHANGES.rst000066400000000000000000000062051453560735400144250ustar00rootroot00000000000000========= Changelog ========= v1.2.0 (2023-12-11) ------------------- * :pr:`124`: Added support for chunking FASTA reads to ``read_paired_chunks``. Previously, only FASTQ was supported. v1.1.0 (2023-11-20) ------------------- * :pr:`116`: Added experimental support for reading unaligned BAM files (single-end only at the moment). This uses a custom, highly efficient BAM parser, making dnaio faster than htslib in this particular case. v1.0.1 (2023-10-06) ------------------- * :pr:`120`: Improved type annotations. * Dropped support for Python 3.7 * Added support for Python 3.12 v1.0.0 (2023-09-06) ------------------- * :pr:`110`: Added ``id`` and ``comment`` properties to ``SequenceRecord``. v0.10.0 (2022-12-05) -------------------- * :pr:`99`: SequenceRecord initialization is now faster, which also provides a speed boost to FASTQ iteration. ``SequenceRecord.__new__`` cannot be used anymore to initialize `SequenceRecord` objects. * :pr:`96`: ``open_threads`` and ``compression_level`` are now added to `~dnaio.open` as arguments. By default dnaio now uses compression level 1 and does not utilize external programs to speed up gzip (de)compression. * :pr:`87`: `~dnaio.open` can now open more than two files. The ``file1`` and ``file2`` arguments are now deprecated. v0.9.1 (2022-08-01) ------------------- * :pr:`85`: macOS wheels are now also built as part of the release procedure. * :pr:`81`: API documentation improvements and minor code refactors for readability. v0.9.0 (2022-05-17) ------------------- * :pr:`79`: Added a `~dnaio.records_are_mates` function to be used for checking whether three or more records are mates of each other (by checking the ID). * :pr:`74`, :pr:`68`: Made FASTQ parsing faster by implementing the check for ASCII using SSE vector instructions. * :pr:`72`: Added a `tutorial `_. v0.8.0 (2022-03-26) ------------------- * Preliminary documentation is available at . * :pr:`53`: Renamed ``Sequence`` to `~dnaio.SequenceRecord`. The previous name is still available as an alias so that existing code will continue to work. * When reading a FASTQ file, there is now a check that ensures that all characters are ASCII. * Function ``record_names_match`` is deprecated, use `~dnaio.SequenceRecord.is_mate` instead. * Added `~dnaio.SequenceRecord.reverse_complement`. * Dropped Python 3.6 support as it is end-of-life. v0.7.1 (2022-01-26) ------------------- * :pr:`34`: Fix parsing of FASTA files that just contain a comment and no reads v0.7.0 (2022-01-17) ------------------- * @rhpvorderman contributed many performance improvements in :pr:`15`, :pr:`17`, :pr:`18`, :pr:`20`, :pr:`21`, :pr:`22`, :pr:`23`. Reading and writing FASTQ files and reading of paired-end FASTQ files was sped up significantly. For example, reading uncompressed FASTQ is 50% faster (!) than before. * :pr:`28`: Windows support added v0.6.0 (2021-09-28) ------------------- * :pr:`12`: Improve FASTQ writing speed twofold (thanks to @rhpvorderman) v0.5.2 (2021-09-07) ------------------- * :issue:`7`: Ignore a trailing "3" in the read id dnaio-1.2.0/LICENSE000066400000000000000000000020771453560735400136330ustar00rootroot00000000000000Copyright (c) 2010 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. dnaio-1.2.0/README.rst000066400000000000000000000042721453560735400143140ustar00rootroot00000000000000.. image:: https://github.com/marcelm/dnaio/workflows/CI/badge.svg :alt: GitHub Actions badge .. image:: https://img.shields.io/pypi/v/dnaio.svg?branch=main :target: https://pypi.python.org/pypi/dnaio :alt: PyPI badge .. image:: https://codecov.io/gh/marcelm/dnaio/branch/master/graph/badge.svg :target: https://codecov.io/gh/marcelm/dnaio :alt: Codecov badge ===================================== dnaio processes FASTQ and FASTA files ===================================== ``dnaio`` is a Python 3.8+ library for very efficient parsing and writing of FASTQ and also FASTA files. The code was previously part of the `Cutadapt `_ tool and has been improved significantly since it has been split out. Example usage ============= The main interface is the `dnaio.open `_ function:: import dnaio with dnaio.open("reads.fastq.gz") as f: bp = 0 for record in f: bp += len(record) print(f"The input file contains {bp/1E6:.1f} Mbp") For more, see the `tutorial `_ and `API documentation `_. Installation ============ Using pip:: pip install dnaio zstandard ``zstandard`` can be omitted if support for Zstandard (``.zst``) files is not required. Features and supported file types ================================= - FASTQ input and output - FASTA input and output - BAM input - Compressed input and output (``.gz``, ``.bz2``, ``.xz`` and ``.zst`` are detected automatically) - Paired-end data in two files - Interleaved paired-end data in a single file - Files with DOS/Windows linebreaks can be read - FASTQ files with a second header line (after the ``+``) are supported Limitations =========== - Multi-line FASTQ files are not supported - FASTQ and BAM parsing is the focus of this library. The FASTA parser is not as optimized Links ===== * `Documentation `_ * `Source code `_ * `Report an issue `_ * `Project page on PyPI `_ dnaio-1.2.0/doc/000077500000000000000000000000001453560735400133655ustar00rootroot00000000000000dnaio-1.2.0/doc/Makefile000066400000000000000000000011721453560735400150260ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = build .PHONY: help Makefile # Put it first so that "make" without argument is like "make html". html: @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) dnaio-1.2.0/doc/api.rst000066400000000000000000000042411453560735400146710ustar00rootroot00000000000000The dnaio API ============= .. module:: dnaio The open function ----------------- .. autofunction:: open The ``SequenceRecord`` class ------------------------------ .. autoclass:: dnaio.SequenceRecord :members: :special-members: __len__, __getitem__ .. automethod:: __init__(name: str, sequence: str, qualities: Optional[str] = None) Reader and writer interfaces ---------------------------- .. autoclass:: SingleEndReader :members: __iter__ .. autoclass:: PairedEndReader :members: __iter__ .. autoclass:: SingleEndWriter :members: write .. autoclass:: PairedEndWriter :members: write .. autoclass:: MultipleFileWriter :members: write, write_iterable Reader and writer classes ------------------------- The `dnaio.open` function returns an instance of one of the following classes. They can also be used directly if needed. .. autoclass:: FastaReader :show-inheritance: .. autoclass:: FastaWriter :show-inheritance: .. autoclass:: FastqReader :show-inheritance: .. autoclass:: FastqWriter :show-inheritance: .. autoclass:: BamReader :show-inheritance: .. autoclass:: TwoFilePairedEndReader :show-inheritance: .. autoclass:: TwoFilePairedEndWriter :show-inheritance: .. autoclass:: InterleavedPairedEndReader :show-inheritance: .. autoclass:: InterleavedPairedEndWriter :show-inheritance: .. autoclass:: MultipleFileReader :members: __iter__ .. autoclass:: MultipleFastaWriter :show-inheritance: .. autoclass:: MultipleFastqWriter :show-inheritance: Chunked reading of sequence records ----------------------------------- The following functions can be used to very quickly split up the input file(s) into similarly-sized chunks without actually parsing the records. The chunks can then be distributed to worker threads or subprocesses and be parsed and processed there. .. autofunction:: read_chunks .. autofunction:: read_paired_chunks Functions --------- .. autofunction:: records_are_mates Exceptions ---------- .. autoexception:: UnknownFileFormat .. autoexception:: FileFormatError .. autoexception:: FastaFormatError :show-inheritance: .. autoexception:: FastqFormatError :show-inheritance: dnaio-1.2.0/doc/changes.rst000066400000000000000000000000521453560735400155240ustar00rootroot00000000000000.. _changes: .. include:: ../CHANGES.rst dnaio-1.2.0/doc/conf.py000066400000000000000000000021261453560735400146650ustar00rootroot00000000000000# Sphinx configuration file import os import sys import time sys.path.insert(0, os.path.abspath("..")) project = "dnaio" copyright = f"{time.gmtime().tm_year} dnaio authors" author = "Marcel Martin" extensions = [ "sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx_issues", ] templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] html_theme = "furo" html_show_sphinx = False html_title = "dnaio" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ["_static"] default_role = "obj" # (or "any") issues_uri = "https://github.com/marcelm/dnaio/issues/{issue}" issues_pr_uri = "https://github.com/marcelm/dnaio/pull/{pr}" autodoc_typehints = "description" python_use_unqualified_type_names = True dnaio-1.2.0/doc/index.rst000066400000000000000000000002341453560735400152250ustar00rootroot00000000000000dnaio ===== .. include:: ../README.rst .. toctree:: :maxdepth: 2 /tutorial /api /changes Source code dnaio-1.2.0/doc/requirements.txt000066400000000000000000000000231453560735400166440ustar00rootroot00000000000000furo sphinx_issues dnaio-1.2.0/doc/tutorial.rst000066400000000000000000000106541453560735400157700ustar00rootroot00000000000000Tutorial ======== This should get you started with using ``dnaio``. The only essential concepts to know about are the `dnaio.open` function and the `~dnaio.SequenceRecord` object. Reading ------- The main interface for reading and writing sequence files is the `dnaio.open` function. For example, this program reads in a FASTQ file and computes the total number of nucleotides it contains:: import dnaio with dnaio.open("reads.fastq.gz") as reader: bp = 0 for record in reader: bp += len(record) print(f"The input file contains {bp/1E6:.1f} Mbp") As can be seen from the ``.gz`` file extension, the input file is gzip-compressed. `dnaio.open` detects and handles this automatically by opening the file with `xopen `_. Here, the call to `dnaio.open` returns a `~dnaio.FastqReader` object. Iterating over it in the ``for`` loop results in `~dnaio.SequenceRecord` objects. Calling ``len()`` on a ``SequenceRecord`` returns the number of nucleotides in the record. A ``SequenceRecord`` has the attributes ``name``, ``sequence`` and ``qualities``. All of these are ``str`` objects. The ``qualities`` attribute is ``None`` when reading FASTA files. The following program uses the ``name`` attribute to check whether any sequence names are duplicated in a FASTA file:: import dnaio seen = set() with dnaio.open("sequences.fasta") as reader: for record in reader: if record.name in seen: print(record.name, "is duplicated") seen.add(record.name) Writing ------- To open a sequence file for writing, pass the ``mode="w"`` argument to ``dnaio.open``:: import dnaio with dnaio.open("onerecord.fastq.gz", mode="w") as writer: writer.write(dnaio.SequenceRecord("name", "ACGT", "#B!#")) Here, a `~dnaio.FastqWriter` object is returned by ``dnaio.open``, which has a `~dnaio.FastqWriter.write()` method that accepts a ``SequenceRecord``. A possibly more common use case is to read an input file, modify the reads and write them to a new output file. The following example program shows how that can be done. It truncates all reads in the input file to a length of 30 nt and writes them to another file:: import dnaio with dnaio.open("in.fastq.gz") as reader, dnaio.open("out.fastq.gz", mode="w") as writer: for record in reader: record = record[:30] writer.write(record) This also shows that `~dnaio.SequenceRecord` objects support slicing: ``record[:30]`` returns a new ``SequenceRecord`` object with the sequence and qualities trimmed to the first 30 characters, leaving the name unchanged. Paired-end data --------------- Paired-end data is supported in two forms: Two separate files or interleaved. To read from separate files, provide two input file names to the ``dnaio.open`` function:: import dnaio with dnaio.open("reads.1.fastq.gz", "reads.2.fastq.gz") as reader: bp = 0 for r1, r2 in reader: bp += len(r1) + len(r2) print(f"The paired-end input contains {bp/1E6:.1f} Mbp") Here, ``dnaio.open`` returns a `~dnaio.TwoFilePairedEndReader`. It also supports iteration, but instead of a plain ``SequenceRecord``, it returns a tuple of two ``SequenceRecord`` instances. To read from interleaved paired-end data, pass ``interleaved=True`` to ``dnaio.open`` instead of a second file name:: ... with dnaio.open("reads.interleaved.fastq.gz", interleaved=True) as reader: ... The ``PairedEndReader`` classes check whether the input files are properly paired, that is, whether they have the same number of reads in both inputs and whether the read names match. For this reason, always use a single call to ``dnaio.open`` to open paired-end files (that is, avoid opening them as two single-end files.) To demonstrate how to write paired-end data, we show a program that reads from a single-end FASTQ file and converts the records to simulated paired-end reads by writing the first 30 nt to R1 and the last 30 nt to R2:: import dnaio with dnaio.open("in.fastq.gz") as reader, \ dnaio.open("out.1.fastq.gz", "out.2.fastq.gz", mode="w") as writer: for record in reader: r1 = record[:30] r2 = record[-30:] writer.write(r1, r2) The ``writer`` in this case is a `~dnaio.TwoFilePairedEndWriter` and its `~dnaio.TwoFilePairedEndWriter.write()` method expects two ``SequenceRecord`` arguments. dnaio-1.2.0/helpers/000077500000000000000000000000001453560735400142625ustar00rootroot00000000000000dnaio-1.2.0/helpers/generate_conversion_tables.py000066400000000000000000000034211453560735400222250ustar00rootroot00000000000000import io def nucleotide_complements_table(): # A nice list of complements can be found at: # http://www.reverse-complement.com/ambiguity.html complements = dict( A="T", C="G", G="C", T="A", a="t", c="g", g="c", t="a", U="A", u="a", # R, purine (A, G) vs Y, pyrimidine (C, T) R="Y", Y="R", r="y", y="r", # K, keto (G, T) vs A, amino (A, C) K="M", M="K", k="m", m="k", # B, not A, vs V, not T B="V", V="B", b="v", v="b", # D, not C vs H, not G D="H", H="D", d="h", h="d", # S, W and N's complements are the same. So they are not explicitly # included above ) table = [] for i in range(256): c = chr(i) if c in complements: table.append(f"'{complements[c]}'") else: table.append(i) return table def make_table(variable_name, table, columns=16): out = io.StringIO() out.write(variable_name + " = {") for i, literal in enumerate(table): if i % columns == 0: out.write("\n ") out.write(f"{literal:3}, ") out.write("\n") out.write("};\n") return out.getvalue() def main(): with open("src/dnaio/_conversions.h", "wt", encoding="utf-8") as out: out.write( "// This file is generated by generate_conversion_tables.py\n" "// Please do not edit manually.\n\n" ) out.write( make_table( "static const char NUCLEOTIDE_COMPLEMENTS[256]", nucleotide_complements_table(), ) ) if __name__ == "__main__": main() dnaio-1.2.0/pyproject.toml000066400000000000000000000037341453560735400155430ustar00rootroot00000000000000[build-system] requires = ["setuptools >= 52", "setuptools_scm >= 6.2", "Cython >= 0.29.20"] build-backend = "setuptools.build_meta" [project] name = "dnaio" authors = [ {name = "Marcel Martin", email = "marcel.martin@scilifelab.se"}, {name = "Ruben Vorderman", email = "r.h.p.vorderman@lumc.nl"} ] description = "Read and write FASTA and FASTQ files efficiently" readme = "README.rst" license = {text = "MIT"} classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics" ] requires-python = ">3.7" dependencies = [ "xopen >= 1.4.0" ] dynamic = ["version"] [project.optional-dependencies] dev = [ "Cython", "pytest" ] [project.urls] "Homepage" = "https://dnaio.readthedocs.io/" "Changelog" = "https://dnaio.readthedocs.io/en/latest/changes.html" "Repository" = "https://github.com/marcelm/dnaio/" [tool.setuptools.exclude-package-data] dnaio = ["*.pyx"] [tool.setuptools_scm] write_to = "src/dnaio/_version.py" [tool.pytest.ini_options] testpaths = ["tests"] [tool.cibuildwheel.windows.environment] CFLAGS = "-g0 -DNDEBUG" [tool.cibuildwheel.macos.environment] CFLAGS = "-g0 -DNDEBUG" [tool.cibuildwheel.linux.environment] CFLAGS = "-g0 -DNDEBUG -mssse3" [tool.cibuildwheel] test-requires = "pytest" test-command = ["cd {project}", "pytest tests"] [[tool.cibuildwheel.overrides]] select = "*-win*" test-command = ["cd /d {project}", "pytest tests"] [tool.mypy] warn_unused_configs = true warn_redundant_casts = true warn_unused_ignores = true [tool.coverage.report] precision = 1 exclude_also = [ "def __repr__", "@overload", "if TYPE_CHECKING:", ] [tool.coverage.run] branch = true parallel = true include = [ "*/site-packages/dnaio/*", "tests/*", ] [tool.coverage.paths] source = [ "src/", "*/site-packages/", ] dnaio-1.2.0/setup.py000066400000000000000000000007171453560735400143370ustar00rootroot00000000000000import platform from setuptools import setup, Extension import setuptools_scm # noqa Ensure it’s installed if platform.machine() == "AMD64": # Macro is defined by default for clang and GCC on relevant targets, but # not by MSVC. DEFINE_MACROS = [("__SSE2__", 1)] else: DEFINE_MACROS = [] setup( ext_modules=[ Extension( "dnaio._core", sources=["src/dnaio/_core.pyx"], define_macros=DEFINE_MACROS ), ], ) dnaio-1.2.0/src/000077500000000000000000000000001453560735400134075ustar00rootroot00000000000000dnaio-1.2.0/src/dnaio/000077500000000000000000000000001453560735400145015ustar00rootroot00000000000000dnaio-1.2.0/src/dnaio/__init__.py000066400000000000000000000215301453560735400166130ustar00rootroot00000000000000""" Sequence I/O: Read and write FASTA and FASTQ files efficiently """ __all__ = [ "open", "BamReader", "SequenceRecord", "SingleEndReader", "PairedEndReader", "SingleEndWriter", "PairedEndWriter", "FastaReader", "FastaWriter", "FastqReader", "FastqWriter", "UnknownFileFormat", "FileFormatError", "FastaFormatError", "FastqFormatError", "InterleavedPairedEndReader", "InterleavedPairedEndWriter", "TwoFilePairedEndReader", "TwoFilePairedEndWriter", "MultipleFileReader", "MultipleFastaWriter", "MultipleFastqWriter", "read_chunks", "read_paired_chunks", "records_are_mates", "__version__", ] import functools from os import PathLike from typing import Optional, Union, BinaryIO, Literal, overload from xopen import xopen from ._core import ( SequenceRecord, ) from ._core import record_names_match # noqa: F401 # deprecated from ._core import records_are_mates from .readers import BamReader, FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .singleend import _open_single from .pairedend import ( _open_paired, TwoFilePairedEndReader, TwoFilePairedEndWriter, InterleavedPairedEndReader, InterleavedPairedEndWriter, ) from .multipleend import ( MultipleFastaWriter, MultipleFastqWriter, MultipleFileReader, MultipleFileWriter, _open_multiple, ) from .exceptions import ( UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError, ) from .interfaces import ( SingleEndReader, PairedEndReader, SingleEndWriter, PairedEndWriter, ) from .chunks import read_chunks, read_paired_chunks from ._version import version as __version__ # Backwards compatibility alias Sequence = SequenceRecord _FileOrPath = Union[str, PathLike, BinaryIO] @overload def open( _file: _FileOrPath, *, fileformat: Optional[str] = ..., interleaved: Literal[False] = ..., mode: Literal["r"] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> SingleEndReader: ... @overload def open( _file1: _FileOrPath, _file2: _FileOrPath, *, fileformat: Optional[str] = ..., interleaved: Literal[False] = ..., mode: Literal["r"] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> PairedEndReader: ... @overload def open( _file: _FileOrPath, *, interleaved: Literal[True], fileformat: Optional[str] = ..., mode: Literal["r"] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> PairedEndReader: ... @overload def open( _file1: _FileOrPath, _file2: _FileOrPath, _file3: _FileOrPath, *files: _FileOrPath, fileformat: Optional[str] = ..., mode: Literal["r"] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> MultipleFileReader: ... @overload def open( _file: _FileOrPath, *, mode: Literal["w", "a"], fileformat: Optional[str] = ..., interleaved: Literal[False] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> SingleEndWriter: ... @overload def open( _file1: _FileOrPath, _file2: _FileOrPath, *, mode: Literal["w", "a"], fileformat: Optional[str] = ..., interleaved: Literal[False] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> PairedEndWriter: ... @overload def open( _file: _FileOrPath, *, mode: Literal["w", "a"], interleaved: Literal[True], fileformat: Optional[str] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> PairedEndWriter: ... @overload def open( _file1: _FileOrPath, _file2: _FileOrPath, _file3: _FileOrPath, *files: _FileOrPath, mode: Literal["w", "a"], fileformat: Optional[str] = ..., interleaved: Literal[False] = ..., qualities: Optional[bool] = ..., opener=..., compression_level: int = ..., open_threads: int = ..., ) -> MultipleFileWriter: ... def open( *files: _FileOrPath, file1: Optional[_FileOrPath] = None, file2: Optional[_FileOrPath] = None, fileformat: Optional[str] = None, interleaved: bool = False, mode: str = "r", qualities: Optional[bool] = None, opener=xopen, compression_level: int = 1, open_threads: int = 0, **_kwargs, # TODO Can we get rid of this? Only here to satisfy type checker ) -> Union[ SingleEndReader, PairedEndReader, SingleEndWriter, PairedEndWriter, MultipleFileReader, MultipleFileWriter, ]: """ Open one or more FASTQ or FASTA files for reading or writing, or open one (unaligned) BAM file for reading. Parameters: files: one or more Path or open file-like objects. One for single-end reads, two for paired-end reads etc. More than two files are also supported. At least one file is required. file1: Deprecated keyword argument for the first file. file2: Deprecated keyword argument for the second file. mode: Set to ``'r'`` for reading, ``'w'`` for writing or ``'a'`` for appending. For BAM files, only reading is supported. interleaved: If True, then there must be only one file argument that contains interleaved paired-end data. fileformat: If *None*, the file format is autodetected from the file name extension. Set to ``'fasta'``, ``'fastq'`` or ``'bam'`` to not auto-detect. qualities: When mode is ``'w'`` and fileformat is *None*, this can be set to *True* or *False* to specify whether the written sequences will have quality values. This is used in two ways: - If the output format cannot be determined (unrecognized extension etc.), no exception is raised, but FASTA or FASTQ format is chosen appropriately. - When False (no qualities available), an exception is raised when the auto-detected output format is FASTQ. opener: A function that is used to open the files if they are not already open file-like objects. By default, ``xopen`` is used, which can also open compressed file formats. open_threads: By default, dnaio opens files in the main thread. When threads is greater than 0, external processes are opened for compressing and decompressing files. This decreases wall clock time at the cost of a little extra overhead. This parameter does not work when a custom opener is set. compression_level: By default dnaio uses compression level 1 for writing gzipped files as this is the fastest. A higher level can be set using this parameter. This parameter does not work when a custom opener is set. """ if files and (file1 is not None) and (file2 is not None): raise ValueError( "file1 and file2 arguments cannot be used together with files specified " "as positional arguments" ) elif files and (file1 is not None): raise ValueError( "The file1 keyword argument cannot be used together with files specified " "as positional arguments" ) elif len(files) > 1 and file2 is not None: raise ValueError( "The file2 argument cannot be used together with more than one " "file specified as positional argument" ) elif file1 is not None and file2 is not None: files = (file1, file2) elif file1 is not None: files = (file1,) elif len(files) == 1 and file2 is not None: files = (files[0], file2) del file1 del file2 if len(files) > 1 and interleaved: raise ValueError("When interleaved is True, only one file must be specified.") elif mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") if opener == xopen: opener = functools.partial( xopen, threads=open_threads, compresslevel=compression_level ) if interleaved or len(files) == 2: return _open_paired( *files, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) elif len(files) > 2: return _open_multiple( *files, fileformat=fileformat, mode=mode, qualities=qualities, opener=opener ) else: return _open_single( files[0], opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) dnaio-1.2.0/src/dnaio/_conversions.h000066400000000000000000000027411453560735400173650ustar00rootroot00000000000000// This file is generated by generate_conversion_tables.py // Please do not edit manually. static const char NUCLEOTIDE_COMPLEMENTS[256] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'T', 'V', 'G', 'H', 69, 70, 'C', 'D', 73, 74, 'M', 76, 'K', 78, 79, 80, 81, 'Y', 83, 'A', 'A', 'B', 87, 88, 'R', 90, 91, 92, 93, 94, 95, 96, 't', 'v', 'g', 'h', 101, 102, 'c', 'd', 105, 106, 'm', 108, 'k', 110, 111, 112, 113, 'y', 115, 'a', 'a', 'b', 119, 120, 'r', 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; dnaio-1.2.0/src/dnaio/_core.pyi000066400000000000000000000041111453560735400163100ustar00rootroot00000000000000from typing import ( Generic, Optional, Tuple, BinaryIO, Iterator, Type, TypeVar, ByteString, ) class SequenceRecord: name: str sequence: str qualities: Optional[str] def __init__( self, name: str, sequence: str, qualities: Optional[str] = ... ) -> None: ... def __getitem__(self, s: slice) -> SequenceRecord: ... def __repr__(self) -> str: ... def __len__(self) -> int: ... def __richcmp__(self, other: SequenceRecord, op: int) -> bool: ... def qualities_as_bytes(self) -> bytes: ... def fastq_bytes(self, two_headers: bool = ...) -> bytes: ... def is_mate(self, other: SequenceRecord) -> bool: ... def reverse_complement(self) -> SequenceRecord: ... @property def id(self) -> str: ... @property def comment(self) -> Optional[str]: ... # Bytestring = Union[bytes, bytearray, memoryview]. Technically just 'bytes' is # acceptable as an alias, but even more technically this function supports all # types that implement the buffer protocol, for which there is no type yet. # See: https://github.com/python/typing/issues/593 def paired_fastq_heads( buf1: ByteString, buf2: ByteString, end1: int, end2: int ) -> Tuple[int, int]: ... def records_are_mates( __first_record: SequenceRecord, __second_record: SequenceRecord, *__other_records: SequenceRecord, ) -> bool: ... T = TypeVar("T") class FastqIter(Generic[T]): def __init__( self, file: BinaryIO, sequence_class: Type[T], buffer_size: int = ... ): ... def __iter__(self) -> Iterator[T]: ... def __next__(self) -> T: ... @property def number_of_records(self) -> int: ... class BamIter: def __init__(self, file: BinaryIO, buffer_size: int): ... def __iter__(self) -> Iterator[SequenceRecord]: ... def __next__(self) -> SequenceRecord: ... @property def header(self) -> bytes: ... @property def number_of_records(self) -> int: ... # Deprecated def record_names_match(header1: str, header2: str) -> bool: ... # Private def bytes_ascii_check(b: bytes, length: int = -1) -> bool: ... dnaio-1.2.0/src/dnaio/_core.pyx000066400000000000000000001167531453560735400163470ustar00rootroot00000000000000# cython: language_level=3, emit_code_comments=False from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes_GET_SIZE, PyBytes_CheckExact from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnicode_DecodeASCII from cpython.object cimport Py_TYPE, PyTypeObject from cpython.ref cimport PyObject from cpython.tuple cimport PyTuple_GET_ITEM from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t cimport cython cdef extern from "Python.h": void *PyUnicode_DATA(object o) bint PyUnicode_IS_COMPACT_ASCII(object o) object PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) cdef extern from "ascii_check.h": int string_is_ascii(char *string, size_t length) cdef extern from "_conversions.h": const char NUCLEOTIDE_COMPLEMENTS[256] cdef extern from "bam.h": void decode_bam_sequence(void *dest, void *encoded_sequence, size_t length) void decode_bam_qualities(uint8_t *dest, uint8_t *encoded_qualities, size_t length) from .exceptions import FastqFormatError from ._util import shorten def bytes_ascii_check(bytes string, Py_ssize_t length = -1): if length == -1: length = PyBytes_GET_SIZE(string) else: length = min(length, PyBytes_GET_SIZE(string)) cdef bint ascii = string_is_ascii(PyBytes_AS_STRING(string), length) return ascii def is_not_ascii_message(field, value): """ Return an error message for a non-ASCII field encountered when initializing a SequenceRecord Arguments: field: Description of the field ("name", "sequence", "qualities" or similar) in which non-ASCII characters were found value: Unicode string that was intended to be assigned to the field """ detail = "" try: value.encode("ascii") except UnicodeEncodeError as e: detail = ( f", but found '{value[e.start:e.end]}' at index {e.start}" ) return f"'{field}' in sequence file must be ASCII encoded{detail}" cdef class SequenceRecord: """ A named sequence with optional quality values. This typically represents a record from a FASTA or FASTQ file. The readers returned by `dnaio.open` yield objects of this type when mode is set to ``"r"`` Attributes: name (str): The read header sequence (str): The nucleotide (or amino acid) sequence qualities (str): None if no quality values are available (such as when the record comes from a FASTA file). If quality values are available, this is a string that contains the Phred-scaled qualities encoded as ASCII(qual+33) (as in FASTQ). Raises: ValueError: One of the provide attributes is not ASCII or the lengths of sequence and qualities differ """ cdef: object _name object _sequence object _qualities object _id object _comment def __init__(self, object name, object sequence, object qualities = None): if not PyUnicode_CheckExact(name): raise TypeError(f"name should be of type str, got {type(name)}") if not PyUnicode_IS_COMPACT_ASCII(name): raise ValueError(is_not_ascii_message("name", name)) if not PyUnicode_CheckExact(sequence): raise TypeError(f"sequence should be of type str, got {type(sequence)}") if not PyUnicode_IS_COMPACT_ASCII(sequence): raise ValueError(is_not_ascii_message("sequence", sequence)) if qualities is not None: if not PyUnicode_CheckExact(qualities): raise TypeError(f"qualities should be of type str, got {type(qualities)}") if not PyUnicode_IS_COMPACT_ASCII(qualities): raise ValueError(is_not_ascii_message("qualities", qualities)) if len(qualities) != len(sequence): rname = shorten(name) raise ValueError("In read named {!r}: length of quality sequence " "({}) and length of read ({}) do not match".format( rname, len(qualities), len(sequence))) self._name = name self._sequence = sequence self._qualities = qualities @property def name(self): return self._name @name.setter def name(self, name): if not PyUnicode_CheckExact(name): raise TypeError(f"name must be of type str, got {type(name)}") if not PyUnicode_IS_COMPACT_ASCII(name): raise ValueError(is_not_ascii_message("name", name)) self._name = name self._id = None self._comment = None @property def sequence(self): return self._sequence @sequence.setter def sequence(self, sequence): if not PyUnicode_CheckExact(sequence): raise TypeError(f"sequence must be of type str, got {type(sequence)}") if not PyUnicode_IS_COMPACT_ASCII(sequence): raise ValueError(is_not_ascii_message("sequence", sequence)) self._sequence = sequence @property def qualities(self): return self._qualities @qualities.setter def qualities(self, qualities): if PyUnicode_CheckExact(qualities): if not PyUnicode_IS_COMPACT_ASCII(qualities): raise ValueError(is_not_ascii_message("qualities", qualities)) elif qualities is None: pass else: raise TypeError( f"qualities must be of type str or None, " f"got {type(qualities)}." ) self._qualities = qualities @property def id(self): """ The header part before any whitespace. This is the unique identifier for the sequence. """ cdef char *name cdef size_t name_length cdef size_t id_length # Not yet cached is None if self._id is None: name = PyUnicode_DATA(self._name) name_length = PyUnicode_GET_LENGTH(self._name) id_length = strcspn(name, "\t ") if id_length == name_length: self._id = self._name else: self._id = PyUnicode_New(id_length, 127) memcpy(PyUnicode_DATA(self._id), name, id_length) return self._id @property def comment(self): """ The header part after the first whitespace. This is usually used to store metadata. It may be empty in which case the attribute is None. """ cdef char *name cdef size_t name_length cdef size_t id_length cdef char *comment_start cdef size_t comment_length # Not yet cached is None if self._comment is None: name = PyUnicode_DATA(self._name) name_length = PyUnicode_GET_LENGTH(self._name) id_length = strcspn(name, "\t ") if id_length == name_length: self._comment = "" else: comment_start = name + id_length + 1 # Skip empty whitespace before comment comment_start = comment_start + strspn(comment_start, '\t ') comment_length = name_length - (comment_start - name) self._comment = PyUnicode_New(comment_length , 127) memcpy(PyUnicode_DATA(self._comment), comment_start, comment_length) # Empty comment is returned as None. This is not stored internally as # None, otherwise the above code would run every time the attribute # was accessed. if PyUnicode_GET_LENGTH(self._comment) == 0: return None return self._comment def __getitem__(self, key): """ Slice this SequenceRecord. If the qualities attribute is not None, it is sliced accordingly. The read name is copied unchanged. Returns: A new `SequenceRecord` object representing the sliced sequence. """ return self.__class__( self._name, self._sequence[key], self._qualities[key] if self._qualities is not None else None, ) def __repr__(self): qstr = '' if self._qualities is not None: qstr = ', qualities={!r}'.format(shorten(self._qualities)) return ''.format( shorten(self._name), shorten(self._sequence), qstr) def __len__(self): """ Returns: The number of characters in the sequence """ return len(self._sequence) def __richcmp__(self, SequenceRecord other, int op): if 2 <= op <= 3: eq = self._name == other._name and \ self._sequence == other._sequence and \ self._qualities == other._qualities if op == 2: return eq else: return not eq else: raise NotImplementedError() def __reduce__(self): return (SequenceRecord, (self._name, self._sequence, self._qualities)) def qualities_as_bytes(self): """ Return the qualities as a bytes object. This is a faster version of ``record.qualities.encode('ascii')``. """ return self._qualities.encode('ascii') def fastq_bytes(self, bint two_headers=False): """ Format this record in FASTQ format Arguments: two_headers (bool): If True, repeat the header (after the ``@``) on the third line (after the ``+``) Returns: A bytes object with the formatted record. This can be written directly to a file. """ if self._qualities is None: raise ValueError("Cannot create a FASTQ record when qualities is not set.") cdef: char *name = PyUnicode_DATA(self._name) char *sequence = PyUnicode_DATA(self._sequence) char *qualities = PyUnicode_DATA(self._qualities) size_t name_length = PyUnicode_GET_LENGTH(self._name) size_t sequence_length = PyUnicode_GET_LENGTH(self._sequence) size_t qualities_length = PyUnicode_GET_LENGTH(self._qualities) # Total size is name + sequence + qualities + 4 newlines + '+' and an # '@' to be put in front of the name. cdef Py_ssize_t total_size = name_length + sequence_length + qualities_length + 6 if two_headers: # We need space for the name after the +. total_size += name_length # This is the canonical way to create an uninitialized bytestring of given size cdef bytes retval = PyBytes_FromStringAndSize(NULL, total_size) cdef char *retval_ptr = PyBytes_AS_STRING(retval) # Write the sequences into the bytestring at the correct positions. cdef size_t cursor retval_ptr[0] = b"@" memcpy(retval_ptr + 1, name, name_length) cursor = name_length + 1 retval_ptr[cursor] = b"\n"; cursor += 1 memcpy(retval_ptr + cursor, sequence, sequence_length) cursor += sequence_length retval_ptr[cursor] = b"\n"; cursor += 1 retval_ptr[cursor] = b"+"; cursor += 1 if two_headers: memcpy(retval_ptr + cursor, name, name_length) cursor += name_length retval_ptr[cursor] = b"\n"; cursor += 1 memcpy(retval_ptr + cursor, qualities, qualities_length) cursor += qualities_length retval_ptr[cursor] = b"\n" return retval def fastq_bytes_two_headers(self): # Deprecated, use ``.fastq_bytes(two_headers=True)`` instead. return self.fastq_bytes(two_headers=True) def is_mate(self, SequenceRecord other): """ Check whether this instance and another are part of the same read pair Checking is done by comparing IDs. The ID is the part of the name before the first whitespace. Any 1, 2 or 3 at the end of the IDs is excluded from the check as forward reads may have a 1 appended to their ID and reverse reads a 2 etc. Args: other (SequenceRecord): The object to compare to Returns: bool: Whether this and *other* are part of the same read pair. """ cdef: char *header1_chars = PyUnicode_DATA(self._name) char *header2_chars = PyUnicode_DATA(other._name) size_t header2_length = PyUnicode_GET_LENGTH(other._name) size_t id1_length = strcspn(header1_chars, ' \t') bint id1_ends_with_number = b'1' <= header1_chars[id1_length - 1] <= b'3' return record_ids_match(header1_chars, header2_chars, id1_length, header2_length, id1_ends_with_number) def reverse_complement(self): """ Return a reverse-complemented version of this record. - The name remains unchanged. - The sequence is reverse complemented. - If quality values exist, their order is reversed. """ cdef: Py_ssize_t sequence_length = PyUnicode_GET_LENGTH(self._sequence) object reversed_sequence_obj = PyUnicode_New(sequence_length, 127) object reversed_qualities_obj char *reversed_sequence = PyUnicode_DATA(reversed_sequence_obj) char *sequence = PyUnicode_DATA(self._sequence), char *reversed_qualities char *qualities Py_ssize_t cursor, reverse_cursor unsigned char nucleotide SequenceRecord seq_record reverse_cursor = sequence_length for cursor in range(sequence_length): reverse_cursor -= 1 nucleotide = sequence[cursor] reversed_sequence[reverse_cursor] = NUCLEOTIDE_COMPLEMENTS[nucleotide] if self._qualities is not None: reverse_cursor = sequence_length reversed_qualities_obj = PyUnicode_New(sequence_length, 127) reversed_qualities = PyUnicode_DATA(reversed_qualities_obj) qualities = PyUnicode_DATA(self._qualities) for cursor in range(sequence_length): reverse_cursor -= 1 reversed_qualities[reverse_cursor] = qualities[cursor] else: reversed_qualities_obj = None seq_record = SequenceRecord.__new__(SequenceRecord) seq_record._name = self._name seq_record._sequence = reversed_sequence_obj seq_record._qualities = reversed_qualities_obj return seq_record def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2): """ Skip forward in the two buffers by multiples of four lines. Returns: A tuple (length1, length2) such that buf1[:length1] and buf2[:length2] contain the same number of lines (where the line number is divisible by four). """ # Acquire buffers. Cython automatically checks for errors here. cdef Py_buffer data1_buffer cdef Py_buffer data2_buffer PyObject_GetBuffer(buf1, &data1_buffer, PyBUF_SIMPLE) PyObject_GetBuffer(buf2, &data2_buffer, PyBUF_SIMPLE) cdef: Py_ssize_t linebreaks = 0 char *data1 = data1_buffer.buf char *data2 = data2_buffer.buf # The min() function ensures we do not read beyond the size of the buffer. char *data1_end = data1 + min(end1, data1_buffer.len) char *data2_end = data2 + min(end2, data2_buffer.len) char *pos1 = data1 char *pos2 = data2 char *record_start1 = data1 char *record_start2 = data2 while True: pos1 = memchr(pos1, b'\n', data1_end - pos1) if pos1 == NULL: break pos1 += 1 pos2 = memchr(pos2, b'\n', data2_end - pos2) if pos2 == NULL: break pos2 += 1 linebreaks += 1 if linebreaks == 4: linebreaks = 0 record_start1 = pos1 record_start2 = pos2 # Hit the end of the data block # This code will always be reached, so the buffers are always safely released. PyBuffer_Release(&data1_buffer) PyBuffer_Release(&data2_buffer) return record_start1 - data1, record_start2 - data2 cdef class FastqIter: """ Parse a FASTQ file and yield SequenceRecord objects Arguments: file: a file-like object, opened in binary mode (it must have a readinto method) sequence_class: A custom class to use for the returned instances (instead of SequenceRecord) buffer_size: size of the initial buffer. This is automatically grown if a FASTQ record is encountered that does not fit. Yields: The *first value* that the generator yields is a boolean indicating whether the first record in the FASTQ has a repeated header (in the third row after the ``+``). Subsequent values are SequenceRecord objects (or whichever objects sequence_class returned if specified) """ cdef: Py_ssize_t buffer_size char *buffer Py_ssize_t bytes_in_buffer type sequence_class bint use_custom_class bint extra_newline bint yielded_two_headers bint eof object file char *record_start cdef readonly Py_ssize_t number_of_records def __cinit__(self, file, sequence_class, Py_ssize_t buffer_size): self.buffer_size = buffer_size self.buffer = PyMem_Malloc(self.buffer_size) if self.buffer == NULL: raise MemoryError() self.bytes_in_buffer = 0 self.sequence_class = sequence_class self.use_custom_class = sequence_class is not SequenceRecord self.number_of_records = 0 self.extra_newline = False self.yielded_two_headers = False self.eof = False self.record_start = self.buffer self.file = file if buffer_size < 1: raise ValueError("Starting buffer size too small") def __dealloc__(self): PyMem_Free(self.buffer) cdef _read_into_buffer(self): # This function sets self.record_start at 0 and makes sure self.buffer # starts at the start of a FASTQ record. Any incomplete FASTQ remainder # of the already processed buffer is moved to the start of the buffer # and the rest of the buffer is filled up with bytes from the file. cdef char *tmp cdef Py_ssize_t remaining_bytes if self.record_start == self.buffer and self.bytes_in_buffer == self.buffer_size: # buffer too small, double it self.buffer_size *= 2 tmp = PyMem_Realloc(self.buffer, self.buffer_size) if tmp == NULL: raise MemoryError() self.buffer = tmp else: # Move the incomplete record from the end of the buffer to the beginning. remaining_bytes = self.bytes_in_buffer - (self.record_start - self.buffer) # Memmove copies safely when dest and src overlap. memmove(self.buffer, self.record_start, remaining_bytes) self.bytes_in_buffer = remaining_bytes self.record_start = self.buffer cdef Py_ssize_t empty_bytes_in_buffer = self.buffer_size - self.bytes_in_buffer cdef object filechunk = self.file.read(empty_bytes_in_buffer) if not PyBytes_CheckExact(filechunk): raise TypeError("self.file is not a binary file reader.") cdef Py_ssize_t filechunk_size = PyBytes_GET_SIZE(filechunk) if filechunk_size > empty_bytes_in_buffer: raise ValueError(f"read() returned too much data: " f"{empty_bytes_in_buffer} bytes requested, " f"{filechunk_size} bytes returned.") memcpy(self.buffer + self.bytes_in_buffer, PyBytes_AS_STRING(filechunk), filechunk_size) # Strings are tested for ASCII as FASTQ should only contain ASCII characters. if not string_is_ascii(self.buffer + self.bytes_in_buffer, filechunk_size): raise FastqFormatError( "Non-ASCII characters found in record.", None) self.bytes_in_buffer += filechunk_size if filechunk_size == 0: # End of file if self.bytes_in_buffer == 0: # EOF Reached. Stop iterating. self.eof = True elif not self.extra_newline and self.buffer[self.bytes_in_buffer - 1] != b'\n': # There is still data in the buffer and its last character is # not a newline: This is a file that is missing the final # newline. Append a newline and continue. self.buffer[self.bytes_in_buffer] = b'\n' self.bytes_in_buffer += 1 self.extra_newline = True else: # Incomplete FASTQ records are present. if self.extra_newline: # Do not report the linefeed that was added by dnaio but # was not present in the original input. self.bytes_in_buffer -= 1 record = PyUnicode_DecodeASCII(self.record_start, self.bytes_in_buffer, NULL) lines = record.count('\n') raise FastqFormatError( 'Premature end of file encountered. The incomplete final record was: ' '{!r}'.format(shorten(record, 500)), line=self.number_of_records * 4 + lines) def __iter__(self): return self def __next__(self): cdef: object ret_val SequenceRecord seq_record char *name_start char *name_end char *sequence_start char *sequence_end char *second_header_start char *second_header_end char *qualities_start char *qualities_end char *buffer_end size_t remaining_bytes Py_ssize_t name_length, sequence_length, second_header_length, qualities_length # Repeatedly attempt to parse the buffer until we have found a full record. # If an attempt fails, we read more data before retrying. while True: buffer_end = self.buffer + self.bytes_in_buffer if self.eof: raise StopIteration() ### Check for a complete record (i.e 4 newlines are present) # Use libc memchr, this optimizes looking for characters by # using 64-bit integers. See: # https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=string/memchr.c;hb=HEAD # void *memchr(const void *str, int c, size_t n) name_end = memchr(self.record_start, b'\n', (buffer_end - self.record_start)) if name_end == NULL: self._read_into_buffer() continue # self.bytes_in_buffer - sequence_start is always nonnegative: # - name_end is at most self.bytes_in_buffer - 1 # - thus sequence_start is at most self.bytes_in_buffer sequence_start = name_end + 1 sequence_end = memchr(sequence_start, b'\n', (buffer_end - sequence_start)) if sequence_end == NULL: self._read_into_buffer() continue second_header_start = sequence_end + 1 remaining_bytes = (buffer_end - second_header_start) # Usually there is no second header, so we skip the memchr call. if remaining_bytes > 2 and memcmp(second_header_start, b"+\n", 2) == 0: second_header_end = second_header_start + 1 else: second_header_end = memchr(second_header_start, b'\n', (remaining_bytes)) if second_header_end == NULL: self._read_into_buffer() continue qualities_start = second_header_end + 1 qualities_end = memchr(qualities_start, b'\n', (buffer_end - qualities_start)) if qualities_end == NULL: self._read_into_buffer() continue if self.record_start[0] != b'@': raise FastqFormatError("Line expected to " "start with '@', but found {!r}".format(chr(self.record_start[0])), line=self.number_of_records * 4) if second_header_start[0] != b'+': raise FastqFormatError("Line expected to " "start with '+', but found {!r}".format(chr(second_header_start[0])), line=self.number_of_records * 4 + 2) name_start = self.record_start + 1 # Skip @ second_header_start += 1 # Skip + name_length = name_end - name_start sequence_length = sequence_end - sequence_start second_header_length = second_header_end - second_header_start qualities_length = qualities_end - qualities_start # Check for \r\n line-endings and compensate if (name_end - 1)[0] == b'\r': name_length -= 1 if (sequence_end - 1)[0] == b'\r': sequence_length -= 1 if (second_header_end - 1)[0] == b'\r': second_header_length -= 1 if (qualities_end - 1)[0] == b'\r': qualities_length -= 1 if second_header_length: # should be 0 when only + is present if (name_length != second_header_length or memcmp(second_header_start, name_start, second_header_length) != 0): raise FastqFormatError( "Sequence descriptions don't match ('{}' != '{}').\n" "The second sequence description must be either " "empty or equal to the first description.".format( PyUnicode_DecodeASCII(name_start, name_length, NULL), PyUnicode_DecodeASCII(second_header_start, second_header_length, NULL)), line=self.number_of_records * 4 + 2) if qualities_length != sequence_length: raise FastqFormatError( "Length of sequence and qualities differ", line=self.number_of_records * 4 + 3) if self.number_of_records == 0 and not self.yielded_two_headers: self.yielded_two_headers = True return bool(second_header_length) # first yielded value is special # Constructing objects with PyUnicode_New and memcpy bypasses some of # the checks otherwise done when using PyUnicode_DecodeLatin1 or similar name = PyUnicode_New(name_length, 127) sequence = PyUnicode_New(sequence_length, 127) qualities = PyUnicode_New(qualities_length, 127) if name == NULL or sequence == NULL or qualities == NULL: raise MemoryError() memcpy(PyUnicode_DATA(name), name_start, name_length) memcpy(PyUnicode_DATA(sequence), sequence_start, sequence_length) memcpy(PyUnicode_DATA(qualities), qualities_start, qualities_length) if self.use_custom_class: ret_val = self.sequence_class(name, sequence, qualities) else: seq_record = SequenceRecord.__new__(SequenceRecord) seq_record._name = name seq_record._sequence = sequence seq_record._qualities = qualities ret_val = seq_record # Advance record to next position self.number_of_records += 1 self.record_start = qualities_end + 1 return ret_val cdef struct BamRecordHeader: uint32_t block_size int32_t reference_id int32_t pos uint8_t l_read_name uint8_t mapq uint16_t bin uint16_t n_cigar_op uint16_t flag uint32_t l_seq int32_t next_ref_id int32_t next_pos int32_t tlen cdef class BamIter: cdef: uint8_t *record_start uint8_t *buffer_end size_t read_in_size uint8_t *read_in_buffer size_t read_in_buffer_size object file readonly object header readonly Py_ssize_t number_of_records def __dealloc__(self): PyMem_Free(self.read_in_buffer) def __cinit__(self, fileobj, read_in_size = 48 * 1024): if read_in_size < 4: raise ValueError(f"read_in_size must be at least 4 got " f"{read_in_size}") # Skip ahead and save the BAM header for later inspection magic_and_header_size = fileobj.read(8) if not isinstance(magic_and_header_size, bytes): raise TypeError(f"fileobj {fileobj} is not a binary IO type, " f"got {type(fileobj)}") if len(magic_and_header_size) < 8: raise EOFError("Truncated BAM file") if magic_and_header_size[:4] != b"BAM\1": raise ValueError( f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " f"found {magic_and_header_size[:4]}") l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False) header = fileobj.read(l_text) if len(header) < l_text: raise EOFError("Truncated BAM file") n_ref_obj = fileobj.read(4) if len(n_ref_obj) < 4: raise EOFError("Truncated BAM file") n_ref = int.from_bytes(n_ref_obj, "little", signed=False) for i in range(n_ref): l_name_obj = fileobj.read(4) if len(l_name_obj) < 4: raise EOFError("Truncated BAM file") l_name = int.from_bytes(l_name_obj, "little", signed=False) reference_chunk_size = l_name + 4 # Include name and uint32_t of size reference_chunk = fileobj.read(reference_chunk_size) if len(reference_chunk) < reference_chunk_size: raise EOFError("Truncated BAM file") # Fileobj is now skipped ahead and at the start of the BAM records self.header = header self.read_in_size = read_in_size self.file = fileobj self.read_in_buffer = NULL self.read_in_buffer_size = 0 self.record_start = self.read_in_buffer self.buffer_end = self.record_start def __iter__(self): return self cdef _read_into_buffer(self): cdef size_t read_in_size cdef size_t leftover_size = self.buffer_end - self.record_start cdef uint32_t block_size memmove(self.read_in_buffer, self.record_start, leftover_size) self.record_start = self.read_in_buffer self.buffer_end = self.record_start + leftover_size if leftover_size >= 4: # Immediately check how much data is at least required block_size = (self.record_start)[0] read_in_size = max(block_size, self.read_in_size) else: read_in_size = self.read_in_size - leftover_size new_bytes = self.file.read(read_in_size) cdef size_t new_bytes_size = PyBytes_GET_SIZE(new_bytes) cdef uint8_t *new_bytes_buf = PyBytes_AS_STRING(new_bytes) cdef size_t new_buffer_size = leftover_size + new_bytes_size if new_buffer_size == 0: # File completely read raise StopIteration() elif new_bytes_size == 0: raise EOFError("Incomplete record at the end of file") cdef uint8_t *tmp if new_buffer_size > self.read_in_buffer_size: tmp = PyMem_Realloc(self.read_in_buffer, new_buffer_size) if tmp == NULL: raise MemoryError() self.read_in_buffer = tmp self.read_in_buffer_size = new_buffer_size memcpy(self.read_in_buffer + leftover_size, new_bytes_buf, new_bytes_size) self.record_start = self.read_in_buffer self.buffer_end = self.record_start + new_buffer_size def __next__(self): cdef: SequenceRecord seq_record uint8_t *record_start uint8_t *buffer_end uint32_t record_size uint8_t *record_end cdef BamRecordHeader header cdef uint8_t *bam_name_start uint32_t name_length uint8_t *bam_seq_start uint32_t seq_length uint8_t *bam_qual_start uint32_t encoded_seq_length while True: record_start = self.record_start buffer_end = self.buffer_end if record_start + 4 >= buffer_end: self._read_into_buffer() continue record_size = (record_start)[0] record_end = record_start + 4 + record_size if record_end > buffer_end: self._read_into_buffer() continue header = (record_start)[0] if header.flag != 4: raise NotImplementedError( "The BAM parser has been implemented with unmapped single " "reads in mind to support ONT sequencing input. Mapped " "BAM files or files with multiple reads are not supported. " "Please use samtools fastq to make the appropriate " "conversion to FASTQ format." ) bam_name_start = record_start + sizeof(BamRecordHeader) name_length = header.l_read_name bam_seq_start = bam_name_start + name_length + header.n_cigar_op * sizeof(uint32_t) name_length -= 1 # Do not include the null byte seq_length = header.l_seq encoded_seq_length = (seq_length + 1) // 2 bam_qual_start = bam_seq_start + encoded_seq_length name = PyUnicode_New(name_length, 127) sequence = PyUnicode_New(seq_length, 127) qualities = PyUnicode_New(seq_length, 127) memcpy(PyUnicode_DATA(name), bam_name_start, name_length) decode_bam_sequence(PyUnicode_DATA(sequence), bam_seq_start, seq_length) decode_bam_qualities(PyUnicode_DATA(qualities), bam_qual_start, seq_length) seq_record = SequenceRecord.__new__(SequenceRecord) seq_record._name = name seq_record._sequence = sequence seq_record._qualities = qualities self.number_of_records += 1 self.record_start = record_end return seq_record def record_names_match(header1: str, header2: str): """ Check whether the sequence record ids id1 and id2 are compatible, ignoring a suffix of '1', '2' or '3'. This exception allows to check some old paired-end reads that have IDs ending in '/1' and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ) appends '.1', '.2' and sometimes '.3' to paired-end reads if option -I is used. Deprecated, use `SequenceRecord.is_mate` instead """ cdef: char *header1_chars = NULL char *header2_chars = NULL size_t header1_length if PyUnicode_CheckExact(header1): if PyUnicode_IS_COMPACT_ASCII(header1): header1_chars = PyUnicode_DATA(header1) else: raise ValueError("header1 must be a valid ASCII-string.") else: raise TypeError(f"Header 1 is the wrong type. Expected bytes or string, " f"got: {type(header1)}") if PyUnicode_CheckExact(header2): if PyUnicode_IS_COMPACT_ASCII(header2): header2_chars = PyUnicode_DATA(header2) header2_length = PyUnicode_GET_LENGTH(header2) else: raise ValueError("header2 must be a valid ASCII-string.") else: raise TypeError(f"Header 2 is the wrong type. Expected bytes or string, " f"got: {type(header2)}") cdef size_t id1_length = strcspn(header1_chars, ' \t') cdef bint id1_ends_with_number = b'1' <= header1_chars[id1_length - 1] <= b'3' return record_ids_match(header1_chars, header2_chars, id1_length, header2_length, id1_ends_with_number) cdef inline bint record_ids_match(char *header1, char *header2, size_t id1_length, size_t header2_length, bint id1_ends_with_number): """ Check whether the ASCII-encoded IDs match. header1, header2 pointers to the ASCII-encoded headers id1_length, the length of header1 before the first whitespace header2_length, the full length of header2. id1_ends_with_number, whether id1 ends with a 1,2 or 3. """ if header2_length < id1_length: return False cdef char end = header2[id1_length] if end != b'\000' and end != b' ' and end != b'\t': return False # Check if the IDs end with 1, 2 or 3. This is the read pair number # which should not be included in the comparison. cdef bint id2_ends_with_number = b'1' <= header2[id1_length - 1] <= b'3' if id1_ends_with_number and id2_ends_with_number: id1_length -= 1 # Compare the strings up to the ID end position. return memcmp(header1, header2, id1_length) == 0 def records_are_mates(*args) -> bool: """ Check if the provided `SequenceRecord` objects are all mates of each other by comparing their record IDs. Accepts two or more `SequenceRecord` objects. This is the same as `SequenceRecord.is_mate` in the case of only two records, but allows for for cases where information is split into three records or more (such as UMI, R1, R2 or index, R1, R2). If there are only two records to check, prefer `SequenceRecord.is_mate`. Example usage:: for records in zip(*all_my_fastq_readers): if not records_are_mates(*records): raise MateError(f"IDs do not match for {records}") Args: *args: two or more `~dnaio.SequenceRecord` objects Returns: True or False """ cdef Py_ssize_t args_length = len(args) if args_length < 2: raise TypeError("records_are_mates requires at least two arguments") cdef SequenceRecord first = PyTuple_GET_ITEM(args, 0) if Py_TYPE(first) != SequenceRecord: raise TypeError(f"{first:r} is not a SequenceRecord object") cdef: object first_name_obj = first._name char *first_name = PyUnicode_DATA(first_name_obj) Py_ssize_t first_name_length = PyUnicode_GET_LENGTH(first_name_obj) Py_ssize_t id_length = strcspn(first_name, b' \t') bint id_ends_with_number = b'1' <= first_name[id_length - 1] <= b'3' SequenceRecord other object other_name_obj char *other_name Py_ssize_t other_name_length bint other_id_ends_with_number char end_char bint are_mates = True Py_ssize_t i for i in range(1, args_length): other = PyTuple_GET_ITEM(args, i) if Py_TYPE(other) != SequenceRecord: raise TypeError(f"{other:r} is not a SequenceRecord object") other_name_obj = other._name other_name = PyUnicode_DATA(other_name_obj) other_name_length = PyUnicode_GET_LENGTH(other_name_obj) # If a match is false, are_mates will stay false regardless of any true checks afterward. are_mates &= record_ids_match(first_name, other_name, id_length, other_name_length, id_ends_with_number) return are_mates dnaio-1.2.0/src/dnaio/_util.py000066400000000000000000000003421453560735400161660ustar00rootroot00000000000000def shorten(s: str, n: int = 100) -> str: """Shorten string s to at most n characters, appending "..." if necessary.""" if s is None: return None if len(s) > n: s = s[: n - 3] + "..." return s dnaio-1.2.0/src/dnaio/_version.pyi000066400000000000000000000002241453560735400170460ustar00rootroot00000000000000# The _version.py file is generated on installation. By including this stub, # we can run mypy without having to install the package. version: str dnaio-1.2.0/src/dnaio/ascii_check.h000066400000000000000000000031641453560735400171030ustar00rootroot00000000000000#include #include #ifdef __SSE2__ #include "emmintrin.h" #endif #define ASCII_MASK_8BYTE 0x8080808080808080ULL #define ASCII_MASK_1BYTE 0x80 /** * @brief Check if a string of given length only contains ASCII characters. * * @param string A char pointer to the start of the string. * @param length The length of the string. This funtion does not check for * terminating NULL bytes. * @returns 1 if the string is ASCII-only, 0 otherwise. */ static int string_is_ascii(const char * string, size_t length) { // By performing bitwise OR on all characters in 8-byte chunks (16-byte // with SSE2) we can // determine ASCII status in a non-branching (except the loops) fashion. uint64_t all_chars = 0; const char *cursor = string; const char *string_end_ptr = string + length; const char *string_8b_end_ptr = string_end_ptr - sizeof(uint64_t); int non_ascii_in_vec = 0; #ifdef __SSE2__ const char *string_16b_end_ptr = string_end_ptr - sizeof(__m128i); __m128i vec_all_chars = _mm_setzero_si128(); while (cursor < string_16b_end_ptr) { __m128i loaded_chars = _mm_loadu_si128((__m128i *)cursor); vec_all_chars = _mm_or_si128(loaded_chars, vec_all_chars); cursor += sizeof(__m128i); } non_ascii_in_vec = _mm_movemask_epi8(vec_all_chars); #endif while (cursor < string_8b_end_ptr) { all_chars |= *(uint64_t *)cursor; cursor += sizeof(uint64_t); } while (cursor < string_end_ptr) { all_chars |= *cursor; cursor += 1; } return !(non_ascii_in_vec + (all_chars & ASCII_MASK_8BYTE)); } dnaio-1.2.0/src/dnaio/bam.h000066400000000000000000000127401453560735400154150ustar00rootroot00000000000000#include #include #include #include #ifdef __SSE2__ #include "emmintrin.h" #endif #ifdef __SSSE3__ #include "tmmintrin.h" #endif static void decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) { /* Reuse a trick from sam_internal.h in htslib. Have a table to lookup two characters simultaneously.*/ static const char code2base[512] = "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" "A=AAACAMAGARASAVATAWAYAHAKADABAN" "C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" "M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" "G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" "R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" "S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" "V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" "T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" "W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" "Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" "H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" "K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" "D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; static const uint8_t *nuc_lookup = (uint8_t *)"=ACMGRSVTWYHKDBN"; const uint8_t *dest_end_ptr = dest + length; uint8_t *dest_cursor = dest; const uint8_t *encoded_cursor = encoded_sequence; #ifdef __SSSE3__ const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i)); __m128i first_upper_shuffle = _mm_setr_epi8( 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff); __m128i first_lower_shuffle = _mm_setr_epi8( 0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7); __m128i second_upper_shuffle = _mm_setr_epi8( 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff); __m128i second_lower_shuffle = _mm_setr_epi8( 0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15); __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup); /* Work on 16 encoded characters at the time resulting in 32 decoded characters Examples are given for 8 encoded characters A until H to keep it readable. Encoded stored as |AB|CD|EF|GH| Shuffle into |AB|00|CD|00|EF|00|GH|00| and |00|AB|00|CD|00|EF|00|GH| Shift upper to the right resulting into |0A|B0|0C|D0|0E|F0|0G|H0| and |00|AB|00|CD|00|EF|00|GH| Merge with or resulting into (X stands for garbage) |0A|XB|0C|XD|0E|XF|0G|XH| Bitwise and with 0b1111 leads to: |0A|0B|0C|0D|0E|0F|0G|0H| We can use the resulting 4-bit integers as indexes for the shuffle of the nucleotide lookup. */ while (dest_cursor < dest_vec_end_ptr) { __m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor); __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111)); __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); _mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides); __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111)); __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); _mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides); encoded_cursor += sizeof(__m128i); dest_cursor += 2 * sizeof(__m128i); } #endif /* Do two at the time until it gets to the last even address. */ const uint8_t *dest_end_ptr_twoatatime = dest + (length & (~1ULL)); while (dest_cursor < dest_end_ptr_twoatatime) { /* According to htslib, size_t cast helps the optimizer. Code confirmed to indeed run faster. */ memcpy(dest_cursor, code2base + ((size_t)*encoded_cursor * 2), 2); dest_cursor += 2; encoded_cursor += 1; } assert((dest_end_ptr - dest_cursor) < 2); if (dest_cursor != dest_end_ptr) { /* There is a single encoded nuc left */ uint8_t encoded_nucs = *encoded_cursor; uint8_t upper_nuc_index = encoded_nucs >> 4; dest_cursor[0] = nuc_lookup[upper_nuc_index]; } } static void decode_bam_qualities(uint8_t *dest, const uint8_t *encoded_qualities, size_t length) { const uint8_t *end_ptr = encoded_qualities + length; const uint8_t *cursor = encoded_qualities; uint8_t *dest_cursor = dest; #ifdef __SSE2__ const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i); while (cursor < vec_end_ptr) { __m128i quals = _mm_loadu_si128((__m128i *)cursor); __m128i phreds = _mm_add_epi8(quals, _mm_set1_epi8(33)); _mm_storeu_si128((__m128i *)dest_cursor, phreds); cursor += sizeof(__m128i); dest_cursor += sizeof(__m128i); } #endif while (cursor < end_ptr) { *dest_cursor = *cursor + 33; cursor += 1; dest_cursor += 1; } }dnaio-1.2.0/src/dnaio/chunks.py000066400000000000000000000207121453560735400163500ustar00rootroot00000000000000""" Chunked reading of FASTA and FASTQ files This can be used to very quickly split up the input file into similarly-sized chunks, without actually parsing the records. The chunks can then be distributed to worker threads or subprocess and be parsed and processed there. """ from io import RawIOBase from typing import Optional, Iterator, Tuple from ._core import paired_fastq_heads as _paired_fastq_heads from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat def _fasta_head(buf: bytes, end: Optional[int] = None) -> int: """ Search for the end of the last complete FASTA record within buf[:end] Return an integer length such that buf[:length] contains the highest possible number of complete FASTA records. """ pos = buf.rfind(b"\n>", 0, end) if pos != -1: return pos + 1 if buf[0:1] == b">" or buf[0:1] == b"#": return 0 if len(buf) == 0: return 0 c = chr(buf[0]) raise FastaFormatError( f"FASTA file expected to start with '>', but found {repr(c)}", line=None, ) def _paired_fasta_heads( buf1: bytes, buf2: bytes, end1: int, end2: int ) -> Tuple[int, int]: """ Return positions pos1, pos2 where right1 <= end1 and right2 <= end2 such that buf1[:pos1] and buf2[:pos2] contain the same number of complete FASTA records. """ if end1 == 0 or end2 == 0: return (0, 0) if (end1 > 0 and buf1[:1] != b">") or (end2 > 0 and buf2[:1] != b">"): raise FastaFormatError("FASTA file expected to start with '>'", line=None) # Count complete records n_records1 = buf1.count(b"\n>", 0, end1) n_records2 = buf2.count(b"\n>", 0, end2) n_records = min(n_records1, n_records2) pos1 = pos2 = 0 while n_records > 0: pos1 = buf1.find(b"\n>", pos1, end1) + 1 pos2 = buf2.find(b"\n>", pos2, end2) + 1 n_records -= 1 return (pos1, pos2) def _fastq_head(buf: bytes, end: Optional[int] = None) -> int: """ Search for the end of the last complete *two* FASTQ records in buf[:end]. Two FASTQ records are required to ensure that read pairs in interleaved paired-end data are not split. """ linebreaks = buf.count(b"\n", 0, end) right = end for _ in range(linebreaks % 8 + 1): right = buf.rfind(b"\n", 0, right) # Note that this works even if linebreaks == 0: # rfind() returns -1 and adding 1 gives index 0, # which is correct. return right + 1 # type: ignore def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]: """ Read chunks of complete FASTA or FASTQ records from a file. If the format is detected to be FASTQ, all chunks except possibly the last contain an even number of records such that interleaved paired-end reads remain in sync. The yielded memoryview objects are only valid for one iteration because the internal buffer is re-used in the next iteration. Arguments: f: File with FASTA or FASTQ reads; must have been opened in binary mode buffer_size: Largest allowed chunk size Yields: memoryview representing the chunk. This becomes invalid on the next iteration. Raises: ValueError: A FASTQ record was encountered that is larger than *buffer_size*. UnknownFileFormat: The file format could not be detected (the first byte must be "@", ">" or "#") """ # This buffer is re-used in each iteration. buf = bytearray(buffer_size) # Read one byte to determine file format. # If there is a comment char, we assume FASTA! start = f.readinto(memoryview(buf)[0:1]) if start == 0: # Empty file return assert start == 1 if buf[0:1] == b"@": head = _fastq_head elif buf[0:1] == b"#" or buf[0:1] == b">": head = _fasta_head else: raise UnknownFileFormat( f"Cannnot determine input file format: First character expected to be '>' or '@', " f"but found {repr(chr(buf[0]))}" ) # Layout of buf # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 start end bufend len(buf) # # buf[0:start] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTA or FASTQ record. while True: if start == len(buf): raise OverflowError("FASTA/FASTQ record does not fit into buffer") bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore if start == bufend: # End of file break end = head(buf, bufend) assert end <= bufend if end > 0: yield memoryview(buf)[0:end] start = bufend - end assert start >= 0 buf[0:start] = buf[end:bufend] if start > 0: yield memoryview(buf)[0:start] def read_paired_chunks( f: RawIOBase, f2: RawIOBase, buffer_size: int = 4 * 1024**2, ) -> Iterator[Tuple[memoryview, memoryview]]: """ Read chunks of paired-end FASTA or FASTQ records from two files. A pair of chunks (memoryview objects) is yielded on each iteration, and both chunks are guaranteed to have the same number of sequences. That is, the paired-end reads will stay in sync. The memoryviews are only valid for one iteration because the internal buffer is re-used in the next iteration. This is similar to `read_chunks`, but for paired-end data. Args: f: File with R1 reads; must have been opened in binary mode f2: File with R2 reads; must have been opened in binary mode buffer_size: Largest allowed chunk size Yields: Pairs of memoryview objects. Raises: ValueError: A FASTA or FASTQ record was encountered that is larger than *buffer_size*. """ if buffer_size < 6: raise ValueError("Buffer size too small") buf1 = bytearray(buffer_size) buf2 = bytearray(buffer_size) # Read one byte to make sure we are processing FASTQ start1 = f.readinto(memoryview(buf1)[0:1]) start2 = f2.readinto(memoryview(buf2)[0:1]) if start1 == 0 and start2 == 0: return memoryview(b""), memoryview(b"") if (start1 == 0) != (start2 == 0): i = 2 if start1 == 0 else 1 raise FileFormatError( f"Paired-end reads not in sync: File with R{i} reads is empty and the other is not", line=None, ) if buf1[:1] == b"@" != buf2[:1] == b"@": raise FileFormatError( "Paired-end data must be in FASTQ format when using multiple cores", line=None, ) if buf1[:1] == b"@": file_format = "FASTQ" paired_heads = _paired_fastq_heads elif buf1[:1] == b">": file_format = "FASTA" paired_heads = _paired_fasta_heads else: raise FileFormatError( "First character in input file must be '@' (FASTQ) or '>' (FASTA), " f"but found {buf1[:1]}", line=None, ) while True: if start1 == len(buf1) and start2 == len(buf2): raise ValueError( f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}" ) bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore if start1 == bufend1 and start2 == bufend2: break end1, end2 = paired_heads(buf1, buf2, bufend1, bufend2) assert end1 <= bufend1 assert end2 <= bufend2 if end1 > 0 or end2 > 0 or file_format == "FASTA": yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2]) else: assert end1 == 0 and end2 == 0 extra = "" if bufend1 == 0 or bufend2 == 0: i = 1 if bufend1 == 0 else 2 extra = f". File {i} ended, but more data found in the other file" raise FileFormatError( f"Premature end of paired-end input{extra}.", line=None ) start1 = bufend1 - end1 assert start1 >= 0 buf1[0:start1] = buf1[end1:bufend1] start2 = bufend2 - end2 assert start2 >= 0 buf2[0:start2] = buf2[end2:bufend2] if start1 > 0 or start2 > 0: yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2]) dnaio-1.2.0/src/dnaio/exceptions.py000066400000000000000000000020461453560735400172360ustar00rootroot00000000000000from typing import Optional class UnknownFileFormat(Exception): """ The file format could not be automatically detected """ class FileFormatError(Exception): """ The file is not formatted correctly Attributes: line: If available, the number of the line at which the error occurred or None if not. The first line has index 0. """ format = "sequence" # Something generic that works for both FASTA and FASTQ def __init__(self, msg: str, line: Optional[int]): super().__init__(msg, line) self.message = msg self.line = line # starts at 0! def __str__(self): line = "unknown line" if self.line is None else f"line {self.line + 1}" return f"Error in {self.format} file at {line}: {self.message}" class FastqFormatError(FileFormatError): """ The FASTQ file is not formatted correctly """ format = "FASTQ" class FastaFormatError(FileFormatError): """ The FASTA file is not formatted correctly """ format = "FASTA" dnaio-1.2.0/src/dnaio/interfaces.py000066400000000000000000000060551453560735400172040ustar00rootroot00000000000000from abc import abstractmethod from contextlib import AbstractContextManager from typing import Iterable, Iterator, Tuple from dnaio import SequenceRecord class SingleEndReader(AbstractContextManager): delivers_qualities: bool number_of_records: int @abstractmethod def __iter__(self) -> Iterator[SequenceRecord]: """ Iterate over an input containing sequence records Yields: `SequenceRecord` objects Raises: `FileFormatError` if there was a parse error """ @abstractmethod def close(self) -> None: pass class PairedEndReader(AbstractContextManager): @abstractmethod def __iter__(self) -> Iterator[Tuple[SequenceRecord, SequenceRecord]]: """ Iterate over an input containing paired-end records Yields: Pairs of `SequenceRecord` objects Raises: `FileFormatError` if there was a parse error or if reads are improperly paired, that is, if there are more reads in one file than the other or if the record IDs do not match (according to `SequenceRecord.is_mate`). """ @abstractmethod def close(self) -> None: pass class SingleEndWriter(AbstractContextManager): @abstractmethod def write(self, record: SequenceRecord) -> None: """Write a `SequenceRecord` to the output.""" @abstractmethod def close(self) -> None: pass class PairedEndWriter(AbstractContextManager): @abstractmethod def write(self, record1: SequenceRecord, record2: SequenceRecord) -> None: """ Write a pair of `SequenceRecord` objects to the paired-end output. This method does not verify that both records have matching IDs because this was already done at parsing time. If it is possible that the record IDs no longer match, check that ``record1.is_mate(record2)`` returns True before calling this method. """ @abstractmethod def close(self) -> None: pass class MultipleFileWriter(AbstractContextManager): _number_of_files: int @abstractmethod def write(self, *records: SequenceRecord) -> None: """ Write N SequenceRecords to the output. N must be equal to the number of files the MultipleFileWriter was initialized with. This method does not check whether the records are properly paired. """ @abstractmethod def write_iterable(self, list_of_records: Iterable[Tuple[SequenceRecord, ...]]): """ Iterate over the list (or other iterable container) and write all N-tuples of SequenceRecord to disk. N must be equal to the number of files the MultipleFileWriter was initialized with. This method does not check whether the records are properly paired. This method may provide a speed boost over calling write for each tuple of SequenceRecords individually. """ @abstractmethod def close(self) -> None: pass dnaio-1.2.0/src/dnaio/multipleend.py000066400000000000000000000211521453560735400173760ustar00rootroot00000000000000import contextlib import os from os import PathLike from typing import BinaryIO, IO, Iterable, Iterator, List, Optional, Tuple, Union from xopen import xopen from ._core import SequenceRecord, records_are_mates from .exceptions import FileFormatError from .interfaces import MultipleFileWriter from .readers import FastaReader, FastqReader from .singleend import _open_single, _detect_format_from_name from .writers import FastaWriter, FastqWriter def _open_multiple( *files: Union[str, PathLike, BinaryIO], fileformat: Optional[str] = None, mode: str = "r", qualities: Optional[bool] = None, opener=xopen, ): if not files: raise ValueError("At least one file is required") if mode not in ("r", "w", "a"): raise ValueError("Mode must be one of 'r', 'w', 'a'") elif mode == "r": return MultipleFileReader(*files, fileformat=fileformat, opener=opener) elif mode == "w" and fileformat is None: # Assume mixed files will not be offered. for file in files: if isinstance(file, (str, os.PathLike)): fileformat = _detect_format_from_name(os.fspath(file)) append = mode == "a" if fileformat == "fastq" or qualities or (fileformat is None and qualities is None): return MultipleFastqWriter(*files, opener=opener, append=append) return MultipleFastaWriter(*files, opener=opener, append=append) class MultipleFileReader: """ Read multiple FASTA/FASTQ files simultaneously. Useful when additional FASTQ files with extra information are supplied (UMIs, indices etc.). While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, *files: Union[str, PathLike, BinaryIO], fileformat: Optional[str] = None, opener=xopen, ): if len(files) < 1: raise ValueError("At least one file is required") self._files = files self._stack = contextlib.ExitStack() self._readers: List[Union[FastaReader, FastqReader]] = [ self._stack.enter_context( _open_single(file, opener=opener, fileformat=fileformat, mode="r") ) for file in self._files ] self.delivers_qualities: bool = self._readers[0].delivers_qualities def __repr__(self) -> str: return ( f"{self.__class__.__name__}" f"({', '.join(repr(reader) for reader in self._readers)})" ) def __iter__(self) -> Iterator[Tuple[SequenceRecord, ...]]: """ Iterate over multiple inputs containing records Yields: N-tuples of `SequenceRecord` objects where N is equal to the number of files. Raises: `FileFormatError` if there was a parse error or if reads are improperly paired, that is, if there are more reads in one file than the others or if the record IDs do not match (according to `records_are_mates`). """ if len(self._files) == 1: yield from zip(self._readers[0]) else: for records in zip(*self._readers): if not records_are_mates(*records): raise FileFormatError( f"Records are out of sync, names " f"{', '.join(repr(r.name) for r in records)} do not match.", line=None, ) yield records # Consume one iteration to check if all the files have an equal number # of records. for reader in self._readers: try: _ = next(iter(reader)) except StopIteration: pass record_numbers = [r.number_of_records for r in self._readers] if len(set(record_numbers)) != 1: raise FileFormatError( f"Files: {', '.join(str(file) for file in self._files)} have " f"an unequal amount of reads.", line=None, ) def close(self): self._stack.close() def __enter__(self): return self def __exit__(self, *exc): self.close() class MultipleFastaWriter(MultipleFileWriter): """ Write multiple FASTA files simultaneously. While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, *files: Union[str, PathLike, BinaryIO], opener=xopen, append: bool = False, ): if len(files) < 1: raise ValueError("At least one file is required") mode = "a" if append else "w" self._files = files self._number_of_files = len(files) self._stack = contextlib.ExitStack() self._writers: List[Union[FastaWriter, FastqWriter]] = [ self._stack.enter_context( _open_single( file, opener=opener, fileformat="fasta", mode=mode, qualities=False, ) ) for file in self._files ] def __repr__(self) -> str: return ( f"{self.__class__.__name__}" f"({', '.join(repr(writer) for writer in self._writers)})" ) def close(self): self._stack.close() def write(self, *records: SequenceRecord): if len(records) != self._number_of_files: raise ValueError(f"records must have length {self._number_of_files}") for record, writer in zip(records, self._writers): writer.write(record) def write_iterable(self, records_iterable: Iterable[Tuple[SequenceRecord, ...]]): for records in records_iterable: self.write(*records) def __enter__(self): return self def __exit__(self, *exc): self.close() class MultipleFastqWriter(MultipleFileWriter): """ Write multiple FASTA files simultaneously. While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, *files: Union[str, PathLike, BinaryIO], opener=xopen, append: bool = False, ): if len(files) < 1: raise ValueError("At least one file is required") mode = "a" if append else "w" self._files = files self._number_of_files = len(files) self._stack = contextlib.ExitStack() self._writers: List[IO] = [ self._stack.enter_context( opener(file, mode + "b") if not hasattr(file, "write") else file ) for file in self._files ] def __repr__(self) -> str: return ( f"{self.__class__.__name__}" f"({', '.join(str(f) for f in self._files)})" ) def close(self): self._stack.close() def write(self, *records: SequenceRecord): if len(records) != self._number_of_files: raise ValueError(f"records must have length {self._number_of_files}") for record, writer in zip(records, self._writers): writer.write(record.fastq_bytes()) def write_iterable(self, records_iterable: Iterable[Tuple[SequenceRecord, ...]]): # Use faster methods for more common cases before falling back to # generic multiple files mode (which is much slower due to calling the # zip function). if self._number_of_files == 1: output = self._writers[0] for (record,) in records_iterable: output.write(record.fastq_bytes()) elif self._number_of_files == 2: output1 = self._writers[0] output2 = self._writers[1] for record1, record2 in records_iterable: output1.write(record1.fastq_bytes()) output2.write(record2.fastq_bytes()) elif self._number_of_files == 3: output1 = self._writers[0] output2 = self._writers[1] output3 = self._writers[2] for record1, record2, record3 in records_iterable: output1.write(record1.fastq_bytes()) output2.write(record2.fastq_bytes()) output3.write(record3.fastq_bytes()) else: # More than 3 files is quite uncommon. writers = self._writers for records in records_iterable: for record, output in zip(records, writers): output.write(record.fastq_bytes()) def __enter__(self): return self def __exit__(self, *exc): self.close() dnaio-1.2.0/src/dnaio/pairedend.py000066400000000000000000000215141453560735400170110ustar00rootroot00000000000000from contextlib import ExitStack from os import PathLike from typing import Union, BinaryIO, Optional, Iterator, Tuple from xopen import xopen from ._core import SequenceRecord from .exceptions import FileFormatError from .interfaces import PairedEndReader, PairedEndWriter from .readers import FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .singleend import _open_single def _open_paired( *files: Union[str, PathLike, BinaryIO], fileformat: Optional[str] = None, mode: str = "r", qualities: Optional[bool] = None, opener=xopen, ) -> Union[PairedEndReader, PairedEndWriter]: """ Open paired-end reads """ if len(files) == 2: if mode in "wa" and files[0] == files[1]: raise ValueError("The paired-end output files are identical") if "r" in mode: return TwoFilePairedEndReader( *files, fileformat=fileformat, opener=opener, mode=mode ) append = mode == "a" return TwoFilePairedEndWriter( *files, fileformat=fileformat, qualities=qualities, opener=opener, append=append, ) elif len(files) == 1: if "r" in mode: return InterleavedPairedEndReader( files[0], fileformat=fileformat, opener=opener, mode=mode ) append = mode == "a" return InterleavedPairedEndWriter( files[0], fileformat=fileformat, qualities=qualities, opener=opener, append=append, ) raise ValueError("_open_paired must be called with one or two files.") class TwoFilePairedEndReader(PairedEndReader): """ Read paired-end reads from two files (not interleaved) While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ paired = True def __init__( self, file1: Union[str, PathLike, BinaryIO], file2: Union[str, PathLike, BinaryIO], *, mode="r", fileformat: Optional[str] = None, opener=xopen, ): self.mode = mode with ExitStack() as stack: self.reader1 = stack.enter_context( _open_single(file1, opener=opener, fileformat=fileformat, mode=mode) ) self.reader2 = stack.enter_context( _open_single(file2, opener=opener, fileformat=fileformat, mode=mode) ) self._close = stack.pop_all().close self.delivers_qualities = self.reader1.delivers_qualities def __repr__(self) -> str: return f"{self.__class__.__name__}(file1={self.reader1}, file2={self.reader2})" def __iter__(self) -> Iterator[Tuple[SequenceRecord, SequenceRecord]]: """ Iterate over the paired reads. Each yielded item is a pair of `SequenceRecord` objects. Raises a `FileFormatError` if reads are improperly paired. """ for r1, r2 in zip(self.reader1, self.reader2): if not r1.is_mate(r2): raise FileFormatError( f"Reads are improperly paired. Read name '{r1.name}' " f"in file 1 does not match '{r2.name}' in file 2.", line=None, ) from None yield r1, r2 # Force consumption of another read to test if iterators are out of sync. try: next(iter(self.reader1)) except StopIteration: pass try: next(iter(self.reader2)) except StopIteration: pass if self.reader1.number_of_records < self.reader2.number_of_records: raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 2 than in file 1.", line=None, ) from None if self.reader1.number_of_records > self.reader2.number_of_records: raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 1 than in file 2.", line=None, ) from None def close(self) -> None: self._close() def __enter__(self): return self def __exit__(self, *exc): self.close() class InterleavedPairedEndReader(PairedEndReader): """ Read paired-end reads from an interleaved FASTQ file While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ paired = True def __init__( self, file: Union[str, PathLike, BinaryIO], *, mode="r", fileformat: Optional[str] = None, opener=xopen, ): self.mode = mode reader = _open_single(file, opener=opener, mode=mode, fileformat=fileformat) assert isinstance(reader, (FastaReader, FastqReader)) # for Mypy self.reader = reader self.delivers_qualities = self.reader.delivers_qualities def __repr__(self) -> str: return f"{self.__class__.__name__}({self.reader})" def __iter__(self) -> Iterator[Tuple[SequenceRecord, SequenceRecord]]: it = iter(self.reader) for r1 in it: try: r2 = next(it) except StopIteration: raise FileFormatError( "Interleaved input file incomplete: Last record " f"'{r1.name}' has no partner.", line=None, ) from None if not r1.is_mate(r2): raise FileFormatError( f"Reads are improperly paired. Name '{r1.name}' " f"(first) does not match '{r2.name}' (second).", line=None, ) yield r1, r2 def close(self) -> None: self.reader.close() def __enter__(self): return self def __exit__(self, *args): self.close() class TwoFilePairedEndWriter(PairedEndWriter): """ Write paired-end reads to two files (not interleaved) While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, file1: Union[str, PathLike, BinaryIO], file2: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = "fastq", qualities: Optional[bool] = None, opener=xopen, append: bool = False, ): mode = "a" if append else "w" with ExitStack() as stack: self._writer1: Union[FastaWriter, FastqWriter] self._writer2: Union[FastaWriter, FastqWriter] self._writer1 = stack.enter_context( _open_single( file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) ) self._writer2 = stack.enter_context( _open_single( file2, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) ) self._close = stack.pop_all().close def __repr__(self) -> str: return f"{self.__class__.__name__}({self._writer1}, {self._writer2})" def write(self, read1, read2) -> None: self._writer1.write(read1) self._writer2.write(read2) def close(self) -> None: self._close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() class InterleavedPairedEndWriter(PairedEndWriter): """ Write paired-end reads to an interleaved FASTA or FASTQ file While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, file: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = "fastq", qualities: Optional[bool] = None, opener=xopen, append: bool = False, ): mode = "a" if append else "w" writer = _open_single( file, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities ) assert isinstance(writer, (FastaWriter, FastqWriter)) # only for Mypy self._writer = writer def __repr__(self) -> str: return f"{self.__class__.__name__}({self._writer})" def write(self, read1: SequenceRecord, read2: SequenceRecord) -> None: self._writer.write(read1) self._writer.write(read2) def close(self) -> None: self._writer.close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() dnaio-1.2.0/src/dnaio/py.typed000066400000000000000000000000001453560735400161660ustar00rootroot00000000000000dnaio-1.2.0/src/dnaio/readers.py000066400000000000000000000164731453560735400165130ustar00rootroot00000000000000""" Classes for reading FASTA and FASTQ files """ __all__ = ["FastaReader", "FastqReader"] import io from os import PathLike from typing import Union, BinaryIO, Optional, Iterator, List from xopen import xopen from ._core import BamIter, FastqIter, SequenceRecord from ._util import shorten as _shorten from .exceptions import FastaFormatError from .interfaces import SingleEndReader class BinaryFileReader: """ A mixin for readers that ensures that a file or a path can be passed in to the constructor. """ _close_on_exit = False paired: bool = False mode: str = "rb" def __init__( self, file: Union[PathLike, str, BinaryIO], *, opener=xopen, _close_file: Optional[bool] = None, ): """ The file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz, .zst). """ if isinstance(file, str): self._file = opener(file, self.mode) self._close_on_exit = True elif _close_file: self._close_on_exit = True self._file = file else: self._file = file def __repr__(self) -> str: return f"{self.__class__.__name__}('{getattr(self._file, 'name', self._file)}')" def close(self) -> None: if self._close_on_exit and self._file is not None: self._file.close() self._file = None def __enter__(self): if self._file is None: raise ValueError("I/O operation on closed BinaryFileReader") return self def __exit__(self, *args): self.close() class FastaReader(BinaryFileReader, SingleEndReader): """ Reader for FASTA files While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, file: Union[PathLike, str, BinaryIO], *, keep_linebreaks: bool = False, sequence_class=SequenceRecord, opener=xopen, _close_file: Optional[bool] = None, ): """ file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz, .zst). keep_linebreaks -- whether to keep newline characters in the sequence """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = False self._delimiter = "\n" if keep_linebreaks else "" self.number_of_records = 0 self._file = io.TextIOWrapper(self._file) def __iter__(self) -> Iterator[SequenceRecord]: """ Iterate over the records in this FASTA file. """ name = None seq: List[str] = [] if self._file.closed: return for i, line in enumerate(self._file): # strip() also removes DOS line breaks line = line.strip() if not line: continue if line and line[0] == ">": if name is not None: self.number_of_records += 1 try: yield self.sequence_class(name, self._delimiter.join(seq), None) except ValueError as e: raise FastaFormatError( str(e) + " (line number refers to record after the problematic one)", line=i, ) name = line[1:] seq = [] elif line and line[0] == "#": continue elif name is not None: seq.append(line) else: raise FastaFormatError( f"Expected '>' at beginning of record, but got '{_shorten(line)}'.", line=i, ) if name is not None: self.number_of_records += 1 try: yield self.sequence_class(name, self._delimiter.join(seq), None) except ValueError as e: raise FastaFormatError(str(e), line=None) class FastqReader(BinaryFileReader, SingleEndReader): """ Reader for FASTQ files. Does not support multi-line FASTQ files. While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, file: Union[PathLike, str, BinaryIO], *, sequence_class=SequenceRecord, buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc. opener=xopen, _close_file: Optional[bool] = None, ): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = True self.buffer_size = buffer_size try: self._iter: Iterator[SequenceRecord] = FastqIter( self._file, self.sequence_class, self.buffer_size ) except Exception: self.close() raise try: # The first value yielded by FastqIter indicates # whether the file has repeated headers th = next(self._iter) assert isinstance(th, bool) self.two_headers: bool = th except StopIteration: # Empty file self.two_headers = False self._iter = iter(()) except Exception: self.close() raise def __iter__(self) -> Iterator[SequenceRecord]: """Iterate over the records in this FASTQ file.""" return self._iter @property def number_of_records(self): try: return self._iter.number_of_records except AttributeError: return 0 class BamReader(BinaryFileReader, SingleEndReader): """ Reader for BAM files. All records in the input BAM must be unmapped single-end reads (with a flag value of 4). While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments. """ def __init__( self, file: Union[PathLike, str, BinaryIO], *, sequence_class=SequenceRecord, buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc. opener=xopen, _close_file: Optional[bool] = None, ): super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = True self.buffer_size = buffer_size try: self._iter: Iterator[SequenceRecord] = BamIter(self._file, self.buffer_size) except Exception: self.close() raise self.two_headers: bool = False def __iter__(self) -> Iterator[SequenceRecord]: """Iterate over the records in this BAM file.""" return self._iter @property def number_of_records(self): try: return self._iter.number_of_records except AttributeError: return 0 @property def header(self): try: return self._iter.header except AttributeError: return b"" dnaio-1.2.0/src/dnaio/singleend.py000066400000000000000000000113651453560735400170310ustar00rootroot00000000000000import os from typing import Optional, Union, BinaryIO, Tuple from .exceptions import UnknownFileFormat from .readers import BamReader, FastaReader, FastqReader from .writers import FastaWriter, FastqWriter def _open_single( file_or_path: Union[str, os.PathLike, BinaryIO], opener, *, fileformat: Optional[str] = None, mode: str = "r", qualities: Optional[bool] = None, ) -> Union[BamReader, FastaReader, FastaWriter, FastqReader, FastqWriter]: """ Open a single sequence file. """ if mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") close_file, file, path = _open_file_or_path(file_or_path, mode, opener) del file_or_path if path is not None and fileformat is None: fileformat = _detect_format_from_name(path) if fileformat is None and mode == "w" and qualities is not None: # Format not recognized, but we know whether to use a format with or without qualities fileformat = "fastq" if qualities else "fasta" if "r" in mode and fileformat is None: fileformat = _detect_format_from_content(file) if fileformat is None: name = getattr(file, "name", repr(file)) file.close() raise UnknownFileFormat( f"Could not determine whether file '{name}' is FASTA or FASTQ. The file extension " "is not available or not recognized, and the first character in the file is " "unexpected." ) if fileformat is None: assert mode == "w" extra = " because the output file name is not available" if path is None else "" file.close() raise UnknownFileFormat( "Auto-detection of the output file format (FASTA/FASTQ) failed" + extra ) if fileformat == "fastq" and mode in "wa" and qualities is False: file.close() raise ValueError( "Output format cannot be FASTQ since no quality values are available." ) if fileformat == "fasta": if "r" in mode: return FastaReader(file, _close_file=close_file) return FastaWriter(file, _close_file=close_file) elif fileformat == "fastq": if "r" in mode: return FastqReader(file, _close_file=close_file) return FastqWriter(file, _close_file=close_file) elif fileformat == "bam": if "r" in mode: return BamReader(file, _close_file=close_file) # This should not be reached raise NotImplementedError("Only reading is supported for BAM files") if close_file: file.close() raise UnknownFileFormat( f"File format '{fileformat}' is unknown (expected 'fasta' or 'fastq')." ) def _open_file_or_path( file_or_path: Union[str, os.PathLike, BinaryIO], mode: str, opener ) -> Tuple[bool, BinaryIO, Optional[str]]: path: Optional[str] file: BinaryIO try: path = os.fspath(file_or_path) # type: ignore except TypeError: if "r" in mode and not hasattr(file_or_path, "readinto"): raise ValueError( "When passing in an open file-like object, it must have been opened in binary mode" ) file = file_or_path # type: ignore if hasattr(file, "name") and isinstance(file.name, str): path = file.name else: path = None close_file = False else: file = opener(path, mode[0] + "b") close_file = True return close_file, file, path def _detect_format_from_name(name: str) -> Optional[str]: """ name -- file name Return 'fasta', 'fastq' or None if the format could not be detected. """ name = name.lower() for ext in (".gz", ".xz", ".bz2", ".zst"): if name.endswith(ext): name = name[: -len(ext)] break name, ext = os.path.splitext(name) if ext in [".fasta", ".fa", ".fna", ".csfasta", ".csfa"]: return "fasta" elif ext in [".fastq", ".fq"] or (ext == ".txt" and name.endswith("_sequence")): return "fastq" elif ext == "bam": return "bam" return None def _detect_format_from_content(file: BinaryIO) -> Optional[str]: """ Return 'fasta', 'fastq' or None """ if file.seekable(): original_position = file.tell() magic = file.read(4) file.seek(original_position) else: # We cannot always use peek() because BytesIO objects do not suppert it magic = file.peek(4)[0:4] # type: ignore if magic.startswith(b"@") or magic == b"": # Pretend FASTQ for empty input return "fastq" elif magic.startswith(b">") or magic.startswith(b"#"): # Some FASTA variants allow comments return "fasta" elif magic == b"BAM\1": return "bam" return None dnaio-1.2.0/src/dnaio/writers.py000066400000000000000000000116771453560735400165660ustar00rootroot00000000000000import os from os import PathLike from typing import Union, BinaryIO, Optional from xopen import xopen from . import SequenceRecord from .interfaces import SingleEndWriter class FileWriter: """ A mix-in that manages opening and closing and provides a context manager """ _file: BinaryIO def __init__( self, file: Union[PathLike, str, BinaryIO], *, opener=xopen, _close_file: Optional[bool] = None, ): try: os.fspath(file) # type: ignore except TypeError: # Assume it’s an open file-like object self._file = file # type: ignore self._close_on_exit = bool(_close_file) else: self._file = opener(file, "wb") self._close_on_exit = True def __repr__(self) -> str: return f"{self.__class__.__name__}('{getattr(self._file, 'name', self._file)}')" def close(self) -> None: if self._close_on_exit: self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() class FastaWriter(FileWriter, SingleEndWriter): """ Write FASTA-formatted sequences to a file While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments unless you need to set the line_length argument. Arguments: line_length: Wrap sequence lines after this many characters (None disables wrapping) """ def __init__( self, file: Union[PathLike, str, BinaryIO], *, line_length: Optional[int] = None, opener=xopen, _close_file: Optional[bool] = None, ): super().__init__(file, opener=opener, _close_file=_close_file) self.line_length = line_length if line_length != 0 else None def __repr__(self) -> str: return f"FastaWriter('{getattr(self._file, 'name', self._file)}')" def write(self, name_or_record, sequence: Optional[str] = None): """Write a record to the FASTA file. If only one parameter (name_or_record) is given, it must have attributes .name and .sequence, which are then used. Otherwise, the first parameter must be the name and the second the sequence. The effect is that you can write this: writer.write("name", "ACCAT") or writer.write(SequenceRecord("name", "ACCAT")) """ if sequence is None: name = name_or_record.name sequence = name_or_record.sequence else: name = name_or_record if self.line_length is not None: self._file.write((">" + name + "\n").encode("ascii")) s = [] for i in range(0, len(sequence), self.line_length): s.append(sequence[i : i + self.line_length] + "\n") self._file.write("".join(s).encode("ascii")) else: text = ">" + name + "\n" + sequence + "\n" self._file.write(text.encode("ascii")) class FastqWriter(FileWriter, SingleEndWriter): """ Write records in FASTQ format While this class can be instantiated directly, the recommended way is to use `dnaio.open` with appropriate arguments unless you need to set two_headers to True. Arguments: two_headers: If True, the header is repeated on the third line of each record after the "+". """ file_mode = "wb" def __init__( self, file: Union[PathLike, str, BinaryIO], *, two_headers: bool = False, opener=xopen, _close_file: Optional[bool] = None, ): super().__init__(file, opener=opener, _close_file=_close_file) self._two_headers = two_headers # setattr avoids a complaint from Mypy setattr( self, "write", self._write_two_headers if self._two_headers else self._write ) def __repr__(self) -> str: return f"FastqWriter('{getattr(self._file, 'name', self._file)}')" def write(self, record: SequenceRecord) -> None: """ Write a record to the FASTQ file. """ # The 'write' attribute is overwritten in the constructor with the correct # write method (_write or _write_two_headers) assert False def _write(self, record: SequenceRecord) -> None: """ Write a record to the FASTQ file. """ self._file.write(record.fastq_bytes()) def _write_two_headers(self, record: SequenceRecord) -> None: """ Write a record to the FASTQ file, repeating the header in the third line after the "+" . """ self._file.write(record.fastq_bytes(two_headers=True)) def writeseq(self, name: str, sequence: str, qualities: str) -> None: # Deprecated self._file.write(f"@{name:s}\n{sequence:s}\n+\n{qualities:s}\n".encode("ascii")) dnaio-1.2.0/tests/000077500000000000000000000000001453560735400137625ustar00rootroot00000000000000dnaio-1.2.0/tests/data/000077500000000000000000000000001453560735400146735ustar00rootroot00000000000000dnaio-1.2.0/tests/data/dos.fastq000066400000000000000000000004371453560735400165240ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-1.2.0/tests/data/interleaved.fastq000066400000000000000000000003251453560735400202350ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH dnaio-1.2.0/tests/data/missingextension000066400000000000000000000004231453560735400202230ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-1.2.0/tests/data/missingextension.gz000066400000000000000000000003321453560735400206410ustar00rootroot00000000000000Osmall.fastq]1n0 @]Pn '#:ngjlE"~|nie)2o*">bfp2Dݔ/0Mo#Bz_pO|qEi撪6|ؤم0".D4RkAAR|SfW\ܖ;~Ȼm\3t@}eͥ,x:@F%dnaio-1.2.0/tests/data/paired.1.fastq000066400000000000000000000004501453560735400173350ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH dnaio-1.2.0/tests/data/paired.2.fastq000066400000000000000000000004451453560735400173420ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ dnaio-1.2.0/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam000066400000000000000000000050261453560735400304560ustar00rootroot00000000000000BCq՘]\gǟؘY%FP 2֐c̜:_3ll2ۙnV RQBRW?TzXE&7-R|9{^,yy?&Ϟs9tVXkqV7 K,Lz8%b_$LI ha Ɯ)R@!9"c̑) 5J*LP!)C fDS&Yz#X U+$s,*MLs҄p&i@#ʙ4 (5'8+(THJ2U: I: g ͑sL"]pW8gB874;{G!7nBFɋ8Lc1pY{Wz8 1QKiɋ4f"%V22I4ssLjfr޸;AQ9}=jm|. V,çӃ=A(`qxZDe֩b/Jcfri'eLI4BӋ/{2KrWQ^F=,RDe=84S^ç0+5qe<~*i/"2PR93p3W%!RWTQ}8f<|P\9 3B2LJo$q#'8fU>3?=3k 1~SmO l4qn,Zȃ4~!sza ƙ˧56N1*46??*a'F>\3spԌQ̉aǚ VBqP6RP`18CԈEeaa('4\5ƢmaՄ" ssb2͈yQդ8j7Rpje\hht8 9iU[_R]<5-LTh9M)ܲΠPn:n6KV6(v? YXjn^,,ޙ CXn[^[ 6^H, "|+0\"p'E)Í'w޾c"w($1| @TYW \3sΛp2ڎ~yy 7yڙq /)\WfqK W_Ii sM3쿖daې2 pۛAç?g0\C&ʞH|fv³7x?7sѧ O}2I|&,rOhiR[d| A|Rm*f^a{Yd 7 d o 2{CDDEDDDDDDDDDDEDDCDDDDD?BBD9 X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015 163 chrM 1 60 101M = 59 159 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG @@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJHHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.bam000066400000000000000000000050561453560735400322120ustar00rootroot00000000000000dnaio-1.2.0/tests/dataBCq՘]\gǟؘY%FP 2֐c̜:_3ll2ۙnV RQBRW?TzXE&7-R|9{^,yy?&Ϟs9tVXkqV7 K,Lz8%b_$LI ha Ɯ)R@!9"c̑) 5J*LP!)C fDS&Yz#X U+$s,*MLs҄p&i@#ʙ4 (5'8+(THJ2U: I: g ͑sL"]pW8gB874;{G!7nBFɋ8Lc1pY{Wz8 1QKiɋ4f"%V22I4ssLjfr޸;AQ9}=jm|. V,çӃ=A(`qxZDe֩b/Jcfri'eLI4BӋ/{2KrWQ^F=,RDe=84S^ç0+5qe<~*i/"2PR93p3W%!RWTQ}8f<|P\9 3B2LJo$q#'8fU>3?=3k 1~SmO l4qn,Zȃ4~!sza ƙ˧56N1*46??*a'F>\3spԌQ̉aǚ VBqP6RP`18CԈEeaa('4\5ƢmaՄ" ssb2͈yQդ8j7Rpje\hht8 9iU[_R]<5-LTh9M)ܲΠPn:n6KV6(v? YXjn^,,ޙ CXn[^[ 6^H, "|+0\"p'E)Í'w޾c"w($1| @TYW \3sΛp2ڎ~yy 7yڙq /)\WfqK W_Ii sM3쿖daې2 pۛAç?g0\C&ʞH|fv³7x?7sѧ O}2I|&,rOhiR[d| A|Rm*f^a{Yd 7 d o 2{CDDEDDDDDDDDDDEDDCDDDDD?BBD9 X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015 4 chrM 1 60 101M = 59 159 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG @@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJHHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U dnaio-1.2.0/tests/data/simple.fasta000066400000000000000000000000661453560735400172060ustar00rootroot00000000000000>first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-1.2.0/tests/data/simple.fasta.bz2000066400000000000000000000001251453560735400176760ustar00rootroot00000000000000BZh91AY&SYf π0 *! TPM T$zM芯rH ` ьwKI.p dnaio-1.2.0/tests/data/simple.fasta.gz000066400000000000000000000001111453560735400176140ustar00rootroot00000000000000f0[simple.fastaK,*./N-,MKN v usv5+NMKArv5E|6dnaio-1.2.0/tests/data/simple.fasta.xz000066400000000000000000000001541453560735400176440ustar00rootroot000000000000007zXZִF!t/5-]'V$@T-ZMaFy38Db`2fI6ԇ}YZdnaio-1.2.0/tests/data/simple.fastq000066400000000000000000000001151453560735400172210ustar00rootroot00000000000000@first_sequence SEQUENCE1 + :6;;8<=:< @second_sequence SEQUENCE2 + 83/J z)„)?hdnaio-1.2.0/tests/data/simple.fastq.gz000066400000000000000000000001351453560735400176420ustar00rootroot00000000000000'[simple.fastqsH,*./N-,MKN v usv52r(NMKTfTfalcooaf>Mdnaio-1.2.0/tests/data/simple.fastq.xz000066400000000000000000000002001453560735400176540ustar00rootroot000000000000007zXZִF!t/LA] 'V$@ƹY#ܿX,lj&4ߵx+OMᄎL%v^.4QmTQuzX !]M'}YZdnaio-1.2.0/tests/data/simple.unaligned.bam000066400000000000000000000003061453560735400206110ustar00rootroot00000000000000BCsreb``pp 23 *+/*IMrrtr *-I*fP ``a`GVf&18v C@{#B>K2R< &I4:hA'$ՂP8LL-A5DP\K[[ 8p6BCdnaio-1.2.0/tests/data/simplebamnoextension000066400000000000000000000003061453560735400210600ustar00rootroot00000000000000BCsreb``pp 23 *+/*IMrrtr *-I*fP ``a`GVf&18v C@{#B>K2R< &I4:hA'$ՂP8LL-A5DP\K[[ 8p6BCdnaio-1.2.0/tests/data/small.fastq000066400000000000000000000004231453560735400170420ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-1.2.0/tests/data/with_comment.fasta000066400000000000000000000001201453560735400204010ustar00rootroot00000000000000# a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-1.2.0/tests/data/withplus.fastq000066400000000000000000000001541453560735400176120ustar00rootroot00000000000000@first_sequence SEQUENCE1 +this is different :6;;8<=:< @second_sequence SEQUENCE2 +also different 831\n") == 0 assert _fasta_head(b">1\n3") == 0 assert _fasta_head(b">1\n3\n") == 0 assert _fasta_head(b">1\n3\n>") == 5 assert _fasta_head(b">1\n3\n>6") == 5 assert _fasta_head(b">1\n3\n>6\n") == 5 assert _fasta_head(b">1\n3\n>6\n8") == 5 assert _fasta_head(b">1\n3\n>6\n8\n") == 5 assert _fasta_head(b">1\n3\n>6\n8\n0") == 5 assert _fasta_head(b">1\n3\n>6\n8\n0\n") == 5 assert _fasta_head(b">1\n3\n>6\n8\n0\n>") == 12 def test_fasta_head_with_comment(): assert _fasta_head(b"#") == 0 assert _fasta_head(b"#\n") == 0 assert _fasta_head(b"#\n>") == 2 assert _fasta_head(b"#\n>3") == 2 assert _fasta_head(b"#\n>3\n") == 2 assert _fasta_head(b"#\n>3\n5") == 2 assert _fasta_head(b"#\n>3\n5\n") == 2 assert _fasta_head(b"#\n>3\n5\n>") == 7 def test_paired_fasta_heads(): def pheads(buf1, buf2): return _paired_fasta_heads(buf1, buf2, len(buf1), len(buf2)) assert pheads(b"", b"") == (0, 0) assert pheads(b">r", b">r") == (0, 0) assert pheads(b">r\nA\n>s", b">r") == (0, 0) assert pheads(b">r\nA\n>s", b">r\nCT\n>s") == (5, 6) assert pheads(b">r\nA\n>s\nG\n>t\n", b">r\nCT\n>s") == (5, 6) buf1 = ( textwrap.dedent( """ >1 a b >2 c >3 uv """ ) .strip() .encode() ) buf2 = ( textwrap.dedent( """ >1 def >2 gh i >3 """ ) .strip() .encode() ) assert pheads(buf1, buf2) == ( len(b">1\na\nb\n>2\nc\n"), len(b">1\ndef\n>2\ngh\ni\n"), ) def test_paired_fastq_heads(): buf1 = b"first\nsecond\nthird\nfourth\nfifth" buf2 = b"a\nb\nc\nd\ne\nf\ng" assert paired_fastq_heads(buf1, buf2, len(buf1), len(buf2)) == ( len(b"first\nsecond\nthird\nfourth\n"), len(b"a\nb\nc\nd\n"), ) assert paired_fastq_heads(b"abc", b"def", 3, 3) == (0, 0) assert paired_fastq_heads(b"abc\n", b"def", 4, 3) == (0, 0) assert paired_fastq_heads(b"abc", b"def\n", 3, 4) == (0, 0) assert paired_fastq_heads(b"\n\n\n\n", b"\n\n\n\n", 4, 4) == (4, 4) def test_fastq_head(): assert _fastq_head(b"") == 0 assert _fastq_head(b"A\n") == 0 assert _fastq_head(b"A\nB") == 0 assert _fastq_head(b"A\nB\n") == 0 assert _fastq_head(b"A\nB\nC") == 0 assert _fastq_head(b"A\nB\nC\n") == 0 assert _fastq_head(b"A\nB\nC\nD") == 0 assert _fastq_head(b"A\nB\nC\nD\n") == 0 assert _fastq_head(b"A\nB\nC\nD\nE") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\n") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF\n") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\n") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH") == 0 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH\n") == 16 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH\nI") == 16 assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH\nI\n") == 16 def test_read_paired_chunks_fastq(): with open("tests/data/paired.1.fastq", "rb") as f1: with open("tests/data/paired.2.fastq", "rb") as f2: for c1, c2 in read_paired_chunks(f1, f2, buffer_size=128): print(c1, c2) def test_paired_chunks_fasta(tmp_path): for i in (1, 2): with dnaio.open(f"tests/data/paired.{i}.fastq") as infile: with dnaio.open(tmp_path / f"{i}.fasta", mode="w") as outfile: for record in infile: record.qualities = None outfile.write(record) with open(tmp_path / "1.fasta", "rb") as r1: with open(tmp_path / "2.fasta", "rb") as r2: for c1, c2 in read_paired_chunks(r1, r2, buffer_size=128): print(c1.tobytes(), c2.tobytes()) def test_paired_chunks_different_number_of_records(): record = b"@r\nAA\n+\n##\n" buf1 = record buf2 = record * 3 it = read_paired_chunks(BytesIO(buf1), BytesIO(buf2), 16) assert next(it) == (record, record) with raises(FileFormatError) as error: next(it) error.match("more data found in the other file") def test_read_chunks(): for data in [b"@r1\nACG\n+\nHHH\n", b">r1\nACGACGACG\n"]: assert [m.tobytes() for m in read_chunks(BytesIO(data))] == [data] # Buffer too small with raises(OverflowError): list(read_chunks(BytesIO(data), buffer_size=4)) def test_read_chunks_empty(): assert list(read_chunks(BytesIO(b""))) == [] def test_invalid_file_format(): with raises(UnknownFileFormat): list(read_chunks(BytesIO(b"invalid format"))) dnaio-1.2.0/tests/test_internal.py000066400000000000000000000676261453560735400172300ustar00rootroot00000000000000import gzip import io import os import shutil import subprocess import sys from io import BytesIO from pathlib import Path from tempfile import mkdtemp from textwrap import dedent import pytest from pytest import raises, mark import dnaio from dnaio import ( BamReader, FileFormatError, FastaFormatError, FastqFormatError, FastaReader, FastqReader, InterleavedPairedEndReader, FastaWriter, FastqWriter, InterleavedPairedEndWriter, TwoFilePairedEndReader, records_are_mates, record_names_match, SequenceRecord, ) from dnaio.writers import FileWriter from dnaio.readers import BinaryFileReader from dnaio._core import bytes_ascii_check TEST_DATA = Path(__file__).parent / "data" SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq") # files tests/data/simple.fast{q,a} simple_fastq = [ SequenceRecord("first_sequence", "SEQUENCE1", ":6;;8<=:<"), SequenceRecord("second_sequence", "SEQUENCE2", "83 None: fasta = BytesIO(b">first_sequence\nSEQUENCE1\n>second_sequence\nSEQUENCE2\n") reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_with_comments(self) -> None: fasta = BytesIO( dedent( """ # a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """ ).encode() ) reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_wrong_format(self) -> None: fasta = BytesIO( dedent( """# a comment # another one unexpected >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """ ).encode() ) with raises(FastaFormatError) as info: list(FastaReader(fasta)) assert info.value.line == 2 def test_fastareader_keeplinebreaks(self) -> None: with FastaReader("tests/data/simple.fasta", keep_linebreaks=True) as f: reads = list(f) assert reads[0] == simple_fasta[0] assert reads[1].sequence == "SEQUEN\nCE2" def test_context_manager(self) -> None: filename = "tests/data/simple.fasta" with open(filename, "rb") as f: assert not f.closed with dnaio.open(f) as inner_f: list(inner_f) assert not f.closed assert f.closed with FastaReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None # Open it a second time with FastaReader(filename): pass class TestFastqReader: def test_fastqreader(self) -> None: with FastqReader(SIMPLE_FASTQ) as f: reads = list(f) assert reads == simple_fastq @mark.parametrize("buffer_size", [1, 2, 3, 5, 7, 10, 20]) def test_fastqreader_buffersize(self, buffer_size) -> None: with FastqReader("tests/data/simple.fastq", buffer_size=buffer_size) as f: reads = list(f) assert reads == simple_fastq def test_fastqreader_buffersize_too_small(self) -> None: with raises(ValueError) as e: with FastqReader("tests/data/simple.fastq", buffer_size=0) as f: _ = list(f) # pragma: no cover assert "buffer size too small" in e.value.args[0] def test_fastqreader_dos(self) -> None: # DOS line breaks with open("tests/data/dos.fastq", "rb") as f: assert b"\r\n" in f.read() with FastqReader("tests/data/dos.fastq") as f: dos_reads = list(f) with FastqReader("tests/data/small.fastq") as f: unix_reads = list(f) assert dos_reads == unix_reads def test_fastq_wrongformat(self) -> None: with raises(FastqFormatError) as info: with FastqReader("tests/data/withplus.fastq") as f: list(f) # pragma: no cover assert info.value.line == 2 def test_empty_fastq(self) -> None: with FastqReader(BytesIO(b"")) as fq: assert list(fq) == [] @mark.parametrize( "s,line", [ (b"@", 0), (b"@r", 0), (b"@r1", 0), (b"@r1\n", 1), (b"@r1\nA", 1), (b"@r1\nAC", 1), (b"@r1\nACG", 1), (b"@r1\nACG\n", 2), (b"@r1\nACG\n+", 2), (b"@r1\nACG\n+\n", 3), (b"@r1\nACG\n+\nH", 3), (b"@r1\nACG\n+\nHH", 3), (b"@r1\nACG\n+\nHHH\n@", 4), (b"@r1\nACG\n+\nHHH\n@r", 4), (b"@r1\nACG\n+\nHHH\n@r2", 4), (b"@r1\nACG\n+\nHHH\n@r2\n", 5), (b"@r1\nACG\n+\nHHH\n@r2\nT", 5), (b"@r1\nACG\n+\nHHH\n@r2\nT\n", 6), (b"@r1\nACG\n+\nHHH\n@r2\nT\n+", 6), (b"@r1\nACG\n+\nHHH\n@r2\nT\n+\n", 7), ], ) def test_fastq_incomplete(self, s, line) -> None: fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_half_record_line_numbers(self) -> None: fastq = BytesIO(b"@r\nACG\n+\nHH\n") # Choose the buffer size such that only parts of the record fit # We want to ensure that the line number is reset properly # after the record has been half-parsed buffer_size = len("@r\nACG\n+\n") with raises(FastqFormatError) as info: with FastqReader(fastq, buffer_size=buffer_size) as fq: list(fq) # pragma: no cover assert "Length of sequence and qualities differ" in info.value.message assert info.value.line == 3 @mark.parametrize( "s,line", [ (b"@r1\nACG\n+\nH#HH\n@r2\nT\n+\nH\n", 3), (b"@r1\nACG\n+\n#H\n@r2\nT\n+\nH\n", 3), (b"@r1\nACG\n+\nHHH\n@r2\nT\n+\nHH\n", 7), (b"@r1\nACG\n+\nHHH\n@r2\nT\n+\n\n", 7), ], ) def test_differing_lengths(self, s, line) -> None: fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_missing_final_newline(self) -> None: # Files with a missing final newline are currently allowed fastq = BytesIO(b"@r1\nA\n+\nH") with dnaio.open(fastq) as f: records = list(f) assert records == [SequenceRecord("r1", "A", "H")] def test_non_ascii_in_record(self) -> None: # \xc4 -> Ä fastq = BytesIO(b"@r1\n\xc4\n+\nH") with pytest.raises(FastqFormatError) as e: with dnaio.open(fastq) as f: list(f) e.match("Non-ASCII") def test_not_opened_as_binary(self) -> None: filename = "tests/data/simple.fastq" with open(filename, "rt") as f: with raises(ValueError): list(dnaio.open(f)) # type: ignore def test_context_manager(self) -> None: filename = "tests/data/simple.fastq" with open(filename, "rb") as f: assert not f.closed reader = dnaio.open(f) assert isinstance(reader, FastqReader) _ = list(reader) assert not f.closed assert f.closed with FastqReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None def test_two_header_detection(self) -> None: fastq = BytesIO(b"@r1\nACG\n+r1\nHHH\n@r2\nT\n+r2\n#\n") with FastqReader(fastq) as fq: assert fq.two_headers list(fq) fastq = BytesIO(b"@r1\nACG\n+\nHHH\n@r2\nT\n+r2\n#\n") with FastqReader(fastq) as fq: assert not fq.two_headers list(fq) def test_second_header_not_equal(self) -> None: fastq = BytesIO(b"@r1\nACG\n+xy\nXXX\n") with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) # pragma: no cover assert "Sequence descriptions don't match" in info.value.message class TestOpen: def setup_method(self): self._tmpdir = mkdtemp() def teardown_method(self): shutil.rmtree(self._tmpdir) def test_sequence_reader(self) -> None: # test the autodetection with dnaio.open("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq with dnaio.open("tests/data/simple.fasta") as f: reads = list(f) assert reads == simple_fasta with open("tests/data/simple.fastq", "rb") as f: reads = list(dnaio.open(f)) assert reads == simple_fastq # make the name attribute unavailable with open("tests/data/simple.fastq", "rb") as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fastq with open("tests/data/simple.fasta", "rb") as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fasta def test_autodetect_fasta_format(self, tmpdir) -> None: path = str(tmpdir.join("tmp.fasta")) with dnaio.open(path, mode="w") as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: records = list(f) assert records == simple_fasta def test_write_qualities_to_fasta(self) -> None: path = os.path.join(self._tmpdir, "tmp.fasta") with dnaio.open(path, mode="w", qualities=True) as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fasta def test_autodetect_fastq_format(self) -> None: path = os.path.join(self._tmpdir, "tmp.fastq") with dnaio.open(path, mode="w") as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fastq def test_autodetect_fastq_weird_name(self) -> None: path = os.path.join(self._tmpdir, "tmp.fastq.gz") with dnaio.open(path, mode="w") as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) weird_path = os.path.join(self._tmpdir, "tmp.weird.gz") os.rename(path, weird_path) with dnaio.open(weird_path) as f: assert list(f) == simple_fastq def test_fastq_qualities_missing(self) -> None: path = os.path.join(self._tmpdir, "tmp.fastq") with raises(ValueError): with dnaio.open(path, mode="w", qualities=False): pass # pragma: no cover class TestInterleavedReader: def test(self) -> None: expected = [ ( SequenceRecord( "read1/1 some text", "TTATTTGTCTCCAGC", "##HHHHHHHHHHHHH" ), SequenceRecord( "read1/2 other text", "GCTGGAGACAAATAA", "HHHHHHHHHHHHHHH" ), ), ( SequenceRecord( "read3/1", "CCAACTTGATATTAATAACA", "HHHHHHHHHHHHHHHHHHHH" ), SequenceRecord( "read3/2", "TGTTATTAATATCAAGTTGG", "#HHHHHHHHHHHHHHHHHHH" ), ), ] with InterleavedPairedEndReader("tests/data/interleaved.fastq") as isr: reads = list(isr) assert reads == expected with dnaio.open("tests/data/interleaved.fastq", interleaved=True) as f: reads = list(f) assert reads == expected def test_missing_partner(self) -> None: s = BytesIO(b"@r1\nACG\n+\nHHH\n") with raises(FileFormatError) as info: with InterleavedPairedEndReader(s) as isr: list(isr) assert "Interleaved input file incomplete" in info.value.message def test_incorrectly_paired(self) -> None: s = BytesIO(b"@r1/1\nACG\n+\nHHH\n@wrong_name\nTTT\n+\nHHH\n") with raises(FileFormatError) as info: with InterleavedPairedEndReader(s) as isr: list(isr) assert "Reads are improperly paired" in info.value.message class TestFastaWriter: def setup_method(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, "tmp.fasta") def teardown_method(self): shutil.rmtree(self._tmpdir) def test(self) -> None: with FastaWriter(self.path) as fw: fw.write("name", "CCATA") fw.write("name2", "HELLO") assert fw._file.closed with open(self.path) as t: assert t.read() == ">name\nCCATA\n>name2\nHELLO\n" def test_linelength(self) -> None: with FastaWriter(self.path, line_length=3) as fw: fw.write("r1", "ACG") fw.write("r2", "CCAT") fw.write("r3", "TACCAG") assert fw._file.closed with open(self.path) as t: d = t.read() assert d == ">r1\nACG\n>r2\nCCA\nT\n>r3\nTAC\nCAG\n" def test_write_sequence_object(self) -> None: with FastaWriter(self.path) as fw: fw.write(SequenceRecord("name", "CCATA")) fw.write(SequenceRecord("name2", "HELLO")) assert fw._file.closed with open(self.path) as t: assert t.read() == ">name\nCCATA\n>name2\nHELLO\n" def test_write_to_file_like_object(self) -> None: bio = BytesIO() with FastaWriter(bio) as fw: fw.write(SequenceRecord("name", "CCATA")) fw.write(SequenceRecord("name2", "HELLO")) assert bio.getvalue() == b">name\nCCATA\n>name2\nHELLO\n" assert not bio.closed assert not fw._file.closed def test_write_zero_length_sequence_record(self) -> None: bio = BytesIO() with FastaWriter(bio) as fw: fw.write(SequenceRecord("name", "")) assert bio.getvalue() == b">name\n\n", "{!r}".format(bio.getvalue()) class TestFastqWriter: def setup_method(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, "tmp.fastq") def teardown_method(self): shutil.rmtree(self._tmpdir) def test(self) -> None: with FastqWriter(self.path) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert fq._file.closed with open(self.path) as t: assert t.read() == "@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n" def test_twoheaders(self) -> None: with FastqWriter(self.path, two_headers=True) as fq: fq.write(SequenceRecord("name", "CCATA", "!#!#!")) fq.write(SequenceRecord("name2", "HELLO", "&&&!&")) assert fq._file.closed with open(self.path) as t: assert ( t.read() == "@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&\n" ) def test_write_to_file_like_object(self) -> None: bio = BytesIO() with FastqWriter(bio) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert bio.getvalue() == b"@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n" class TestInterleavedWriter: def test(self) -> None: reads = [ ( SequenceRecord("A/1 comment", "TTA", "##H"), SequenceRecord("A/2 comment", "GCT", "HH#"), ), (SequenceRecord("B/1", "CC", "HH"), SequenceRecord("B/2", "TG", "#H")), ] bio = BytesIO() with InterleavedPairedEndWriter(bio) as writer: for read1, read2 in reads: writer.write(read1, read2) assert bio.getvalue() == ( b"@A/1 comment\nTTA\n+\n##H\n" b"@A/2 comment\nGCT\n+\nHH#\n" b"@B/1\nCC\n+\nHH\n" b"@B/2\nTG\n+\n#H\n" ) class TestPairedSequenceReader: def test_read(self) -> None: s1 = BytesIO(b"@r1\nACG\n+\nHHH\n") s2 = BytesIO(b"@r2\nGTT\n+\n858\n") with TwoFilePairedEndReader(s1, s2) as psr: assert [ ( SequenceRecord("r1", "ACG", "HHH"), SequenceRecord("r2", "GTT", "858"), ), ] == list(psr) def test_record_names_match(self) -> None: match = record_names_match assert match("abc", "abc") assert match("abc def", "abc") assert match("abc def", "abc ghi") assert match("abc", "abc ghi") assert not match("abc", "xyz") assert match("abc\tdef", "abc") assert match("abc\tdef", "abc\tghi") assert match("abc somecomment\tanothercomment", "abc andanothercomment\tbla") assert match("abc\tcomments comments", "abc\tothers others") assert match("abc\tdef", "abc def") def test_record_names_match_with_ignored_trailing_12(self) -> None: match = record_names_match assert match("abc/1", "abc/2") assert match("abc.1", "abc.2") assert match("abc1", "abc2") assert match("abc2", "abc1") assert match("abc1 def", "abc1 ghi") assert match("abc1 def", "abc2 ghi") assert match("abc2 def", "abc1 ghi") assert not match("abc1", "abc4") assert not match("abc1", "abc") assert not match("abc", "abc1") assert not match("abc", "abc2") def test_record_names_match_with_ignored_trailing_123(self) -> None: match = record_names_match assert match("abc/1", "abc/3") assert match("abc.1 def", "abc.3 ghi") assert match("abc.3 def", "abc.1 ghi") def test_missing_partner1(self) -> None: s1 = BytesIO(b"") s2 = BytesIO(b"@r1\nACG\n+\nHHH\n") with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "There are more reads in file 2 than in file 1" in info.value.message def test_missing_partner2(self) -> None: s1 = BytesIO(b"@r1\nACG\n+\nHHH\n") s2 = BytesIO(b"") with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "There are more reads in file 1 than in file 2" in info.value.message def test_empty_sequences_do_not_stop_iteration(self) -> None: s1 = BytesIO(b"@r1_1\nACG\n+\nHHH\n@r2_1\nACG\n+\nHHH\n@r3_2\nACG\n+\nHHH\n") s2 = BytesIO(b"@r1_1\nACG\n+\nHHH\n@r2_2\n\n+\n\n@r3_2\nACG\n+\nHHH\n") # Second sequence for s2 is empty but valid. Should not lead to a stop of iteration. with TwoFilePairedEndReader(s1, s2) as psr: seqs = list(psr) print(seqs) assert len(seqs) == 3 def test_incorrectly_paired(self) -> None: s1 = BytesIO(b"@r1/1\nACG\n+\nHHH\n") s2 = BytesIO(b"@wrong_name\nTTT\n+\nHHH\n") with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "Reads are improperly paired" in info.value.message @mark.parametrize( "path", [ os.path.join("tests", "data", "simple.fastq"), os.path.join("tests", "data", "dos.fastq"), os.path.join("tests", "data", "simple.fasta"), os.path.join("tests", "data", "with_comment.fasta"), ], ) def test_read_stdin(path) -> None: # Get number of records in the input file with dnaio.open(path) as f: expected = len(list(f)) # Use piping from a separate subprocess to force the input file name to be unavailable cmd = "type" if sys.platform == "win32" else "cat" with subprocess.Popen( [cmd, path], stdout=subprocess.PIPE, shell=sys.platform == "win32" ) as cat: with subprocess.Popen( [sys.executable, "tests/read_from_stdin.py"], stdin=cat.stdout, stdout=subprocess.PIPE, ) as py: assert cat.stdout is not None cat.stdout.close() # Check that the read_from_stdin.py script prints the correct number of records assert str(expected) == py.communicate()[0].decode().strip() def test_file_writer(tmp_path) -> None: path = tmp_path / "out.txt" fw = FileWriter(path) repr(fw) fw.close() assert path.exists() with raises(ValueError) as e: with fw: pass # pragma: no coverage assert "operation on closed file" in e.value.args[0] def test_binary_file_reader() -> None: bfr = BinaryFileReader("tests/data/simple.fasta") repr(bfr) bfr.close() with raises(ValueError) as e: with bfr: pass # pragma: no coverage assert "operation on closed" in e.value.args[0] def test_fasta_writer_repr(tmp_path) -> None: with FastaWriter(tmp_path / "out.fasta") as fw: repr(fw) def test_fastq_writer_repr(tmp_path) -> None: with FastqWriter(tmp_path / "out.fastq") as fw: repr(fw) class TestAsciiCheck: ASCII_STRING = ( "In het Nederlands komen bijzondere leestekens niet vaak voor.".encode("ascii") ) # NON-ASCII from the German wikipedia. NON_ASCII_STRING = ( "In späterer Zeit trat Umlaut sehr häufig analogisch ein.".encode("latin-1") ) def test_ascii(self) -> None: assert bytes_ascii_check(self.ASCII_STRING) def test_ascii_all_chars(self) -> None: assert bytes_ascii_check(bytes(range(128))) assert not bytes_ascii_check(bytes(range(129))) def test_non_ascii(self) -> None: assert not bytes_ascii_check(self.NON_ASCII_STRING) def test_non_ascii_lengths(self) -> None: # Make sure that the function finds the non-ascii byte correctly for # all lengths. non_ascii_char = "é".encode("latin-1") for i in range(len(self.ASCII_STRING)): test_string = self.ASCII_STRING[:i] + non_ascii_char assert not bytes_ascii_check(test_string) def test_ascii_lengths(self) -> None: # Make sure the ascii check is correct even though there are non-ASCII # bytes directly behind the search space. # This ensures there is no overshoot where the algorithm checks bytes # after the search space. non_ascii_char = "é".encode("latin-1") for i in range(1, len(self.ASCII_STRING) + 1): test_string = self.ASCII_STRING[:i] + (non_ascii_char * 8) assert bytes_ascii_check(test_string, i - 1) class TestRecordsAreMates: def test_records_are_mates(self) -> None: assert records_are_mates( SequenceRecord("same_name1 some_comment", "A", "H"), SequenceRecord("same_name2 other_comment", "A", "H"), SequenceRecord("same_name3", "A", "H"), ) @pytest.mark.parametrize("number_of_mates", list(range(2, 11))) def test_lots_of_records_are_mates(self, number_of_mates) -> None: mates = [SequenceRecord("name", "A", "H") for _ in range(number_of_mates)] assert records_are_mates(*mates) def test_records_are_not_mates(self) -> None: assert not records_are_mates( SequenceRecord("same_name1 some_comment", "A", "H"), SequenceRecord("same_name2 other_comment", "A", "H"), SequenceRecord("shame_name3 different_comment", "A", "H"), ) def test_records_are_mates_zero_arguments(self) -> None: with pytest.raises(TypeError) as error: records_are_mates() # type: ignore error.match("records_are_mates requires at least two arguments") def test_records_are_mates_one_argument(self) -> None: with pytest.raises(TypeError) as error: records_are_mates(SequenceRecord("A", "A", "A")) # type: ignore error.match("records_are_mates requires at least two arguments") class TestBamReader: bam_file = ( TEST_DATA / "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878" ".bwa.markDuplicates.unmapped.bam" ) raw_bam_bytes = gzip.decompress(bam_file.read_bytes()) complete_record_with_header = raw_bam_bytes[:6661] complete_header = complete_record_with_header[:6359] def test_parse_bam(self): with dnaio.open(self.bam_file) as reader: records = list(reader) assert len(records) == 3 assert reader.number_of_records == 3 assert records[0].name == "HWI-D00119:50:H7AP8ADXX:1:1104:8519:18990" assert records[0].sequence == ( "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" ) assert records[0].qualities == ( "CCCFFFFFHFFHHJIJJIJGGJJJJJJJJJJJJJIGHIIEHIJJJJJJIJJJJIBGGIIIHIIII" "HHHHDD;9CCDEDDDDDDDDDDEDDDDDDDDDDDDD" ) assert records[1].name == "HWI-D00119:50:H7AP8ADXX:1:2104:18479:82511" assert records[1].sequence == ( "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" ) assert records[1].qualities == ( "CCCFFFFFHFFHHJJJJIJJJJIIJJJJJGIJJJJGIJJJJJJJJGJIJJIJJJGHIJJJJJJJI" "HHHHDD@>CDDEDDDDDDDDDDEDDCDDDDD?BBD9" ) assert records[2].name == "HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015" assert records[2].sequence == ( "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" ) assert records[2].qualities == ( "@@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJ" "HHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD" ) def test_parse_header(self): header = ( Path(__file__).parent / "data" / "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa" ".markDuplicates.header.sam" ) header_bytes = header.read_bytes() with dnaio.open(self.bam_file) as bam: assert bam.header == header_bytes @pytest.mark.parametrize( "end", range(len(complete_header) + 1, len(complete_record_with_header)) ) def test_truncated_record(self, end: int): file = io.BytesIO(self.complete_record_with_header[:end]) with pytest.raises(EOFError) as e: list(BamReader(file)) e.match("Incomplete record at the end of file") @pytest.mark.parametrize("end", [3, 5, 2000, 6000]) def test_truncated_header(self, end): file = io.BytesIO(self.complete_record_with_header[:end]) with pytest.raises(EOFError) as e: list(BamReader(file)) e.match("Truncated BAM file") def test_bam_parser_not_binary_error(self): file = io.StringIO( "Don't be too proud of this technological terror you have constructed." ) with pytest.raises(TypeError) as error: BamReader(file) error.match("binary IO") @pytest.mark.parametrize("buffersize", [4, 8, 10, 20, 40]) def test_small_buffersize(self, buffersize): reader = BamReader(str(self.bam_file), buffer_size=buffersize) assert len(list(reader)) == 3 def test_error_on_mapped_bam(self): bam = TEST_DATA / ( "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878" ".bwa.markDuplicates.bam" ) reader = BamReader(str(bam)) it = iter(reader) with pytest.raises(NotImplementedError) as error: next(it) assert error.match("unmapped single reads") dnaio-1.2.0/tests/test_multiple.py000066400000000000000000000077301453560735400172350ustar00rootroot00000000000000import io import itertools import os from pathlib import Path import dnaio from dnaio import SequenceRecord, _open_multiple import pytest @pytest.mark.parametrize( ["fileformat", "number_of_files"], itertools.product(("fasta", "fastq"), (1, 2, 3, 4)), ) def test_read_files(fileformat, number_of_files): file = Path(__file__).parent / "data" / ("simple." + fileformat) files = [file] * number_of_files with _open_multiple(*files) as multiple_reader: for records in multiple_reader: pass assert len(records) == number_of_files assert isinstance(records, tuple) @pytest.mark.parametrize( "kwargs", [ dict(mode="w", fileformat="fasta"), dict(mode="r"), dict(mode="w", fileformat="fastq"), ], ) def test_open_no_file_error(kwargs): with pytest.raises(ValueError): _open_multiple(**kwargs) def test_open_multiple_unsupported_mode(): with pytest.raises(ValueError) as error: _open_multiple(os.devnull, mode="X") error.match("one of 'r', 'w', 'a'") @pytest.mark.parametrize( ["number_of_files", "content"], itertools.product( (1, 2, 3, 4), (">my_fasta\nAGCTAGA\n", "@my_fastq\nAGC\n+\nHHH\n") ), ) def test_multiple_binary_read(number_of_files, content): files = [io.BytesIO(content.encode("ascii")) for _ in range(number_of_files)] with _open_multiple(*files) as reader: for records_tup in reader: pass @pytest.mark.parametrize( ["number_of_files", "fileformat"], itertools.product((1, 2, 3, 4), ("fastq", "fasta")), ) def test_multiple_binary_write(number_of_files, fileformat): files = [io.BytesIO() for _ in range(number_of_files)] records = [SequenceRecord("A", "A", "A") for _ in range(number_of_files)] with _open_multiple(*files, mode="w", fileformat=fileformat) as writer: writer.write(*records) @pytest.mark.parametrize( ["number_of_files", "fileformat"], itertools.product((1, 2, 3, 4), ("fastq", "fasta")), ) def test_multiple_write_too_many(number_of_files, fileformat): files = [io.BytesIO() for _ in range(number_of_files)] records = [SequenceRecord("A", "A", "A") for _ in range(number_of_files + 1)] with _open_multiple(*files, mode="w", fileformat=fileformat) as writer: with pytest.raises(ValueError) as error: writer.write(*records) error.match(str(number_of_files)) @pytest.mark.parametrize( ["number_of_files", "fileformat"], itertools.product((1, 2, 3, 4), ("fastq", "fasta")), ) def test_multiple_write_iterable(number_of_files, fileformat): files = [io.BytesIO() for _ in range(number_of_files)] records = [SequenceRecord("A", "A", "A") for _ in range(number_of_files)] records_list = [records, records, records] with _open_multiple(*files, mode="w", fileformat=fileformat) as writer: writer.write_iterable(records_list) @pytest.mark.parametrize("number_of_files", (2, 3, 4)) def test_multiple_read_unmatched_names(number_of_files): record1_content = b"@my_fastq\nAGC\n+\nHHH\n" record2_content = b"@my_fasterq\nAGC\n+\nHHH\n" files = ( io.BytesIO(record1_content), *(io.BytesIO(record2_content) for _ in range(number_of_files - 1)), ) with _open_multiple(*files) as reader: with pytest.raises(dnaio.FileFormatError) as error: for records in reader: pass # pragma: no coverage error.match("do not match") @pytest.mark.parametrize("number_of_files", (2, 3, 4)) def test_multiple_read_out_of_sync(number_of_files): record1_content = b"@my_fastq\nAGC\n+\nHHH\n" record2_content = b"@my_fastq\nAGC\n+\nHHH\n@my_secondfastq\nAGC\n+\nHHH\n" files = ( io.BytesIO(record1_content), *(io.BytesIO(record2_content) for _ in range(number_of_files - 1)), ) with _open_multiple(*files) as reader: with pytest.raises(dnaio.FileFormatError) as error: for records in reader: pass error.match("unequal amount") dnaio-1.2.0/tests/test_open.py000066400000000000000000000333221453560735400163370ustar00rootroot00000000000000import os from pathlib import Path import pytest from xopen import xopen import dnaio from dnaio import FileFormatError, UnknownFileFormat @pytest.fixture(params=["", ".gz", ".bz2", ".xz"]) def extension(request): return request.param @pytest.fixture(params=["fasta", "fastq"]) def fileformat(request): return request.param SIMPLE_RECORDS = { "fasta": [ dnaio.SequenceRecord("first_sequence", "SEQUENCE1"), dnaio.SequenceRecord("second_sequence", "SEQUENCE2"), ], "fastq": [ dnaio.SequenceRecord("first_sequence", "SEQUENCE1", ":6;;8<=:<"), dnaio.SequenceRecord("second_sequence", "SEQUENCE2", "83 str: if fileformat == "fastq": return "@{}\n{}\n+\n{}\n".format(record.name, record.sequence, record.qualities) else: return ">{}\n{}\n".format(record.name, record.sequence) def formatted_sequences(records, fileformat) -> str: return "".join(formatted_sequence(record, fileformat) for record in records) def test_formatted_sequence() -> None: s = dnaio.SequenceRecord("s1", "ACGT", "HHHH") assert ">s1\nACGT\n" == formatted_sequence(s, "fasta") assert "@s1\nACGT\n+\nHHHH\n" == formatted_sequence(s, "fastq") def test_version() -> None: _ = dnaio.__version__ def test_open_nonexistent(tmp_path) -> None: with pytest.raises(FileNotFoundError): with dnaio.open(tmp_path / "nonexistent"): pass # pragma: no cover def test_open_empty_file_with_unrecognized_extension(tmp_path) -> None: path = tmp_path / "unrecognized-extension.tmp" path.touch() with dnaio.open(path) as f: records = list(f) assert records == [] def test_fileformat_error(tmp_path) -> None: with open(tmp_path / "file.fastq", mode="w") as f: print("this is not a FASTQ file", file=f) with pytest.raises(FileFormatError) as e: with dnaio.open(tmp_path / "file.fastq") as f: _ = list(f) # pragma: no cover assert "at line 2" in str(e.value) # Premature end of file def test_write_unknown_file_format(tmp_path) -> None: with pytest.raises(UnknownFileFormat): with dnaio.open(tmp_path / "out.txt", mode="w") as f: f.write(dnaio.SequenceRecord("name", "ACG", "###")) # pragma: no cover def test_read_unknown_file_format(tmp_path) -> None: with open(tmp_path / "file.txt", mode="w") as f: print("text file", file=f) with pytest.raises(UnknownFileFormat): with dnaio.open(tmp_path / "file.txt") as f: _ = list(f) # pragma: no cover def test_invalid_format(tmp_path) -> None: with pytest.raises(UnknownFileFormat): with dnaio.open(tmp_path / "out.txt", mode="w", fileformat="foo"): pass # pragma: no cover def test_write_qualities_to_file_without_fastq_extension(tmp_path) -> None: with dnaio.open(tmp_path / "out.txt", mode="w", qualities=True) as f: f.write(dnaio.SequenceRecord("name", "ACG", "###")) with dnaio.open(tmp_path / "out.txt", mode="w", qualities=False) as f: f.write(dnaio.SequenceRecord("name", "ACG", None)) def test_read(fileformat, extension) -> None: with dnaio.open("tests/data/simple." + fileformat + extension) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_pathlib_path(fileformat, extension) -> None: path = Path("tests/data/simple." + fileformat + extension) with dnaio.open(path) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_opener(fileformat, extension) -> None: def my_opener(path, mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data) with dnaio.open( "totally-ignored-filename." + fileformat + extension, opener=my_opener ) as f: records = list(f) assert len(records) == 1 assert records[0].name == "read" assert records[0].sequence == "ACG" def test_read_paired_fasta() -> None: path = "tests/data/simple.fasta" with dnaio.open(path, path) as f: list(f) @pytest.mark.parametrize("interleaved", [False, True]) def test_paired_opener(fileformat, extension, interleaved) -> None: def my_opener(_path, _mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data + data) path1 = "ignored-filename." + fileformat + extension path2 = "also-ignored-filename." + fileformat + extension if interleaved: with dnaio.open(path1, path2, opener=my_opener) as f: records = list(f) expected = 2 else: with dnaio.open(path1, interleaved=True, opener=my_opener) as f: records = list(f) expected = 1 assert len(records) == expected assert records[0][0].name == "read" assert records[0][0].sequence == "ACG" assert records[0][1].name == "read" assert records[0][1].sequence == "ACG" def test_detect_fastq_from_content() -> None: """FASTQ file that is not named .fastq""" with dnaio.open("tests/data/missingextension") as f: record = next(iter(f)) assert record.name == "prefix:1_13_573/1" def test_detect_compressed_fastq_from_content() -> None: """Compressed FASTQ file that is not named .fastq.gz""" with dnaio.open("tests/data/missingextension.gz") as f: record = next(iter(f)) assert record.name == "prefix:1_13_573/1" def test_detect_bam_from_content() -> None: with dnaio.open("tests/data/simplebamnoextension") as f: record = next(iter(f)) assert record.name == "Myheader" def test_detect_bam_from_filename() -> None: with dnaio.open("tests/data/simple.unaligned.bam") as f: record = next(iter(f)) assert record.name == "Myheader" def test_write(tmp_path, extension) -> None: out_fastq = tmp_path / ("out.fastq" + extension) with dnaio.open(str(out_fastq), mode="w") as f: f.write(dnaio.SequenceRecord("name", "ACGT", "HHHH")) with xopen(out_fastq) as f: assert f.read() == "@name\nACGT\n+\nHHHH\n" def test_write_with_xopen(tmp_path, fileformat, extension) -> None: s = dnaio.SequenceRecord("name", "ACGT", "HHHH") out_fastq = tmp_path / ("out." + fileformat + extension) with xopen(out_fastq, "wb") as outer_f: with dnaio.open(outer_f, mode="w", fileformat=fileformat) as f: f.write(s) with xopen(out_fastq) as f: if fileformat == "fasta": assert f.read() == ">name\nACGT\n" else: assert f.read() == "@name\nACGT\n+\nHHHH\n" def test_write_str_path(tmp_path, fileformat, extension) -> None: s1 = dnaio.SequenceRecord("s1", "ACGT", "HHHH") path = str(tmp_path / ("out." + fileformat + extension)) with dnaio.open(path, mode="w") as f: f.write(s1) if fileformat == "fasta": expected = b">s1\nACGT\n" else: expected = b"@s1\nACGT\n+\nHHHH\n" with xopen(path, "rb") as f: assert f.read() == expected def test_write_paired_same_path(tmp_path) -> None: path1 = tmp_path / "same.fastq" path2 = tmp_path / "same.fastq" with pytest.raises(ValueError): with dnaio.open(path1, path2, mode="w"): pass # pragma: no cover def test_write_paired(tmp_path, fileformat, extension) -> None: r1 = [ dnaio.SequenceRecord("s1", "ACGT", "HHHH"), dnaio.SequenceRecord("s2", "CGCA", "8383"), ] r2 = [ dnaio.SequenceRecord("t1", "TCGT", "5HHH"), dnaio.SequenceRecord("t2", "TGCA", "5383"), ] path1 = tmp_path / ("out.1." + fileformat + extension) path2 = tmp_path / ("out.2." + fileformat + extension) with dnaio.open(path1, path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_write_interleaved(tmp_path, fileformat, extension) -> None: r1 = [ dnaio.SequenceRecord("s1", "ACGT", "HHHH"), dnaio.SequenceRecord("s2", "CGCA", "8383"), ] r2 = [ dnaio.SequenceRecord("t1", "TCGT", "5HHH"), dnaio.SequenceRecord("t2", "TGCA", "5383"), ] path = tmp_path / ("out.interleaved." + fileformat + extension) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def test_append(tmp_path, fileformat, extension) -> None: s1 = dnaio.SequenceRecord("s1", "ACGT", "HHHH") s2 = dnaio.SequenceRecord("s2", "CGCA", "8383") path = tmp_path / ("out." + fileformat + extension) with dnaio.open(path, mode="w") as f: f.write(s1) with dnaio.open(path, mode="a") as f: f.write(s2) with xopen(path) as f: assert formatted_sequences([s1, s2], fileformat) == f.read() def test_append_paired(tmp_path, fileformat, extension) -> None: r1 = [ dnaio.SequenceRecord("s1", "ACGT", "HHHH"), dnaio.SequenceRecord("s2", "CGCA", "8383"), ] r2 = [ dnaio.SequenceRecord("t1", "TCGT", "5HHH"), dnaio.SequenceRecord("t2", "TGCA", "5383"), ] path1 = tmp_path / ("out.1." + fileformat + extension) path2 = tmp_path / ("out.2." + fileformat + extension) with dnaio.open(path1, path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path1, path2, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_append_interleaved(tmp_path, fileformat, extension) -> None: r1 = [ dnaio.SequenceRecord("s1", "ACGT", "HHHH"), dnaio.SequenceRecord("s2", "CGCA", "8383"), ] r2 = [ dnaio.SequenceRecord("t1", "TCGT", "5HHH"), dnaio.SequenceRecord("t2", "TGCA", "5383"), ] path = tmp_path / ("out.interleaved." + fileformat + extension) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def make_random_fasta(path, n_records) -> None: from random import choice with xopen(path, "w") as f: for i in range(n_records): name = "sequence_{}".format(i) sequence = "".join(choice("ACGT") for _ in range(300)) print(">", name, "\n", sequence, sep="", file=f) def test_islice_gzip_does_not_fail(tmp_path) -> None: path = tmp_path / "file.fasta.gz" make_random_fasta(path, 100) f = dnaio.open(path) next(iter(f)) f.close() def test_unsupported_mode() -> None: with pytest.raises(ValueError) as error: _ = dnaio.open(os.devnull, mode="x") # type: ignore error.match("Mode must be") def test_no_file2_with_multiple_args() -> None: with pytest.raises(ValueError) as error: _ = dnaio.open(os.devnull, os.devnull, file2=os.devnull) # type: ignore error.match("as positional argument") error.match("file2") def test_no_multiple_files_interleaved() -> None: with pytest.raises(ValueError) as error: _ = dnaio.open(os.devnull, os.devnull, interleaved=True) # type: ignore error.match("interleaved") error.match("one file") @pytest.mark.parametrize( ["mode", "expected_class"], [("r", dnaio.PairedEndReader), ("w", dnaio.PairedEndWriter)], ) def test_paired_open_with_multiple_args( tmp_path, fileformat, mode, expected_class ) -> None: path = tmp_path / "file" path2 = tmp_path / "file2" path.touch() path2.touch() with dnaio.open(path, path2, fileformat=fileformat, mode=mode) as f: assert isinstance(f, expected_class) @pytest.mark.parametrize( ["kwargs", "expected_class"], [ ({}, dnaio.multipleend.MultipleFileReader), ({"mode": "w"}, dnaio.multipleend.MultipleFastqWriter), ({"mode": "w", "fileformat": "fastq"}, dnaio.multipleend.MultipleFastqWriter), ({"mode": "w", "fileformat": "fasta"}, dnaio.multipleend.MultipleFastaWriter), ], ) def test_multiple_open_fastq(kwargs, expected_class) -> None: with dnaio.open(os.devnull, os.devnull, os.devnull, **kwargs) as f: assert isinstance(f, expected_class) def test_deprecated_file1_file2_keyword_arguments(tmp_path): path = Path("tests/data/simple.fasta") expected = SIMPLE_RECORDS["fasta"] with dnaio.open(file1=path) as f: records = list(f) assert records == expected with dnaio.open(path, file2=path) as f: records = list(f) assert records == list(zip(expected, expected)) with dnaio.open(file1=path, file2=path) as f: records = list(f) assert records == list(zip(expected, expected)) def test_positional_with_file1(): with pytest.raises(ValueError) as error: with dnaio.open("in.fastq", file1="in2.fastq"): pass # pragma: no cover error.match("file1 keyword argument cannot be used together") def test_positional_with_file1_and_file2(): with pytest.raises(ValueError) as error: with dnaio.open("in.fastq", file1="in2.fastq", file2="in3.fastq"): pass # pragma: no cover error.match("cannot be used together") dnaio-1.2.0/tests/test_records.py000066400000000000000000000122351453560735400170370ustar00rootroot00000000000000import pytest from dnaio import SequenceRecord class TestSequenceRecord: def test_too_many_qualities(self): with pytest.raises(ValueError): SequenceRecord(name="name", sequence="ACGT", qualities="#####") def test_fastq_bytes(self): assert ( SequenceRecord("name", "ACGT", "====").fastq_bytes() == b"@name\nACGT\n+\n====\n" ) def test_fastq_bytes_two_headers(self): assert ( SequenceRecord("name", "ACGT", "====").fastq_bytes(two_headers=True) == b"@name\nACGT\n+name\n====\n" ) def test_is_mate_succes(self): assert SequenceRecord("name1", "A", "=").is_mate( SequenceRecord("name2", "GC", "FF") ) def test_reverse_complement(self): assert SequenceRecord( "name1", "ACGTUMRWSYKVHDBNacgtumrwsykvhdbn", "/AAAA/6E/EEEEEEEEEEEE/EEEEA///E/", ).reverse_complement() == SequenceRecord( "name1", "nvhdbmrswykaacgtNVHDBMRSWYKAACGT", "/E///AEEEE/EEEEEEEEEEEE/E6/AAAA/", ) def test_reverse_complement_none_qualities(self): assert SequenceRecord( "name1", "GATTACA", None ).reverse_complement() == SequenceRecord("name1", "TGTAATC", None) def test_init_name_bad(self): with pytest.raises(ValueError) as error: SequenceRecord("nąme1", "A", "=") error.match("ASCII") def test_init_name_none(self): with pytest.raises(TypeError) as error: SequenceRecord(None, "A", "=") error.match("str") def test_init_sequence_bad(self): with pytest.raises(ValueError) as error: SequenceRecord("name1", "Ä", "=") error.match("ASCII") def test_init_sequence_none(self): with pytest.raises(TypeError) as error: SequenceRecord("name1", None, "=") error.match("str") def test_init_qualities_bad(self): with pytest.raises(ValueError) as error: SequenceRecord("name1", "A", "ä") error.match("ASCII") def test_init_qualities_none(self): seq = SequenceRecord("name1", "A", None) assert seq.qualities is None def test_set_name_bad(self): seq = SequenceRecord("name1", "A", "=") with pytest.raises(ValueError) as error: seq.name = "näme1" error.match("ASCII") def test_set_name_none(self): seq = SequenceRecord("name1", "A", "=") with pytest.raises(TypeError) as error: seq.name = None error.match("str") def test_set_sequence_bad(self): seq = SequenceRecord("name1", "A", "=") with pytest.raises(ValueError) as error: seq.sequence = "Ä" error.match("ASCII") def test_set_sequence_none(self): seq = SequenceRecord("name1", "A", "=") with pytest.raises(TypeError) as error: seq.sequence = None error.match("str") def test_set_qualities_bad(self): seq = SequenceRecord("name1", "A", "=") with pytest.raises(ValueError) as error: seq.qualities = "Ä" error.match("ASCII") def test_set_qualities_none(self): seq = SequenceRecord("name1", "A", "=") seq.qualities = None assert seq.qualities is None def test_set_id(self): seq = SequenceRecord("name", "A", "=") with pytest.raises(AttributeError): seq.id = "Obi-Wan" def test_set_comment(self): seq = SequenceRecord("name", "A", "=") with pytest.raises(AttributeError): seq.comment = "Hello there!" @pytest.mark.parametrize( ["record", "expected"], [ (SequenceRecord("name", "A", "="), None), (SequenceRecord("name ", "A", "="), None), (SequenceRecord("name ", "A", "="), None), (SequenceRecord("name", "A", "="), None), (SequenceRecord("AotC I hate sand!", "A", "="), "I hate sand!"), ( SequenceRecord("Givemesome space", "A", "="), "space", ), ], ) def test_get_comment(self, record, expected): assert record.comment == expected @pytest.mark.parametrize( ["record", "expected"], [ (SequenceRecord("name", "A", "="), "name"), (SequenceRecord("name ", "A", "="), "name"), (SequenceRecord("name ", "A", "="), "name"), (SequenceRecord("name", "A", "="), "name"), (SequenceRecord("AotC I hate sand!", "A", "="), "AotC"), ], ) def test_get_id(self, record, expected): assert record.id == expected def test_reset_id_and_comment_on_name_update(self): record = SequenceRecord("Obi-Wan: don't try it!", "", "") assert record.id == "Obi-Wan:" assert record.comment == "don't try it!" record.name = "Anakin: you underestimate my power!" assert record.id == "Anakin:" assert record.comment == "you underestimate my power!" def test_legacy_sequence(): from dnaio import Sequence s = Sequence("name", "ACGT", "####") assert isinstance(s, SequenceRecord) dnaio-1.2.0/tests/test_util.py000066400000000000000000000003131453560735400163450ustar00rootroot00000000000000from dnaio._util import shorten def test_shorten(): assert shorten(None) is None assert shorten("hello too long", 5) == "he..." assert shorten("hello not too long") == "hello not too long" dnaio-1.2.0/tox.ini000066400000000000000000000022111453560735400141270ustar00rootroot00000000000000[tox] envlist = flake8,black,mypy,docs,py38,py39,py310,py311,py312 isolated_build = True [testenv] deps = pytest coverage commands = coverage run -m pytest coverage combine coverage xml coverage report setenv = PYTHONDEVMODE = 1 [testenv:flake8] basepython = python3.10 deps = flake8 commands = flake8 src/ tests/ [testenv:black] basepython = python3.10 deps = black==22.3.0 skip_install = true commands = black --check src/ tests/ doc/ helpers/ setup.py [testenv:mypy] basepython = python3.10 deps = mypy pytest commands = mypy src/ tests/ [testenv:asan] setenv= PYTHONDEVMODE=1 PYTHONMALLOC=malloc CFLAGS=-lasan -fsanitize=address -fno-omit-frame-pointer allowlist_externals=bash commands= bash -c 'export LD_PRELOAD=$(gcc -print-file-name=libasan.so) && printenv LD_PRELOAD && python -c "import dnaio" && pytest tests' [testenv:docs] basepython = python3.10 changedir = doc deps = -r doc/requirements.txt commands = sphinx-build -W -d {envtmpdir}/doctrees . {envtmpdir}/html [flake8] max-line-length = 99 max-complexity = 15 # E203 (whitespace before ':') must be ignored for Black extend-ignore = E203