pax_global_header00006660000000000000000000000064141742016340014514gustar00rootroot0000000000000052 comment=a868b91f17e3da3e86b3cdac237ecfa67afb3174 dnaio-0.7.1/000077500000000000000000000000001417420163400126135ustar00rootroot00000000000000dnaio-0.7.1/.codecov.yml000066400000000000000000000002461417420163400150400ustar00rootroot00000000000000comment: off codecov: require_ci_to_pass: no coverage: precision: 1 round: down range: "90...100" status: project: yes patch: no changes: no dnaio-0.7.1/.editorconfig000066400000000000000000000001451417420163400152700ustar00rootroot00000000000000[*.{py,pyx}] charset=utf-8 end_of_line=lf insert_final_newline=true indent_style=space indent_size=4 dnaio-0.7.1/.gitattributes000066400000000000000000000000341417420163400155030ustar00rootroot00000000000000*.fastq -crlf *.fasta -crlf dnaio-0.7.1/.github/000077500000000000000000000000001417420163400141535ustar00rootroot00000000000000dnaio-0.7.1/.github/workflows/000077500000000000000000000000001417420163400162105ustar00rootroot00000000000000dnaio-0.7.1/.github/workflows/ci.yml000066400000000000000000000055471417420163400173410ustar00rootroot00000000000000name: CI on: [push, pull_request] jobs: lint: timeout-minutes: 10 runs-on: ubuntu-latest strategy: matrix: python-version: [3.7] toxenv: [flake8, mypy] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install tox run: python -m pip install tox - name: Run tox ${{ matrix.toxenv }} run: tox -e ${{ matrix.toxenv }} build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 with: fetch-depth: 0 # required for setuptools_scm - name: Build sdist and temporary wheel run: pipx run build - uses: actions/upload-artifact@v2 with: name: sdist path: dist/*.tar.gz test: timeout-minutes: 10 runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest] python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] include: - os: macos-latest python-version: 3.8 - os: windows-latest python-version: 3.8 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install tox run: python -m pip install tox - name: Test run: tox -e py - name: Upload coverage report uses: codecov/codecov-action@v1 wheels: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') needs: [lint, test] timeout-minutes: 15 strategy: matrix: os: [ubuntu-20.04, windows-2019] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 with: fetch-depth: 0 # required for setuptools_scm - name: Build wheels uses: pypa/cibuildwheel@v2.1.2 env: CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64" CIBW_ENVIRONMENT: "CFLAGS=-g0" CIBW_TEST_REQUIRES: "pytest" CIBW_TEST_COMMAND_LINUX: "cd {project} && pytest tests" CIBW_TEST_COMMAND_WINDOWS: "cd /d {project} && pytest tests" - uses: actions/upload-artifact@v2 with: name: wheels path: wheelhouse/*.whl publish: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') needs: [build, wheels] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v2 with: name: sdist path: dist/ - uses: actions/download-artifact@v2 with: name: wheels path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@v1.4.2 with: user: __token__ password: ${{ secrets.pypi_password }} #password: ${{ secrets.test_pypi_password }} #repository_url: https://test.pypi.org/legacy/ dnaio-0.7.1/.gitignore000066400000000000000000000002141417420163400146000ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /.tox/ /src/dnaio/_version.py dnaio-0.7.1/CHANGES.rst000066400000000000000000000012721417420163400144170ustar00rootroot00000000000000========= Changelog ========= v0.7.1 (2022-01-26) ------------------- * PR #34: Fix parsing of FASTA files that just contain a comment and no reads v0.7.0 (2022-01-17) ------------------- * @rhpvorderman contributed many performance improvements in PR #15, #17, #18, #20, #21, #22, #23. Reading and writing FASTQ files and reading of paired-end FASTQ files was sped up significantly. For example, reading uncompressed FASTQ is 50% faster (!) than before. * PR #28: Windows support added v0.6.0 (2021-09-28) ------------------- * PR #12: Improve FASTQ writing speed twofold (thanks to @rhpvorderman) v0.5.2 (2021-09-07) ------------------- * Issue #7: Ignore a trailing "3" in the read id dnaio-0.7.1/LICENSE000066400000000000000000000021041417420163400136150ustar00rootroot00000000000000Copyright (c) 2010-2018 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. dnaio-0.7.1/MANIFEST.in000066400000000000000000000001261417420163400143500ustar00rootroot00000000000000include README.md include LICENSE include src/dnaio/*.c include src/dnaio/_version.py dnaio-0.7.1/NOTES.md000066400000000000000000000030141417420163400140230ustar00rootroot00000000000000- compressed - paired-end - interleaved - chunked - FASTA, FASTQ - BAM? import dnaio with dnaio.open('input.fastq.gz') as f: for record in f: print(record....) with dnaio.open('input.1.fastq.gz', 'input.2.fastq.gz') as f: for record in f: print(record....) Use cases - open FASTQ from path - open FASTA from path - open compressed FASTA or FASTQ (.gz, .bz2, .xz) - open paired-end data - open interleaved data - open file-like object (such as sys.stdin) - use custom sequence record class - autodetect file format from contents - write FASTQ - write FASTA - read FASTQ/FASTA chunks (multiple records) Issues - Binary vs text - Should SequenceRecord be immutable? TODO - Sequence.name should be Sequence.description or so (reserve .name for the part before the first space) - optimize writing - Documentation - Line endings - second header FASTQ chunks - need an index attribute - need a line_number attribute # API ## Advertised - dnaio.open - Sequence(Record) - possibly SequencePair/PairedSequence? ## Reader - FastqReader - FastaReader - PairedSequenceReader -> rename to PairedFastqReader? - InterleavedSequenceReader -> rename to InterleavedFastqReader ## Writing class FastqWriter class FastaWriter class PairedSequenceWriter class InterleavedSequenceWriter ## Chunking def find_fasta_record_end(buf, end): def find_fastq_record_end(buf, end=None): def read_chunks_from_file(f, buffer_size=4*1024**2): def read_paired_chunks(f, f2, buffer_size=4*1024**2): head fastq_head two_fastq_heads dnaio-0.7.1/README.md000066400000000000000000000027151417420163400140770ustar00rootroot00000000000000![CI](https://github.com/marcelm/dnaio/workflows/CI/badge.svg) [![PyPI](https://img.shields.io/pypi/v/dnaio.svg?branch=master)](https://pypi.python.org/pypi/dnaio) [![Codecov](https://codecov.io/gh/marcelm/dnaio/branch/master/graph/badge.svg)](https://codecov.io/gh/marcelm/dnaio) # dnaio parses FASTQ and FASTA `dnaio` is a Python 3.6+ library for fast parsing of FASTQ and also FASTA files. The code was previously part of the [Cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out. ## Example usage The main interface is the `dnaio.open` function: import dnaio with dnaio.open('reads.fastq.gz') as f: bp = 0 for record in f: bp += len(record) print(f'The input file contains {bp/1E6:.1f} Mbp') ## Features and supported file types - FASTQ input and output - FASTA input and output - Compressed input and output (`.gz`, `.bz2` and `.xz`, detected automatically) - Paired-end data in two files - Interleaved paired-end data in a single file - Files with DOS/Windows linebreaks can be read - FASTQ files with a second header line (after the `+`) are supported # Limitations - Multi-line FASTQ files are not supported. - FASTQ parsing is the focus of this library. The FASTA parser is not as optimized. # Links * [Source code](https://github.com/marcelm/dnaio/) * [Report an issue](https://github.com/marcelm/dnaio/issues) * [Project page on PyPI](https://pypi.python.org/pypi/dnaio/) dnaio-0.7.1/pyproject.toml000066400000000000000000000003501417420163400155250ustar00rootroot00000000000000[build-system] requires = ["setuptools >= 45", "wheel", "setuptools_scm >= 6.2", "Cython >= 0.29.20"] build-backend = "setuptools.build_meta" [black.tool] line-length = 100 [tool.setuptools_scm] write_to = "src/dnaio/_version.py" dnaio-0.7.1/setup.cfg000066400000000000000000000014551417420163400144410ustar00rootroot00000000000000[metadata] name = dnaio author = Marcel Martin author_email = marcel.martin@scilifelab.se url = https://github.com/marcelm/dnaio/ description = Read and write FASTA and FASTQ files efficiently long_description = file: README.md long_description_content_type = text/markdown license = MIT classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Science/Research License :: OSI Approved :: MIT License Programming Language :: Cython Programming Language :: Python :: 3 Topic :: Scientific/Engineering :: Bio-Informatics [options] python_requires = >=3.6 package_dir = =src packages = find: install_requires = xopen >= 1.4.0 [options.packages.find] where = src [options.package_data] * = py.typed, *.pyi [options.extras_require] dev = Cython pytest dnaio-0.7.1/setup.py000066400000000000000000000004111417420163400143210ustar00rootroot00000000000000from setuptools import setup, Extension from Cython.Build import cythonize import setuptools_scm # noqa Ensure it’s installed setup( ext_modules=cythonize( [ Extension("dnaio._core", sources=["src/dnaio/_core.pyx"]), ] ), ) dnaio-0.7.1/src/000077500000000000000000000000001417420163400134025ustar00rootroot00000000000000dnaio-0.7.1/src/dnaio/000077500000000000000000000000001417420163400144745ustar00rootroot00000000000000dnaio-0.7.1/src/dnaio/__init__.py000066400000000000000000000103531417420163400166070ustar00rootroot00000000000000""" Sequence I/O: Read and write FASTA and FASTQ files efficiently """ __all__ = [ 'open', 'Sequence', 'SingleEndReader', 'PairedEndReader', 'SingleEndWriter', 'PairedEndWriter', 'FastaReader', 'FastaWriter', 'FastqReader', 'FastqWriter', 'UnknownFileFormat', 'FileFormatError', 'FastaFormatError', 'FastqFormatError', 'InterleavedPairedEndReader', 'InterleavedPairedEndWriter', 'TwoFilePairedEndReader', 'TwoFilePairedEndWriter', 'read_chunks', 'read_paired_chunks', 'record_names_match', '__version__', ] from os import PathLike from typing import Optional, Union, BinaryIO from xopen import xopen from ._core import Sequence, record_names_match from .readers import FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .singleend import _open_single from .pairedend import ( TwoFilePairedEndReader, TwoFilePairedEndWriter, InterleavedPairedEndReader, InterleavedPairedEndWriter, ) from .exceptions import UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError from .interfaces import SingleEndReader, PairedEndReader, SingleEndWriter, PairedEndWriter from .chunks import read_chunks, read_paired_chunks from ._version import version as __version__ def open( file1: Union[str, PathLike, BinaryIO], *, file2: Optional[Union[str, PathLike, BinaryIO]] = None, fileformat: Optional[str] = None, interleaved: bool = False, mode: str = "r", qualities: Optional[bool] = None, opener=xopen ) -> Union[ SingleEndReader, PairedEndReader, SingleEndWriter, PairedEndWriter, ]: """ Open sequence files in FASTA or FASTQ format for reading or writing. Parameters: file1: file2: Paths to regular or compressed files or file-like objects (as str or as pathlib.Path). Use only file1 if data is single-end. If sequences are paired, use also file2. mode: Either ``'r'`` for reading, ``'w'`` for writing or ``'a'`` for appending. interleaved: If True, then file1 contains interleaved paired-end data. file2 must be None in this case. fileformat: If *None*, the file format is autodetected from the file name extension. Set to ``'fasta'`` or ``'fastq'`` to not auto-detect. qualities: When mode is ``'w'`` and fileformat is *None*, this can be set to *True* or *False* to specify whether the written sequences will have quality values. This is is used in two ways: - If the output format cannot be determined (unrecognized extension etc), no exception is raised, but fasta or fastq format is chosen appropriately. - When False (no qualities available), an exception is raised when the auto-detected output format is FASTQ. opener: A function that is used to open file1 and file2 if they are not already open file-like objects. By default, ``xopen`` is used, which can also open compressed file formats. Return: An instance of one of the ...Reader or ...Writer classes """ if mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") if interleaved and file2 is not None: raise ValueError("When interleaved is set, file2 must be None") if file2 is not None: if mode in "wa" and file1 == file2: raise ValueError("The paired-end output files are identical") if mode == "r": return TwoFilePairedEndReader(file1, file2, fileformat=fileformat, opener=opener) append = mode == "a" return TwoFilePairedEndWriter( file1, file2, fileformat=fileformat, qualities=qualities, opener=opener, append=append ) if interleaved: if mode == "r": return InterleavedPairedEndReader(file1, fileformat=fileformat, opener=opener) append = mode == "a" return InterleavedPairedEndWriter( file1, fileformat=fileformat, qualities=qualities, opener=opener, append=append) # The multi-file options have been dealt with, delegate rest to the # single-file function. return _open_single( file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities) dnaio-0.7.1/src/dnaio/_core.pyi000066400000000000000000000016271417420163400163140ustar00rootroot00000000000000from typing import Optional, Tuple, Union, BinaryIO, Iterator class Sequence: name: str sequence: str qualities: Optional[str] def __init__(self, name: str, sequence: str, qualities: Optional[str] = ...) -> None: ... def __getitem__(self, s: slice) -> Sequence: ... def __repr__(self) -> str: ... def __len__(self) -> int: ... def __richcmp__(self, other: Sequence, op: int) -> bool: ... def qualities_as_bytes(self) -> bytes: ... def fastq_bytes(self) -> bytes: ... def fastq_bytes_two_headers(self) -> bytes: ... def paired_fastq_heads(buf1: Union[bytes,bytearray], buf2: Union[bytes,bytearray], end1: int, end2: int) -> Tuple[int, int]: ... # TODO Sequence should be sequence_class, first yielded value is a bool def fastq_iter(file: BinaryIO, sequence_class, buffer_size: int) -> Iterator[Sequence]: ... def record_names_match(header1: str, header2: str) -> bool: ... dnaio-0.7.1/src/dnaio/_core.pyx000066400000000000000000000467621417420163400163440ustar00rootroot00000000000000# cython: language_level=3, emit_code_comments=False from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes_GET_SIZE from cpython.unicode cimport PyUnicode_DecodeLatin1 from libc.string cimport strncmp, memcmp, memcpy, memchr, strcspn from cpython.unicode cimport PyUnicode_GET_LENGTH cimport cython cdef extern from *: unsigned char * PyUnicode_1BYTE_DATA(object o) int PyUnicode_KIND(object o) int PyUnicode_1BYTE_KIND from .exceptions import FastqFormatError from ._util import shorten cdef class Sequence: """ A sequencing read with read name/id and (optional) qualities If qualities are available, they are as For a Sequence a FASTA file record containing a read in a FASTA or FASTQ file. For FASTA, the qualities attribute is None. For FASTQ, qualities is a string and it contains the qualities encoded as ASCII(qual+33). Attributes: name (str): The read description sequence (str): qualities (str): """ cdef: public str name public str sequence public str qualities def __cinit__(self, str name, str sequence, str qualities=None): """Set qualities to None if there are no quality values""" self.name = name self.sequence = sequence self.qualities = qualities def __init__(self, str name, str sequence, str qualities = None): # __cinit__ is called first and sets all the variables. if qualities is not None and len(qualities) != len(sequence): rname = shorten(name) raise ValueError("In read named {!r}: length of quality sequence " "({}) and length of read ({}) do not match".format( rname, len(qualities), len(sequence))) def __getitem__(self, key): """ Slice this Sequence. If the qualities attribute is not None, it is sliced accordingly. The read name is copied unchanged. Returns: A new Sequence object with a sliced sequence. """ return self.__class__( self.name, self.sequence[key], self.qualities[key] if self.qualities is not None else None) def __repr__(self): qstr = '' if self.qualities is not None: qstr = ', qualities={!r}'.format(shorten(self.qualities)) return ''.format( shorten(self.name), shorten(self.sequence), qstr) def __len__(self): """ Returns: The number of characters in this sequence """ return len(self.sequence) def __richcmp__(self, other, int op): if 2 <= op <= 3: eq = self.name == other.name and \ self.sequence == other.sequence and \ self.qualities == other.qualities if op == 2: return eq else: return not eq else: raise NotImplementedError() def __reduce__(self): return (Sequence, (self.name, self.sequence, self.qualities)) def qualities_as_bytes(self): """Return the qualities as a bytes object. This is a faster version of qualities.encode('ascii').""" return self.qualities.encode('ascii') def fastq_bytes(self, two_headers = False): """Return the entire FASTQ record as bytes which can be written into a file. Optionally the header (after the @) can be repeated on the third line (after the +), when two_headers is enabled.""" cdef: char * name char * sequence char * qualities Py_ssize_t name_length Py_ssize_t sequence_length Py_ssize_t qualities_length if PyUnicode_KIND(self.name) == PyUnicode_1BYTE_KIND: name = PyUnicode_1BYTE_DATA(self.name) name_length = PyUnicode_GET_LENGTH(self.name) else: # Allow non-ASCII in name name_bytes = self.name.encode('latin-1') name = PyBytes_AS_STRING(name_bytes) name_length = PyBytes_GET_SIZE(name_bytes) if PyUnicode_KIND(self.sequence) == PyUnicode_1BYTE_KIND: sequence = PyUnicode_1BYTE_DATA(self.sequence) sequence_length = PyUnicode_GET_LENGTH(self.sequence) else: # Don't allow non-ASCII in sequence and qualities sequence_bytes = self.sequence.encode('ascii') sequence = PyBytes_AS_STRING(sequence_bytes) sequence_length = PyBytes_GET_SIZE(sequence_bytes) if PyUnicode_KIND(self.qualities) == PyUnicode_1BYTE_KIND: qualities = PyUnicode_1BYTE_DATA(self.qualities) qualities_length = PyUnicode_GET_LENGTH(self.qualities) else: qualities_bytes = self.qualities.encode('ascii') qualities = PyBytes_AS_STRING(qualities_bytes) qualities_length = PyBytes_GET_SIZE(qualities_bytes) return create_fastq_record(name, sequence, qualities, name_length, sequence_length, qualities_length, two_headers) def fastq_bytes_two_headers(self): """ Return this record in FASTQ format as a bytes object where the header (after the @) is repeated on the third line. """ return self.fastq_bytes(two_headers=True) cdef bytes create_fastq_record(char * name, char * sequence, char * qualities, Py_ssize_t name_length, Py_ssize_t sequence_length, Py_ssize_t qualities_length, bint two_headers = False): # Total size is name + sequence + qualities + 4 newlines + '+' and an # '@' to be put in front of the name. cdef Py_ssize_t total_size = name_length + sequence_length + qualities_length + 6 if two_headers: # We need space for the name after the +. total_size += name_length # This is the canonical way to create an uninitialized bytestring of given size cdef bytes retval = PyBytes_FromStringAndSize(NULL, total_size) cdef char * retval_ptr = PyBytes_AS_STRING(retval) # Write the sequences into the bytestring at the correct positions. cdef size_t cursor retval_ptr[0] = b"@" memcpy(retval_ptr + 1, name, name_length) cursor = name_length + 1 retval_ptr[cursor] = b"\n"; cursor += 1 memcpy(retval_ptr + cursor, sequence, sequence_length) cursor += sequence_length retval_ptr[cursor] = b"\n"; cursor += 1 retval_ptr[cursor] = b"+"; cursor += 1 if two_headers: memcpy(retval_ptr + cursor, name, name_length) cursor += name_length retval_ptr[cursor] = b"\n"; cursor += 1 memcpy(retval_ptr + cursor, qualities, qualities_length) cursor += qualities_length retval_ptr[cursor] = b"\n" return retval # It would be nice to be able to have the first parameter be an # unsigned char[:] (memory view), but this fails with a BufferError # when a bytes object is passed in. # See ctypedef fused bytes_or_bytearray: bytes bytearray def paired_fastq_heads(bytes_or_bytearray buf1, bytes_or_bytearray buf2, Py_ssize_t end1, Py_ssize_t end2): """ Skip forward in the two buffers by multiples of four lines. Return a tuple (length1, length2) such that buf1[:length1] and buf2[:length2] contain the same number of lines (where the line number is divisible by four). """ cdef: Py_ssize_t pos1 = 0, pos2 = 0 Py_ssize_t linebreaks = 0 unsigned char* data1 = buf1 unsigned char* data2 = buf2 Py_ssize_t record_start1 = 0 Py_ssize_t record_start2 = 0 while True: while pos1 < end1 and data1[pos1] != b'\n': pos1 += 1 if pos1 == end1: break pos1 += 1 while pos2 < end2 and data2[pos2] != b'\n': pos2 += 1 if pos2 == end2: break pos2 += 1 linebreaks += 1 if linebreaks == 4: linebreaks = 0 record_start1 = pos1 record_start2 = pos2 # Hit the end of the data block return record_start1, record_start2 def fastq_iter(file, sequence_class, Py_ssize_t buffer_size): """ Parse a FASTQ file and yield Sequence objects The *first value* that the generator yields is a boolean indicating whether the first record in the FASTQ has a repeated header (in the third row after the ``+``). file -- a file-like object, opened in binary mode (it must have a readinto method) buffer_size -- size of the initial buffer. This is automatically grown if a FASTQ record is encountered that does not fit. """ cdef: bytearray buf = bytearray(buffer_size) char[:] buf_view = buf char* c_buf = buf str name str sequence str qualities Py_ssize_t last_read_position = 0 Py_ssize_t record_start = 0 Py_ssize_t bufstart, bufend, name_start, name_end, name_length Py_ssize_t sequence_start, sequence_end, sequence_length Py_ssize_t second_header_start, second_header_end, second_header_length Py_ssize_t qualities_start, qualities_end, qualities_length char *name_end_ptr char *sequence_end_ptr char *second_header_end_ptr char *qualities_end_ptr bint custom_class = sequence_class is not Sequence Py_ssize_t n_records = 0 bint extra_newline = False if buffer_size < 1: raise ValueError("Starting buffer size too small") # buf is a byte buffer that is re-used in each iteration. Its layout is: # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 bufstart end bufend len(buf) # # buf[0:bufstart] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTQ record. readinto = file.readinto bufstart = 0 # The input file is processed in chunks that each fit into buf while True: assert bufstart < len(buf_view) bufend = readinto(buf_view[bufstart:]) + bufstart if bufstart == bufend: # End of file if bufstart > 0 and buf_view[bufstart-1] != b'\n': # There is still data in the buffer and its last character is # not a newline: This is a file that is missing the final # newline. Append a newline and continue. buf_view[bufstart] = b'\n' bufstart += 1 bufend += 1 extra_newline = True elif last_read_position > record_start: # Incomplete FASTQ records are present. if extra_newline: # Do not report the linefeed that was added by dnaio but # was not present in the original input. last_read_position -= 1 lines = buf[record_start:last_read_position].count(b'\n') raise FastqFormatError( 'Premature end of file encountered. The incomplete final record was: ' '{!r}'.format( shorten(buf[record_start:last_read_position].decode('latin-1'), 500)), line=n_records * 4 + lines) else: # EOF Reached. Stop iterating. return # Parse all complete FASTQ records in this chunk record_start = 0 while True: ### Check for a complete record (i.e 4 newlines are present) # Use libc memchr, this optimizes looking for characters by # using 64-bit integers. See: # https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=string/memchr.c;hb=HEAD # void *memchr(const void *str, int c, size_t n) name_end_ptr = memchr(c_buf + record_start, b'\n', (bufend - record_start)) if name_end_ptr == NULL: break # bufend - sequence_start is always nonnegative: # - name_end is at most bufend - 1 # - thus sequence_start is at most bufend name_end = name_end_ptr - c_buf sequence_start = name_end + 1 sequence_end_ptr = memchr(c_buf + sequence_start, b'\n', (bufend - sequence_start)) if sequence_end_ptr == NULL: break sequence_end = sequence_end_ptr - c_buf second_header_start = sequence_end + 1 second_header_end_ptr = memchr(c_buf + second_header_start, b'\n', (bufend - second_header_start)) if second_header_end_ptr == NULL: break second_header_end = second_header_end_ptr - c_buf qualities_start = second_header_end + 1 qualities_end_ptr = memchr(c_buf + qualities_start, b'\n', (bufend - qualities_start)) if qualities_end_ptr == NULL: break qualities_end = qualities_end_ptr - c_buf if c_buf[record_start] != b'@': raise FastqFormatError("Line expected to " "start with '@', but found {!r}".format(chr(c_buf[record_start])), line=n_records * 4) if c_buf[second_header_start] != b'+': raise FastqFormatError("Line expected to " "start with '+', but found {!r}".format(chr(c_buf[second_header_start])), line=n_records * 4 + 2) name_start = record_start + 1 # Skip @ second_header_start += 1 # Skip + name_length = name_end - name_start sequence_length = sequence_end - sequence_start second_header_length = second_header_end - second_header_start qualities_length = qualities_end - qualities_start # Check for \r\n line-endings and compensate if c_buf[name_end - 1] == b'\r': name_length -= 1 if c_buf[sequence_end - 1] == b'\r': sequence_length -= 1 if c_buf[second_header_end - 1] == b'\r': second_header_length -= 1 if c_buf[qualities_end - 1] == b'\r': qualities_length -= 1 if second_header_length: # should be 0 when only + is present if (name_length != second_header_length or strncmp(c_buf+second_header_start, c_buf + name_start, second_header_length) != 0): raise FastqFormatError( "Sequence descriptions don't match ('{}' != '{}').\n" "The second sequence description must be either " "empty or equal to the first description.".format( c_buf[name_start:name_end].decode('latin-1'), c_buf[second_header_start:second_header_end] .decode('latin-1')), line=n_records * 4 + 2) if qualities_length != sequence_length: raise FastqFormatError( "Length of sequence and qualities differ", line=n_records * 4 + 3) ### Copy record into python variables # PyUnicode_DecodeLatin1 is 50% faster than PyUnicode_DecodeASCII. # This is because PyUnicode_DecodeLatin1 is an alias for # _PyUnicode_FromUCS1. Which directly copies the bytes into a # string object after some checks. With PyUnicode_DecodeASCII, # there is an extra check whether characters exceed 128. name = PyUnicode_DecodeLatin1(c_buf + name_start, name_length, 'strict') sequence = PyUnicode_DecodeLatin1(c_buf + sequence_start, sequence_length, 'strict') qualities = PyUnicode_DecodeLatin1(c_buf + qualities_start, qualities_length, 'strict') if n_records == 0: yield bool(second_header_length) # first yielded value is special if custom_class: yield sequence_class(name, sequence, qualities) else: yield Sequence.__new__(Sequence, name, sequence, qualities) ### Advance record to next position n_records += 1 record_start = qualities_end + 1 # bufend reached last_read_position = bufend if record_start == 0 and bufend == len(buf): # buffer too small, double it buffer_size *= 2 prev_buf = buf buf = bytearray(buffer_size) buf[0:bufend] = prev_buf del prev_buf bufstart = bufend buf_view = buf c_buf = buf else: bufstart = bufend - record_start buf[0:bufstart] = buf[record_start:bufend] def record_names_match(header1: str, header2: str): """ Check whether the sequence record ids id1 and id2 are compatible, ignoring a suffix of '1', '2' or '3'. This exception allows to check some old paired-end reads that have IDs ending in '/1' and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ) appends '.1', '.2' and sometimes '.3' to paired-end reads if option -I is used. """ if ( PyUnicode_KIND(header1) != PyUnicode_1BYTE_KIND or PyUnicode_KIND(header2) != PyUnicode_1BYTE_KIND ): # Fall back to slower code path. name1 = header1.split(maxsplit=1)[0] name2 = header2.split(maxsplit=1)[0] if name1 and name2 and name1[-1] in '123' and name2[-1] in '123': return name1[:-1] == name2[:-1] return name1 == name2 # Do not call .encode functions but use the unicode pointer inside the # python object directly, provided it is in 1-byte encoding, so we can # find the spaces and tabs easily. cdef char * header1_chars = PyUnicode_1BYTE_DATA(header1) cdef char * header2_chars = PyUnicode_1BYTE_DATA(header2) cdef size_t header1_length = PyUnicode_GET_LENGTH(header1) return record_ids_match(header1_chars, header2_chars, header1_length) cdef bint record_ids_match(char *header1, char *header2, size_t header1_length): """ Check whether the ASCII-encoded IDs match. Only header1_length is needed. """ # Only the read ID is of interest. # Find the first tab or space, if not present, strcspn will return the # position of the terminating NULL byte. (I.e. the length). # Header1 is not searched because we can reuse the end of ID position of # header2 as header1's ID should end at the same position. cdef size_t id2_length = strcspn(header2, b' \t') if header1_length < id2_length: return False cdef char end = header1[id2_length] if end != b'\000' and end != b' ' and end != b'\t': return False # Check if the IDs end with 1, 2 or 3. This is the read pair number # which should not be included in the comparison. cdef bint id1endswithnumber = b'1' <= header1[id2_length - 1] <= b'3' cdef bint id2endswithnumber = b'1' <= header2[id2_length - 1] <= b'3' if id1endswithnumber and id2endswithnumber: id2_length -= 1 # Compare the strings up to the ID end position. return memcmp(header1, header2, id2_length) == 0 dnaio-0.7.1/src/dnaio/_util.py000066400000000000000000000014511417420163400161630ustar00rootroot00000000000000import pathlib def _is_path(obj: object) -> bool: """ Return whether the given object looks like a path (str, pathlib.Path or pathlib2.Path) """ # TODO # pytest uses pathlib2.Path objects on Python 3.5 for its tmp_path fixture. # On Python 3.6+, this function can be replaced with isinstance(obj, os.PathLike) import sys if "pathlib2" in sys.modules: import pathlib2 # type: ignore path_classes = [str, pathlib.Path, pathlib2.Path] else: path_classes = [str, pathlib.Path] return isinstance(obj, tuple(path_classes)) def shorten(s: str, n: int = 100) -> str: """Shorten string s to at most n characters, appending "..." if necessary.""" if s is None: return None if len(s) > n: s = s[:n-3] + '...' return s dnaio-0.7.1/src/dnaio/_version.pyi000066400000000000000000000002241417420163400170410ustar00rootroot00000000000000# The _version.py file is generated on installation. By including this stub, # we can run mypy without having to install the package. version: str dnaio-0.7.1/src/dnaio/chunks.py000066400000000000000000000116461417420163400163510ustar00rootroot00000000000000"""Chunked reading of FASTA and FASTQ files""" from io import RawIOBase from typing import Optional, Iterator, Tuple from ._core import paired_fastq_heads as _paired_fastq_heads from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat def _fasta_head(buf: bytes, end: Optional[int] = None) -> int: """ Search for the end of the last complete FASTA record within buf[:end] Return an integer length such that buf[:length] contains the highest possible number of complete FASTA records. """ pos = buf.rfind(b'\n>', 0, end) if pos != -1: return pos + 1 if buf[0:1] == b'>' or buf[0:1] == b'#': return 0 if len(buf) == 0: return 0 c = chr(buf[0]) raise FastaFormatError( f"FASTA file expected to start with '>', but found {repr(c)}", line=None, ) def _fastq_head(buf: bytes, end: Optional[int] = None) -> int: """ Search for the end of the last complete *two* FASTQ records in buf[:end]. Two FASTQ records are required to ensure that read pairs in interleaved paired-end data are not split. """ linebreaks = buf.count(b'\n', 0, end) right = end for _ in range(linebreaks % 8 + 1): right = buf.rfind(b'\n', 0, right) # Note that this works even if linebreaks == 0: # rfind() returns -1 and adding 1 gives index 0, # which is correct. return right + 1 # type: ignore def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]: """ Read a chunk of complete FASTA or FASTQ records from a file. The size of a chunk is at most buffer_size. f needs to be a file opened in binary mode. The yielded memoryview objects become invalid on the next iteration. """ # This buffer is re-used in each iteration. buf = bytearray(buffer_size) # Read one byte to determine file format. # If there is a comment char, we assume FASTA! start = f.readinto(memoryview(buf)[0:1]) if start == 0: # Empty file return assert start == 1 if buf[0:1] == b'@': head = _fastq_head elif buf[0:1] == b'#' or buf[0:1] == b'>': head = _fasta_head else: raise UnknownFileFormat( f"Cannnot determine input file format: First character expected to be '>' or '@', " f"but found {repr(chr(buf[0]))}" ) # Layout of buf # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 start end bufend len(buf) # # buf[0:start] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTA or FASTQ record. while True: if start == len(buf): raise OverflowError('FASTA/FASTQ record does not fit into buffer') bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore if start == bufend: # End of file break end = head(buf, bufend) assert end <= bufend if end > 0: yield memoryview(buf)[0:end] start = bufend - end assert start >= 0 buf[0:start] = buf[end:bufend] if start > 0: yield memoryview(buf)[0:start] def read_paired_chunks( f: RawIOBase, f2: RawIOBase, buffer_size: int = 4 * 1024**2, ) -> Iterator[Tuple[memoryview, memoryview]]: if buffer_size < 1: raise ValueError("Buffer size too small") buf1 = bytearray(buffer_size) buf2 = bytearray(buffer_size) # Read one byte to make sure we are processing FASTQ start1 = f.readinto(memoryview(buf1)[0:1]) # type: ignore start2 = f2.readinto(memoryview(buf2)[0:1]) # type: ignore if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'): raise FileFormatError( "Paired-end data must be in FASTQ format when using multiple cores", line=None) while True: if start1 == len(buf1) or start2 == len(buf2): raise ValueError("FASTQ record does not fit into buffer") bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore if start1 == bufend1 and start2 == bufend2: break end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2) assert end1 <= bufend1 assert end2 <= bufend2 if end1 > 0 or end2 > 0: yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2]) start1 = bufend1 - end1 assert start1 >= 0 buf1[0:start1] = buf1[end1:bufend1] start2 = bufend2 - end2 assert start2 >= 0 buf2[0:start2] = buf2[end2:bufend2] if start1 > 0 or start2 > 0: yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2]) dnaio-0.7.1/src/dnaio/exceptions.py000066400000000000000000000016031417420163400172270ustar00rootroot00000000000000from typing import Optional class FileFormatError(Exception): """ The file is not formatted correctly """ format = 'sequence' # Something generic that works for both FASTA and FASTQ def __init__(self, msg: str, line: Optional[int]): super().__init__(msg, line) self.message = msg self.line = line # starts at 0! def __str__(self): line = "unknown line" if self.line is None else f"line {self.line + 1}" return f"Error in {self.format} file at {line}: {self.message}" class FastqFormatError(FileFormatError): """ The FASTQ file is not formatted correctly """ format = "FASTQ" class FastaFormatError(FileFormatError): """ The FASTA file is not formatted correctly """ format = "FASTA" class UnknownFileFormat(Exception): """ The file format could not be automatically detected """ dnaio-0.7.1/src/dnaio/interfaces.py000066400000000000000000000010761417420163400171750ustar00rootroot00000000000000from abc import ABC, abstractmethod from typing import Iterator, Tuple from dnaio import Sequence class SingleEndReader(ABC): @abstractmethod def __iter__(self) -> Iterator[Sequence]: pass class PairedEndReader(ABC): @abstractmethod def __iter__(self) -> Iterator[Tuple[Sequence, Sequence]]: pass class SingleEndWriter(ABC): @abstractmethod def write(self, record: Sequence) -> None: pass class PairedEndWriter(ABC): @abstractmethod def write(self, record1: Sequence, record2: Sequence) -> None: pass dnaio-0.7.1/src/dnaio/pairedend.py000066400000000000000000000164371417420163400170140ustar00rootroot00000000000000import itertools from contextlib import ExitStack from os import PathLike from typing import Union, BinaryIO, Optional, Iterator, Tuple from xopen import xopen from ._core import Sequence, record_names_match from .exceptions import FileFormatError from .interfaces import PairedEndReader, PairedEndWriter from .readers import FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .singleend import _open_single class TwoFilePairedEndReader(PairedEndReader): """ Read paired-end reads from two files. Wraps two BinaryFileReader instances, making sure that reads are properly paired. """ paired = True def __init__( self, file1: Union[str, PathLike, BinaryIO], file2: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = None, opener=xopen, ): with ExitStack() as stack: self.reader1 = stack.enter_context( _open_single(file1, opener=opener, fileformat=fileformat) ) self.reader2 = stack.enter_context( _open_single(file2, opener=opener, fileformat=fileformat) ) self._close = stack.pop_all().close self.delivers_qualities = self.reader1.delivers_qualities def __repr__(self) -> str: return f"{self.__class__.__name__}(file1={self.reader1}, file2={self.reader2})" def __iter__(self) -> Iterator[Tuple[Sequence, Sequence]]: """ Iterate over the paired reads. Each item is a pair of Sequence objects. """ # Avoid usage of zip() below since it will consume one item too many, # when one of the iterators is exhausted. zip in python 3.10 has a # 'strict' keyword that can be used to prevent this and throw an error, # but it will take a long time for 3.10 or higher to be available on # everyone's machine. # Instead use zip_longest from itertools. This yields None if one of # the iterators is exhausted. Checking for None identity is fast. # So we can quickly check if the iterator is still yielding. # This is faster than implementing a while loop with next calls, # which requires expensive function lookups. for r1, r2 in itertools.zip_longest(self.reader1, self.reader2): if r1 is None: raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 2 than in file 1.", line=None, ) from None if r2 is None: raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 1 than in file 2.", line=None, ) from None if not record_names_match(r1.name, r2.name): raise FileFormatError( f"Reads are improperly paired. Read name '{r1.name}' " f"in file 1 does not match '{r2.name}' in file 2.", line=None, ) from None yield (r1, r2) def close(self) -> None: self._close() def __enter__(self): return self def __exit__(self, *exc): self.close() class InterleavedPairedEndReader(PairedEndReader): """ Read paired-end reads from an interleaved FASTQ file. """ paired = True def __init__( self, file: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = None, opener=xopen, ): reader = _open_single(file, opener=opener, fileformat=fileformat) assert isinstance(reader, (FastaReader, FastqReader)) # for Mypy self.reader = reader self.delivers_qualities = self.reader.delivers_qualities def __repr__(self) -> str: return f"{self.__class__.__name__}({self.reader})" def __iter__(self) -> Iterator[Tuple[Sequence, Sequence]]: it = iter(self.reader) for r1 in it: try: r2 = next(it) except StopIteration: raise FileFormatError( "Interleaved input file incomplete: Last record " f"'{r1.name}' has no partner.", line=None, ) from None if not record_names_match(r1.name, r2.name): raise FileFormatError( f"Reads are improperly paired. Name '{r1.name}' " f"(first) does not match '{r2.name}' (second).", line=None, ) yield (r1, r2) def close(self) -> None: self.reader.close() def __enter__(self): return self def __exit__(self, *args): self.close() class TwoFilePairedEndWriter(PairedEndWriter): def __init__( self, file1: Union[str, PathLike, BinaryIO], file2: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = "fastq", qualities: Optional[bool] = None, opener=xopen, append: bool = False, ): mode = "a" if append else "w" with ExitStack() as stack: self._writer1: Union[FastaWriter, FastqWriter] self._writer2: Union[FastaWriter, FastqWriter] self._writer1 = stack.enter_context( _open_single( file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) ) self._writer2 = stack.enter_context( _open_single( file2, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities, ) ) self._close = stack.pop_all().close def __repr__(self) -> str: return f"{self.__class__.__name__}({self._writer1}, {self._writer2})" def write(self, read1, read2) -> None: self._writer1.write(read1) self._writer2.write(read2) def close(self) -> None: self._close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() class InterleavedPairedEndWriter(PairedEndWriter): """ Write paired-end reads to an interleaved FASTA or FASTQ file """ def __init__( self, file: Union[str, PathLike, BinaryIO], *, fileformat: Optional[str] = "fastq", qualities: Optional[bool] = None, opener=xopen, append: bool = False, ): mode = "a" if append else "w" writer = _open_single( file, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities ) assert isinstance(writer, (FastaWriter, FastqWriter)) # only for Mypy self._writer = writer def __repr__(self) -> str: return f"{self.__class__.__name__}({self._writer})" def write(self, read1: Sequence, read2: Sequence) -> None: self._writer.write(read1) self._writer.write(read2) def close(self) -> None: self._writer.close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() dnaio-0.7.1/src/dnaio/py.typed000066400000000000000000000000001417420163400161610ustar00rootroot00000000000000dnaio-0.7.1/src/dnaio/readers.py000066400000000000000000000112051417420163400164720ustar00rootroot00000000000000""" Classes for reading FASTA and FASTQ files """ __all__ = ['FastaReader', 'FastqReader'] import io from typing import Union, BinaryIO, Optional, Iterator, List from xopen import xopen from ._core import fastq_iter as _fastq_iter, Sequence from ._util import shorten as _shorten from .exceptions import FastaFormatError from .interfaces import SingleEndReader class BinaryFileReader: """ A mixin for readers that ensures that a file or a path can be passed in to the constructor. """ _close_on_exit = False paired: bool = False mode: str = 'rb' def __init__( self, file: Union[str, BinaryIO], *, opener=xopen, _close_file: Optional[bool] = None ): """ The file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz). """ if isinstance(file, str): self._file = opener(file, self.mode) self._close_on_exit = True elif _close_file: self._close_on_exit = True self._file = file else: self._file = file def __repr__(self) -> str: return f"{self.__class__.__name__}('{getattr(self._file, 'name', self._file)}')" def close(self) -> None: if self._close_on_exit and self._file is not None: self._file.close() self._file = None def __enter__(self): if self._file is None: raise ValueError("I/O operation on closed BinaryFileReader") return self def __exit__(self, *args): self.close() class FastaReader(BinaryFileReader, SingleEndReader): """ Reader for FASTA files. """ def __init__( self, file: Union[str, BinaryIO], *, keep_linebreaks: bool = False, sequence_class=Sequence, opener=xopen, _close_file: Optional[bool] = None, ): """ file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz). keep_linebreaks -- whether to keep newline characters in the sequence """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = False self._delimiter = '\n' if keep_linebreaks else '' def __iter__(self) -> Iterator[Sequence]: """ Read next entry from the file (single entry at a time). """ name = None seq: List[str] = [] f = io.TextIOWrapper(self._file) for i, line in enumerate(f): # strip() also removes DOS line breaks line = line.strip() if not line: continue if line and line[0] == '>': if name is not None: yield self.sequence_class(name, self._delimiter.join(seq), None) name = line[1:] seq = [] elif line and line[0] == '#': continue elif name is not None: seq.append(line) else: raise FastaFormatError( f"Expected '>' at beginning of record, but got '{_shorten(line)}'.", line=i) if name is not None: yield self.sequence_class(name, self._delimiter.join(seq), None) # Prevent TextIOWrapper from closing the underlying file f.detach() class FastqReader(BinaryFileReader, SingleEndReader): """ Reader for FASTQ files. Does not support multi-line FASTQ files. """ def __init__( self, file: Union[str, BinaryIO], *, sequence_class=Sequence, buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc. opener=xopen, _close_file: Optional[bool] = None, ): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = True self.buffer_size = buffer_size # The first value yielded by _fastq_iter indicates # whether the file has repeated headers self._iter = _fastq_iter(self._file, self.sequence_class, self.buffer_size) try: th = next(self._iter) assert isinstance(th, bool) self.two_headers: bool = th except StopIteration: # Empty file self.two_headers = False self._iter = iter(()) except Exception: self.close() raise def __iter__(self) -> Iterator[Sequence]: return self._iter dnaio-0.7.1/src/dnaio/singleend.py000066400000000000000000000104151417420163400170170ustar00rootroot00000000000000import functools import os from typing import Optional, Union, BinaryIO from .exceptions import UnknownFileFormat from .readers import FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from ._util import _is_path def _open_single( file_or_path: Union[str, os.PathLike, BinaryIO], opener, *, fileformat: Optional[str] = None, mode: str = "r", qualities: Optional[bool] = None, ) -> Union[FastaReader, FastaWriter, FastqReader, FastqWriter]: """ Open a single sequence file. See description of open() above. """ if mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") path: Optional[str] if _is_path(file_or_path): path = os.fspath(file_or_path) # type: ignore file = opener(path, mode + "b") close_file = True else: if mode == "r" and not hasattr(file_or_path, "readinto"): raise ValueError( "When passing in an open file-like object, it must have been opened in binary mode" ) file = file_or_path if hasattr(file, "name") and isinstance(file.name, str): path = file.name else: path = None close_file = False fastq_handler = FastqReader if mode == "r" else FastqWriter fasta_handler = FastaReader if mode == "r" else FastaWriter handlers = { "fastq": functools.partial(fastq_handler, _close_file=close_file), "fasta": functools.partial(fasta_handler, _close_file=close_file), } if fileformat: try: handler = handlers[fileformat.lower()] except KeyError: file.close() raise UnknownFileFormat( f"File format '{fileformat}' is unknown (expected 'fasta' or 'fastq')." ) return handler(file) if path is not None: fileformat = _detect_format_from_name(path) if fileformat is None and mode == "w" and qualities is not None: # Format not recognized, but we know whether to use a format with or without qualities fileformat = "fastq" if qualities else "fasta" if mode == "r" and fileformat is None: fileformat = _detect_format_from_content(file) if fileformat is None: name = getattr(file, "name", repr(file)) file.close() raise UnknownFileFormat( f"Could not determine whether file '{name}' is FASTA or FASTQ. The file extension " "is not available or not recognized, and the first character in the file is " "unexpected." ) if fileformat is None: assert mode == "w" extra = " because the output file name is not available" if path is None else "" file.close() raise UnknownFileFormat( "Auto-detection of the output file format (FASTA/FASTQ) failed" + extra ) if fileformat == "fastq" and mode in "wa" and qualities is False: file.close() raise ValueError( "Output format cannot be FASTQ since no quality values are available." ) return handlers[fileformat](file) def _detect_format_from_name(name: str) -> Optional[str]: """ name -- file name Return 'fasta', 'fastq' or None if the format could not be detected. """ name = name.lower() for ext in (".gz", ".xz", ".bz2"): if name.endswith(ext): name = name[: -len(ext)] break name, ext = os.path.splitext(name) if ext in [".fasta", ".fa", ".fna", ".csfasta", ".csfa"]: return "fasta" elif ext in [".fastq", ".fq"] or (ext == ".txt" and name.endswith("_sequence")): return "fastq" return None def _detect_format_from_content(file: BinaryIO) -> Optional[str]: """ Return 'fasta', 'fastq' or None """ if file.seekable(): original_position = file.tell() first_char = file.read(1) file.seek(original_position) else: # We cannot always use peek() because BytesIO objects do not suppert it first_char = file.peek(1)[0:1] # type: ignore formats = { b"@": "fastq", b">": "fasta", b"#": "fasta", # Some FASTA variants allow comments b"": "fastq", # Pretend FASTQ for empty input } return formats.get(first_char, None) dnaio-0.7.1/src/dnaio/writers.py000066400000000000000000000104451417420163400165510ustar00rootroot00000000000000from os import PathLike from typing import Union, BinaryIO, Optional from xopen import xopen from . import Sequence from ._util import _is_path from .interfaces import SingleEndWriter class FileWriter: def __init__( self, file: Union[PathLike, str, BinaryIO], *, opener=xopen, _close_file: Optional[bool] = None, ): if _is_path(file): self._file = opener(file, "wb") self._close_on_exit = True else: self._file = file self._close_on_exit = bool(_close_file) def __repr__(self) -> str: return f"{self.__class__.__name__}('{getattr(self._file, 'name', self._file)}')" def close(self) -> None: if self._close_on_exit: self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() class FastaWriter(FileWriter, SingleEndWriter): """ Write FASTA-formatted sequences to a file. """ def __init__( self, file: Union[PathLike, str, BinaryIO], *, line_length: Optional[int] = None, opener=xopen, _close_file: Optional[bool] = None, ): """ If line_length is not None, the lines will be wrapped after line_length characters. """ super().__init__(file, opener=opener, _close_file=_close_file) self.line_length = line_length if line_length != 0 else None def __repr__(self) -> str: return f"FastaWriter('{getattr(self._file, 'name', self._file)}')" def write(self, name_or_record, sequence: Optional[str] = None): """Write an entry to the the FASTA file. If only one parameter (name_or_record) is given, it must have attributes .name and .sequence, which are then used. Otherwise, the first parameter must be the name and the second the sequence. The effect is that you can write this: writer.write("name", "ACCAT") or writer.write(Sequence("name", "ACCAT")) """ if sequence is None: name = name_or_record.name sequence = name_or_record.sequence else: name = name_or_record if self.line_length is not None: self._file.write(('>' + name + '\n').encode('ascii')) s = [] for i in range(0, len(sequence), self.line_length): s.append(sequence[i:i + self.line_length] + '\n') self._file.write(''.join(s).encode('ascii')) else: text = '>' + name + '\n' + sequence + '\n' self._file.write(text.encode('ascii')) class FastqWriter(FileWriter, SingleEndWriter): """ Write sequences with qualities in FASTQ format. FASTQ files are formatted like this:: @read name AACCGGTT + FF,:F,,F """ file_mode = 'wb' def __init__( self, file: Union[PathLike, str, BinaryIO], *, two_headers: bool = False, opener=xopen, _close_file: Optional[bool] = None, ): super().__init__(file, opener=opener, _close_file=_close_file) self._two_headers = two_headers # setattr avoids a complaint from Mypy setattr(self, "write", self._write_two_headers if self._two_headers else self._write) def __repr__(self) -> str: return f"FastqWriter('{getattr(self._file, 'name', self._file)}')" def write(self, record: Sequence) -> None: """ Dummy method to make it possible to instantiate this class. The correct write method is assigned in the constructor. """ assert False def _write(self, record: Sequence) -> None: """ Write a Sequence record to the FASTQ file. """ self._file.write(record.fastq_bytes()) def _write_two_headers(self, record: Sequence) -> None: """ Write a Sequence record to the FASTQ file, repeating the header in the third line after the "+" . """ self._file.write(record.fastq_bytes_two_headers()) def writeseq(self, name: str, sequence: str, qualities: str) -> None: self._file.write(f"@{name:s}\n{sequence:s}\n+\n{qualities:s}\n".encode("ascii")) dnaio-0.7.1/tests/000077500000000000000000000000001417420163400137555ustar00rootroot00000000000000dnaio-0.7.1/tests/data/000077500000000000000000000000001417420163400146665ustar00rootroot00000000000000dnaio-0.7.1/tests/data/dos.fastq000066400000000000000000000004371417420163400165170ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.7.1/tests/data/interleaved.fastq000066400000000000000000000003251417420163400202300ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH dnaio-0.7.1/tests/data/missingextension000066400000000000000000000004231417420163400202160ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.7.1/tests/data/missingextension.gz000066400000000000000000000003321417420163400206340ustar00rootroot00000000000000Osmall.fastq]1n0 @]Pn '#:ngjlE"~|nie)2o*">bfp2Dݔ/0Mo#Bz_pO|qEi撪6|ؤم0".D4RkAAR|SfW\ܖ;~Ȼm\3t@}eͥ,x:@F%dnaio-0.7.1/tests/data/paired.1.fastq000066400000000000000000000004501417420163400173300ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH dnaio-0.7.1/tests/data/paired.2.fastq000066400000000000000000000004451417420163400173350ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ dnaio-0.7.1/tests/data/simple.fasta000066400000000000000000000000661417420163400172010ustar00rootroot00000000000000>first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-0.7.1/tests/data/simple.fasta.bz2000066400000000000000000000001251417420163400176710ustar00rootroot00000000000000BZh91AY&SYf π0 *! TPM T$zM芯rH ` ьwKI.p dnaio-0.7.1/tests/data/simple.fasta.gz000066400000000000000000000001111417420163400176070ustar00rootroot00000000000000f0[simple.fastaK,*./N-,MKN v usv5+NMKArv5E|6dnaio-0.7.1/tests/data/simple.fasta.xz000066400000000000000000000001541417420163400176370ustar00rootroot000000000000007zXZִF!t/5-]'V$@T-ZMaFy38Db`2fI6ԇ}YZdnaio-0.7.1/tests/data/simple.fastq000066400000000000000000000001151417420163400172140ustar00rootroot00000000000000@first_sequence SEQUENCE1 + :6;;8<=:< @second_sequence SEQUENCE2 + 83/J z)„)?hdnaio-0.7.1/tests/data/simple.fastq.gz000066400000000000000000000001351417420163400176350ustar00rootroot00000000000000'[simple.fastqsH,*./N-,MKN v usv52r(NMKTfTfalcooaf>Mdnaio-0.7.1/tests/data/simple.fastq.xz000066400000000000000000000002001417420163400176470ustar00rootroot000000000000007zXZִF!t/LA] 'V$@ƹY#ܿX,lj&4ߵx+OMᄎL%v^.4QmTQuzX !]M'}YZdnaio-0.7.1/tests/data/small.fastq000066400000000000000000000004231417420163400170350ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.7.1/tests/data/with_comment.fasta000066400000000000000000000001201417420163400203740ustar00rootroot00000000000000# a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-0.7.1/tests/data/withplus.fastq000066400000000000000000000001541417420163400176050ustar00rootroot00000000000000@first_sequence SEQUENCE1 +this is different :6;;8<=:< @second_sequence SEQUENCE2 +also different 831\n') == 0 assert _fasta_head(b'>1\n3') == 0 assert _fasta_head(b'>1\n3\n') == 0 assert _fasta_head(b'>1\n3\n>') == 5 assert _fasta_head(b'>1\n3\n>6') == 5 assert _fasta_head(b'>1\n3\n>6\n') == 5 assert _fasta_head(b'>1\n3\n>6\n8') == 5 assert _fasta_head(b'>1\n3\n>6\n8\n') == 5 assert _fasta_head(b'>1\n3\n>6\n8\n0') == 5 assert _fasta_head(b'>1\n3\n>6\n8\n0\n') == 5 assert _fasta_head(b'>1\n3\n>6\n8\n0\n>') == 12 def test_fasta_head_with_comment(): assert _fasta_head(b'#') == 0 assert _fasta_head(b'#\n') == 0 assert _fasta_head(b'#\n>') == 2 assert _fasta_head(b'#\n>3') == 2 assert _fasta_head(b'#\n>3\n') == 2 assert _fasta_head(b'#\n>3\n5') == 2 assert _fasta_head(b'#\n>3\n5\n') == 2 assert _fasta_head(b'#\n>3\n5\n>') == 7 def test_paired_fastq_heads(): buf1 = b'first\nsecond\nthird\nfourth\nfifth' buf2 = b'a\nb\nc\nd\ne\nf\ng' assert paired_fastq_heads(buf1, buf2, len(buf1), len(buf2)) == ( len(b'first\nsecond\nthird\nfourth\n'), len(b'a\nb\nc\nd\n')) assert paired_fastq_heads(b'abc', b'def', 3, 3) == (0, 0) assert paired_fastq_heads(b'abc\n', b'def', 4, 3) == (0, 0) assert paired_fastq_heads(b'abc', b'def\n', 3, 4) == (0, 0) assert paired_fastq_heads(b'\n\n\n\n', b'\n\n\n\n', 4, 4) == (4, 4) def test_fastq_head(): assert _fastq_head(b'') == 0 assert _fastq_head(b'A\n') == 0 assert _fastq_head(b'A\nB') == 0 assert _fastq_head(b'A\nB\n') == 0 assert _fastq_head(b'A\nB\nC') == 0 assert _fastq_head(b'A\nB\nC\n') == 0 assert _fastq_head(b'A\nB\nC\nD') == 0 assert _fastq_head(b'A\nB\nC\nD\n') == 0 assert _fastq_head(b'A\nB\nC\nD\nE') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\n') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF\n') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG\n') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG\nH') == 0 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG\nH\n') == 16 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG\nH\nI') == 16 assert _fastq_head(b'A\nB\nC\nD\nE\nF\nG\nH\nI\n') == 16 def test_read_paired_chunks(): with open('tests/data/paired.1.fastq', 'rb') as f1: with open('tests/data/paired.2.fastq', 'rb') as f2: for c1, c2 in read_paired_chunks(f1, f2, buffer_size=128): print(c1, c2) def test_read_chunks(): for data in [b'@r1\nACG\n+\nHHH\n', b'>r1\nACGACGACG\n']: assert [m.tobytes() for m in read_chunks(BytesIO(data))] == [data] # Buffer too small with raises(OverflowError): list(read_chunks(BytesIO(data), buffer_size=4)) def test_read_chunks_empty(): assert list(read_chunks(BytesIO(b''))) == [] dnaio-0.7.1/tests/test_internal.py000066400000000000000000000502141417420163400172040ustar00rootroot00000000000000import os import shutil import subprocess import sys from io import BytesIO from tempfile import mkdtemp from textwrap import dedent from pytest import raises, mark import dnaio from dnaio import ( FileFormatError, FastaFormatError, FastqFormatError, FastaReader, FastqReader, InterleavedPairedEndReader, FastaWriter, FastqWriter, InterleavedPairedEndWriter, TwoFilePairedEndReader, ) from dnaio import record_names_match, Sequence from dnaio.writers import FileWriter from dnaio.readers import BinaryFileReader # files tests/data/simple.fast{q,a} simple_fastq = [ Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), Sequence("second_sequence", "SEQUENCE2", "83first_sequence\nSEQUENCE1\n>second_sequence\nSEQUENCE2\n") reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_with_comments(self): fasta = BytesIO(dedent( """ # a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """).encode()) reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_wrong_format(self): fasta = BytesIO(dedent( """# a comment # another one unexpected >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """).encode()) with raises(FastaFormatError) as info: list(FastaReader(fasta)) assert info.value.line == 2 def test_fastareader_keeplinebreaks(self): with FastaReader("tests/data/simple.fasta", keep_linebreaks=True) as f: reads = list(f) assert reads[0] == simple_fasta[0] assert reads[1].sequence == 'SEQUEN\nCE2' def test_context_manager(self): filename = "tests/data/simple.fasta" with open(filename, 'rb') as f: assert not f.closed _ = list(dnaio.open(f)) assert not f.closed assert f.closed with FastaReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None # Open it a second time with FastaReader(filename): pass class TestFastqReader: def test_fastqreader(self): with FastqReader("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq @mark.parametrize("buffer_size", [1, 2, 3, 5, 7, 10, 20]) def test_fastqreader_buffersize(self, buffer_size): with FastqReader("tests/data/simple.fastq", buffer_size=buffer_size) as f: reads = list(f) assert reads == simple_fastq def test_fastqreader_buffersize_too_small(self): with raises(ValueError) as e: with FastqReader("tests/data/simple.fastq", buffer_size=0) as f: _ = list(f) # pragma: no cover assert "buffer size too small" in e.value.args[0] def test_fastqreader_dos(self): # DOS line breaks with open('tests/data/dos.fastq', 'rb') as f: assert b'\r\n' in f.read() with FastqReader("tests/data/dos.fastq") as f: dos_reads = list(f) with FastqReader("tests/data/small.fastq") as f: unix_reads = list(f) assert dos_reads == unix_reads def test_fastq_wrongformat(self): with raises(FastqFormatError) as info: with FastqReader("tests/data/withplus.fastq") as f: list(f) # pragma: no cover assert info.value.line == 2 def test_empty_fastq(self): with FastqReader(BytesIO(b'')) as fq: assert list(fq) == [] @mark.parametrize('s,line', [ (b'@', 0), (b'@r', 0), (b'@r1', 0), (b'@r1\n', 1), (b'@r1\nA', 1), (b'@r1\nAC', 1), (b'@r1\nACG', 1), (b'@r1\nACG\n', 2), (b'@r1\nACG\n+', 2), (b'@r1\nACG\n+\n', 3), (b'@r1\nACG\n+\nH', 3), (b'@r1\nACG\n+\nHH', 3), (b'@r1\nACG\n+\nHHH\n@', 4), (b'@r1\nACG\n+\nHHH\n@r', 4), (b'@r1\nACG\n+\nHHH\n@r2', 4), (b'@r1\nACG\n+\nHHH\n@r2\n', 5), (b'@r1\nACG\n+\nHHH\n@r2\nT', 5), (b'@r1\nACG\n+\nHHH\n@r2\nT\n', 6), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+', 6), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\n', 7), ]) def test_fastq_incomplete(self, s, line): fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_half_record_line_numbers(self): fastq = BytesIO(b'@r\nACG\n+\nHH\n') # Choose the buffer size such that only parts of the record fit # We want to ensure that the line number is reset properly # after the record has been half-parsed buffer_size = len('@r\nACG\n+\n') with raises(FastqFormatError) as info: with FastqReader(fastq, buffer_size=buffer_size) as fq: list(fq) # pragma: no cover assert 'Length of sequence and qualities differ' in info.value.message assert info.value.line == 3 @mark.parametrize('s,line', [ (b'@r1\nACG\n+\nH#HH\n@r2\nT\n+\nH\n', 3), (b'@r1\nACG\n+\n#H\n@r2\nT\n+\nH\n', 3), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\nHH\n', 7), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\n\n', 7), ]) def test_differing_lengths(self, s, line): fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_missing_final_newline(self): # Files with a missing final newline are currently allowed fastq = BytesIO(b'@r1\nA\n+\nH') with dnaio.open(fastq) as f: records = list(f) assert records == [Sequence('r1', 'A', 'H')] def test_not_opened_as_binary(self): filename = 'tests/data/simple.fastq' with open(filename, 'rt') as f: with raises(ValueError): list(dnaio.open(f)) def test_context_manager(self): filename = "tests/data/simple.fastq" with open(filename, 'rb') as f: assert not f.closed _ = list(dnaio.open(f)) assert not f.closed assert f.closed with FastqReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None def test_two_header_detection(self): fastq = BytesIO(b'@r1\nACG\n+r1\nHHH\n@r2\nT\n+r2\n#\n') with FastqReader(fastq) as fq: assert fq.two_headers list(fq) fastq = BytesIO(b'@r1\nACG\n+\nHHH\n@r2\nT\n+r2\n#\n') with FastqReader(fastq) as fq: assert not fq.two_headers list(fq) def test_second_header_not_equal(self): fastq = BytesIO(b'@r1\nACG\n+xy\nXXX\n') with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) # pragma: no cover assert "Sequence descriptions don't match" in info.value.message class TestOpen: def setup(self): self._tmpdir = mkdtemp() def teardown(self): shutil.rmtree(self._tmpdir) def test_sequence_reader(self): # test the autodetection with dnaio.open("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq with dnaio.open("tests/data/simple.fasta") as f: reads = list(f) assert reads == simple_fasta with open("tests/data/simple.fastq", 'rb') as f: reads = list(dnaio.open(f)) assert reads == simple_fastq # make the name attribute unavailable with open("tests/data/simple.fastq", 'rb') as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fastq with open("tests/data/simple.fasta", 'rb') as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fasta def test_autodetect_fasta_format(self, tmpdir): path = str(tmpdir.join('tmp.fasta')) with dnaio.open(path, mode='w') as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: records = list(f) assert records == simple_fasta def test_write_qualities_to_fasta(self): path = os.path.join(self._tmpdir, 'tmp.fasta') with dnaio.open(path, mode='w', qualities=True) as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fasta def test_autodetect_fastq_format(self): path = os.path.join(self._tmpdir, 'tmp.fastq') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fastq def test_autodetect_fastq_weird_name(self): path = os.path.join(self._tmpdir, 'tmp.fastq.gz') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) weird_path = os.path.join(self._tmpdir, 'tmp.weird.gz') os.rename(path, weird_path) with dnaio.open(weird_path) as f: assert list(f) == simple_fastq def test_fastq_qualities_missing(self): path = os.path.join(self._tmpdir, 'tmp.fastq') with raises(ValueError): with dnaio.open(path, mode='w', qualities=False): pass class TestInterleavedReader: def test(self): expected = [ ( Sequence('read1/1 some text', 'TTATTTGTCTCCAGC', '##HHHHHHHHHHHHH'), Sequence('read1/2 other text', 'GCTGGAGACAAATAA', 'HHHHHHHHHHHHHHH') ), ( Sequence('read3/1', 'CCAACTTGATATTAATAACA', 'HHHHHHHHHHHHHHHHHHHH'), Sequence('read3/2', 'TGTTATTAATATCAAGTTGG', '#HHHHHHHHHHHHHHHHHHH') ), ] with InterleavedPairedEndReader("tests/data/interleaved.fastq") as isr: reads = list(isr) assert reads == expected with dnaio.open("tests/data/interleaved.fastq", interleaved=True) as f: reads = list(f) assert reads == expected def test_missing_partner(self): s = BytesIO(b'@r1\nACG\n+\nHHH\n') with raises(FileFormatError) as info: with InterleavedPairedEndReader(s) as isr: list(isr) assert "Interleaved input file incomplete" in info.value.message def test_incorrectly_paired(self): s = BytesIO(b'@r1/1\nACG\n+\nHHH\n@wrong_name\nTTT\n+\nHHH\n') with raises(FileFormatError) as info: with InterleavedPairedEndReader(s) as isr: list(isr) assert "Reads are improperly paired" in info.value.message class TestFastaWriter: def setup(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, 'tmp.fasta') def teardown(self): shutil.rmtree(self._tmpdir) def test(self): with FastaWriter(self.path) as fw: fw.write("name", "CCATA") fw.write("name2", "HELLO") assert fw._file.closed with open(self.path) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' def test_linelength(self): with FastaWriter(self.path, line_length=3) as fw: fw.write("r1", "ACG") fw.write("r2", "CCAT") fw.write("r3", "TACCAG") assert fw._file.closed with open(self.path) as t: d = t.read() assert d == '>r1\nACG\n>r2\nCCA\nT\n>r3\nTAC\nCAG\n' def test_write_sequence_object(self): with FastaWriter(self.path) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert fw._file.closed with open(self.path) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' def test_write_to_file_like_object(self): bio = BytesIO() with FastaWriter(bio) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert bio.getvalue() == b'>name\nCCATA\n>name2\nHELLO\n' assert not bio.closed assert not fw._file.closed def test_write_zero_length_sequence(self): bio = BytesIO() with FastaWriter(bio) as fw: fw.write(Sequence("name", "")) assert bio.getvalue() == b'>name\n\n', '{!r}'.format(bio.getvalue()) class TestFastqWriter: def setup(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, 'tmp.fastq') def teardown(self): shutil.rmtree(self._tmpdir) def test(self): with FastqWriter(self.path) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert fq._file.closed with open(self.path) as t: assert t.read() == '@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n' def test_twoheaders(self): with FastqWriter(self.path, two_headers=True) as fq: fq.write(Sequence("name", "CCATA", "!#!#!")) fq.write(Sequence("name2", "HELLO", "&&&!&")) assert fq._file.closed with open(self.path) as t: assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&\n' def test_write_to_file_like_object(self): bio = BytesIO() with FastqWriter(bio) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert bio.getvalue() == b'@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n' class TestInterleavedWriter: def test(self): reads = [ ( Sequence('A/1 comment', 'TTA', '##H'), Sequence('A/2 comment', 'GCT', 'HH#') ), ( Sequence('B/1', 'CC', 'HH'), Sequence('B/2', 'TG', '#H') ), ] bio = BytesIO() with InterleavedPairedEndWriter(bio) as writer: for read1, read2 in reads: writer.write(read1, read2) assert bio.getvalue() == ( b'@A/1 comment\nTTA\n+\n##H\n' b'@A/2 comment\nGCT\n+\nHH#\n' b'@B/1\nCC\n+\nHH\n' b'@B/2\nTG\n+\n#H\n' ) class TestPairedSequenceReader: def test_read(self): s1 = BytesIO(b'@r1\nACG\n+\nHHH\n') s2 = BytesIO(b'@r2\nGTT\n+\n858\n') with TwoFilePairedEndReader(s1, s2) as psr: assert [ (Sequence("r1", "ACG", "HHH"), Sequence("r2", "GTT", "858")), ] == list(psr) def test_record_names_match(self): match = record_names_match assert match('abc', 'abc') assert match('abc def', 'abc') assert match('abc def', 'abc ghi') assert match('abc', 'abc ghi') assert not match('abc', 'xyz') assert match('abc\tdef', 'abc') assert match('abc\tdef', 'abc\tghi') assert match('abc somecomment\tanothercomment', 'abc andanothercomment\tbla') assert match('abc\tcomments comments', 'abc\tothers others') assert match('abc\tdef', 'abc def') def test_record_names_match_with_ignored_trailing_12(self): match = record_names_match assert match('abc/1', 'abc/2') assert match('abc.1', 'abc.2') assert match('abc1', 'abc2') assert match('abc2', 'abc1') assert match('abc1 def', 'abc1 ghi') assert match('abc1 def', 'abc2 ghi') assert match('abc2 def', 'abc1 ghi') assert not match('abc1', 'abc4') assert not match('abc1', 'abc') assert not match('abc', 'abc1') assert not match('abc', 'abc2') def test_record_names_match_with_ignored_trailing_123(self): match = record_names_match assert match('abc/1', 'abc/3') assert match('abc.1 def', 'abc.3 ghi') assert match('abc.3 def', 'abc.1 ghi') def test_missing_partner1(self): s1 = BytesIO(b'') s2 = BytesIO(b'@r1\nACG\n+\nHHH\n') with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "There are more reads in file 2 than in file 1" in info.value.message def test_missing_partner2(self): s1 = BytesIO(b'@r1\nACG\n+\nHHH\n') s2 = BytesIO(b'') with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "There are more reads in file 1 than in file 2" in info.value.message def test_empty_sequences_do_not_stop_iteration(self): s1 = BytesIO(b'@r1_1\nACG\n+\nHHH\n@r2_1\nACG\n+\nHHH\n@r3_2\nACG\n+\nHHH\n') s2 = BytesIO(b'@r1_1\nACG\n+\nHHH\n@r2_2\n\n+\n\n@r3_2\nACG\n+\nHHH\n') # Second sequence for s2 is empty but valid. Should not lead to a stop of iteration. with TwoFilePairedEndReader(s1, s2) as psr: seqs = list(psr) print(seqs) assert len(seqs) == 3 def test_incorrectly_paired(self): s1 = BytesIO(b'@r1/1\nACG\n+\nHHH\n') s2 = BytesIO(b'@wrong_name\nTTT\n+\nHHH\n') with raises(FileFormatError) as info: with TwoFilePairedEndReader(s1, s2) as psr: list(psr) assert "Reads are improperly paired" in info.value.message @mark.parametrize('path', [ os.path.join('tests', 'data', 'simple.fastq'), os.path.join('tests', 'data', 'dos.fastq'), os.path.join('tests', 'data', 'simple.fasta'), os.path.join('tests', 'data', 'with_comment.fasta'), ]) def test_read_stdin(path): # Get number of records in the input file with dnaio.open(path) as f: expected = len(list(f)) # Use piping from a separate subprocess to force the input file name to be unavailable cmd = "type" if sys.platform == "win32" else "cat" with subprocess.Popen( [cmd, path], stdout=subprocess.PIPE, shell=sys.platform == "win32" ) as cat: with subprocess.Popen( [sys.executable, 'tests/read_from_stdin.py'], stdin=cat.stdout, stdout=subprocess.PIPE ) as py: cat.stdout.close() # Check that the read_from_stdin.py script prints the correct number of records assert str(expected) == py.communicate()[0].decode().strip() def test_file_writer(tmp_path): path = tmp_path / "out.txt" fw = FileWriter(path) repr(fw) fw.close() assert path.exists() with raises(ValueError) as e: with fw: pass assert "operation on closed file" in e.value.args[0] def test_binary_file_reader(): bfr = BinaryFileReader("tests/data/simple.fasta") repr(bfr) bfr.close() with raises(ValueError) as e: with bfr: pass assert "operation on closed" in e.value.args[0] def test_fasta_writer_repr(tmp_path): with FastaWriter(tmp_path / "out.fasta") as fw: repr(fw) def test_fastq_writer_repr(tmp_path): with FastqWriter(tmp_path / "out.fastq") as fw: repr(fw) dnaio-0.7.1/tests/test_open.py000066400000000000000000000216051417420163400163330ustar00rootroot00000000000000from pathlib import Path import dnaio from xopen import xopen import pytest @pytest.fixture(params=["", ".gz", ".bz2", ".xz"]) def extension(request): return request.param @pytest.fixture(params=["fasta", "fastq"]) def fileformat(request): return request.param SIMPLE_RECORDS = { "fasta": [ dnaio.Sequence("first_sequence", "SEQUENCE1"), dnaio.Sequence("second_sequence", "SEQUENCE2"), ], "fastq": [ dnaio.Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), dnaio.Sequence("second_sequence", "SEQUENCE2", "83{}\n{}\n".format(record.name, record.sequence) def formatted_sequences(records, fileformat): return "".join(formatted_sequence(record, fileformat) for record in records) def test_formatted_sequence(): s = dnaio.Sequence("s1", "ACGT", "HHHH") assert ">s1\nACGT\n" == formatted_sequence(s, "fasta") assert "@s1\nACGT\n+\nHHHH\n" == formatted_sequence(s, "fastq") def test_version(): _ = dnaio.__version__ def test_open_nonexistent(tmp_path): with pytest.raises(FileNotFoundError): with dnaio.open(tmp_path / "nonexistent"): pass def test_open_empty_file_with_unrecognized_extension(tmp_path): path = tmp_path / "unrecognized-extension.tmp" path.touch() with dnaio.open(path) as f: records = list(f) assert records == [] def test_read(fileformat, extension): with dnaio.open("tests/data/simple." + fileformat + extension) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_pathlib_path(fileformat, extension): path = Path("tests/data/simple." + fileformat + extension) with dnaio.open(path) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_opener(fileformat, extension): def my_opener(path, mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data) with dnaio.open("totally-ignored-filename." + fileformat + extension, opener=my_opener) as f: records = list(f) assert len(records) == 1 assert records[0].name == "read" assert records[0].sequence == "ACG" @pytest.mark.parametrize("interleaved", [False, True]) def test_paired_opener(fileformat, extension, interleaved): def my_opener(_path, _mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data + data) path1 = "ignored-filename." + fileformat + extension path2 = "also-ignored-filename." + fileformat + extension if interleaved: with dnaio.open(path1, file2=path2, opener=my_opener) as f: records = list(f) expected = 2 else: with dnaio.open(path1, interleaved=True, opener=my_opener) as f: records = list(f) expected = 1 assert len(records) == expected assert records[0][0].name == "read" assert records[0][0].sequence == "ACG" assert records[0][1].name == "read" assert records[0][1].sequence == "ACG" def test_detect_fastq_from_content(): """FASTQ file that is not named .fastq""" with dnaio.open('tests/data/missingextension') as f: record = next(iter(f)) assert record.name == 'prefix:1_13_573/1' def test_detect_compressed_fastq_from_content(): """Compressed FASTQ file that is not named .fastq.gz""" with dnaio.open('tests/data/missingextension.gz') as f: record = next(iter(f)) assert record.name == 'prefix:1_13_573/1' def test_write(tmpdir, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = tmpdir.join("out.fastq" + extension) with dnaio.open(str(out_fastq), mode='w') as f: f.write(s) with xopen(out_fastq) as f: assert f.read() == '@name\nACGT\n+\nHHHH\n' def test_write_with_xopen(tmpdir, fileformat, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = str(tmpdir.join("out." + fileformat + extension)) with xopen(out_fastq, 'wb') as outer_f: with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f: f.write(s) with xopen(out_fastq) as f: if fileformat == "fasta": assert f.read() == ">name\nACGT\n" else: assert f.read() == "@name\nACGT\n+\nHHHH\n" def test_write_pathlib(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") path = Path(str(tmpdir / ("out." + fileformat + extension))) with dnaio.open(path, mode="w") as f: f.write(s1) if fileformat == "fasta": expected = b">s1\nACGT\n" else: expected = b"@s1\nACGT\n+\nHHHH\n" with xopen(path, "rb") as f: assert f.read() == expected def test_write_paired_same_path(tmpdir): path1 = str(tmpdir / "same.fastq") path2 = str(tmpdir / "same.fastq") with pytest.raises(ValueError): with dnaio.open(file1=path1, file2=path2, mode="w"): pass def test_write_paired(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path1 = str(tmpdir / ("out.1." + fileformat + extension)) path2 = str(tmpdir / ("out.2." + fileformat + extension)) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_write_interleaved(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path = str(tmpdir / ("out.interleaved." + fileformat + extension)) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def test_append(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") s2 = dnaio.Sequence("s2", "CGCA", "8383") path = str(tmpdir / ("out." + fileformat + extension)) with dnaio.open(path, mode="w") as f: f.write(s1) with dnaio.open(path, mode="a") as f: f.write(s2) with xopen(path) as f: assert formatted_sequences([s1, s2], fileformat) == f.read() def test_append_paired(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path1 = str(tmpdir / ("out.1." + fileformat + extension)) path2 = str(tmpdir / ("out.2." + fileformat + extension)) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_append_interleaved(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path = str(tmpdir / ("out.interleaved." + fileformat + extension)) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def make_random_fasta(path, n_records): from random import choice with xopen(path, "w") as f: for i in range(n_records): name = "sequence_{}".format(i) sequence = "".join(choice("ACGT") for _ in range(300)) print(">", name, "\n", sequence, sep="", file=f) def test_islice_gzip_does_not_fail(tmpdir): path = str(tmpdir / "file.fasta.gz") make_random_fasta(path, 100) f = dnaio.open(path) next(iter(f)) f.close() dnaio-0.7.1/tests/test_util.py000066400000000000000000000003131417420163400163400ustar00rootroot00000000000000from dnaio._util import shorten def test_shorten(): assert shorten(None) is None assert shorten("hello too long", 5) == "he..." assert shorten("hello not too long") == "hello not too long" dnaio-0.7.1/tox.ini000066400000000000000000000012121417420163400141220ustar00rootroot00000000000000[tox] envlist = flake8,mypy,py36,py37,py38,py39,py310 isolated_build = True [testenv] deps = pytest coverage commands = coverage run --concurrency=multiprocessing -m pytest --doctest-modules --pyargs tests/ coverage combine coverage report setenv = PYTHONDEVMODE = 1 [testenv:flake8] basepython = python3.7 deps = flake8 commands = flake8 src/ tests/ [testenv:mypy] basepython = python3.7 deps = mypy commands = mypy src/ [coverage:run] branch = True parallel = True include = */site-packages/dnaio/* tests/* [coverage:paths] source = src/ */site-packages/ [flake8] max-line-length = 110 max-complexity = 15