pax_global_header00006660000000000000000000000064135611041750014515gustar00rootroot0000000000000052 comment=24f94975f2ac09f682a5486cc08fda4e6727fbcc dnaio-0.4.1/000077500000000000000000000000001356110417500126115ustar00rootroot00000000000000dnaio-0.4.1/.editorconfig000066400000000000000000000001451356110417500152660ustar00rootroot00000000000000[*.{py,pyx}] charset=utf-8 end_of_line=lf insert_final_newline=true indent_style=space indent_size=4 dnaio-0.4.1/.gitattributes000066400000000000000000000000341356110417500155010ustar00rootroot00000000000000*.fastq -crlf *.fasta -crlf dnaio-0.4.1/.gitignore000066400000000000000000000002141356110417500145760ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /.tox/ /src/dnaio/_version.py dnaio-0.4.1/.travis.yml000066400000000000000000000016521356110417500147260ustar00rootroot00000000000000language: python dist: xenial cache: directories: - $HOME/.cache/pip python: - "3.4" - "3.5" - "3.6" - "3.7" - "3.8" - "nightly" install: - pip install --upgrade coverage codecov - pip install .[dev] script: - coverage run -m pytest after_success: - coverage combine - codecov env: global: # - TWINE_REPOSITORY_URL=https://test.pypi.org/legacy/ - TWINE_USERNAME=marcelm # TWINE_PASSWORD is set in Travis settings jobs: include: - stage: deploy services: - docker python: "3.6" install: python3 -m pip install Cython twine if: tag IS present script: - | python3 setup.py sdist ./buildwheels.sh ls -l dist/ python3 -m twine upload dist/* - name: flake8 python: "3.6" install: python3 -m pip install flake8 script: flake8 src/ tests/ allow_failures: - python: "nightly" dnaio-0.4.1/LICENSE000066400000000000000000000021041356110417500136130ustar00rootroot00000000000000Copyright (c) 2010-2018 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. dnaio-0.4.1/MANIFEST.in000066400000000000000000000001541356110417500143470ustar00rootroot00000000000000include README.md include LICENSE include src/dnaio/*.c include versioneer.py include src/dnaio/_version.py dnaio-0.4.1/NOTES.md000066400000000000000000000030141356110417500140210ustar00rootroot00000000000000- compressed - paired-end - interleaved - chunked - FASTA, FASTQ - BAM? import dnaio with dnaio.open('input.fastq.gz') as f: for record in f: print(record....) with dnaio.open('input.1.fastq.gz', 'input.2.fastq.gz') as f: for record in f: print(record....) Use cases - open FASTQ from path - open FASTA from path - open compressed FASTA or FASTQ (.gz, .bz2, .xz) - open paired-end data - open interleaved data - open file-like object (such as sys.stdin) - use custom sequence record class - autodetect file format from contents - write FASTQ - write FASTA - read FASTQ/FASTA chunks (multiple records) Issues - Binary vs text - Should SequenceRecord be immutable? TODO - Sequence.name should be Sequence.description or so (reserve .name for the part before the first space) - optimize writing - Documentation - Line endings - second header FASTQ chunks - need an index attribute - need a line_number attribute # API ## Advertised - dnaio.open - Sequence(Record) - possibly SequencePair/PairedSequence? ## Reader - FastqReader - FastaReader - PairedSequenceReader -> rename to PairedFastqReader? - InterleavedSequenceReader -> rename to InterleavedFastqReader ## Writing class FastqWriter class FastaWriter class PairedSequenceWriter class InterleavedSequenceWriter ## Chunking def find_fasta_record_end(buf, end): def find_fastq_record_end(buf, end=None): def read_chunks_from_file(f, buffer_size=4*1024**2): def read_paired_chunks(f, f2, buffer_size=4*1024**2): head fastq_head two_fastq_heads dnaio-0.4.1/README.md000066400000000000000000000030241356110417500140670ustar00rootroot00000000000000[![Travis](https://travis-ci.org/marcelm/dnaio.svg?branch=master)](https://travis-ci.org/marcelm/dnaio) [![PyPI](https://img.shields.io/pypi/v/dnaio.svg?branch=master)](https://pypi.python.org/pypi/dnaio) [![Codecov](https://codecov.io/gh/marcelm/dnaio/branch/master/graph/badge.svg)](https://codecov.io/gh/marcelm/dnaio) # dnaio parses FASTQ and FASTA `dnaio` is a Python 3 library for fast parsing of FASTQ and also FASTA files. The code was previously part of the [Cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out. ## Example usage The main interface is the `dnaio.open` function: import dnaio with dnaio.open('reads.fastq.gz') as f: bp = 0 for record in f: bp += len(record) print(f'The input file contains {bp/1E6:.1f} Mbp') ## Features and supported file types - FASTQ input and output - FASTA input and output - Compressed input and output (`.gz`, `.bz2` and `.xz`, detected automatically) - Paired-end data in two files - Interleaved paired-end data in a single file - Files with DOS/Windows linebreaks can be read - FASTQ files with a second header line (after the `+`) are supported # Limitations - Multi-line FASTQ files are not supported. You shouldn’t use them anyway. - FASTQ parsing is the focus of this library. The FASTA parser is not as optimized. # Links * [Source code](https://github.com/marcelm/dnaio/) * [Report an issue](https://github.com/marcelm/dnaio/issues) * [Project page on PyPI](https://pypi.python.org/pypi/dnaio/) dnaio-0.4.1/buildwheels.sh000077500000000000000000000023531356110417500154620ustar00rootroot00000000000000#!/bin/bash # # Build manylinux1 wheels. Based on the example at # # # For interactive tests: # docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash set -xeuo pipefail # For convenience, if this script is called from outside of a docker container, # it starts a container and runs itself inside of it. if ! grep -q docker /proc/1/cgroup; then # We are not inside a container docker pull quay.io/pypa/manylinux1_x86_64 exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0 fi # Strip binaries (copied from multibuild) STRIP_FLAGS=${STRIP_FLAGS:-"-Wl,-strip-all"} export CFLAGS="${CFLAGS:-$STRIP_FLAGS}" export CXXFLAGS="${CXXFLAGS:-$STRIP_FLAGS}" # We don’t support Python 2.7 rm /opt/python/cp27* PYBINS="/opt/python/*/bin" HAS_CYTHON=0 for PYBIN in ${PYBINS}; do ${PYBIN}/pip install Cython # ${PYBIN}/pip install -r /io/requirements.txt ${PYBIN}/pip wheel /io/ -w wheelhouse/ done # Bundle external shared libraries into the wheels for whl in wheelhouse/dnaio-*.whl; do auditwheel repair "$whl" -w repaired/ done # Created files are owned by root, so fix permissions. chown -R --reference=/io/setup.py repaired/ mv repaired/*.whl /io/dist/ dnaio-0.4.1/pyproject.toml000066400000000000000000000001161356110417500155230ustar00rootroot00000000000000[build-system] requires = ["setuptools", "wheel", "setuptools_scm", "Cython"] dnaio-0.4.1/setup.py000066400000000000000000000056721356110417500143350ustar00rootroot00000000000000import sys import os.path from setuptools import setup, Extension, find_packages from distutils.command.sdist import sdist as _sdist from distutils.command.build_ext import build_ext as _build_ext if sys.version_info[:2] < (3, 4): sys.stdout.write('Python 3.4 or later is required\n') sys.exit(1) def no_cythonize(extensions, **_ignore): """Change .pyx to .c or .cpp (copied from Cython documentation)""" for extension in extensions: sources = [] for sfile in extension.sources: path, ext = os.path.splitext(sfile) if ext in ('.pyx', '.py'): if extension.language == 'c++': ext = '.cpp' else: ext = '.c' sfile = path + ext sources.append(sfile) extension.sources[:] = sources extensions = [ Extension('dnaio._core', sources=['src/dnaio/_core.pyx']), ] class BuildExt(_build_ext): def run(self): # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip # file retrieved from PyPI that already includes the pre-cythonized # extension modules, and then we do not need to run cythonize(). if os.path.exists('PKG-INFO'): no_cythonize(extensions) else: # Otherwise, this is a 'developer copy' of the code, and then the # only sensible thing is to require Cython to be installed. from Cython.Build import cythonize self.extensions = cythonize(self.extensions) super().run() class SDist(_sdist): def run(self): # Make sure the compiled Cython files in the distribution are up-to-date from Cython.Build import cythonize cythonize(extensions) super().run() with open('README.md', encoding='utf-8') as f: long_description = f.read() setup( name='dnaio', setup_requires=['setuptools_scm'], # Support pip versions that don't know about pyproject.toml use_scm_version={'write_to': 'src/dnaio/_version.py'}, author='Marcel Martin', author_email='marcel.martin@scilifelab.se', url='https://github.com/marcelm/dnaio/', description='Read FASTA and FASTQ files efficiently', long_description=long_description, long_description_content_type='text/markdown', license='MIT', package_dir={'': 'src'}, packages=find_packages('src'), extras_require={ 'dev': ['Cython', 'pytest'], }, ext_modules=extensions, cmdclass={'build_ext': BuildExt, 'sdist': SDist}, install_requires=['xopen>=0.8.2'], python_requires='>=3.4', classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics" ] ) dnaio-0.4.1/src/000077500000000000000000000000001356110417500134005ustar00rootroot00000000000000dnaio-0.4.1/src/dnaio/000077500000000000000000000000001356110417500144725ustar00rootroot00000000000000dnaio-0.4.1/src/dnaio/__init__.py000066400000000000000000000313611356110417500166070ustar00rootroot00000000000000""" Sequence I/O: Read and write FASTA and FASTQ files efficiently """ __all__ = [ 'open', 'Sequence', 'FastaReader', 'FastaWriter', 'FastqReader', 'FastqWriter', 'UnknownFileFormat', 'FileFormatError', 'FastaFormatError', 'FastqFormatError', 'InterleavedSequenceReader', 'InterleavedSequenceWriter', 'PairedSequenceReader', 'read_chunks', 'read_paired_chunks', '__version__', ] import os from contextlib import ExitStack import functools import pathlib from xopen import xopen from ._core import Sequence from .readers import FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .exceptions import UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError from .chunks import read_chunks, read_paired_chunks from ._version import version as __version__ try: from os import fspath # Exists in Python 3.6+ except ImportError: def fspath(path): if hasattr(path, "__fspath__"): return path.__fspath__() # Python 3.4 and 3.5 do not support the file system path protocol if isinstance(path, pathlib.Path): return str(path) return path def open( file1, *, file2=None, fileformat=None, interleaved=False, mode="r", qualities=None, opener=xopen ): """ Open sequence files in FASTA or FASTQ format for reading or writing. This is a factory that returns an instance of one of the ...Reader or ...Writer classes also defined in this module. file1, file2 -- Paths to regular or compressed files or file-like objects (as str or as pathlib.Path). Use only file1 if data is single-end. If sequences are paired, use also file2. mode -- Either 'r' for reading, 'w' for writing or 'a' for appending. interleaved -- If True, then file1 contains interleaved paired-end data. file2 must be None in this case. fileformat -- If set to None, the file format is autodetected from the file name extension. Set to 'fasta' or 'fastq' to not auto-detect. qualities -- When mode is 'w' and fileformat is None, this can be set to True or False to specify whether the written sequences will have quality values. This is is used in two ways: * If the output format cannot be determined (unrecognized extension etc), no exception is raised, but fasta or fastq format is chosen appropriately. * When False (no qualities available), an exception is raised when the auto-detected output format is FASTQ. opener -- A function that is used to open file1 and file2 if they are not already open file-like objects. By default, xopen is used, which can also open compressed file formats. """ if mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") if interleaved and file2 is not None: raise ValueError("When interleaved is set, file2 must be None") if file2 is not None: if mode in "wa" and file1 == file2: raise ValueError("The paired-end output files are identical") if mode == "r": return PairedSequenceReader(file1, file2, fileformat, opener=opener) elif mode == "w": return PairedSequenceWriter(file1, file2, fileformat, qualities, opener=opener) else: return PairedSequenceAppender(file1, file2, fileformat, qualities, opener=opener) if interleaved: if mode == "r": return InterleavedSequenceReader(file1, fileformat, opener=opener) elif mode == "w": return InterleavedSequenceWriter(file1, fileformat, qualities, opener=opener) else: return InterleavedSequenceAppender(file1, fileformat, qualities, opener=opener) # The multi-file options have been dealt with, delegate rest to the # single-file function. return _open_single( file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities) def _detect_format_from_name(name): """ name -- file name Return 'fasta', 'fastq' or None if the format could not be detected. """ name = name.lower() for ext in ('.gz', '.xz', '.bz2'): if name.endswith(ext): name = name[:-len(ext)] break name, ext = os.path.splitext(name) if ext in ['.fasta', '.fa', '.fna', '.csfasta', '.csfa']: return 'fasta' elif ext in ['.fastq', '.fq'] or (ext == '.txt' and name.endswith('_sequence')): return 'fastq' return None def _open_single(file, opener, *, fileformat=None, mode="r", qualities=None): """ Open a single sequence file. See description of open() above. """ if mode not in ("r", "w", "a"): raise ValueError("Mode must be 'r', 'w' or 'a'") if isinstance(file, (str, pathlib.Path)): # TODO Use os.PathLike in Python 3.6+ path = fspath(file) file = opener(path, mode + "b") close_file = True else: if mode == 'r' and not hasattr(file, 'readinto'): raise ValueError( 'When passing in an open file-like object, it must have been opened in binary mode') if hasattr(file, "name") and isinstance(file.name, str): path = file.name else: path = None close_file = False if mode == 'r': fastq_handler = FastqReader fasta_handler = FastaReader else: fastq_handler = FastqWriter fasta_handler = FastaWriter handlers = { 'fastq': functools.partial(fastq_handler, _close_file=close_file), 'fasta': functools.partial(fasta_handler, _close_file=close_file), } if fileformat: try: handler = handlers[fileformat.lower()] except KeyError: raise UnknownFileFormat( "File format {!r} is unknown (expected 'fasta' or 'fastq').".format(fileformat)) return handler(file) if path is not None: fileformat = _detect_format_from_name(path) if fileformat is None and mode == 'w' and qualities is not None: # Format not recognized, but we know whether to use a format with or without qualities fileformat = 'fastq' if qualities else 'fasta' if mode == 'r' and fileformat is None: fileformat = _detect_format_from_content(file) if fileformat is None: raise UnknownFileFormat( 'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was ' 'not available or not recognized and the first character in the file is ' 'unexpected.'.format(file)) if fileformat is None: assert mode == 'w' extra = " because the output file name is not available" if path is None else "" raise UnknownFileFormat( "Auto-detection of the output file format (FASTA/FASTQ) failed" + extra) if fileformat == 'fastq' and mode in "wa" and qualities is False: raise ValueError( 'Output format cannot be FASTQ since no quality values are available.') return handlers[fileformat](file) def _detect_format_from_content(file): """ Return 'fasta', 'fastq' or None """ if file.seekable(): first_char = file.read(1) file.seek(-1, 1) else: first_char = file.peek(1)[0:1] formats = { b'@': 'fastq', b'>': 'fasta', b'#': 'fasta', # Some FASTA variants allow comments b'': 'fastq', # Pretend FASTQ for empty input } return formats.get(first_char, None) def _sequence_names_match(r1, r2): """ Check whether the sequence records r1 and r2 have identical names, ignoring a suffix of '1' or '2'. Some old paired-end reads have names that end in '/1' and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ) appends a .1 and .2 to paired-end reads if option -I is used. """ name1 = r1.name.split(None, 1)[0] name2 = r2.name.split(None, 1)[0] if name1[-1:] in '12' and name2[-1:] in '12': name1 = name1[:-1] name2 = name2[:-1] return name1 == name2 class PairedSequenceReader: """ Read paired-end reads from two files. Wraps two BinaryFileReader instances, making sure that reads are properly paired. """ paired = True def __init__(self, file1, file2, fileformat=None, opener=xopen): with ExitStack() as stack: self.reader1 = stack.enter_context(_open_single(file1, opener=opener, fileformat=fileformat)) self.reader2 = stack.enter_context(_open_single(file2, opener=opener, fileformat=fileformat)) self._close = stack.pop_all().close self.delivers_qualities = self.reader1.delivers_qualities def __iter__(self): """ Iterate over the paired reads. Each item is a pair of Sequence objects. """ # Avoid usage of zip() below since it will consume one item too many. it1, it2 = iter(self.reader1), iter(self.reader2) while True: try: r1 = next(it1) except StopIteration: # End of file 1. Make sure that file 2 is also at end. try: next(it2) raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 2 than in file 1.", line=None) from None except StopIteration: pass break try: r2 = next(it2) except StopIteration: raise FileFormatError( "Reads are improperly paired. There are more reads in " "file 1 than in file 2.", line=None) from None if not _sequence_names_match(r1, r2): raise FileFormatError( "Reads are improperly paired. Read name '{}' " "in file 1 does not match '{}' in file 2.".format(r1.name, r2.name), line=None) from None yield (r1, r2) def close(self): self._close() def __enter__(self): return self def __exit__(self, *exc): self.close() class InterleavedSequenceReader: """ Read paired-end reads from an interleaved FASTQ file. """ paired = True def __init__(self, file, fileformat=None, opener=xopen): self.reader = _open_single(file, opener=opener, fileformat=fileformat) self.delivers_qualities = self.reader.delivers_qualities def __iter__(self): it = iter(self.reader) for r1 in it: try: r2 = next(it) except StopIteration: raise FileFormatError( "Interleaved input file incomplete: Last record " "{!r} has no partner.".format(r1.name), line=None) from None if not _sequence_names_match(r1, r2): raise FileFormatError( "Reads are improperly paired. Name {!r} " "(first) does not match {!r} (second).".format(r1.name, r2.name), line=None) yield (r1, r2) def close(self): self.reader.close() def __enter__(self): return self def __exit__(self, *args): self.close() class PairedSequenceWriter: _mode = "w" def __init__(self, file1, file2, fileformat='fastq', qualities=None, opener=xopen): with ExitStack() as stack: self._writer1 = stack.enter_context( _open_single( file1, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities)) self._writer2 = stack.enter_context( _open_single( file2, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities)) self._close = stack.pop_all().close def write(self, read1, read2): self._writer1.write(read1) self._writer2.write(read2) def close(self): self._close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() class PairedSequenceAppender(PairedSequenceWriter): _mode = "a" class InterleavedSequenceWriter: """ Write paired-end reads to an interleaved FASTA or FASTQ file """ _mode = "w" def __init__(self, file, fileformat='fastq', qualities=None, opener=xopen): self._writer = _open_single( file, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities) def write(self, read1, read2): self._writer.write(read1) self._writer.write(read2) def close(self): self._writer.close() def __enter__(self): # TODO do not allow this twice return self def __exit__(self, *args): self.close() class InterleavedSequenceAppender(InterleavedSequenceWriter): _mode = "a" dnaio-0.4.1/src/dnaio/_core.pyx000066400000000000000000000242651356110417500163340ustar00rootroot00000000000000# cython: language_level=3, emit_code_comments=False from libc.string cimport strncmp cimport cython from .exceptions import FastqFormatError from ._util import shorten cdef class Sequence: """ A record in a FASTA or FASTQ file. For FASTA, the qualities attribute is None. For FASTQ, qualities is a string and it contains the qualities encoded as ascii(qual+33). """ cdef: public str name public str sequence public str qualities def __cinit__(self, str name, str sequence, str qualities=None): """Set qualities to None if there are no quality values""" self.name = name self.sequence = sequence self.qualities = qualities if qualities is not None and len(qualities) != len(sequence): rname = shorten(name) raise ValueError("In read named {!r}: length of quality sequence " "({}) and length of read ({}) do not match".format( rname, len(qualities), len(sequence))) def __getitem__(self, key): """slicing""" return self.__class__( self.name, self.sequence[key], self.qualities[key] if self.qualities is not None else None) def __repr__(self): qstr = '' if self.qualities is not None: qstr = ', qualities={!r}'.format(shorten(self.qualities)) return ''.format( shorten(self.name), shorten(self.sequence), qstr) def __len__(self): return len(self.sequence) def __richcmp__(self, other, int op): if 2 <= op <= 3: eq = self.name == other.name and \ self.sequence == other.sequence and \ self.qualities == other.qualities if op == 2: return eq else: return not eq else: raise NotImplementedError() def __reduce__(self): return (Sequence, (self.name, self.sequence, self.qualities)) # It would be nice to be able to have the first parameter be an # unsigned char[:] (memory view), but this fails with a BufferError # when a bytes object is passed in. # See ctypedef fused bytes_or_bytearray: bytes bytearray def paired_fastq_heads(bytes_or_bytearray buf1, bytes_or_bytearray buf2, Py_ssize_t end1, Py_ssize_t end2): """ Skip forward in the two buffers by multiples of four lines. Return a tuple (length1, length2) such that buf1[:length1] and buf2[:length2] contain the same number of lines (where the line number is divisible by four). """ cdef: Py_ssize_t pos1 = 0, pos2 = 0 Py_ssize_t linebreaks = 0 unsigned char* data1 = buf1 unsigned char* data2 = buf2 Py_ssize_t record_start1 = 0 Py_ssize_t record_start2 = 0 while True: while pos1 < end1 and data1[pos1] != b'\n': pos1 += 1 if pos1 == end1: break pos1 += 1 while pos2 < end2 and data2[pos2] != b'\n': pos2 += 1 if pos2 == end2: break pos2 += 1 linebreaks += 1 if linebreaks == 4: linebreaks = 0 record_start1 = pos1 record_start2 = pos2 # Hit the end of the data block return record_start1, record_start2 def fastq_iter(file, sequence_class, Py_ssize_t buffer_size): """ Parse a FASTQ file and yield Sequence objects The *first value* that the generator yields is a boolean indicating whether the first record in the FASTQ has a repeated header (in the third row after the ``+``). file -- a file-like object, opened in binary mode (it must have a readinto method) buffer_size -- size of the initial buffer. This is automatically grown if a FASTQ record is encountered that does not fit. """ cdef: bytearray buf = bytearray(buffer_size) char[:] buf_view = buf char* c_buf = buf int endskip str name char* name_encoded Py_ssize_t bufstart, bufend, pos, record_start, sequence_start Py_ssize_t second_header_start, sequence_length, qualities_start Py_ssize_t second_header_length, name_length bint custom_class = sequence_class is not Sequence Py_ssize_t n_records = 0 bint extra_newline = False if buffer_size < 1: raise ValueError("Starting buffer size too small") # buf is a byte buffer that is re-used in each iteration. Its layout is: # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 bufstart end bufend len(buf) # # buf[0:bufstart] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTQ record. readinto = file.readinto bufstart = 0 # The input file is processed in chunks that each fit into buf while True: assert bufstart < len(buf_view) bufend = readinto(buf_view[bufstart:]) + bufstart if bufstart == bufend: # End of file if bufstart > 0 and buf_view[bufstart-1] != b'\n': # There is still data in the buffer and its last character is # not a newline: This is a file that is missing the final # newline. Append a newline and continue. buf_view[bufstart] = b'\n' bufstart += 1 bufend += 1 extra_newline = True else: break # Parse all complete FASTQ records in this chunk pos = 0 record_start = 0 while True: # Parse the name (line 0) if c_buf[pos] != b'@': raise FastqFormatError("Line expected to " "start with '@', but found {!r}".format(chr(c_buf[pos])), line=n_records * 4) pos += 1 while pos < bufend and c_buf[pos] != b'\n': pos += 1 if pos == bufend: break endskip = 1 if c_buf[pos-1] == b'\r' else 0 name_length = pos - endskip - record_start - 1 name_encoded = c_buf + record_start + 1 # .decode('latin-1') is 50% faster than .decode('ascii') name = c_buf[record_start+1:pos-endskip].decode('latin-1') pos += 1 # Parse the sequence (line 1) sequence_start = pos while pos < bufend and c_buf[pos] != b'\n': pos += 1 if pos == bufend: break endskip = 1 if c_buf[pos-1] == b'\r' else 0 sequence = c_buf[sequence_start:pos-endskip].decode('latin-1') sequence_length = pos - endskip - sequence_start pos += 1 # Parse second header (line 2) second_header_start = pos if pos == bufend: break if c_buf[pos] != b'+': raise FastqFormatError("Line expected to " "start with '+', but found {!r}".format(chr(c_buf[pos])), line=n_records * 4 + 2) pos += 1 # skip over the '+' while pos < bufend and c_buf[pos] != b'\n': pos += 1 if pos == bufend: break endskip = 1 if c_buf[pos-1] == b'\r' else 0 second_header_length = pos - endskip - second_header_start - 1 if second_header_length == 0: second_header = False else: if (name_length != second_header_length or strncmp(c_buf+second_header_start+1, name_encoded, second_header_length) != 0): raise FastqFormatError( "Sequence descriptions don't match ('{}' != '{}').\n" "The second sequence description must be either " "empty or equal to the first description.".format( name_encoded[:name_length].decode('latin-1'), c_buf[second_header_start+1:pos-endskip] .decode('latin-1')), line=n_records * 4 + 2) second_header = True pos += 1 # Parse qualities (line 3) qualities_start = pos while pos < bufend and c_buf[pos] != b'\n': pos += 1 if pos == bufend: break endskip = 1 if c_buf[pos-1] == b'\r' else 0 qualities = c_buf[qualities_start:pos-endskip].decode('latin-1') if pos - endskip - qualities_start != sequence_length: raise FastqFormatError("Length of sequence and " "qualities differ", line=n_records * 4 + 3) pos += 1 if n_records == 0: yield second_header # first yielded value is special if custom_class: yield sequence_class(name, sequence, qualities) else: yield Sequence.__new__(Sequence, name, sequence, qualities) n_records += 1 record_start = pos if pos == bufend: break if pos == bufend: if record_start == 0 and bufend == len(buf): # buffer too small, double it buffer_size *= 2 prev_buf = buf buf = bytearray(buffer_size) buf[0:bufend] = prev_buf del prev_buf bufstart = bufend buf_view = buf c_buf = buf else: bufstart = bufend - record_start buf[0:bufstart] = buf[record_start:bufend] if pos > record_start: if extra_newline: pos -= 1 lines = buf[record_start:pos].count(b'\n') raise FastqFormatError( 'Premature end of file encountered. The incomplete final record was: ' '{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)), line=n_records * 4 + lines) dnaio-0.4.1/src/dnaio/_util.py000066400000000000000000000003131356110417500161550ustar00rootroot00000000000000def shorten(s, n=100): """Shorten string s to at most n characters, appending "..." if necessary.""" if s is None: return None if len(s) > n: s = s[:n-3] + '...' return s dnaio-0.4.1/src/dnaio/chunks.py000066400000000000000000000105041356110417500163370ustar00rootroot00000000000000"""Chunked reading of FASTA and FASTQ files""" from ._core import paired_fastq_heads as _paired_fastq_heads from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat def _fasta_head(buf, end): """ Search for the end of the last complete FASTA record within buf[:end] Return an integer length such that buf[:length] contains the highest possible number of complete FASTA records. """ pos = buf.rfind(b'\n>', 0, end) if pos != -1: return pos + 1 if buf[0:1] == b'>': return 0 raise FastaFormatError('File does not start with ">"', line=None) def _fastq_head(buf, end=None): """ Search for the end of the last complete *two* FASTQ records in buf[:end]. Two FASTQ records are required to ensure that read pairs in interleaved paired-end data are not split. """ linebreaks = buf.count(b'\n', 0, end) right = end for _ in range(linebreaks % 8 + 1): right = buf.rfind(b'\n', 0, right) # Note that this works even if linebreaks == 0: # rfind() returns -1 and adding 1 gives index 0, # which is correct. return right + 1 def read_chunks(f, buffer_size=4*1024**2): """ Read a chunk of complete FASTA or FASTQ records from a file. The size of a chunk is at most buffer_size. f needs to be a file opened in binary mode. The yielded memoryview objects become invalid on the next iteration. """ # This buffer is re-used in each iteration. buf = bytearray(buffer_size) # Read one byte to determine file format. # If there is a comment char, we assume FASTA! start = f.readinto(memoryview(buf)[0:1]) if start == 1 and buf[0:1] == b'@': head = _fastq_head elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'): head = _fasta_head elif start == 0: # Empty file return else: raise UnknownFileFormat('Input file format unknown') # Layout of buf # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 start end bufend len(buf) # # buf[0:start] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTA or FASTQ record. while True: if start == len(buf): raise OverflowError('FASTA/FASTQ record does not fit into buffer') bufend = f.readinto(memoryview(buf)[start:]) + start if start == bufend: # End of file break end = head(buf, bufend) assert end <= bufend if end > 0: yield memoryview(buf)[0:end] start = bufend - end assert start >= 0 buf[0:start] = buf[end:bufend] if start > 0: yield memoryview(buf)[0:start] def read_paired_chunks(f, f2, buffer_size=4*1024**2): if buffer_size < 1: raise ValueError("Buffer size too small") buf1 = bytearray(buffer_size) buf2 = bytearray(buffer_size) # Read one byte to make sure we are processing FASTQ start1 = f.readinto(memoryview(buf1)[0:1]) start2 = f2.readinto(memoryview(buf2)[0:1]) if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'): raise FileFormatError( "Paired-end data must be in FASTQ format when using multiple cores", line=None) while True: if start1 == len(buf1) or start2 == len(buf2): raise ValueError("FASTQ record does not fit into buffer") bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 if start1 == bufend1 and start2 == bufend2: break end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2) assert end1 <= bufend1 assert end2 <= bufend2 if end1 > 0 or end2 > 0: yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2]) start1 = bufend1 - end1 assert start1 >= 0 buf1[0:start1] = buf1[end1:bufend1] start2 = bufend2 - end2 assert start2 >= 0 buf2[0:start2] = buf2[end2:bufend2] if start1 > 0 or start2 > 0: yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2]) dnaio-0.4.1/src/dnaio/exceptions.py000066400000000000000000000013511356110417500172250ustar00rootroot00000000000000class FileFormatError(Exception): """ The file is not formatted correctly """ format = 'sequence' # Something generic that works for both FASTA and FASTQ def __init__(self, msg, line): super().__init__(msg, line) self.message = msg self.line = line # starts at 0! def __str__(self): line = 'unknown line' if self.line is None else 'line {}'.format(self.line + 1) return 'Error in {} file at {}: {}'.format(self.format, line, self.message) class FastqFormatError(FileFormatError): format = 'FASTQ' class FastaFormatError(FileFormatError): format = 'FASTA' class UnknownFileFormat(Exception): """ The file format could not be automatically detected """ dnaio-0.4.1/src/dnaio/readers.py000066400000000000000000000076111356110417500164760ustar00rootroot00000000000000""" Classes for reading FASTA and FASTQ files """ __all__ = ['FastaReader', 'FastqReader'] import io from xopen import xopen from ._core import fastq_iter as _fastq_iter, Sequence from ._util import shorten as _shorten from .exceptions import FastaFormatError class BinaryFileReader: """ A mixin for readers that ensures that a file or a path can be passed in to the constructor. """ _close_on_exit = False paired = False mode = 'rb' def __init__(self, file, opener=xopen, _close_file=None): """ The file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz). """ if isinstance(file, str): file = opener(file, self.mode) self._close_on_exit = True elif _close_file: self._close_on_exit = True self._file = file def close(self): if self._close_on_exit and self._file is not None: self._file.close() self._file = None def __enter__(self): if self._file is None: raise ValueError("I/O operation on closed BinaryFileReader") return self def __exit__(self, *args): self.close() class FastaReader(BinaryFileReader): """ Reader for FASTA files. """ def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, opener=xopen, _close_file=None): """ file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz). keep_linebreaks -- whether to keep newline characters in the sequence """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = False self._delimiter = '\n' if keep_linebreaks else '' def __iter__(self): """ Read next entry from the file (single entry at a time). """ name = None seq = [] f = io.TextIOWrapper(self._file) for i, line in enumerate(f): # strip() also removes DOS line breaks line = line.strip() if not line: continue if line and line[0] == '>': if name is not None: yield self.sequence_class(name, self._delimiter.join(seq), None) name = line[1:] seq = [] elif line and line[0] == '#': continue elif name is not None: seq.append(line) else: raise FastaFormatError( "Expected '>' at beginning of record, but got {!r}." .format(_shorten(line)), line=i) if name is not None: yield self.sequence_class(name, self._delimiter.join(seq), None) # Prevent TextIOWrapper from closing the underlying file f.detach() class FastqReader(BinaryFileReader): """ Reader for FASTQ files. Does not support multi-line FASTQ files. """ def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, opener=xopen, _close_file=None): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. """ super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = True self.buffer_size = buffer_size # The first value yielded by _fastq_iter indicates # whether the file has repeated headers self._iter = _fastq_iter(self._file, self.sequence_class, self.buffer_size) try: self.two_headers = next(self._iter) assert self.two_headers in (True, False) except StopIteration: # Empty file self.two_headers = False self._iter = iter(()) def __iter__(self): return self._iter dnaio-0.4.1/src/dnaio/writers.py000066400000000000000000000060151356110417500165450ustar00rootroot00000000000000from xopen import xopen class FileWriter: def __init__(self, file, opener=xopen, _close_file=None): self._file = file if isinstance(file, str): self._file = opener(file, "wb") self._close_on_exit = True else: self._close_on_exit = bool(_close_file) def close(self): if self._close_on_exit: self._file.close() def __enter__(self): if self._file.closed: raise ValueError("I/O operation on closed file") return self def __exit__(self, *args): self.close() class FastaWriter(FileWriter): """ Write FASTA-formatted sequences to a file. """ def __init__(self, file, line_length=None, opener=xopen, _close_file=None): """ If line_length is not None, the lines will be wrapped after line_length characters. """ super().__init__(file, opener=opener, _close_file=_close_file) self.line_length = line_length if line_length != 0 else None def write(self, name_or_record, sequence=None): """Write an entry to the the FASTA file. If only one parameter (name_or_record) is given, it must have attributes .name and .sequence, which are then used. Otherwise, the first parameter must be the name and the second the sequence. The effect is that you can write this: writer.write("name", "ACCAT") or writer.write(Sequence("name", "ACCAT")) """ if sequence is None: name = name_or_record.name sequence = name_or_record.sequence else: name = name_or_record if self.line_length is not None: self._file.write(('>' + name + '\n').encode('ascii')) s = [] for i in range(0, len(sequence), self.line_length): s.append(sequence[i:i + self.line_length] + '\n') self._file.write(''.join(s).encode('ascii')) else: s = '>' + name + '\n' + sequence + '\n' self._file.write(s.encode('ascii')) class FastqWriter(FileWriter): """ Write sequences with qualities in FASTQ format. FASTQ files are formatted like this: @read name SEQUENCE + QUALITIS """ file_mode = 'wb' def __init__(self, file, two_headers=False, opener=xopen, _close_file=None): super().__init__(file, opener=opener, _close_file=_close_file) self._two_headers = two_headers def write(self, record): """ Write a Sequence record to the FASTQ file. The record object must have attributes .name, .sequence and .qualities. """ name2 = record.name if self._two_headers else '' s = ('@' + record.name + '\n' + record.sequence + '\n+' + name2 + '\n' + record.qualities + '\n') self._file.write(s.encode('ascii')) def writeseq(self, name, sequence, qualities): self._file.write("@{0:s}\n{1:s}\n+\n{2:s}\n".format( name, sequence, qualities).encode('ascii')) dnaio-0.4.1/tests/000077500000000000000000000000001356110417500137535ustar00rootroot00000000000000dnaio-0.4.1/tests/data/000077500000000000000000000000001356110417500146645ustar00rootroot00000000000000dnaio-0.4.1/tests/data/dos.fastq000066400000000000000000000004371356110417500165150ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.4.1/tests/data/interleaved.fastq000066400000000000000000000003251356110417500202260ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH dnaio-0.4.1/tests/data/missingextension000066400000000000000000000004231356110417500202140ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.4.1/tests/data/missingextension.gz000066400000000000000000000003321356110417500206320ustar00rootroot00000000000000Osmall.fastq]1n0 @]Pn '#:ngjlE"~|nie)2o*">bfp2Dݔ/0Mo#Bz_pO|qEi撪6|ؤم0".D4RkAAR|SfW\ܖ;~Ȼm\3t@}eͥ,x:@F%dnaio-0.4.1/tests/data/paired.1.fastq000066400000000000000000000004501356110417500173260ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH dnaio-0.4.1/tests/data/paired.2.fastq000066400000000000000000000004451356110417500173330ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ dnaio-0.4.1/tests/data/simple.fasta000066400000000000000000000000661356110417500171770ustar00rootroot00000000000000>first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-0.4.1/tests/data/simple.fasta.bz2000066400000000000000000000001251356110417500176670ustar00rootroot00000000000000BZh91AY&SYf π0 *! TPM T$zM芯rH ` ьwKI.p dnaio-0.4.1/tests/data/simple.fasta.gz000066400000000000000000000001111356110417500176050ustar00rootroot00000000000000f0[simple.fastaK,*./N-,MKN v usv5+NMKArv5E|6dnaio-0.4.1/tests/data/simple.fasta.xz000066400000000000000000000001541356110417500176350ustar00rootroot000000000000007zXZִF!t/5-]'V$@T-ZMaFy38Db`2fI6ԇ}YZdnaio-0.4.1/tests/data/simple.fastq000066400000000000000000000001151356110417500172120ustar00rootroot00000000000000@first_sequence SEQUENCE1 + :6;;8<=:< @second_sequence SEQUENCE2 + 83/J z)„)?hdnaio-0.4.1/tests/data/simple.fastq.gz000066400000000000000000000001351356110417500176330ustar00rootroot00000000000000'[simple.fastqsH,*./N-,MKN v usv52r(NMKTfTfalcooaf>Mdnaio-0.4.1/tests/data/simple.fastq.xz000066400000000000000000000002001356110417500176450ustar00rootroot000000000000007zXZִF!t/LA] 'V$@ƹY#ܿX,lj&4ߵx+OMᄎL%v^.4QmTQuzX !]M'}YZdnaio-0.4.1/tests/data/small.fastq000066400000000000000000000004231356110417500170330ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: dnaio-0.4.1/tests/data/with_comment.fasta000066400000000000000000000001201356110417500203720ustar00rootroot00000000000000# a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 dnaio-0.4.1/tests/data/withplus.fastq000066400000000000000000000001541356110417500176030ustar00rootroot00000000000000@first_sequence SEQUENCE1 +this is different :6;;8<=:< @second_sequence SEQUENCE2 +also different 83r1\nACGACGACG\n']: assert [m.tobytes() for m in read_chunks(BytesIO(data))] == [data] # Buffer too small with raises(OverflowError): list(read_chunks(BytesIO(data), buffer_size=4)) def test_read_chunks_empty(): assert list(read_chunks(BytesIO(b''))) == [] dnaio-0.4.1/tests/test_internal.py000066400000000000000000000425611356110417500172100ustar00rootroot00000000000000import os import shutil import subprocess import sys from io import BytesIO from tempfile import mkdtemp from textwrap import dedent from pytest import raises, mark import dnaio from dnaio import ( FileFormatError, FastaFormatError, FastqFormatError, FastaReader, FastqReader, InterleavedSequenceReader, FastaWriter, FastqWriter, InterleavedSequenceWriter, PairedSequenceReader) from dnaio import _sequence_names_match, Sequence # files tests/data/simple.fast{q,a} simple_fastq = [ Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), Sequence("second_sequence", "SEQUENCE2", "83first_sequence\nSEQUENCE1\n>second_sequence\nSEQUENCE2\n") reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_with_comments(self): fasta = BytesIO(dedent( """ # a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """).encode()) reads = list(FastaReader(fasta)) assert reads == simple_fasta def test_wrong_format(self): fasta = BytesIO(dedent( """# a comment # another one unexpected >first_sequence SEQUENCE1 >second_sequence SEQUENCE2 """).encode()) with raises(FastaFormatError) as info: list(FastaReader(fasta)) assert info.value.line == 2 def test_fastareader_keeplinebreaks(self): with FastaReader("tests/data/simple.fasta", keep_linebreaks=True) as f: reads = list(f) assert reads[0] == simple_fasta[0] assert reads[1].sequence == 'SEQUEN\nCE2' def test_context_manager(self): filename = "tests/data/simple.fasta" with open(filename, 'rb') as f: assert not f.closed _ = list(dnaio.open(f)) assert not f.closed assert f.closed with FastaReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None # Open it a second time with FastaReader(filename): pass class TestFastqReader: def test_fastqreader(self): with FastqReader("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq @mark.parametrize("buffer_size", [1, 2, 3, 5, 7, 10, 20]) def test_fastqreader_buffersize(self, buffer_size): with FastqReader("tests/data/simple.fastq", buffer_size=buffer_size) as f: reads = list(f) assert reads == simple_fastq def test_fastqreader_buffersize_too_small(self): with raises(ValueError): with FastqReader("tests/data/simple.fastq", buffer_size=0) as f: _ = list(f) # pragma: no cover def test_fastqreader_dos(self): # DOS line breaks with open('tests/data/dos.fastq', 'rb') as f: assert b'\r\n' in f.read() with FastqReader("tests/data/dos.fastq") as f: dos_reads = list(f) with FastqReader("tests/data/small.fastq") as f: unix_reads = list(f) assert dos_reads == unix_reads def test_fastq_wrongformat(self): with raises(FastqFormatError) as info: with FastqReader("tests/data/withplus.fastq") as f: list(f) # pragma: no cover assert info.value.line == 2 def test_empty_fastq(self): with FastqReader(BytesIO(b'')) as fq: assert list(fq) == [] @mark.parametrize('s,line', [ (b'@', 0), (b'@r', 0), (b'@r1', 0), (b'@r1\n', 1), (b'@r1\nA', 1), (b'@r1\nAC', 1), (b'@r1\nACG', 1), (b'@r1\nACG\n', 2), (b'@r1\nACG\n+', 2), (b'@r1\nACG\n+\n', 3), (b'@r1\nACG\n+\nH', 3), (b'@r1\nACG\n+\nHH', 3), (b'@r1\nACG\n+\nHHH\n@', 4), (b'@r1\nACG\n+\nHHH\n@r', 4), (b'@r1\nACG\n+\nHHH\n@r2', 4), (b'@r1\nACG\n+\nHHH\n@r2\n', 5), (b'@r1\nACG\n+\nHHH\n@r2\nT', 5), (b'@r1\nACG\n+\nHHH\n@r2\nT\n', 6), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+', 6), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\n', 7), ]) def test_fastq_incomplete(self, s, line): fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_half_record_line_numbers(self): fastq = BytesIO(b'@r\nACG\n+\nHH\n') # Choose the buffer size such that only parts of the record fit # We want to ensure that the line number is reset properly # after the record has been half-parsed buffer_size = len('@r\nACG\n+\n') with raises(FastqFormatError) as info: with FastqReader(fastq, buffer_size=buffer_size) as fq: list(fq) # pragma: no cover assert 'Length of sequence and qualities differ' in info.value.message assert info.value.line == 3 @mark.parametrize('s,line', [ (b'@r1\nACG\n+\nH#HH\n@r2\nT\n+\nH\n', 3), (b'@r1\nACG\n+\n#H\n@r2\nT\n+\nH\n', 3), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\nHH\n', 7), (b'@r1\nACG\n+\nHHH\n@r2\nT\n+\n\n', 7), ]) def test_differing_lengths(self, s, line): fastq = BytesIO(s) with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) assert info.value.line == line def test_missing_final_newline(self): # Files with a missing final newline are currently allowed fastq = BytesIO(b'@r1\nA\n+\nH') with dnaio.open(fastq) as f: records = list(f) assert records == [Sequence('r1', 'A', 'H')] def test_not_opened_as_binary(self): filename = 'tests/data/simple.fastq' with open(filename, 'rt') as f: with raises(ValueError): list(dnaio.open(f)) def test_context_manager(self): filename = "tests/data/simple.fastq" with open(filename, 'rb') as f: assert not f.closed _ = list(dnaio.open(f)) assert not f.closed assert f.closed with FastqReader(filename) as sr: tmp_sr = sr assert not sr._file.closed _ = list(sr) assert not sr._file.closed assert tmp_sr._file is None def test_two_header_detection(self): fastq = BytesIO(b'@r1\nACG\n+r1\nHHH\n@r2\nT\n+r2\n#\n') with FastqReader(fastq) as fq: assert fq.two_headers list(fq) fastq = BytesIO(b'@r1\nACG\n+\nHHH\n@r2\nT\n+r2\n#\n') with FastqReader(fastq) as fq: assert not fq.two_headers list(fq) def test_second_header_not_equal(self): fastq = BytesIO(b'@r1\nACG\n+xy\n') with raises(FastqFormatError) as info: with FastqReader(fastq) as fq: list(fq) # pragma: no cover assert "Sequence descriptions don't match" in info.value.message class TestOpen: def setup(self): self._tmpdir = mkdtemp() def teardown(self): shutil.rmtree(self._tmpdir) def test_sequence_reader(self): # test the autodetection with dnaio.open("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq with dnaio.open("tests/data/simple.fasta") as f: reads = list(f) assert reads == simple_fasta with open("tests/data/simple.fastq", 'rb') as f: reads = list(dnaio.open(f)) assert reads == simple_fastq # make the name attribute unavailable f = BytesIO(open("tests/data/simple.fastq", 'rb').read()) reads = list(dnaio.open(f)) assert reads == simple_fastq f = BytesIO(open("tests/data/simple.fasta", 'rb').read()) reads = list(dnaio.open(f)) assert reads == simple_fasta def test_autodetect_fasta_format(self, tmpdir): path = str(tmpdir.join('tmp.fasta')) with dnaio.open(path, mode='w') as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: records = list(f) assert records == simple_fasta def test_write_qualities_to_fasta(self): path = os.path.join(self._tmpdir, 'tmp.fasta') with dnaio.open(path, mode='w', qualities=True) as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fasta def test_autodetect_fastq_format(self): path = os.path.join(self._tmpdir, 'tmp.fastq') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fastq def test_autodetect_fastq_weird_name(self): path = os.path.join(self._tmpdir, 'tmp.fastq.gz') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) weird_path = os.path.join(self._tmpdir, 'tmp.weird.gz') os.rename(path, weird_path) with dnaio.open(weird_path) as f: assert list(f) == simple_fastq def test_fastq_qualities_missing(self): path = os.path.join(self._tmpdir, 'tmp.fastq') with raises(ValueError): dnaio.open(path, mode='w', qualities=False) class TestInterleavedReader: def test(self): expected = [ ( Sequence('read1/1 some text', 'TTATTTGTCTCCAGC', '##HHHHHHHHHHHHH'), Sequence('read1/2 other text', 'GCTGGAGACAAATAA', 'HHHHHHHHHHHHHHH') ), ( Sequence('read3/1', 'CCAACTTGATATTAATAACA', 'HHHHHHHHHHHHHHHHHHHH'), Sequence('read3/2', 'TGTTATTAATATCAAGTTGG', '#HHHHHHHHHHHHHHHHHHH') ), ] with InterleavedSequenceReader("tests/data/interleaved.fastq") as isr: reads = list(isr) for (r1, r2), (e1, e2) in zip(reads, expected): print(r1, r2, e1, e2) assert reads == expected with dnaio.open("tests/data/interleaved.fastq", interleaved=True) as f: reads = list(f) assert reads == expected def test_missing_partner(self): s = BytesIO(b'@r1\nACG\n+\nHHH\n') with raises(FileFormatError) as info: with InterleavedSequenceReader(s) as isr: list(isr) assert "Interleaved input file incomplete" in info.value.message def test_incorrectly_paired(self): s = BytesIO(b'@r1/1\nACG\n+\nHHH\n@wrong_name\nTTT\n+\nHHH\n') with raises(FileFormatError) as info: with InterleavedSequenceReader(s) as isr: list(isr) assert "Reads are improperly paired" in info.value.message class TestFastaWriter: def setup(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, 'tmp.fasta') def teardown(self): shutil.rmtree(self._tmpdir) def test(self): with FastaWriter(self.path) as fw: fw.write("name", "CCATA") fw.write("name2", "HELLO") assert fw._file.closed with open(self.path) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' def test_linelength(self): with FastaWriter(self.path, line_length=3) as fw: fw.write("r1", "ACG") fw.write("r2", "CCAT") fw.write("r3", "TACCAG") assert fw._file.closed with open(self.path) as t: d = t.read() assert d == '>r1\nACG\n>r2\nCCA\nT\n>r3\nTAC\nCAG\n' def test_write_sequence_object(self): with FastaWriter(self.path) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert fw._file.closed with open(self.path) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n' def test_write_to_file_like_object(self): bio = BytesIO() with FastaWriter(bio) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert bio.getvalue() == b'>name\nCCATA\n>name2\nHELLO\n' assert not bio.closed assert not fw._file.closed def test_write_zero_length_sequence(self): bio = BytesIO() with FastaWriter(bio) as fw: fw.write(Sequence("name", "")) assert bio.getvalue() == b'>name\n\n', '{!r}'.format(bio.getvalue()) class TestFastqWriter: def setup(self): self._tmpdir = mkdtemp() self.path = os.path.join(self._tmpdir, 'tmp.fastq') def teardown(self): shutil.rmtree(self._tmpdir) def test(self): with FastqWriter(self.path) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert fq._file.closed with open(self.path) as t: assert t.read() == '@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n' def test_twoheaders(self): with FastqWriter(self.path, two_headers=True) as fq: fq.write(Sequence("name", "CCATA", "!#!#!")) fq.write(Sequence("name2", "HELLO", "&&&!&")) assert fq._file.closed with open(self.path) as t: assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&\n' def test_write_to_file_like_object(self): bio = BytesIO() with FastqWriter(bio) as fq: fq.writeseq("name", "CCATA", "!#!#!") fq.writeseq("name2", "HELLO", "&&&!&&") assert bio.getvalue() == b'@name\nCCATA\n+\n!#!#!\n@name2\nHELLO\n+\n&&&!&&\n' class TestInterleavedWriter: def test(self): reads = [ ( Sequence('A/1 comment', 'TTA', '##H'), Sequence('A/2 comment', 'GCT', 'HH#') ), ( Sequence('B/1', 'CC', 'HH'), Sequence('B/2', 'TG', '#H') ), ] bio = BytesIO() with InterleavedSequenceWriter(bio) as writer: for read1, read2 in reads: writer.write(read1, read2) assert bio.getvalue() == ( b'@A/1 comment\nTTA\n+\n##H\n' b'@A/2 comment\nGCT\n+\nHH#\n' b'@B/1\nCC\n+\nHH\n' b'@B/2\nTG\n+\n#H\n' ) class TestPairedSequenceReader: def test_read(self): s1 = BytesIO(b'@r1\nACG\n+\nHHH\n') s2 = BytesIO(b'@r2\nGTT\n+\n858\n') with PairedSequenceReader(s1, s2) as psr: assert [ (Sequence("r1", "ACG", "HHH"), Sequence("r2", "GTT", "858")), ] == list(psr) def test_sequence_names_match(self): def match(name1, name2): seq1 = Sequence(name1, 'ACGT') seq2 = Sequence(name2, 'AACC') return _sequence_names_match(seq1, seq2) assert match('abc', 'abc') assert match('abc/1', 'abc/2') assert match('abc.1', 'abc.2') assert match('abc1', 'abc2') assert not match('abc', 'xyz') def test_missing_partner1(self): s1 = BytesIO(b'') s2 = BytesIO(b'@r1\nACG\n+\nHHH\n') with raises(FileFormatError) as info: with PairedSequenceReader(s1, s2) as psr: list(psr) assert "There are more reads in file 2 than in file 1" in info.value.message def test_missing_partner2(self): s1 = BytesIO(b'@r1\nACG\n+\nHHH\n') s2 = BytesIO(b'') with raises(FileFormatError) as info: with PairedSequenceReader(s1, s2) as psr: list(psr) assert "There are more reads in file 1 than in file 2" in info.value.message def test_incorrectly_paired(self): s1 = BytesIO(b'@r1/1\nACG\n+\nHHH\n') s2 = BytesIO(b'@wrong_name\nTTT\n+\nHHH\n') with raises(FileFormatError) as info: with PairedSequenceReader(s1, s2) as psr: list(psr) assert "Reads are improperly paired" in info.value.message @mark.parametrize('path', [ 'tests/data/simple.fastq', 'tests/data/dos.fastq', 'tests/data/simple.fasta', 'tests/data/with_comment.fasta', ]) def test_read_stdin(path): # Get number of records in the input file with dnaio.open(path) as f: expected = len(list(f)) # Use 'cat' to simulate that no file name is available for stdin of the subprocess cat = subprocess.Popen(['cat', path], stdout=subprocess.PIPE) py = subprocess.Popen( [sys.executable, 'tests/read_from_stdin.py'], stdin=cat.stdout, stdout=subprocess.PIPE) cat.stdout.close() # Check that the read_from_stdin.py script prints the correct number of records assert str(expected) == py.communicate()[0].decode().strip() dnaio-0.4.1/tests/test_open.py000066400000000000000000000210261356110417500163260ustar00rootroot00000000000000from pathlib import Path import dnaio from xopen import xopen import pytest @pytest.fixture(params=["", ".gz", ".bz2", ".xz"]) def extension(request): return request.param @pytest.fixture(params=["fasta", "fastq"]) def fileformat(request): return request.param SIMPLE_RECORDS = { "fasta": [ dnaio.Sequence("first_sequence", "SEQUENCE1"), dnaio.Sequence("second_sequence", "SEQUENCE2"), ], "fastq": [ dnaio.Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), dnaio.Sequence("second_sequence", "SEQUENCE2", "83{}\n{}\n".format(record.name, record.sequence) def formatted_sequences(records, fileformat): return "".join(formatted_sequence(record, fileformat) for record in records) def test_formatted_sequence(): s = dnaio.Sequence("s1", "ACGT", "HHHH") assert ">s1\nACGT\n" == formatted_sequence(s, "fasta") assert "@s1\nACGT\n+\nHHHH\n" == formatted_sequence(s, "fastq") def test_version(): _ = dnaio.__version__ def test_read(fileformat, extension): with dnaio.open("tests/data/simple." + fileformat + extension) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_pathlib_path(fileformat, extension): path = Path("tests/data/simple." + fileformat + extension) with dnaio.open(path) as f: records = list(f) assert records == SIMPLE_RECORDS[fileformat] def test_read_opener(fileformat, extension): def my_opener(path, mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data) with dnaio.open("totally-ignored-filename." + fileformat + extension, opener=my_opener) as f: records = list(f) assert len(records) == 1 assert records[0].name == "read" assert records[0].sequence == "ACG" @pytest.mark.parametrize("interleaved", [False, True]) def test_paired_opener(fileformat, extension, interleaved): def my_opener(_path, _mode): import io if fileformat == "fasta": data = b">read\nACG\n" else: data = b"@read\nACG\n+\nHHH\n" return io.BytesIO(data + data) path1 = "ignored-filename." + fileformat + extension path2 = "also-ignored-filename." + fileformat + extension if interleaved: with dnaio.open(path1, file2=path2, opener=my_opener) as f: records = list(f) expected = 2 else: with dnaio.open(path1, interleaved=True, opener=my_opener) as f: records = list(f) expected = 1 assert len(records) == expected assert records[0][0].name == "read" assert records[0][0].sequence == "ACG" assert records[0][1].name == "read" assert records[0][1].sequence == "ACG" def test_detect_fastq_from_content(): """FASTQ file that is not named .fastq""" with dnaio.open('tests/data/missingextension') as f: record = next(iter(f)) assert record.name == 'prefix:1_13_573/1' def test_detect_compressed_fastq_from_content(): """Compressed FASTQ file that is not named .fastq.gz""" with dnaio.open('tests/data/missingextension.gz') as f: record = next(iter(f)) assert record.name == 'prefix:1_13_573/1' def test_write(tmpdir, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = tmpdir.join("out.fastq" + extension) with dnaio.open(str(out_fastq), mode='w') as f: f.write(s) with xopen(out_fastq) as f: assert f.read() == '@name\nACGT\n+\nHHHH\n' def test_write_with_xopen(tmpdir, fileformat, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = str(tmpdir.join("out." + fileformat + extension)) with xopen(out_fastq, 'wb') as outer_f: with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f: f.write(s) with xopen(out_fastq) as f: if fileformat == "fasta": assert f.read() == ">name\nACGT\n" else: assert f.read() == "@name\nACGT\n+\nHHHH\n" def test_write_pathlib(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") path = Path(str(tmpdir / ("out." + fileformat + extension))) with dnaio.open(path, mode="w") as f: f.write(s1) if fileformat == "fasta": expected = b">s1\nACGT\n" else: expected = b"@s1\nACGT\n+\nHHHH\n" with xopen(path, "rb") as f: assert f.read() == expected def test_write_paired_same_path(tmpdir): path1 = str(tmpdir / "same.fastq") path2 = str(tmpdir / "same.fastq") with pytest.raises(ValueError): with dnaio.open(file1=path1, file2=path2, mode="w"): pass def test_write_paired(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path1 = str(tmpdir / ("out.1." + fileformat + extension)) path2 = str(tmpdir / ("out.2." + fileformat + extension)) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_write_interleaved(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path = str(tmpdir / ("out.interleaved." + fileformat + extension)) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def test_append(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") s2 = dnaio.Sequence("s2", "CGCA", "8383") path = str(tmpdir / ("out." + fileformat + extension)) with dnaio.open(path, mode="w") as f: f.write(s1) with dnaio.open(path, mode="a") as f: f.write(s2) with xopen(path) as f: assert formatted_sequences([s1, s2], fileformat) == f.read() def test_append_paired(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path1 = str(tmpdir / ("out.1." + fileformat + extension)) path2 = str(tmpdir / ("out.2." + fileformat + extension)) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read() def test_append_interleaved(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path = str(tmpdir / ("out.interleaved." + fileformat + extension)) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="a") as f: f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read() def make_random_fasta(path, n_records): from random import choice with xopen(path, "w") as f: for i in range(n_records): name = "sequence_{}".format(i) sequence = "".join(choice("ACGT") for _ in range(300)) print(">", name, "\n", sequence, sep="", file=f) def test_islice_gzip_does_not_fail(tmpdir): path = str(tmpdir / "file.fasta.gz") make_random_fasta(path, 100) f = dnaio.open(path) next(iter(f)) f.close() dnaio-0.4.1/tox.ini000066400000000000000000000007751356110417500141350ustar00rootroot00000000000000[tox] envlist = flake8,py34,py35,py36,py37,py38 [testenv] deps = pytest coverage commands = coverage run --concurrency=multiprocessing -m pytest --doctest-modules --pyargs tests/ coverage combine coverage report [testenv:flake8] basepython = python3.6 deps = flake8 commands = flake8 src/ tests/ [coverage:run] parallel = True include = */site-packages/dnaio/* tests/* [coverage:paths] source = src/ */site-packages/ [flake8] max-line-length = 110 max-complexity = 15