pax_global_header00006660000000000000000000000064132053376310014515gustar00rootroot0000000000000052 comment=fc2ef93bb367aebf62f9105cbb8af9464fc6adfe xopen-0.3.2/000077500000000000000000000000001320533763100126505ustar00rootroot00000000000000xopen-0.3.2/.gitignore000066400000000000000000000000541320533763100146370ustar00rootroot00000000000000__pycache__/ *.pyc *.egg-info *~ .tox venv/ xopen-0.3.2/.travis.yml000066400000000000000000000003001320533763100147520ustar00rootroot00000000000000sudo: false language: python cache: directories: - $HOME/.cache/pip python: - "2.7" - "3.3" - "3.4" - "3.5" - "3.6" install: - pip install . script: - nosetests -P tests xopen-0.3.2/LICENSE000066400000000000000000000020711320533763100136550ustar00rootroot00000000000000Copyright (c) 2010-2016 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xopen-0.3.2/README.rst000066400000000000000000000044001320533763100143350ustar00rootroot00000000000000.. image:: https://travis-ci.org/marcelm/xopen.svg?branch=master :target: https://travis-ci.org/marcelm/xopen .. image:: https://img.shields.io/pypi/v/xopen.svg?branch=master :target: https://pypi.python.org/pypi/xopen ===== xopen ===== This small Python module provides an ``xopen`` function that works like the built-in ``open`` function, but can also deal with compressed files. Supported compression formats are gzip, bzip2 and xz. They are automatically recognized by their file extensions `.gz`, `.bz2` or `.xz`. The focus is on being as efficient as possible on all supported Python versions. For example, simply using ``gzip.open`` is very slow in older Pythons, and it is a lot faster to use a ``gzip`` subprocess. For writing to gzip files, ``xopen`` uses ``pigz`` when available. This module has originally been developed as part of the `cutadapt tool `_ that is used in bioinformatics to manipulate sequencing data. It has been in successful use within that software for a few years. ``xopen`` is compatible with Python 2.7, 3.3, 3.4, 3.5 and 3.6. Usage ----- Open a file for reading:: from xopen import xopen with xopen('file.txt.xz') as f: content = f.read() Or without context manager:: from xopen import xopen f = xopen('file.txt.xz') content = f.read() f.close() Open a file for writing:: from xopen import xopen with xopen('file.txt.gz', mode='w') as f: f.write('Hello') Credits ------- The name ``xopen`` was taken from the C function of the same name in the `utils.h file which is part of BWA `_. Kyle Beauchamp has contributed support for appending to files. Some ideas were taken from the `canopener project `_. If you also want to open S3 files, you may want to use that module instead. Author ------ Marcel Martin (`@marcelm_ on Twitter `_) Links ----- * `Source code `_ * `Report an issue `_ * `Project page on PyPI (Python package index) `_ xopen-0.3.2/setup.py000066400000000000000000000016401320533763100143630ustar00rootroot00000000000000import sys from setuptools import setup if sys.version_info < (2, 7): sys.stdout.write("At least Python 2.7 is required.\n") sys.exit(1) with open('README.rst') as f: long_description = f.read() if sys.version_info < (3, ): requires = ['bz2file'] else: requires = [] setup( name='xopen', version='0.3.2', author='Marcel Martin', author_email='mail@marcelm.net', url='https://github.com/marcelm/xopen/', description='Open compressed files transparently', long_description=long_description, license='MIT', py_modules=['xopen'], install_requires=requires, classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", ] ) xopen-0.3.2/tests/000077500000000000000000000000001320533763100140125ustar00rootroot00000000000000xopen-0.3.2/tests/file.txt000066400000000000000000000000461320533763100154720ustar00rootroot00000000000000Testing, testing ... The second line. xopen-0.3.2/tests/file.txt.bz2000066400000000000000000000001661320533763100161710ustar00rootroot00000000000000BZh91AY&SYӀ@ 1MTikt%B"(HN|BZh91AY&SYsS@e 1ē& 7"(H9xopen-0.3.2/tests/file.txt.gz000066400000000000000000000000651320533763100161120ustar00rootroot00000000000000ȵW I-.KQ(0B2RSRr2Rs&xopen-0.3.2/tests/file.txt.xz000066400000000000000000000001401320533763100161250ustar00rootroot000000000000007zXZִF!t/%Testing, testing ... The second line. ]ݜa>&+N}YZxopen-0.3.2/tests/hello.gz000066400000000000000000000000311320533763100154510ustar00rootroot00000000000000ZH6xopen-0.3.2/tests/test_xopen.py000066400000000000000000000121451320533763100165570ustar00rootroot00000000000000# coding: utf-8 from __future__ import print_function, division, absolute_import import gzip import os import random import sys import signal from contextlib import contextmanager from nose.tools import raises from xopen import xopen, PipedGzipReader base = "tests/file.txt" files = [ base + ext for ext in ['', '.gz', '.bz2' ] ] try: import lzma files.append(base + '.xz') except ImportError: lzma = None try: import bz2 except ImportError: bz2 = None major, minor = sys.version_info[0:2] @contextmanager def temporary_path(name): directory = os.path.join(os.path.dirname(__file__), 'testtmp') if not os.path.isdir(directory): os.mkdir(directory) path = os.path.join(directory, name) yield path os.remove(path) def test_xopen_text(): for name in files: with xopen(name, 'rt') as f: lines = list(f) assert len(lines) == 2 assert lines[1] == 'The second line.\n', name def test_xopen_binary(): for name in files: with xopen(name, 'rb') as f: lines = list(f) assert len(lines) == 2 assert lines[1] == b'The second line.\n', name def test_no_context_manager_text(): for name in files: f = xopen(name, 'rt') lines = list(f) assert len(lines) == 2 assert lines[1] == 'The second line.\n', name f.close() assert f.closed def test_no_context_manager_binary(): for name in files: f = xopen(name, 'rb') lines = list(f) assert len(lines) == 2 assert lines[1] == b'The second line.\n', name f.close() assert f.closed @raises(IOError) def test_nonexisting_file(): with xopen('this-file-does-not-exist') as f: pass @raises(IOError) def test_nonexisting_file_gz(): with xopen('this-file-does-not-exist.gz') as f: pass @raises(IOError) def test_nonexisting_file_bz2(): with xopen('this-file-does-not-exist.bz2') as f: pass if lzma: @raises(IOError) def test_nonexisting_file_xz(): with xopen('this-file-does-not-exist.xz') as f: pass @raises(IOError) def test_write_to_nonexisting_dir(): with xopen('this/path/does/not/exist/file.txt', 'w') as f: pass @raises(IOError) def test_write_to_nonexisting_dir_gz(): with xopen('this/path/does/not/exist/file.gz', 'w') as f: pass @raises(IOError) def test_write_to_nonexisting_dir_bz2(): with xopen('this/path/does/not/exist/file.bz2', 'w') as f: pass if lzma: @raises(IOError) def test_write_to_nonexisting_dir(): with xopen('this/path/does/not/exist/file.xz', 'w') as f: pass def test_append(): cases = ["", ".gz"] if bz2 and sys.version_info > (3,): # BZ2 does NOT support append in Py 2. cases.append(".bz2") if lzma: cases.append(".xz") for ext in cases: # On Py3, need to send BYTES, not unicode. Let's do it for all. text = "AB".encode("utf-8") reference = text + text with temporary_path('truncated.fastq' + ext) as path: try: os.unlink(path) except OSError: pass with xopen(path, 'ab') as f: f.write(text) with xopen(path, 'ab') as f: f.write(text) with xopen(path, 'r') as f: for appended in f: pass try: reference = reference.decode("utf-8") except AttributeError: pass assert appended == reference def test_append_text(): cases = ["", ".gz"] if bz2 and sys.version_info > (3,): # BZ2 does NOT support append in Py 2. cases.append(".bz2") if lzma: cases.append(".xz") for ext in cases: # BZ2 does NOT support append text = "AB" reference = text + text with temporary_path('truncated.fastq' + ext) as path: try: os.unlink(path) except OSError: pass with xopen(path, 'at') as f: f.write(text) with xopen(path, 'at') as f: f.write(text) with xopen(path, 'rt') as f: for appended in f: pass assert appended == reference def create_truncated_file(path): # Random text random_text = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(1024)) # Make the text a lot bigger in order to ensure that it is larger than the # pipe buffer size. random_text *= 1024 # 1MB with xopen(path, 'w') as f: f.write(random_text) with open(path, 'a') as f: f.truncate(os.stat(path).st_size - 10) class TookTooLongError(Exception): pass class timeout: # copied from https://stackoverflow.com/a/22348885/715090 def __init__(self, seconds=1): self.seconds = seconds def handle_timeout(self, signum, frame): raise TookTooLongError() def __enter__(self): signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) def __exit__(self, type, value, traceback): signal.alarm(0) if sys.version_info[:2] != (3, 3): @raises(EOFError, IOError) def test_truncated_gz(): with temporary_path('truncated.gz') as path: create_truncated_file(path) with timeout(seconds=2): f = xopen(path, 'r') f.read() f.close() @raises(EOFError, IOError) def test_truncated_gz_iter(): with temporary_path('truncated.gz') as path: create_truncated_file(path) with timeout(seconds=2): f = xopen(path, 'r') for line in f: pass f.close() def test_bare_read_from_gz(): with xopen('tests/hello.gz', 'rt') as f: assert f.read() == 'hello' def test_read_piped_gzip(): with PipedGzipReader('tests/hello.gz', 'rt') as f: assert f.read() == 'hello' xopen-0.3.2/tox.ini000066400000000000000000000001361320533763100141630ustar00rootroot00000000000000[tox] envlist = py27,py33,py34,py35,py36 [testenv] deps = nose commands = nosetests -P tests xopen-0.3.2/xopen.py000066400000000000000000000154161320533763100143620ustar00rootroot00000000000000""" Open compressed files transparently. """ from __future__ import print_function, division, absolute_import import gzip import sys import io import os import time from subprocess import Popen, PIPE __version__ = '0.3.2' _PY3 = sys.version > '3' if not _PY3: import bz2file as bz2 else: try: import bz2 except ImportError: bz2 = None try: import lzma except ImportError: lzma = None if _PY3: basestring = str class Closing(object): """ Inherit from this class and implement a close() method to offer context manager functionality. """ def __enter__(self): return self def __exit__(self, *exc_info): self.close() def __del__(self): try: self.close() except: pass class PipedGzipWriter(Closing): """ Write gzip-compressed files by running an external gzip or pigz process and piping into it. On Python 2, this is faster than using gzip.open(). On Python 3, it allows to run the compression in a separate process and can therefore also be faster. """ def __init__(self, path, mode='wt'): if mode not in ('w', 'wt', 'wb', 'a', 'at', 'ab'): raise ValueError("Mode is '{0}', but it must be 'w', 'wt', 'wb', 'a', 'at' or 'ab'".format(mode)) self.outfile = open(path, mode) self.devnull = open(os.devnull, mode) self.closed = False self.name = path # Setting close_fds to True in the Popen arguments is necessary due to # . kwargs = dict(stdin=PIPE, stdout=self.outfile, stderr=self.devnull, close_fds=True) try: self.process = Popen(['pigz'], **kwargs) self.program = 'pigz' except OSError as e: # pigz not found, try regular gzip try: self.process = Popen(['gzip'], **kwargs) self.program = 'gzip' except (IOError, OSError) as e: self.outfile.close() self.devnull.close() raise except IOError as e: self.outfile.close() self.devnull.close() raise if _PY3 and 'b' not in mode: self._file = io.TextIOWrapper(self.process.stdin) else: self._file = self.process.stdin def write(self, arg): self._file.write(arg) def close(self): self.closed = True self._file.close() retcode = self.process.wait() self.outfile.close() self.devnull.close() if retcode != 0: raise IOError("Output {0} process terminated with exit code {1}".format(self.program, retcode)) class PipedGzipReader(Closing): def __init__(self, path, mode='r'): if mode not in ('r', 'rt', 'rb'): raise ValueError("Mode is '{0}', but it must be 'r', 'rt' or 'rb'".format(mode)) self.process = Popen(['gzip', '-cd', path], stdout=PIPE, stderr=PIPE) self.name = path if _PY3 and not 'b' in mode: self._file = io.TextIOWrapper(self.process.stdout) else: self._file = self.process.stdout if _PY3: self._stderr = io.TextIOWrapper(self.process.stderr) else: self._stderr = self.process.stderr self.closed = False # Give gzip a little bit of time to report any errors (such as # a non-existing file) time.sleep(0.01) self._raise_if_error() def close(self): self.closed = True retcode = self.process.poll() if retcode is None: # still running self.process.terminate() self._raise_if_error() def __iter__(self): for line in self._file: yield line self.process.wait() self._raise_if_error() def _raise_if_error(self): """ Raise IOError if process is not running anymore and the exit code is nonzero. """ retcode = self.process.poll() if retcode is not None and retcode != 0: message = self._stderr.read().strip() raise IOError(message) def read(self, *args): data = self._file.read(*args) if len(args) == 0 or args[0] <= 0: # wait for process to terminate until we check the exit code self.process.wait() self._raise_if_error() return data if bz2 is not None: class ClosingBZ2File(bz2.BZ2File, Closing): """ A better BZ2File that supports the context manager protocol. This is relevant only for Python 2.6. """ def xopen(filename, mode='r', compresslevel=6): """ Replacement for the "open" function that can also open files that have been compressed with gzip, bzip2 or xz. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. If the filename ends with .gz, the file is opened with a pipe to the gzip program. If that does not work, then gzip.open() is used (the gzip module is slower than the pipe to the gzip program). If the filename ends with .bz2, it's opened as a bz2.BZ2File. Otherwise, the regular open() is used. mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb' Instead of 'rt', 'wt' and 'at', 'r', 'w' and 'a' can be used as abbreviations. In Python 2, the 't' and 'b' characters are ignored. Append mode ('a', 'at', 'ab') is unavailable with BZ2 compression and will raise an error. compresslevel is the gzip compression level. It is not used for bz2 and xz. """ if mode in ('r', 'w', 'a'): mode += 't' if mode not in ('rt', 'rb', 'wt', 'wb', 'at', 'ab'): raise ValueError("mode '{0}' not supported".format(mode)) if not _PY3: mode = mode[0] if not isinstance(filename, basestring): raise ValueError("the filename must be a string") # standard input and standard output handling if filename == '-': return dict( r=sys.stdin, rt=sys.stdin, rb=sys.stdin.buffer, w=sys.stdout, wt=sys.stdout, wb=sys.stdout.buffer)[mode] if filename.endswith('.bz2'): if bz2 is None: raise ImportError("Cannot open bz2 files: The bz2 module is not available") if _PY3: return bz2.open(filename, mode) else: if mode[0] == 'a': raise ValueError("mode '{0}' not supported with BZ2 compression".format(mode)) if sys.version_info[:2] <= (2, 6): return ClosingBZ2File(filename, mode) else: return bz2.BZ2File(filename, mode) elif filename.endswith('.xz'): if lzma is None: raise ImportError("Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)") return lzma.open(filename, mode) elif filename.endswith('.gz'): if _PY3 and 'r' in mode: return gzip.open(filename, mode) if sys.version_info[:2] == (2, 7): buffered_reader = io.BufferedReader buffered_writer = io.BufferedWriter else: buffered_reader = lambda x: x buffered_writer = lambda x: x if 'r' in mode: try: return PipedGzipReader(filename, mode) except OSError: # gzip not installed return buffered_reader(gzip.open(filename, mode)) else: try: return PipedGzipWriter(filename, mode) except OSError: return buffered_writer(gzip.open(filename, mode, compresslevel=compresslevel)) else: # Python 2.6 and 2.7 have io.open, which we could use to make the returned # object consistent with the one returned in Python 3, but reading a file # with io.open() is 100 times slower (!) on Python 2.6, and still about # three times slower on Python 2.7 (tested with "for _ in io.open(path): pass") return open(filename, mode)