indexed_gzip-1.6.4/0000755000175000017500000000000014133320576013525 5ustar nileshnileshindexed_gzip-1.6.4/indexed_gzip/0000755000175000017500000000000014133320576016176 5ustar nileshnileshindexed_gzip-1.6.4/indexed_gzip/zran.h0000644000175000017500000003643214133320576017331 0ustar nileshnilesh#ifndef __ZRAN_H__ #define __ZRAN_H__ /* * The zran module is an adaptation of the zran example, written by Mark * Alder, which ships with the zlib source code. It allows the creation * of an index into a compressed file, which is used to improve the speed * of random seek/read access to the uncompressed data. */ #include #include #define PY_SSIZE_T_CLEAN #include struct _zran_index; struct _zran_point; typedef struct _zran_index zran_index_t; typedef struct _zran_point zran_point_t; /* * These values may be passed in as flags to the zran_init function. * They are specified as bit-masks, rather than bit locations. */ enum { ZRAN_AUTO_BUILD = 1, ZRAN_SKIP_CRC_CHECK = 2, }; /* * Struct representing the index. None of the fields in this struct * should ever need to be accessed or modified directly. */ struct _zran_index { /* * Handle to the compressed file. */ FILE *fd; /* * Handle to the compressed file object. */ PyObject *f; /* * Size of the compressed file. This * is calculated in zran_init. */ uint64_t compressed_size; /* * Size of the uncompressed data. This is * only updated when it becomes known. */ uint64_t uncompressed_size; /* * Spacing size in bytes, relative to the * uncompressed data stream, between adjacent * index points. */ uint32_t spacing; /* * Number of bytes of uncompressed data to store * for each index point. This must be a minimum * of 32768 bytes. */ uint32_t window_size; /* * Base2 logarithm of the window size - it * is needed to initialise zlib inflation. */ uint32_t log_window_size; /* * Size, in bytes, of buffer used to store * compressed data read from disk. */ uint32_t readbuf_size; /* * Number of index points that have been created. */ uint32_t npoints; /* * Number of index points that can be stored - * i.e. the amount allocated to the "list" field. */ uint32_t size; /* * List of index points. */ zran_point_t *list; /* * Most recently requested seek/read * location into the uncompressed data * stream - this is used to keep track * of where the calling code thinks it * is in the (uncompressed) file. */ uint64_t uncmp_seek_offset; /* * Flags passed to zran_init */ uint16_t flags; /* * All of the fields after this point are used * by the internal _zran_inflate function. */ /* * Reference to a file input * buffer of size readbuf_size. */ uint8_t *readbuf; /* * An offset into readbuf. */ uint32_t readbuf_offset; /* * The current end of the readbuf contents. */ uint32_t readbuf_end; /* * Current offsets into the uncompressed and * compressed data streams. */ uint64_t inflate_cmp_offset; uint64_t inflate_uncmp_offset; /* * Uncompressed offset at the point that the * last GZIP stream ended. This is updated as * more data is read and uncompressed, and * used to determine whether the CRC/size * check for the current stream has already * been performed. */ uint64_t last_stream_ended; /* * CRC-32 checksum and size (number of * bytes, modulo 2^32) of the uncompressed * data in the current gzip stream, not * used if ZRAN_SKIP_CRC_CHECK is active. * The CRC and size are incrementally * calculated as data is read in. When the * end of a gzip stream is reached, the * calculated CRC and size are compared * against the CRC and size in the gzip * footer, and an error is returned if * they don't match. */ uint32_t stream_crc32; uint32_t stream_size; uint8_t validating; }; /* * Struct representing a single seek point in the index. */ struct _zran_point { /* * Location of this point in the compressed data * stream. This is the location of the first full * byte of compressed data - if the compressed * and uncompressed locations are not byte-aligned, * the bits field below specifies the bit offset. */ uint64_t cmp_offset; /* * Corresponding location of this point * in the uncompressed data stream. */ uint64_t uncmp_offset; /* * If this point is not byte-aligned, this specifies * the number of bits, in the compressed stream, * back from cmp_offset, that the uncompressed data * starts. */ uint8_t bits; /* * Chunk of uncompressed data preceeding this point. * This is required to initialise decompression from * this point onwards. */ uint8_t *data; }; /* * Initialise a zran_index_t struct for use with the given file. * * Passing in 0 for the spacing, window_size and readbuf_size arguments * will result in the following values being used: * * spacing: 1048576 * window_size: 32768 * readbuf_size: 16384 * * The read buffer must be at least the maximum expectedd size of a GZIP * header. GZIP headers have a minimum size of 10 bytes, but there is no upper * bound on their size, so using a very small read buffer would be unwise. In * the case of concatenated GZIP streams, the read buffer must be at least big * enough to accommodate a GZIP footer of one stream, padding bytes in between * two streams, and the GZIP header of the next stream. * * The flags argument is a bit mask used to control the following options: * * ZRAN_AUTO_BUILD: Build the index automatically on demand. * * ZRAN_SKIP_CRC_CHECK: Do not perform a CRC32 and file size check * when the end of a GZIP stream is reached. * This flag is automatically set when an index * is imported from file using zran_import_index. */ int zran_init( zran_index_t *index, /* The index */ FILE *fd, /* Open handle to the compressed file */ PyObject *f, /* Open handle to the compressed file object */ uint32_t spacing, /* Distance in bytes between index seek points */ uint32_t window_size, /* Number of uncompressed bytes to store with each point */ uint32_t readbuf_size, /* Number of bytes to read at a time */ uint16_t flags /* Flags controlling index behaviour */ ); /* * Frees the memory use by the given index. The zran_index_t struct * itself is not freed. */ void zran_free( zran_index_t *index /* The index */ ); /* * Return codes for zran_build_index. */ enum { ZRAN_BUILD_INDEX_OK = 0, ZRAN_BUILD_INDEX_FAIL = -1, ZRAN_BUILD_INDEX_CRC_ERROR = -2, }; /* * (Re-)Builds the index to cover the given range, which must be * specified relative to the compressed data stream. Pass in 0 * for both offsets to re-build the full index. * * Returns ZRAN_BUILD_INDEX_OK on success, ZRAN_BUILD_INDEX_CRC_ERROR * if a CRC error is detected in a GZIP stream, or ZRAN_BUILD_INDEX_FAIL * if some other type of error occurs. */ int zran_build_index( zran_index_t *index, /* The index */ uint64_t from, /* Build the index from this point */ uint64_t until /* Build the index to this point */ ); /* Return codes for zran_seek. */ enum { ZRAN_SEEK_CRC_ERROR = -2, ZRAN_SEEK_FAIL = -1, ZRAN_SEEK_OK = 0, ZRAN_SEEK_NOT_COVERED = 1, ZRAN_SEEK_EOF = 2, ZRAN_SEEK_INDEX_NOT_BUILT = 3 }; /* * Seek to the specified offset in the uncompressed data stream. * If the index does not currently cover the offset, and it was * created with the ZRAN_AUTO_BUILD flag, the index is expanded * to cover the offset. * * Seeking from the end of the uncompressed stream (using SEEK_END) * is only possible if the index fully covers the file. * * Returns: * - ZRAN_SEEK_OK for success. * * - ZRAN_SEEK_INDEX_NOT_BUILT if SEEK_END is used, and the index * does not fully cover the file. * * - ZRAN_SEEK_NOT_COVERED to indicate that the index does not * cover the requested offset (will never happen if * ZRAN_AUTO_BUILD is active). * * - ZRAN_SEEK_EOF to indicate that the requested offset * is past the end of the uncompressed stream. * * - ZRAN_SEEK_CRC_ERROR to indicate that the CRC or file size * stored in the footer of a GZIP stream does not match the * data. * * - ZRAN_SEEK_FAIL to indicate failure of some sort. */ int zran_seek( zran_index_t *index, /* The index */ int64_t offset, /* Uncompressed offset to seek to */ uint8_t whence, /* SEEK_SET, SEEK_CUR, or SEEK_END */ zran_point_t **point /* Optional place to store corresponding zran_point_t */ ); /* * Returns the current seek location in the uncompressed data stream * (just returns zran_index_t.uncmp_seek_offset). */ uint64_t zran_tell( zran_index_t *index /* The index */ ); /* Return codes for zran_read. */ enum { ZRAN_READ_NOT_COVERED = -1, ZRAN_READ_EOF = -2, ZRAN_READ_FAIL = -3, ZRAN_READ_CRC_ERROR = -4 }; /* * Read len bytes from the current location in the uncompressed * data stream, storing them in buf. If the index was created with * the ZRAN_AUTO_BUILD flag, it is expanded as needed. * * Returns: * - Number of bytes read for success, or one of the following codes, * all of which are negative. * * - ZRAN_READ_NOT_COVERED to indicate that the index does not * cover the requested region (will never happen if * ZRAN_AUTO_BUILD is active). * * - ZRAN_READ_EOF to indicate that the read could not be completed * because the current uncompressed seek point is at EOF. * * - ZRAN_SEEK_CRC_ERROR to indicate that the CRC or file size * stored in the footer of a GZIP stream does not match the * data. * * - ZRAN_READ_FAIL to indicate that the read failed for some reason. */ int64_t zran_read( zran_index_t *index, /* The index */ void *buf, /* Buffer to store len bytes */ uint64_t len /* Number of bytes to read */ ); /* * Identifier and version number for index files created by zran_export_index, * defined in zran.c. */ extern const char ZRAN_INDEX_FILE_ID[]; extern const uint8_t ZRAN_INDEX_FILE_VERSION; /* Return codes for zran_export_index. */ enum { ZRAN_EXPORT_OK = 0, ZRAN_EXPORT_WRITE_ERROR = -1 }; /* * Export current index data to given file. This exported file later can be * used to rebuild index without needing to going through the file again. * * See zran_import_index for importing. * * A zran index file is a binary file which has the following header * structure. All fields are assumed to be stored with little-endian * ordering: * * | Offset | Length | Description | * | 0 | 5 | File header (ascii, GZIDX) | * | 5 | 1 | Version (uint8, currently 1) | * | 6 | 1 | Reserved (uint8, currently must be 0) | * | 7 | 8 | Compressed file size (uint64) | * | 15 | 8 | Uncompressed file size (uint64) | * | 23 | 4 | Index point spacing (uint32) | * | 27 | 4 | Index window size W (uint32) | * | 31 | 4 | Number of index points (uint32) | * * The header is followed by the offsets for each index point: * * | Offset | Length | Description | * | 0 | 8 | Compressed offset for point 0 (uint64) | * | 8 | 8 | Uncompressed offset for point 0 (uint64) | * | 16 | 1 | Bit offset for point 0 (uint8) | * | 17 | 1 | Data flag - 1 if point has window data, | * | | | 0 otherwise (uint8, added in file format | * | | | version 1) | * | ... | ... | ... | * | N*18 | 8 | Compressed offset for point N (uint64) | * | ... | ... | ... | * * Finally the window data for all index points that have data is * concatenated (W represents the index window size): * * | Offset | Length | Description | * | 0 | W | Window data for first index point with data | * | ... | ... | ... | * | N*W | W | Window data for Nth index point with data | * * Returns: * - ZRAN_EXPORT_OK for success. * * - ZRAN_EXPORT_WRITE_ERROR to indicate an error from writing to underlying * file. */ int zran_export_index( zran_index_t *index, /* The index */ FILE *fd, /* Open handle to export file */ PyObject *f /* Open handle to export file object */ ); /* Return codes for zran_import_index. */ enum { ZRAN_IMPORT_OK = 0, ZRAN_IMPORT_FAIL = -1, ZRAN_IMPORT_EOF = -2, ZRAN_IMPORT_READ_ERROR = -3, ZRAN_IMPORT_INCONSISTENT = -4, ZRAN_IMPORT_MEMORY_ERROR = -5, ZRAN_IMPORT_UNKNOWN_FORMAT = -6, ZRAN_IMPORT_UNSUPPORTED_VERSION = -7 }; /* * Import current index from the given file. index must have been initialized * by zran_init function before calling this function, as it is not supported * importing into an uninitialised zran_index_t struct. Existing index will be * overwritten including spacing and window_size values, whereas values of * readbuf_size and flags will be kept. * * Updating an index file is not supported currently. To update an index file, * first import it, create new checkpoints, and then export it again. * * CRC validation of uncompressed data from an imported index is not currently * supported - this function will enable the ZRAN_SKIP_CRC_CHECK flag on the * given zran_index_t struct. * * See zran_export_index for exporting. * * Returns: * - ZRAN_IMPORT_OK for success. * * - ZRAN_IMPORT_FAIL general errors. * * - ZRAN_IMPORT_EOF to indicate unexpected end-of-file. * * - ZRAN_IMPORT_READ_ERROR to indicate error while reading file. * * - ZRAN_IMPORT_OVERFLOW to indicate overflow while reading compressed and * uncompressed size attributes. This shouldn't be a problem for x64 * processors. * * - ZRAN_IMPORT_INCONSISTENT to indicate compressed size, or uncompressed * size if known, of the index file is inconsistent with the loaded * compressed file. * * - ZRAN_IMPORT_MEMORY_ERROR to indicate failure to allocate memory for new * index. This typically result from out-of-memory. * * - ZRAN_IMPORT_UNKNOWN_FORMAT to indicate given file is of unknown format. * * - ZRAN_IMPORT_UNSUPPORTED_VERSION to indicate that the file has a version * which is too new for this version of indexed_gzip to parse. */ int zran_import_index( zran_index_t *index, /* The index */ FILE *fd, /* Open handle to import file */ PyObject *f /* Open handle to import file object */ ); #endif /* __ZRAN_H__ */ indexed_gzip-1.6.4/indexed_gzip/zran.pxd0000644000175000017500000000600614133320576017667 0ustar nileshnilesh# # Cython declaration for the zran library. # from libc.stdio cimport FILE from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t from posix.types cimport off_t from cpython.ref cimport PyObject cdef extern from "zran.h": ctypedef struct zran_index_t: FILE *fd; PyObject *f; size_t compressed_size; size_t uncompressed_size; uint32_t spacing; uint32_t window_size; uint32_t readbuf_size; uint32_t npoints; zran_point_t *list; ctypedef struct zran_point_t: uint64_t cmp_offset; uint64_t uncmp_offset; uint8_t bits; uint8_t *data; enum: # flags for zran_init ZRAN_AUTO_BUILD = 1, ZRAN_SKIP_CRC_CHECK = 2, # return codes for zran_build_index ZRAN_BUILD_INDEX_OK = 0, ZRAN_BUILD_INDEX_FAIL = -1, ZRAN_BUILD_INDEX_CRC_ERROR = -2, # return codes for zran_seek ZRAN_SEEK_CRC_ERROR = -2, ZRAN_SEEK_FAIL = -1, ZRAN_SEEK_OK = 0, ZRAN_SEEK_NOT_COVERED = 1, ZRAN_SEEK_EOF = 2, ZRAN_SEEK_INDEX_NOT_BUILT = 3, # return codes for zran_read ZRAN_READ_NOT_COVERED = -1, ZRAN_READ_EOF = -2, ZRAN_READ_FAIL = -3, ZRAN_READ_CRC_ERROR = -4, # return codes for zran_export_index ZRAN_EXPORT_OK = 0, ZRAN_EXPORT_WRITE_ERROR = -1, # return codes for zran_import_index ZRAN_IMPORT_OK = 0, ZRAN_IMPORT_FAIL = -1, ZRAN_IMPORT_EOF = -2, ZRAN_IMPORT_READ_ERROR = -3, ZRAN_IMPORT_INCONSISTENT = -4, ZRAN_IMPORT_MEMORY_ERROR = -5, ZRAN_IMPORT_UNKNOWN_FORMAT = -6, ZRAN_IMPORT_UNSUPPORTED_VERSION = -7 int zran_init(zran_index_t *index, FILE *fd, PyObject *f, uint32_t spacing, uint32_t window_size, uint32_t readbuf_size, uint16_t flags) void zran_free(zran_index_t *index) int zran_build_index(zran_index_t *index, uint64_t from_, uint64_t until) nogil; uint64_t zran_tell(zran_index_t *index); int zran_seek(zran_index_t *index, int64_t offset, uint8_t whence, zran_point_t **point) nogil; int64_t zran_read(zran_index_t *index, void *buf, uint64_t len) nogil; int zran_export_index(zran_index_t *index, FILE *fd, PyObject *f); int zran_import_index(zran_index_t *index, FILE *fd, PyObject *f); indexed_gzip-1.6.4/indexed_gzip/zran_file_util.pxd0000644000175000017500000000221214133320576021716 0ustar nileshnilesh# # Cython declaration for the zran_file_util library. # from libc.stdio cimport FILE from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t from posix.types cimport off_t from cpython.ref cimport PyObject cdef extern from "zran_file_util.h": size_t _fread_python(void *ptr, size_t size, size_t nmemb, PyObject *f) int64_t _ftell_python(PyObject *f) int _fseek_python(PyObject *f, int64_t offset, int whence) int _feof_python(PyObject *f, size_t f_ret) int _ferror_python(PyObject *f) int _fflush_python(PyObject *f) size_t _fwrite_python(const void *ptr, size_t size, size_t nmemb, PyObject *f) int _getc_python(PyObject *f) int ferror_(FILE *fd, PyObject *f) int fseek_(FILE *fd, PyObject *f, int64_t offset, int whence) int64_t ftell_(FILE *fd, PyObject *f) size_t fread_(void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f) int feof_(FILE *fd, PyObject *f, size_t f_ret) int fflush_(FILE *fd, PyObject *f) size_t fwrite_(const void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f) int getc_(FILE *fd, PyObject *f) indexed_gzip-1.6.4/indexed_gzip/zran_file_util.h0000644000175000017500000000502714133320576021361 0ustar nileshnilesh#ifndef __ZRAN_FILE_UTIL_H__ #define __ZRAN_FILE_UTIL_H__ /* * File utilities used to manipulate either * Python file-like objects or file descriptors. */ #include #include #define PY_SSIZE_T_CLEAN #include /* * Implements a method analogous to fread that is performed on Python * file-like objects. */ size_t _fread_python(void *ptr, size_t size, size_t nmemb, PyObject *f); /* * Implements a method analogous to ftell that is performed on Python * file-like objects. */ int64_t _ftell_python(PyObject *f); /* * Implements a method analogous to fseek that is performed on Python * file-like objects. */ int _fseek_python(PyObject *f, int64_t offset, int whence); /* * Implements a method analogous to feof that is performed on Python file-like * objects. */ int _feof_python(PyObject *f, size_t f_ret); /* * Implements a method analogous to ferror that is performed on Python * file-like objects. */ int _ferror_python(PyObject *f); /* * Implements a method analogous to fflush that is performed on Python * file-like objects. */ int _fflush_python(PyObject *f); /* * Implements a method analogous to fwrite that is performed on Python * file-like objects. */ size_t _fwrite_python(const void *ptr, size_t size, size_t nmemb, PyObject *f); /* * Implements a method analogous to getc that is performed on Python * file-like objects. */ int _getc_python(PyObject *f); /* * Calls ferror on fd if specified, otherwise the Python-specific method on f. */ int ferror_(FILE *fd, PyObject *f); /* * Calls fseek on fd if specified, otherwise the Python-specific method on f. */ int fseek_(FILE *fd, PyObject *f, int64_t offset, int whence); /* * Calls ftell on fd if specified, otherwise the Python-specific method on f. */ int64_t ftell_(FILE *fd, PyObject *f); /* * Calls fread on fd if specified, otherwise the Python-specific method on f. */ size_t fread_(void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f); /* * Calls feof on fd if specified, otherwise the Python-specific method on f. */ int feof_(FILE *fd, PyObject *f, size_t f_ret); /* * Calls fflush on fd if specified, otherwise the Python-specific method on f. */ int fflush_(FILE *fd, PyObject *f); /* * Calls fwrite on fd if specified, otherwise the Python-specific method on f. */ size_t fwrite_( const void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f); /* * Calls getc on fd if specified, otherwise the Python-specific method on f. */ int getc_(FILE *fd, PyObject *f); #endif /* __ZRAN_FILE_UTIL_H__ */ indexed_gzip-1.6.4/indexed_gzip/zran_file_util.c0000644000175000017500000001621614133320576021356 0ustar nileshnilesh/* * zran_file_util.c - file utilities used in zran.c to manipulate either * Python file-like objects or file descriptors. * * Functions which interact with Python file-likes will acquire and release * the GIL as needeed. */ #include #include #include #include #include #define PY_SSIZE_T_CLEAN #include #include "zran_file_util.h" #ifdef _WIN32 #define FSEEK _fseeki64 #define FTELL _ftelli64 #include "windows.h" #include "io.h" #else #define FSEEK fseeko #define FTELL ftello #endif /* * The zran functions are typically called with the GIL released. These * macros are used to temporarily (re-)acquire and release the GIL when * interacting with Python file-like objects. */ #define _ZRAN_FILE_UTIL_ACQUIRE_GIL \ PyGILState_STATE s; \ s = PyGILState_Ensure(); #define _ZRAN_FILE_UTIL_RELEASE_GIL \ PyGILState_Release(s); /* * Implements a method analogous to fread that is performed on Python * file-like objects. */ size_t _fread_python(void *ptr, size_t size, size_t nmemb, PyObject *f) { PyObject *data = NULL; char *buf; Py_ssize_t len; _ZRAN_FILE_UTIL_ACQUIRE_GIL if ((data = PyObject_CallMethod(f, "read", "(n)", size * nmemb)) == NULL) goto fail; if ((buf = PyBytes_AsString(data)) == NULL) goto fail; if ((len = PyBytes_Size(data)) == -1) goto fail; memmove(ptr, buf, (size_t) len); Py_DECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return (size_t) len / size; fail: Py_XDECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return 0; } /* * Implements a method analogous to ftell that is performed on Python * file-like objects. */ int64_t _ftell_python(PyObject *f) { PyObject *data = NULL; int64_t result; _ZRAN_FILE_UTIL_ACQUIRE_GIL data = PyObject_CallMethod(f, "tell", NULL); if (data == NULL) goto fail; result = PyLong_AsLong(data); if (result == -1 && PyErr_Occurred()) goto fail; Py_DECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return result; fail: Py_XDECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return -1; } /* * Implements a method analogous to fseek that is performed on Python * file-like objects. */ int _fseek_python(PyObject *f, int64_t offset, int whence) { PyObject *data = NULL; _ZRAN_FILE_UTIL_ACQUIRE_GIL /* * The seek method returns type long, which has * different sizes on different platforms */ if (sizeof(long) == 8) data = PyObject_CallMethod(f, "seek", "(l,i)", offset, whence); else if (sizeof(long long) == 8) data = PyObject_CallMethod(f, "seek", "(L,i)", offset, whence); else goto fail; if (data == NULL) goto fail; Py_DECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return 0; fail: Py_XDECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return -1; } /* * Implements a method analogous to feof that is performed on Python file-like * objects. If f_ret, the number of bytes returned by the last read, is zero, * then we're at EOF. */ int _feof_python(PyObject *f, size_t f_ret) { return f_ret == 0; } /* * Implements a method analogous to ferror that is performed on Python * file-like objects. */ int _ferror_python(PyObject *f) { PyObject *result; _ZRAN_FILE_UTIL_ACQUIRE_GIL result = PyErr_Occurred(); _ZRAN_FILE_UTIL_RELEASE_GIL if (result != NULL) return 1; else return 0; } /* * Implements a method analogous to fflush that is performed on Python * file-like objects. */ int _fflush_python(PyObject *f) { PyObject *data = NULL; _ZRAN_FILE_UTIL_ACQUIRE_GIL if ((data = PyObject_CallMethod(f, "flush", NULL)) == NULL) goto fail; Py_DECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return 0; fail: Py_XDECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return -1; } /* * Implements a method analogous to fwrite that is performed on Python * file-like objects. */ size_t _fwrite_python(const void *ptr, size_t size, size_t nmemb, PyObject *f) { PyObject *input = NULL; PyObject *data = NULL; long len; _ZRAN_FILE_UTIL_ACQUIRE_GIL if ((input = PyBytes_FromStringAndSize(ptr, size * nmemb)) == NULL) goto fail; if ((data = PyObject_CallMethod(f, "write", "(O)", input)) == NULL) goto fail; #if PY_MAJOR_VERSION >= 3 if ((len = PyLong_AsLong(data)) == -1 && PyErr_Occurred()) goto fail; #else // In Python 2, a file object's write() method does not return the number // of bytes written, so let's just assume that everything has been written // properly. len = size * nmemb; #endif Py_DECREF(input); Py_DECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return (size_t) len / size; fail: Py_XDECREF(input); Py_XDECREF(data); _ZRAN_FILE_UTIL_RELEASE_GIL return 0; } /* * Implements a method analogous to getc that is performed on Python file-like * objects. */ int _getc_python(PyObject *f) { char buf; if (_fread_python(&buf, 1, 1, f) == 0) { // Reached EOF, or an error (in which case the error indicator is set). // Either way, we should return -1. return -1; } return buf; } /* * Calls ferror on fd if specified, otherwise the Python-specific method on f. */ int ferror_(FILE *fd, PyObject *f) { return fd != NULL ? ferror(fd) : _ferror_python(f); } /* * Calls fseek on fd if specified, otherwise the Python-specific method on f. */ int fseek_(FILE *fd, PyObject *f, int64_t offset, int whence) { return fd != NULL ? FSEEK(fd, offset, whence) : _fseek_python(f, offset, whence); } /* * Calls ftell on fd if specified, otherwise the Python-specific method on f. */ int64_t ftell_(FILE *fd, PyObject *f) { return fd != NULL ? FTELL(fd) : _ftell_python(f); } /* * Calls fread on fd if specified, otherwise the Python-specific method on f. */ size_t fread_(void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f) { return fd != NULL ? fread(ptr, size, nmemb, fd) : _fread_python(ptr, size, nmemb, f); } /* * Calls feof on fd if specified, otherwise the Python-specific method on f. * If fd is not specified, requires f_ret, the number of bytes read on the last * read, to determine if the file is at EOF. */ int feof_(FILE *fd, PyObject *f, size_t f_ret) { return fd != NULL ? feof(fd): _feof_python(f, f_ret); } /* * Calls fflush on fd if specified, otherwise the Python-specific method on f. */ int fflush_(FILE *fd, PyObject *f) { return fd != NULL ? fflush(fd): _fflush_python(f); } /* * Calls fwrite on fd if specified, otherwise the Python-specific method on f. */ size_t fwrite_(const void *ptr, size_t size, size_t nmemb, FILE *fd, PyObject *f) { return fd != NULL ? fwrite(ptr, size, nmemb, fd) : _fwrite_python(ptr, size, nmemb, f); } /* * Calls getc on fd if specified, otherwise the Python-specific method on f. */ int getc_(FILE *fd, PyObject *f) { return fd != NULL ? getc(fd): _getc_python(f); } indexed_gzip-1.6.4/indexed_gzip/tests/0000755000175000017500000000000014133320576017340 5ustar nileshnileshindexed_gzip-1.6.4/indexed_gzip/tests/conftest.py0000644000175000017500000000506014133320576021540 0ustar nileshnilesh#!/usr/bin/env python # # conftest.py - # # Author: Paul McCarthy # import os import os.path as op import numpy as np import pytest def pytest_addoption(parser): parser.addoption('--nelems', type=str, action='store', default='rnd_16777217', help='Number of uint64 elements for test data') parser.addoption('--concat', action='store_true', help='Generate test data made of ' 'concatenated GZIP streams') parser.addoption('--use_mmap', action='store_true', help='Use mmap for read buffer instead of main memory') parser.addoption('--seed', type=int, help='Seed for random number generator') parser.addoption('--testfile', action='store', help='Name of test file') parser.addoption('--niters', type=int, action='store', default=1000, help='Number of inputs for tests which ' 'use a random set of inputs') @pytest.fixture def nelems(request): val = request.config.getoption('--nelems') if val.startswith('rnd_'): if hasattr(nelems, 'val'): val = nelems.val else: # val +/- 20% val = int(val.split('_')[1]) var = (np.random.random() - 0.5) * val * 0.4 val = round(val + var) nelems.val = val return int(val) @pytest.fixture def niters(request): return request.config.getoption('--niters') @pytest.fixture def concat(request): return request.config.getoption('--concat') @pytest.fixture def use_mmap(request): return request.config.getoption('--use_mmap') @pytest.fixture def seed(request): seed = request.config.getoption('--seed') if seed is None: seed = np.random.randint(2 ** 30) np.random.seed(seed) print('Seed for random number generator: {}'.format(seed)) return seed @pytest.fixture def testfile(request, nelems, concat): from indexed_gzip.tests import gen_test_data filename = request.config.getoption('--testfile') if filename is None: filename = op.join(os.getcwd(), 'ctest_zran_{}_{}.gz'.format(nelems, concat)) if not op.exists(filename): gen_test_data(filename, nelems, concat) return filename indexed_gzip-1.6.4/indexed_gzip/tests/test_indexed_gzip_threading.py0000644000175000017500000000430214133320576025446 0ustar nileshnilesh#!/usr/bin/env python # # test_indexed_gzip_threading.py - # # Author: Paul McCarthy # from __future__ import print_function from __future__ import division import sys import threading import numpy as np import pytest import indexed_gzip as igzip from . import check_data_valid pytestmark = pytest.mark.indexed_gzip_test def test_IndexedGzipFile_open_close(testfile, nelems, concat): _test_IndexedGzipFile_open_close(testfile, False) def test_IndexedGzipFile_open_close_drop_handles(testfile, nelems, concat): _test_IndexedGzipFile_open_close(testfile, True) @pytest.mark.slow_test def test_IndexedGzipFile_pread_threaded(testfile, nelems, concat): _test_IndexedGzipFile_pread_threaded(testfile, nelems, False) @pytest.mark.slow_test def test_IndexedGzipFile_pread_threaded_drop_handles(testfile, nelems, concat): _test_IndexedGzipFile_pread_threaded(testfile, nelems, True) def _test_IndexedGzipFile_open_close(testfile, drop): f = igzip.IndexedGzipFile(filename=testfile, drop_handles=drop) f.seek(10) f.read(10) f.close() def _test_IndexedGzipFile_pread_threaded(testfile, nelems, drop): filesize = nelems * 8 indexSpacing = max(524288, filesize // 2000) with igzip.IndexedGzipFile(filename=testfile, spacing=indexSpacing, drop_handles=drop) as f: readelems = 50 readsize = readelems * 8 nthreads = 100 allreads = [] def do_pread(nbytes, offset): data = f.pread(nbytes, int(offset * 8)) allreads.append((offset, data)) offsets = np.linspace(0, nelems - readelems, nthreads, dtype=np.uint64) threads = [threading.Thread(target=do_pread, args=(readsize, o)) for o in offsets] [t.start() for t in threads] [t.join() for t in threads] assert len(allreads) == nthreads for offset, data in allreads: assert len(data) == readsize data = np.ndarray(shape=readelems, dtype=np.uint64, buffer=data) assert check_data_valid(data, offset, offset + readelems) indexed_gzip-1.6.4/indexed_gzip/tests/ctest_zran.pyx0000644000175000017500000013655514133320576022275 0ustar nileshnilesh# # Tests for the zran module. # # Author: Paul McCarthy # from __future__ import print_function from __future__ import division import os import os.path as op import itertools as it import subprocess as sp import sys import time import gzip import shutil import random import hashlib import tempfile import threading import contextlib import numpy as np # The stdlib resource module is only # available on unix-like platforms. try: import resource except ImportError: resource = None cimport numpy as np from posix.types cimport off_t from io import BytesIO from libc.stdio cimport (SEEK_SET, SEEK_CUR, SEEK_END, FILE, fdopen, fwrite) from libc.stdint cimport int64_t from libc.string cimport memset, memcmp from cpython.exc cimport (PyErr_Clear, PyErr_SetString) from cpython.mem cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free) from cpython.ref cimport PyObject from posix.mman cimport (mmap, munmap, PROT_READ, PROT_WRITE, MAP_ANON, MAP_SHARED) from . import poll, check_data_valid, tempdir, compress_inmem cdef extern from "sys/mman.h": cdef enum: MAP_FAILED cimport indexed_gzip.zran as zran cimport indexed_gzip.zran_file_util as zran_file_util np.import_array() cdef read_element(zran.zran_index_t *index, element, nelems, seek=True): cdef void *buffer buf = ReadBuffer(8) buffer = buf.buffer if element >= nelems: expseek = zran.ZRAN_SEEK_EOF else: expseek = zran.ZRAN_SEEK_OK if element >= nelems: exptell = (nelems * 8) else: exptell = element * 8 if seek: gotseek = zran.zran_seek(index, element * 8, SEEK_SET, NULL) gottell = zran.zran_tell(index) try: assert gotseek == expseek assert gottell == exptell except: print('expseek: {}'.format(expseek)) print('exptell: {}'.format(exptell)) print('gotseek: {}'.format(gotseek)) print('gottell: {}'.format(gottell)) raise if element >= nelems: expread = zran.ZRAN_READ_EOF else: expread = 8 if element >= nelems: exptell = (nelems * 8) else: exptell = (element + 1) * 8 gotread = zran.zran_read(index, buffer, 8) gottell = zran.zran_tell(index) try: assert gotread == expread assert gottell == exptell except: print('nelems: {}'.format(nelems)) print('element: {}'.format(element)) print('expread: {}'.format(expread)) print('exptell: {}'.format(exptell)) print('gotread: {}'.format(gotread)) print('gottell: {}'.format(gottell)) raise if element < nelems: pybuf = (buffer)[:8] val = np.ndarray(1, np.uint64, buffer=pybuf) return val[0] else: return None cdef class ReadBuffer: """Wrapper around a chunk of memory. .. see:: http://docs.cython.org/src/tutorial/memory_allocation.html """ cdef void *buffer """A raw chunk of bytes. """ cdef bint use_mmap """ """ cdef size_t size """ """ cdef object mmap_fd cdef object mmap_path def __cinit__(self, size_t size, use_mmap=False): """Allocate ``size`` bytes of memory. """ self.use_mmap = use_mmap self.mmap_fd = None self.mmap_path = None self.size = size self.buffer = NULL if not self.use_mmap: self.buffer = PyMem_Malloc(size) memset(self.buffer, 0, size); else: fd, path = tempfile.mkstemp('readbuf_mmap_{}'.format(id(self))) print('Memory-mapping {:0.2f} GB ({})'.format(size / 1073741824., path)) def initmem(): towrite = size while towrite > 0: zeros = np.zeros(min(towrite, 134217728), dtype=np.uint8) towrite -= len(zeros) os.write(fd, zeros.tostring()) th = threading.Thread(target=initmem) th.start() poll(lambda : not th.is_alive()) self.mmap_fd = fd self.mmap_path = path self.buffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) if self.buffer == MAP_FAILED: raise RuntimeError('mmap fail') if not self.buffer: raise RuntimeError('ReadBuffer init fail') def resize(self, size_t size): """Re-allocate the memory to the given ``size``. """ if self.use_mmap: raise NotImplementedError('Cannot resize a memmapped array!') buf = PyMem_Realloc(self.buffer, size) if not buf: raise MemoryError('PyMem_Realloc fail') self.buffer = buf self.size = size def __dealloc__(self): """Free the mwmory. """ if not self.use_mmap: PyMem_Free(self.buffer) else: munmap(self.buffer, self.size) os.close( self.mmap_fd) os.remove(self.mmap_path) def error_fn(*args, **kwargs): raise Exception("Error") def return_fn(return_value): return lambda *args, **kwargs: return_value def test_fread(): """Tests Python wrapper C function for fread.""" f = BytesIO(b"abc") cdef char buf[3] elems_read = zran_file_util._fread_python(buf, 1, 3, f) assert elems_read == 3 assert f.tell() == 3 assert buf[0:3] == b"abc" assert zran_file_util._ferror_python(f) == 0 # fread error conditions: for fn in [error_fn, return_fn(None)]: f.read = fn assert zran_file_util._fread_python(buf, 1, 3, f) == 0 assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_ftell(): f = BytesIO(b"abc") assert zran_file_util._ftell_python(f) == 0 f.seek(2) assert zran_file_util._ftell_python(f) == 2 assert zran_file_util._ferror_python(f) == 0 # ftell error conditions for fn in [error_fn, return_fn(None)]: f.tell = fn assert zran_file_util._ftell_python(f) == -1 assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_fseek(): f = BytesIO(b"abc") zran_file_util._fseek_python(f, 1, SEEK_SET) assert f.tell() == 1 zran_file_util._fseek_python(f, -1, SEEK_END) assert f.tell() == 2 zran_file_util._fseek_python(f, 100, SEEK_SET) assert f.tell() == 100 assert zran_file_util._ferror_python(f) == 0 # fseek error conditions for fn in [error_fn]: f.seek = fn assert zran_file_util._fseek_python(f, 1, SEEK_SET) == -1 assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_feof(): f = BytesIO(b"abc") f.seek(0) # the EOF indicator shouldn't be set... assert zran_file_util._feof_python(f, 2) == 0 # ...unless f_read is zero. assert zran_file_util._feof_python(f, 0) == 1 assert zran_file_util._ferror_python(f) == 0 def test_ferror(): f = BytesIO(b"abc") assert zran_file_util._ferror_python(f) == 0 PyErr_SetString(ValueError, "Error") assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() assert zran_file_util._ferror_python(f) == 0 def test_fflush(): f = BytesIO(b"abc") zran_file_util._fflush_python(f) assert zran_file_util._ferror_python(f) == 0 # fflush error conditions for fn in [error_fn]: f.flush = fn assert zran_file_util._fflush_python(f) == -1 assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_fwrite(): f = BytesIO(b"abc") cdef char* inp = 'de' elems_written = zran_file_util._fwrite_python(inp, 1, 2, f) assert elems_written == 2 assert f.tell() == 2 assert zran_file_util._ferror_python(f) == 0 f.seek(0) assert f.read() == b"dec" # fwrite error conditions # In Python 2, .write() returns None, so its return value # is ignored by _fwrite_python and can't cause an error. for fn in [error_fn, return_fn(None)] if sys.version_info[0] >= 3 else [error_fn]: f.write = fn result = zran_file_util._fwrite_python(inp, 1, 2, f) assert result == 0, result assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_getc(): f = BytesIO(b"dbc") assert zran_file_util._getc_python(f) == ord(b"d") assert zran_file_util._ferror_python(f) == 0 assert zran_file_util._getc_python(f) == ord(b"b") assert zran_file_util._ferror_python(f) == 0 assert zran_file_util._getc_python(f) == ord(b"c") assert zran_file_util._ferror_python(f) == 0 assert zran_file_util._getc_python(f) == -1 # reached EOF assert zran_file_util._ferror_python(f) == 0 assert zran_file_util._feof_python(f, 0) == 1 # getc error conditions for fn in [error_fn, return_fn(None)]: f.read = fn assert zran_file_util._getc_python(f) == -1 assert zran_file_util._ferror_python(f) == 1 PyErr_Clear() def test_init(testfile, no_fds): """Tests a bunch of permutations of the parameters to zran_init. """ spacings = [0, 16384, 32768, 65536, 524288, 1048576, 2097152, 4194304] window_sizes = [0, 8192, 16384, 32768, 65536, 131072] readbuf_sizes = [0, 8192, 16384, 24576, 32768, 65536, 131072] flags = [0, zran.ZRAN_AUTO_BUILD] cdef zran.zran_index_t index cdef FILE *cfid with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') for s, w, r, f in it.product(spacings, window_sizes, readbuf_sizes, flags): result = not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, s, w, r, f) expected = True # zran_init should fail if the point spacing # is less than the window size if w == 0: w = 32768 if s == 0: s = 1048576 if r == 0: r = 16384 expected = (w >= 32768) and (s > w) assert result == expected zran.zran_free(&index) def test_init_file_modes(testfile, no_fds): modes = ['r', 'r+', 'w', 'w+', 'a', 'a+'] files = [testfile, testfile, 'dummy.gz', 'dummy.gz', 'dummy.gz', 'dummy.gz'] cdef zran.zran_index_t index cdef FILE *cfid cdef bytes bmode cdef char *cmode for filename, mode in zip(files, modes): with open(filename, mode) as pyfid: bmode = mode.encode() cmode = bmode cfid = fdopen(pyfid.fileno(), cmode) # If no_fds is set, we can't detect the mode, so reading is always allowed. expected = 1 if no_fds else mode == 'r' result = not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, 0, 0, 0, 0) assert result == expected zran.zran_free(&index) if filename == 'dummy.gz' and op.exists(filename): os.remove(filename) def test_no_auto_build(testfile, no_fds, nelems): cdef zran.zran_index_t index cdef void *buffer filesize = nelems * 8 indexSpacing = max(1048576, filesize // 1500) bufSize = 1048576 buf = ReadBuffer(bufSize) buffer = buf.buffer with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, 0) assert zran.zran_seek(&index, 0, SEEK_SET, NULL) == zran.ZRAN_SEEK_OK assert zran.zran_tell(&index) == 0 assert zran.zran_seek(&index, 1, SEEK_SET, NULL) == zran.ZRAN_SEEK_NOT_COVERED assert zran.zran_tell(&index) == 0 gotread = zran.zran_read(&index, buffer, bufSize) gottell = zran.zran_tell(&index) if bufSize > filesize: expread = filesize else: expread = bufSize if bufSize > filesize: exptell = filesize else: exptell = bufSize try: assert gotread == expread assert gottell == exptell except: print("expread: {}".format(expread)) print("gotread: {}".format(gotread)) print("exptell: {}".format(exptell)) print("gottell: {}".format(gottell)) raise pybuf = (buffer)[:gotread] data = np.ndarray(gotread // 8, np.uint64, pybuf) assert check_data_valid(data, 0) if bufSize < filesize: assert zran.zran_read(&index, buffer, bufSize) == zran.ZRAN_READ_NOT_COVERED def test_seek_to_end(testfile, no_fds, nelems): cdef zran.zran_index_t index filesize = nelems * 8 seek_point = filesize - 1 indexSpacing = max(524288, filesize // 1500) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) assert zran.zran_seek(&index, seek_point, SEEK_SET, NULL) == 0 zt = zran.zran_tell(&index) assert zt == seek_point zran.zran_free(&index) def test_seek_cur(testfile, no_fds, nelems): cdef zran.zran_index_t index filesize = nelems * 8 indexSpacing = max(524288, filesize // 1500) seekstep = max(1, (nelems - 1) // 500) curelem = 0; with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) while curelem < nelems: if (curelem + seekstep) * 8 < filesize: exp = zran.ZRAN_SEEK_OK else: exp = zran.ZRAN_SEEK_EOF out = zran.zran_seek(&index, seekstep * 8, SEEK_CUR, NULL) assert out == exp, out if exp == zran.ZRAN_SEEK_EOF: break curelem += seekstep zt = zran.zran_tell(&index) val = read_element(&index, curelem, nelems, False) assert zt == curelem * 8 assert val == curelem assert zran.zran_seek(&index, -8, SEEK_CUR, NULL) == zran.ZRAN_SEEK_OK zran.zran_free(&index) def test_seek_end(testfile, no_fds, nelems): cdef zran.zran_index_t index filesize = nelems * 8 indexSpacing = max(131072, filesize // 1500) seekstep = max(1, (nelems - 1) // 500) curelem = 0 with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) assert zran.zran_seek(&index, -10, SEEK_END, NULL) == zran.ZRAN_SEEK_INDEX_NOT_BUILT assert zran.zran_seek(&index, 20, SEEK_SET, NULL) == zran.ZRAN_SEEK_OK assert zran.zran_tell(&index) == 20 assert zran.zran_seek(&index, -10, SEEK_END, NULL) == zran.ZRAN_SEEK_INDEX_NOT_BUILT assert zran.zran_build_index(&index, 0, 0) == 0 assert zran.zran_seek(&index, 0, SEEK_END, NULL) == zran.ZRAN_SEEK_EOF assert zran.zran_tell(&index) == filesize assert zran.zran_seek(&index, -1, SEEK_END, NULL) == zran.ZRAN_SEEK_OK assert zran.zran_tell(&index) == filesize - 1 assert zran.zran_seek(&index, 1, SEEK_END, NULL) == zran.ZRAN_SEEK_EOF assert zran.zran_tell(&index) == filesize assert zran.zran_seek(&index, -filesize - 1, SEEK_END, NULL) == zran.ZRAN_SEEK_FAIL assert zran.zran_seek(&index, -filesize, SEEK_END, NULL) == zran.ZRAN_SEEK_OK assert zran.zran_tell(&index) == 0 while curelem < nelems: seekloc = filesize - ((nelems + curelem - 1) * 8) if seekloc >= 0: exp = zran.ZRAN_SEEK_EOF else: exp = zran.ZRAN_SEEK_OK assert zran.zran_seek(&index, seekloc, SEEK_END, NULL) == exp if exp == zran.ZRAN_SEEK_EOF: break curelem += seekstep zt = zran.zran_tell(&index) val = read_element(&index, curelem, nelems, False) assert zt == curelem * 8 assert val == curelem zran.zran_free(&index) def test_seek_beyond_end(testfile, no_fds, nelems): cdef zran.zran_index_t index filesize = nelems * 8 indexSpacing = max(524288, filesize // 1500) seekpoints = [filesize - 10, filesize - 2, filesize - 1, filesize, filesize + 1, filesize + 2, filesize + 10] with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) for sp in seekpoints: zs = zran.zran_seek(&index, sp, SEEK_SET, NULL) if sp >= filesize: expected = zran.ZRAN_SEEK_EOF else: expected = zran.ZRAN_SEEK_OK try: assert zs == expected except: print("{} != {} [sp={}, size={}]".format(zs, expected, sp, filesize)) raise zt = zran.zran_tell(&index) if sp >= filesize: expected = filesize else: expected = sp try: assert zt == expected except: print("{} != {}".format(zt, expected)) raise zran.zran_free(&index) def test_sequential_seek_to_end(testfile, no_fds, nelems, niters): cdef zran.zran_index_t index filesize = nelems * 8 seek_points = np.random.randint(0, filesize, niters, dtype=np.uint64) seek_points = np.sort(seek_points) indexSpacing = max(524288, filesize // 2000) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) for sp in seek_points: if sp >= filesize: expseek = zran.ZRAN_SEEK_EOF exptell = filesize else: expseek = zran.ZRAN_SEEK_OK exptell = sp seek = zran.zran_seek(&index, sp, SEEK_SET, NULL) tell = zran.zran_tell(&index) try: assert seek == expseek assert tell == exptell except: print("expseek: {}".format(expseek)) print("exptell: {}".format(exptell)) print("seek: {}".format(seek)) print("tell: {}".format(tell)) raise zran.zran_free(&index) def test_random_seek(testfile, no_fds, nelems, niters, seed): cdef zran.zran_index_t index filesize = nelems * 8 seekpoints = [random.randint(0, filesize) for i in range(niters)] indexSpacing = max(524288, filesize // 1000) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) for sp in seekpoints: assert zran.zran_seek(&index, sp, SEEK_SET, NULL) == 0 zt = zran.zran_tell(&index) assert zt == sp zran.zran_free(&index) def test_read_all(testfile, no_fds, nelems, use_mmap): filesize = nelems * 8 indexSpacing = max(524288, filesize // 1000) cdef zran.zran_index_t index cdef void *buffer cdef np.npy_intp nelemsp buf = ReadBuffer(filesize, use_mmap=use_mmap) buffer = buf.buffer with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) nbytes = zran.zran_read(&index, buffer, filesize) assert nbytes == filesize assert zran.zran_tell(&index) == nbytes zran.zran_free(&index) nelemsp = nbytes / 8. data = np.PyArray_SimpleNewFromData(1, &nelemsp, np.NPY_UINT64, buffer) assert check_data_valid(data, 0) def test_seek_then_read_block(testfile, no_fds, nelems, niters, seed, use_mmap): filesize = nelems * 8 indexSpacing = max(524288, filesize // 1000) buf = ReadBuffer(filesize, use_mmap=use_mmap) seekelems = np.random.randint(0, nelems - 1, niters, dtype=np.uint64) cdef zran.zran_index_t index cdef void *buffer = buf.buffer cdef np.npy_intp nelemsp with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') ret = zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) assert not ret, ret for i, se in enumerate(seekelems): if se == nelems - 1: readelems = 1 else: readelems = np.random.randint(1, nelems - se) start = time.time() print("{} / {}: reading {} elements from {} ... ".format( i, len(seekelems), readelems, se), end='') assert zran.zran_seek(&index, se * 8, SEEK_SET, NULL) == zran.ZRAN_SEEK_OK nbytes = zran.zran_read(&index, buffer, readelems * 8) try: assert nbytes == readelems * 8 assert zran.zran_tell(&index) == (se + readelems) * 8 except: print('seekelem: {}'.format(se)) print('readelems: {}'.format(readelems)) print('nbytes: {}'.format(nbytes)) print(' should be: {}'.format(readelems * 8)) print('ftell: {}'.format(zran.zran_tell(&index))) print(' should be: {}'.format((se + readelems) * 8)) raise nelemsp = nbytes / 8. data = np.PyArray_SimpleNewFromData(1, &nelemsp, np.NPY_UINT64, buffer) assert check_data_valid(data, se, se + readelems) end = time.time() print("{:0.2f} seconds".format(end - start)) zran.zran_free(&index) def test_random_seek_and_read(testfile, no_fds, nelems, niters, seed): cdef zran.zran_index_t index filesize = nelems * 8 seekelems = np.random.randint(0, nelems, niters) indexSpacing = max(524288, filesize // 1000) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) for se in seekelems: # Should never happen if se >= nelems: expval = None else: expval = se val = read_element(&index, se, nelems, True) try: assert val == expval except: print("{} != {}".format(val, se)) raise zran.zran_free(&index) def test_read_all_sequential(testfile, no_fds, nelems): cdef zran.zran_index_t index filesize = nelems * 8 indexSpacing = max(524288, filesize // 1000) # Takes too long to read all elements seekelems = np.random.randint(0, nelems - 1, 10000, dtype=np.uint64) seekelems = np.sort(seekelems) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) for se in seekelems: val = read_element(&index, se, nelems, True) try: assert val == se except: print("{} != {}".format(val, se)) print("{:x} != {:x}".format(val, se)) raise zran.zran_free(&index) def test_build_then_read(testfile, no_fds, nelems, seed, use_mmap): filesize = nelems * 8 indexSpacing = max(524288, filesize // 1000) buf = ReadBuffer(filesize, use_mmap) seekelems = np.random.randint(0, nelems - 1, 5000, dtype=np.uint64) cdef zran.zran_index_t index cdef void *buffer = buf.buffer with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, 32768, 131072, zran.ZRAN_AUTO_BUILD) assert not zran.zran_build_index(&index, 0, 0) for se in seekelems: assert zran.zran_seek(&index, se * 8, SEEK_SET, NULL) == 0 if se == nelems - 1: readelems = 1 else: readelems = np.random.randint(1, min(nelems - se, 5000)) nbytes = zran.zran_read(&index, buffer, readelems * 8) assert nbytes == readelems * 8 assert zran.zran_tell(&index) == (se + readelems) * 8 pybuf = (buffer)[:nbytes] data = np.ndarray(nbytes // 8, np.uint64, pybuf) for i, val in enumerate(data, se): assert val == i zran.zran_free(&index) def test_readbuf_spacing_sizes(testfile, no_fds, nelems, niters, seed): cdef zran.zran_index_t index spacings = [262144, 524288, 1048576, 2097152, 4194304, 8388608] bufsizes = [16384, 65536, 131072, 262144, 524288, 1048575, 1048576, 1048577, 2097152, 4194304, 8388608] seekelems = np.random.randint(0, nelems, niters // 2) seekelems = np.concatenate((spacings, bufsizes, seekelems)) for sbi, (spacing, bufsize) in enumerate(it.product(spacings, bufsizes)): with open(testfile, 'rb') as pyfid: print('{} / {}: spacing={}, bufsize={} ... '.format( sbi, len(spacings) * len(bufsizes), spacing, bufsize), end='') cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, spacing, 32768, bufsize, zran.ZRAN_AUTO_BUILD) for i, se in enumerate(seekelems): # print('{} / {}: {}'.format(i, len(seekelems), se)) if se >= nelems: expval = None else: expval = se val = read_element(&index, se, nelems, seek=True) try: assert val == expval except: print('{} != {}'.format(val, expval)) raise print() zran.zran_free(&index) cdef _compare_indexes(zran.zran_index_t *index1, zran.zran_index_t *index2): """Check that two indexes are equivalent. """ cdef zran.zran_point_t *p1 cdef zran.zran_point_t *p2 assert index2.compressed_size == index1.compressed_size assert index2.uncompressed_size == index1.uncompressed_size assert index2.spacing == index1.spacing assert index2.window_size == index1.window_size assert index2.npoints == index1.npoints ws = index1.window_size for i in range(index1.npoints): p1 = &index1.list[i] p2 = &index2.list[i] msg = 'Error at point %d' % i assert p2.cmp_offset == p1.cmp_offset, msg assert p2.uncmp_offset == p1.uncmp_offset, msg assert p2.bits == p1.bits, msg if (not p1.data): assert p1.data == p2.data, msg else: assert not memcmp(p2.data, p1.data, ws), msg def test_export_then_import(testfile, no_fds): """Export-import round trip . Test exporting an index, then importing it back in. """ cdef zran.zran_index_t index1 cdef zran.zran_index_t index2 indexSpacing = 1048576 windowSize = 32768 readbufSize = 131072 flag = 0 with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index1, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, windowSize, readbufSize, flag) assert not zran.zran_build_index(&index1, 0, 0) with open(testfile + '.idx.tmp', 'wb') as pyexportfid: cfid = fdopen(pyexportfid.fileno(), 'ab') ret = zran.zran_export_index(&index1, NULL if no_fds else cfid, pyexportfid if no_fds else NULL) assert not ret, str(ret) with open(testfile, 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index2, NULL if no_fds else cfid, pyfid if no_fds else NULL, indexSpacing, windowSize, readbufSize, flag) with open(testfile + '.idx.tmp', 'rb') as pyexportfid: cfid = fdopen(pyexportfid.fileno(), 'rb') ret = zran.zran_import_index(&index2, NULL if no_fds else cfid, pyexportfid if no_fds else NULL) assert not ret, str(ret) _compare_indexes(&index1, &index2) zran.zran_free(&index1) zran.zran_free(&index2) def test_export_import_no_points(no_fds): """Test exporting and importing an index which does not contain any seek points. """ cdef zran.zran_index_t index cdef void *buffer data = np.random.randint(1, 255, 100, dtype=np.uint8) buf = ReadBuffer(100) buffer = buf.buffer with tempdir(): with gzip.open('data.gz', 'wb') as f: f.write(data.tostring()) with open('data.gz', 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, 1048576, 32768, 131072, 0) == 0 output = zran.zran_read(&index, buffer, 100) assert output == 100, output pybuf = (buffer)[:100] assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data) with open('data.gz.index', 'wb') as pyidxfid: cidxfid = fdopen(pyidxfid.fileno(), 'wb') assert zran.zran_export_index(&index, NULL if no_fds else cidxfid, pyidxfid if no_fds else NULL) == 0 zran.zran_free(&index) with open('data.gz', 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert zran.zran_init(&index, NULL if no_fds else cfid, pyfid if no_fds else NULL, 1048576, 32768, 131072, 0) == 0 with open('data.gz.index', 'rb') as pyidxfid: cidxfid = fdopen(pyidxfid.fileno(), 'rb') assert zran.zran_import_index(&index, NULL if no_fds else cidxfid, pyidxfid if no_fds else NULL) == 0 assert index.npoints == 0 assert zran.zran_read(&index, buffer, 100) == 100 pybuf = (buffer)[:100] assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data) zran.zran_free(&index) def test_export_import_format_v0(): """Test index export and import on a version 0 index file. """ cdef zran.zran_index_t index1 cdef zran.zran_index_t index2 cdef int ret data = np.random.randint(1, 255, 1000000, dtype=np.uint8) with tempdir(): with gzip.open('data.gz', 'wb') as f: f.write(data.tostring()) with open('data.gz', 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init( &index1, cfid, NULL, 50000, 32768, 131072, 0) assert not zran.zran_build_index(&index1, 0, 0) _write_index_file_v0(&index1, 'data.gz.index') with open('data.gz', 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init( &index2, cfid, NULL, 50000, 32768, 131072, 0) with open('data.gz.index', 'rb') as pyidxfid: cidxfid = fdopen(pyidxfid.fileno(), 'rb') ret = zran.zran_import_index(&index2, cidxfid, NULL) assert ret == 0, ret _compare_indexes(&index1, &index2) zran.zran_free(&index1) zran.zran_free(&index2) cdef _write_index_file_v0(zran.zran_index_t *index, dest): """Write the given index out to a file, index file version 0 format. """ cdef zran.zran_point_t *point with open(dest, 'wb') as f: f.write(b'GZIDX\0\0') f.write(((&index.compressed_size))[:8]) f.write(((&index.uncompressed_size))[:8]) f.write(((&index.spacing))[:4]) f.write(((&index.window_size))[:4]) f.write(((&index.npoints))[:4]) for i in range(index.npoints): point = &index.list[i] f.write(((&point.cmp_offset))[:8]) f.write(((&point.uncmp_offset))[:8]) f.write(((&point.bits))[:1]) for i in range(1, index.npoints): point = &index.list[i] data = point.data[:index.window_size] f.write(data) def test_crc_validation(concat): """Basic test of CRC validation. """ cdef zran.zran_index_t index cdef void *buffer cdef int64_t ret # use uint32 so there are lots of zeros, # and so there is something to compress dsize = 1048576 * 10 data = np.random.randint(0, 255, dsize // 4, dtype=np.uint32) cmpdata, strmoffs = compress_inmem(data.tobytes(), concat) buf = ReadBuffer(dsize) buffer = buf.buffer f = [None] # to prevent gc def _zran_init(flags): f[0] = BytesIO(cmpdata) assert not zran.zran_init(&index, NULL, f[0], 1048576, 32768, 131072, flags) def _run_crc_tests(shouldpass, flags=zran.ZRAN_AUTO_BUILD): if shouldpass: expect_build = zran.ZRAN_BUILD_INDEX_OK expect_seek = zran.ZRAN_SEEK_OK expect_read = dsize else: expect_build = zran.ZRAN_BUILD_INDEX_CRC_ERROR expect_seek = zran.ZRAN_SEEK_CRC_ERROR expect_read = zran.ZRAN_READ_CRC_ERROR # CRC validation should occur on the first # pass through a gzip stream, regardless # of how that pass is initiated. Below we # test the most common scenarios. # Error if we try to build an index. Note # that an error here is not guaranteed, as # the _zran_expand_index might need a few # passes through the data to reach the end, # which might cause inflation to be # re-initialised, and therefore validation # to be disabled. It depends on the data, # and on the constants used in # _zran_estimate_offset _zran_init(flags) ret = zran.zran_build_index(&index, 0, 0) assert ret == expect_build, ret zran.zran_free(&index) # error if we try to seek _zran_init(flags) ret = zran.zran_seek(&index, dsize - 1, SEEK_SET, NULL) assert ret == expect_seek, ret zran.zran_free(&index) # error if we try to read _zran_init(flags) ret = zran.zran_read(&index, buffer, dsize) assert ret == expect_read, ret zran.zran_free(&index) if shouldpass: pybuf = (buffer)[:dsize] assert np.all(np.frombuffer(pybuf, dtype=np.uint32) == data) def wrap(val): return val % 255 # data/crc is good, all should be well _run_crc_tests(True) # corrupt the size, we should get an error cmpdata[-1] = wrap(cmpdata[-1] + 1) # corrupt size _run_crc_tests(False) # corrupt the crc, we should get an error cmpdata[-1] = wrap(cmpdata[-1] - 1) # restore size to correct value cmpdata[-5] = wrap(cmpdata[-5] + 1) # corrupt crc _run_crc_tests(False) # Corrupt a different stream, if we have more than one cmpdata[-5] = wrap(cmpdata[-5] - 1) # restore crc to correct value if len(strmoffs) > 1: for off in strmoffs[1:]: cmpdata[off-1] = wrap(cmpdata[off-1] + 1) _run_crc_tests(False) cmpdata[off-1] = wrap(cmpdata[off-1] - 1) # Disable CRC, all should be well, even with a corrupt CRC/size # First test with good data _run_crc_tests(True, zran.ZRAN_AUTO_BUILD | zran.ZRAN_SKIP_CRC_CHECK) cmpdata[-1] = wrap(cmpdata[-1] + 1) # corrupt size _run_crc_tests(True, zran.ZRAN_AUTO_BUILD | zran.ZRAN_SKIP_CRC_CHECK) cmpdata[-1] = wrap(cmpdata[-1] - 1) # restore size to correct value cmpdata[-5] = wrap(cmpdata[-5] - 1) # corrupt crc _run_crc_tests(True, zran.ZRAN_AUTO_BUILD | zran.ZRAN_SKIP_CRC_CHECK) def test_standard_usage_with_null_padding(concat): """Make sure standard usage works with files that have null-padding after the GZIP footer. See https://www.gnu.org/software/gzip/manual/gzip.html#Tapes """ cdef zran.zran_index_t index cdef void *buffer cdef int64_t ret # use uint32 so there are lots of zeros, # and so there is something to compress dsize = 1048576 * 10 data = np.random.randint(0, 255, dsize // 4, dtype=np.uint32) cmpdata, strmoffs = compress_inmem(data.tobytes(), concat) buf = ReadBuffer(dsize) buffer = buf.buffer f = [None] # to prevent gc # random amount of padding for each stream padding = np.random.randint(1, 100, len(strmoffs)) # new compressed data - bytearrays # are initialised to contain all 0s paddedcmpdata = bytearray(len(cmpdata) + padding.sum()) # copy old unpadded compressed data # into new padded compressed data padoff = 0 # offset into padded data last = 0 # offset to end of last copied stream in unpadded data print('Padding streams [orig size: {}] ...'.format(len(cmpdata))) for off, pad in zip(strmoffs, padding): strm = cmpdata[last:off] paddedcmpdata[padoff:padoff + len(strm)] = strm print(' Copied stream from [{} - {}] to [{} - {}] ({} ' 'padding bytes)'.format( last, off, padoff, padoff + len(strm), pad)) padoff += len(strm) + pad last = off def _zran_init(): f[0] = BytesIO(paddedcmpdata) assert not zran.zran_init(&index, NULL, f[0], 1048576, 32768, 131072, zran.ZRAN_AUTO_BUILD) _zran_init() ret = zran.zran_build_index(&index, 0, 0) assert ret == zran.ZRAN_BUILD_INDEX_OK, ret zran.zran_free(&index) _zran_init() ret = zran.zran_seek(&index, dsize - 1, SEEK_SET, NULL) assert ret == zran.ZRAN_SEEK_OK, ret zran.zran_free(&index) _zran_init() ret = zran.zran_read(&index, buffer, dsize) assert ret == dsize, ret zran.zran_free(&index) pybuf = (buffer)[:dsize] assert np.all(np.frombuffer(pybuf, dtype=np.uint32) == data) # pauldmccarthy/indexed_gzip#82 def test_inflateInit_leak_on_error(): """Make sure memory is not leaked after a successful call to inflateInit2(), but then a failure on subsequent zlib calls. """ cdef zran.zran_index_t index # inflateInit2 is called twice in the _zran_zlib_init_inflate function. # We can target the first call by passing a file containing random noise. # I haven't yet figured out a reliable way to target the second call. f = BytesIO(np.arange(1, 100).tobytes()) iters = np.arange(1, 10000) mem = np.zeros(10000, dtype=np.uint64) for i in iters: assert not zran.zran_init(&index, NULL, f, 1048576, 32768, 131072, zran.ZRAN_AUTO_BUILD) assert zran.zran_seek(&index, 20, SEEK_SET, NULL) == \ zran.ZRAN_SEEK_FAIL zran.zran_free(&index) if resource is not None: mem[i] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # We expect to see some small growth in memory # usage for the first few iterations, but then # it should remain stable mem = mem[5:] assert np.all(mem == mem[0]) # pauldmccarthy/indexed_gzip#80 def test_read_eof_memmove_rotate_bug(seed): # This bug was triggered by the read buffer rotation # that takes place in zran.c::_zran_read_data_from_file, # and occurs when the file is at EOF, and the # stream->next_in pointer is ahead of index->readbuf by # less than stream->avail_in bytes. In this case, the # source and dest pointers passed to memmove are # overlapping, so the area pointed to by next_in is # modified. The bug was that, when at EOF, the # stream->next_in pointer was not being reset to point # to the beginning of readbuf, so the subsequent read # of the gzip footer in _zran_validate_stream was # reading from the wrong location. # # We can trigger this situation by generating a file # which has compressed file size (X * readbuf_size) + Y, # for any integer x, and for 9 <= Y < 16 cdef zran.zran_index_t index cdef FILE *cfid with tempdir(): nelems = np.random.randint(524288, 525000, 1)[0] data = np.random.random(nelems) with gzip.open('test.gz', 'wb') as f: f.write(data.tobytes()) fsize = os.stat('test.gz').st_size readbuf_size = fsize - 10 with open('test.gz', 'rb') as pyfid: cfid = fdopen(pyfid.fileno(), 'rb') assert not zran.zran_init(&index, cfid, NULL, 4194304, 32768, readbuf_size, zran.ZRAN_AUTO_BUILD) eof = nelems * 8 - 1 got = zran.zran_seek(&index, eof, SEEK_SET, NULL) assert got == zran.ZRAN_SEEK_OK, got assert zran.zran_tell(&index) == eof indexed_gzip-1.6.4/indexed_gzip/tests/test_indexed_gzip.py0000644000175000017500000001313014133320576023420 0ustar nileshnilesh#!/usr/bin/env python # # test_indexed_gzip.py - Python wrapper around ctest_indexed_gzip.pyx. # # Author: Paul McCarthy # import pytest from . import ctest_indexed_gzip pytestmark = pytest.mark.indexed_gzip_test def test_open_close(testfile, nelems, seed): ctest_indexed_gzip.test_open_close(testfile, nelems, seed, False) def test_open_function(testfile, nelems): ctest_indexed_gzip.test_open_function(testfile, nelems) def test_open_close_drop_handles(testfile, nelems, seed): ctest_indexed_gzip.test_open_close(testfile, nelems, seed, True) def test_open_close_ctxmanager(testfile, nelems, seed): ctest_indexed_gzip.test_open_close_ctxmanager( testfile, nelems, seed, False) def test_open_close_ctxmanager_drop_handles(testfile, nelems, seed): ctest_indexed_gzip.test_open_close_ctxmanager(testfile, nelems, seed, True) def test_open_mode(): ctest_indexed_gzip.test_open_mode() def test_atts(testfile): ctest_indexed_gzip.test_atts(testfile, False) def test_atts_drop_handles(testfile): ctest_indexed_gzip.test_atts(testfile, True) def test_init_failure_cases(concat): ctest_indexed_gzip.test_init_failure_cases(concat, False) def test_init_failure_cases_drop_handles(concat): ctest_indexed_gzip.test_init_failure_cases(concat, True) def test_init_success_cases(concat): ctest_indexed_gzip.test_init_success_cases(concat, False) def test_init_success_cases_drop_handles(concat): ctest_indexed_gzip.test_init_success_cases(concat, True) def test_accept_filename_or_fileobj(testfile, nelems): ctest_indexed_gzip.test_accept_filename_or_fileobj(testfile, nelems) def test_prioritize_fd_over_f(testfile, nelems): ctest_indexed_gzip.test_prioritize_fd_over_f(testfile, nelems) def test_create_from_open_handle(testfile, nelems, seed): ctest_indexed_gzip.test_create_from_open_handle( testfile, nelems, seed, False, False) def test_create_from_open_handle_drop_handles(testfile, nelems, seed): ctest_indexed_gzip.test_create_from_open_handle( testfile, nelems, seed, True, False) def test_create_from_file_like_obj(testfile, nelems, seed): ctest_indexed_gzip.test_create_from_open_handle( testfile, nelems, seed, False, True) def test_create_from_file_like_obj_drop_handles(testfile, nelems, seed): ctest_indexed_gzip.test_create_from_open_handle( testfile, nelems, seed, True, True) def test_handles_not_dropped(testfile, nelems, seed): ctest_indexed_gzip.test_handles_not_dropped(testfile, nelems, seed) def test_manual_build(): ctest_indexed_gzip.test_manual_build() def test_read_all(testfile, nelems, use_mmap): ctest_indexed_gzip.test_read_all(testfile, nelems, use_mmap, False) def test_read_all_drop_handles(testfile, nelems, use_mmap): ctest_indexed_gzip.test_read_all(testfile, nelems, use_mmap, True) def test_simple_read_with_null_padding(): ctest_indexed_gzip.test_simple_read_with_null_padding() def test_read_with_null_padding(testfile, nelems, use_mmap): ctest_indexed_gzip.test_read_with_null_padding(testfile, nelems, use_mmap) def test_read_beyond_end(concat): ctest_indexed_gzip.test_read_beyond_end(concat, False) def test_seek(concat): ctest_indexed_gzip.test_seek(concat) def test_read_beyond_end_drop_handles(concat): ctest_indexed_gzip.test_read_beyond_end(concat, True) def test_seek_and_read(testfile, nelems, niters, seed): ctest_indexed_gzip.test_seek_and_read( testfile, nelems, niters, seed, False) def test_seek_and_read_drop_handles(testfile, nelems, niters, seed): ctest_indexed_gzip.test_seek_and_read(testfile, nelems, niters, seed, True) def test_seek_and_tell(testfile, nelems, niters, seed): ctest_indexed_gzip.test_seek_and_tell( testfile, nelems, niters, seed, False) def test_seek_and_tell_drop_handles(testfile, nelems, niters, seed): ctest_indexed_gzip.test_seek_and_tell(testfile, nelems, niters, seed, True) def test_pread(): ctest_indexed_gzip.test_pread() def test_readinto(): ctest_indexed_gzip.test_readinto(False) def test_readinto_drop_handles(): ctest_indexed_gzip.test_readinto(True) def test_readline(): ctest_indexed_gzip.test_readline(False) def test_readline_drop_handles(): ctest_indexed_gzip.test_readline(True) def test_readline_sizelimit(): ctest_indexed_gzip.test_readline_sizelimit(False) def test_readline_sizelimit_drop_handles(): ctest_indexed_gzip.test_readline_sizelimit(True) def test_readlines(): ctest_indexed_gzip.test_readlines(False) def test_readlines_drop_handles(): ctest_indexed_gzip.test_readlines(True) def test_readlines_sizelimit(): ctest_indexed_gzip.test_readlines_sizelimit(False) def test_readlines_sizelimit_drop_handles(): ctest_indexed_gzip.test_readlines_sizelimit(True) def test_iter(): ctest_indexed_gzip.test_iter(False) def test_iter_drop_handles(): ctest_indexed_gzip.test_iter(True) @pytest.mark.slow_test def test_get_index_seek_points(): ctest_indexed_gzip.test_get_index_seek_points() def test_import_export_index(): ctest_indexed_gzip.test_import_export_index() def test_wrapper_class(): ctest_indexed_gzip.test_wrapper_class() def test_size_multiple_of_readbuf(seed): ctest_indexed_gzip.test_size_multiple_of_readbuf() @pytest.mark.slow_test def test_picklable(): ctest_indexed_gzip.test_picklable() def test_copyable(): ctest_indexed_gzip.test_copyable() @pytest.mark.slow_test def test_multiproc_serialise(): ctest_indexed_gzip.test_multiproc_serialise() @pytest.mark.slow_test def test_32bit_overflow(niters, seed): ctest_indexed_gzip.test_32bit_overflow(niters, seed) indexed_gzip-1.6.4/indexed_gzip/tests/ctest_indexed_gzip.pyx0000644000175000017500000007733214133320576023771 0ustar nileshnilesh# # Tests for the indexed_gzip module. # # Author: Paul McCarthy # from __future__ import print_function import os import os.path as op import itertools as it import functools as ft import subprocess as sp import multiprocessing as mp import copy as cp import sys import time import gzip import random import shutil import pickle import hashlib import textwrap import tempfile import contextlib import numpy as np from io import BytesIO import pytest import indexed_gzip as igzip from . import gen_test_data from . import check_data_valid from . import tempdir from . import compress from libc.stdio cimport (SEEK_SET, SEEK_CUR, SEEK_END) def error_fn(*args, **kwargs): raise Exception("Error") def read_element(gzf, element, seek=True): if seek: gzf.seek(int(element) * 8) bytes = gzf.read(8) val = np.ndarray(1, np.uint64, buffer=bytes) return val[0] def write_text_to_gzip_file(fname, lines): with gzip.open(fname, mode='wb') as f: for line in lines: f.write('{}\n'.format(line).encode()) def test_open_close(testfile, nelems, seed, drop): f = igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) assert not f.closed element = np.random.randint(0, nelems, 1) readval = read_element(f, element) assert readval == element f.close() assert f.closed with pytest.raises(IOError): f.close() def test_open_function(testfile, nelems): f1 = None f2 = None try: f1 = igzip.IndexedGzipFile(testfile) f2 = igzip.open( testfile) element = np.random.randint(0, nelems, 1) readval1 = read_element(f1, element) readval2 = read_element(f2, element) assert readval1 == element assert readval2 == element finally: if f1 is not None: f1.close() if f2 is not None: f2.close() def test_open_close_ctxmanager(testfile, nelems, seed, drop): with igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) as f: element = np.random.randint(0, nelems, 1) readval = read_element(f, element) assert readval == element assert f.closed def test_open_mode(): modes = [('r', True), ('rb', True), (None, True), ('rt', False), ('w', False), ('wt', False)] # open from file with tempdir(): with gzip.open('f.gz', 'wb') as f: f.write(b'12345') for mode, expect in modes: if expect: gzf = igzip.IndexedGzipFile('f.gz', mode=mode) assert gzf.read() == b'12345' else: with pytest.raises(ValueError): igzip.IndexedGzipFile('f.gz', mode=mode) # open from fileobj class BytesIOWithMode(BytesIO): pass # accept file-like without mode attribute modes.append(('del', True)) for mode, expect in modes: if mode == 'del' and hasattr(BytesIOWithMode, 'mode'): delattr(BytesIOWithMode, 'mode') else: BytesIOWithMode.mode = mode fileobj = BytesIOWithMode() with gzip.GzipFile(fileobj=fileobj, mode='wb') as f: f.write(b'12345') print(mode, expect) if expect: assert igzip.IndexedGzipFile(fileobj=fileobj).read() == b'12345' else: with pytest.raises(ValueError): igzip.IndexedGzipFile(fileobj=fileobj).read() def test_atts(testfile, drop): modes = [None, 'rb', 'r'] for m in modes: with igzip._IndexedGzipFile(filename=testfile, mode=m, drop_handles=drop) as f: assert not f.closed assert f.readable() assert f.seekable() assert not f.writable() assert f.mode == 'rb' assert f.tell() == 0 if not drop: assert f.fileobj() is not None assert f.fileno() == f.fileobj().fileno() else: with pytest.raises(igzip.NoHandleError): f.fileobj() with pytest.raises(igzip.NoHandleError): f.fileno() def test_init_failure_cases(concat, drop): with tempdir() as td: testfile = op.join(td, 'test.gz') gen_test_data(testfile, 65536, concat) # No writing with pytest.raises(ValueError): gf = igzip._IndexedGzipFile(filename=testfile, mode='w', drop_handles=drop) with pytest.raises(ValueError): gf = igzip._IndexedGzipFile(filename=testfile, mode='wb', drop_handles=drop) # No writing f = open(testfile, mode='wb') with pytest.raises(ValueError): gf = igzip._IndexedGzipFile(fileobj=f, drop_handles=drop) f.close() # No writing f = open(testfile, mode='w') with pytest.raises(ValueError): gf = igzip._IndexedGzipFile(fileobj=f, drop_handles=drop) f.close() # Need a filename or fid with pytest.raises(ValueError): f = igzip._IndexedGzipFile(drop_handles=drop) # can only specify one of filename/fid with pytest.raises(ValueError): with open(testfile, mode='rb'): f = igzip._IndexedGzipFile(filename=testfile, fileobj=f, drop_handles=drop) def test_init_success_cases(concat, drop): with tempdir() as td: testfile = op.join(td, 'test.gz') gen_test_data(testfile, 65536, concat) gf1 = igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) gf2 = igzip._IndexedGzipFile(filename=testfile, mode='r', drop_handles=drop) gf3 = igzip._IndexedGzipFile(filename=testfile, mode='rb', drop_handles=drop) gf1.close() gf2.close() gf3.close() del gf1 del gf2 del gf3 def test_create_from_open_handle(testfile, nelems, seed, drop, file_like_object): f = open(testfile, 'rb') if file_like_object: f = BytesIO(f.read()) gzf = igzip._IndexedGzipFile(fileobj=f, drop_handles=drop) assert gzf.fileobj() is f assert not gzf.drop_handles element = np.random.randint(0, nelems, 1) readval = read_element(gzf, element) gzf.close() try: assert readval == element assert gzf.closed assert not f.closed finally: f.close() del gzf del f def test_accept_filename_or_fileobj(testfile, nelems): f = None gzf1 = None gzf2 = None gzf3 = None try: f = open(testfile, 'rb') gzf1 = igzip._IndexedGzipFile(testfile) gzf2 = igzip._IndexedGzipFile(f) gzf3 = igzip._IndexedGzipFile(fileobj=BytesIO(open(testfile, 'rb').read())) element = np.random.randint(0, nelems, 1) readval1 = read_element(gzf1, element) readval2 = read_element(gzf2, element) readval3 = read_element(gzf3, element) assert readval1 == element assert readval2 == element assert readval3 == element finally: if gzf3 is not None: gzf3.close() if gzf2 is not None: gzf2.close() if gzf1 is not None: gzf1.close() if f is not None: f .close() del f del gzf1 del gzf2 del gzf3 def test_prioritize_fd_over_f(testfile, nelems): """When a fileobj with an associated fileno is passed to IndexedGzipFile, the fileobj's file descriptor (fd) should be utilized by zran.c instead of the file-like object specified by fileobj (f). """ if sys.version_info[0] < 3: # We can't set the .read attribute in Python 2 # because it's read-only, so skip it. return f = None gzf = None try: f = open(testfile, 'rb') f.read = error_fn # If the file-like object were directly used by zran.c, reading would raise an error. gzf = igzip._IndexedGzipFile(fileobj=f) element = np.random.randint(0, nelems, 1) readval = read_element(gzf, element) assert readval == element finally: if gzf is not None: gzf.close() if f is not None: f .close() del f del gzf def test_handles_not_dropped(testfile, nelems, seed): # When drop_handles is False with igzip._IndexedGzipFile(filename=testfile, drop_handles=False) as f: fid = f.fileobj() assert fid is not None # Check that the file object # doesn't change across reads for i in range(5): element = np.random.randint(0, nelems, 1) readval = read_element(f, element) assert readval == element assert f.fileobj() is fid # Also when given an open stream with open(testfile, 'rb') as f: with igzip._IndexedGzipFile(fileobj=f) as gzf: assert gzf.fileobj() is f for i in range(5): element = np.random.randint(0, nelems, 1) readval = read_element(gzf, element) assert readval == element assert gzf.fileobj() is f def test_manual_build(): with tempdir() as td: nelems = 65536 fname = op.join(td, 'test.gz') gen_test_data(fname, nelems, False) with igzip._IndexedGzipFile(fname, auto_build=False) as f: # Seeking to 0 should work, but # anywhere else should fail f.seek(0) for off in [1, 2, 20, 200]: with pytest.raises(igzip.NotCoveredError): f.seek(off) # Reading from beginning should work readval = read_element(f, 0, seek=False) assert readval == 0 # but subsequent reads should fail # (n.b. this might change in the future) with pytest.raises(igzip.NotCoveredError): readval = read_element(f, 1, seek=False) # Seek should still fail even after read with pytest.raises(igzip.NotCoveredError): f.seek(8) # But reading from beginning should still work f.seek(0) readval = read_element(f, 0, seek=False) assert readval == 0 # But after building the index, # seeking and reading should work f.build_full_index() for i in range(5): element = np.random.randint(0, nelems, 1) readval = read_element(f, element) assert readval == element def test_read_all(testfile, nelems, use_mmap, drop): if use_mmap: pytest.skip('skipping test_read_all test as ' 'it will require too much memory') with igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) as f: data = f.read(nelems * 8) data = np.ndarray(shape=nelems, dtype=np.uint64, buffer=data) # Check that every value is valid assert check_data_valid(data, 0) def test_simple_read_with_null_padding(): fileobj = BytesIO() with gzip.GzipFile(fileobj=fileobj, mode='wb') as f: f.write(b"hello world") fileobj.write(b"\0" * 100) with igzip._IndexedGzipFile(fileobj=fileobj) as f: assert f.read() == b"hello world" f.seek(3) assert f.read() == b"lo world" f.seek(20) assert f.read() == b"" def test_read_with_null_padding(testfile, nelems, use_mmap): if use_mmap: pytest.skip('skipping test_read_with_null_padding test ' 'as it will require too much memory') fileobj = BytesIO(open(testfile, "rb").read() + b"\0" * 100) with igzip._IndexedGzipFile(fileobj=fileobj) as f: data = f.read(nelems * 8) # Read a bit further so we reach the zero-padded area. # This line should not throw an exception. f.read(1) data = np.ndarray(shape=nelems, dtype=np.uint64, buffer=data) # Check that every value is valid assert check_data_valid(data, 0) def test_read_beyond_end(concat, drop): with tempdir() as tdir: nelems = 65536 testfile = op.join(tdir, 'test.gz') gen_test_data(testfile, nelems, concat) with igzip._IndexedGzipFile(filename=testfile, readall_buf_size=1024, drop_handles=drop) as f: # Try with a specific number of bytes data1 = f.read(nelems * 8 + 10) # And also with unspecified numbytes f.seek(0) data2 = f.read() data1 = np.ndarray(shape=nelems, dtype=np.uint64, buffer=data1) data2 = np.ndarray(shape=nelems, dtype=np.uint64, buffer=data2) assert check_data_valid(data1, 0) assert check_data_valid(data2, 0) def test_seek(concat): with tempdir() as tdir: nelems = 262144 # == 2MB testfile = op.join(tdir, 'test.gz') gen_test_data(testfile, nelems, concat) results = [] with igzip._IndexedGzipFile(testfile, spacing=131072) as f: results.append((f.read(8), 0)) f.seek(24, SEEK_SET) results.append((f.read(8), 3)) f.seek(-16, SEEK_CUR) results.append((f.read(8), 2)) f.seek(16, SEEK_CUR) results.append((f.read(8), 5)) # SEEK_END only works when index is built with pytest.raises(ValueError): f.seek(-100, SEEK_END) f.build_full_index() f.seek(-800, SEEK_END) results.append((f.read(8), 262044)) f.seek(-3200, SEEK_END) results.append((f.read(8), 261744)) for data, expected in results: val = np.frombuffer(data, dtype=np.uint64) assert val == expected def test_seek_and_read(testfile, nelems, niters, seed, drop): with igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) as f: # Pick some random elements and make # sure their values are all right seekelems = np.random.randint(0, nelems, niters) for i, testval in enumerate(seekelems): readval = read_element(f, testval) ft = f.tell() assert ft == (testval + 1) * 8 assert readval == testval def test_seek_and_tell(testfile, nelems, niters, seed, drop): filesize = nelems * 8 with igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) as f: # Pick some random seek positions # and make sure that seek and tell # return their location correctly seeklocs = np.random.randint(0, filesize, niters) for seekloc in seeklocs: st = f.seek(seekloc) ft = f.tell() assert ft == seekloc assert st == seekloc # Also test that seeking beyond # EOF is clamped to EOF eofseeks = [filesize, filesize + 1, filesize + 2, filesize + 3, filesize + 4, filesize + 1000, filesize * 1000] for es in eofseeks: assert f.seek(es) == filesize assert f.tell() == filesize def test_pread(): with tempdir() as td: nelems = 1024 testfile = op.join(td, 'test.gz') gen_test_data(testfile, nelems, False) with igzip.IndexedGzipFile(testfile) as f: for i in range(20): off = np.random.randint(0, nelems, 1)[0] data = f.pread(8, off * 8) val = np.frombuffer(data, dtype=np.uint64) assert val[0] == off def test_readinto(drop): lines = textwrap.dedent(""" line 1 line 2 this is line 3 line the fourth here is the fifth line """).strip().split('\n') def line_offset(idx): return sum([len(l) for l in lines[:idx]]) + idx with tempdir() as td: testfile = op.join(td, 'test.gz') write_text_to_gzip_file(testfile, lines) with igzip._IndexedGzipFile(filename=testfile, drop_handles=drop) as f: # read first line into a byte array buf = bytearray(len(lines[0])) f.seek(0) assert f.readinto(buf) == len(lines[0]) assert buf.decode() == lines[0] # read first line into memoryvew buf = memoryview(bytearray(len(lines[0]))) f.seek(0) assert f.readinto(buf) == len(lines[0]) assert buf.tobytes().decode() == lines[0] # read an arbitrary line offset = line_offset(2) buf = bytearray(len(lines[2])) f.seek(offset) assert f.readinto(buf) == len(lines[2]) assert buf.decode() == lines[2] # read the end line, sans-newline offset = line_offset(len(lines) - 1) buf = bytearray(len(lines[-1])) f.seek(offset) assert f.readinto(buf) == len(lines[-1]) assert buf.decode() == lines[-1] # read the end line, with newline buf = bytearray(len(lines[-1]) + 1) f.seek(offset) assert f.readinto(buf) == len(lines[-1]) + 1 assert buf.decode() == lines[-1] + '\n' # read the end line with a bigger buffer buf = bytearray(len(lines[-1]) + 10) f.seek(offset) assert f.readinto(buf) == len(lines[-1]) + 1 assert buf.decode() == lines[-1] + '\n' + (b'\0' * 9).decode() # start at EOF, and try to read something filelen = sum([len(l) for l in lines]) + len(lines) f.seek(filelen) buf = bytearray([99 for i in range(len(buf))]) assert f.readinto(buf) == 0 assert all([b == chr(99) for b in buf.decode()]) def test_readline(drop): lines = textwrap.dedent(""" this is some text split across several lines how creative """).strip().split('\n') with tempdir() as td: fname = op.join(td, 'test.gz') write_text_to_gzip_file(fname, lines) with igzip._IndexedGzipFile(fname, drop_handles=drop) as f: seekpos = 0 for line in lines: assert f.readline() == (line + '\n').encode() seekpos += len(line) + 1 assert f.tell() == seekpos # Should return empty string after EOF assert f.readline() == b'' f.seek(0) assert f.readline(0) == b'' def test_readline_sizelimit(drop): lines = ['line one', 'line two'] with tempdir() as td: fname = op.join(td, 'test.gz') write_text_to_gzip_file(fname, lines) with igzip._IndexedGzipFile(fname, drop_handles=drop) as f: # limit to one character before the end of the first line l = f.readline(len(lines[0]) - 1) assert l == (lines[0][:-1]).encode() # limit to the last character of the first line f.seek(0) l = f.readline(len(lines[0]) - 1) assert l == (lines[0][:-1]).encode() # limit to the newline at the end of the first line f.seek(0) l = f.readline(len(lines[0]) + 1) assert l == (lines[0] + '\n').encode() # limit to the first character after the first line f.seek(0) l = f.readline(len(lines[0]) + 2) assert l == (lines[0] + '\n').encode() def test_readlines(drop): lines = textwrap.dedent(""" this is some more text split across several lines super imaginative test data """).strip().split('\n') with tempdir() as td: fname = op.join(td, 'test.gz') write_text_to_gzip_file(fname, lines) with igzip._IndexedGzipFile(fname, drop_handles=drop) as f: gotlines = f.readlines() assert len(lines) == len(gotlines) for expl, gotl in zip(lines, gotlines): assert (expl + '\n').encode() == gotl assert f.read() == b'' def test_readlines_sizelimit(drop): lines = ['line one', 'line two'] data = '\n'.join(lines) + '\n' with tempdir() as td: fname = op.join(td, 'test.gz') write_text_to_gzip_file(fname, lines) limits = range(len(data) + 2) with igzip._IndexedGzipFile(fname, drop_handles=drop) as f: for lim in limits: f.seek(0) gotlines = f.readlines(lim) # Expect the first line if lim < len(lines[0]) + 1: assert len(gotlines) == 1 assert gotlines[0] == (lines[0] + '\n').encode() # Expect both lines else: assert len(gotlines) == 2 assert gotlines[0] == (lines[0] + '\n').encode() assert gotlines[1] == (lines[1] + '\n').encode() def test_iter(drop): lines = textwrap.dedent(""" this is even more text that is split across several lines the creativity involved in generating this test data is unparalleled """).strip().split('\n') with tempdir() as td: fname = op.join(td, 'test.gz') write_text_to_gzip_file(fname, lines) with igzip._IndexedGzipFile(fname, drop_handles=drop) as f: for i, gotline in enumerate(f): assert (lines[i] + '\n').encode() == gotline with pytest.raises(StopIteration): next(f) def test_get_index_seek_points(): with tempdir() as td: fname = op.join(td, 'test.gz') spacing = 1048576 # make a test file data = np.arange(spacing, dtype=np.uint64) with gzip.open(fname, 'wb') as f: f.write(data.tostring()) # check points before and after index creation with igzip._IndexedGzipFile(fname, spacing=spacing) as f: assert not list(f.seek_points()) f.build_full_index() expected_number_of_seek_points = 1 + int(data.nbytes / spacing) seek_points = list(f.seek_points()) assert len(seek_points) == expected_number_of_seek_points # check monotonic growth uncmp_offsets = [point[0] for point in seek_points] assert sorted(uncmp_offsets) == uncmp_offsets def test_import_export_index(): with tempdir() as td: fname = op.join(td, 'test.gz') idxfname = op.join(td, 'test.gzidx') # make a test file data = np.arange(65536, dtype=np.uint64) with gzip.open(fname, 'wb') as f: f.write(data.tostring()) # generate an index file with igzip._IndexedGzipFile(fname) as f: f.build_full_index() f.export_index(idxfname) # Check that index file works via __init__ with igzip._IndexedGzipFile(fname, index_file=idxfname) as f: f.seek(65535 * 8) val = np.frombuffer(f.read(8), dtype=np.uint64) assert val[0] == 65535 # Check that index file works via import_index with igzip._IndexedGzipFile(fname) as f: f.import_index(idxfname) f.seek(65535 * 8) val = np.frombuffer(f.read(8), dtype=np.uint64) assert val[0] == 65535 # generate an index file from open file handle with igzip._IndexedGzipFile(fname) as f: f.build_full_index() # Should raise if wrong permissions with pytest.raises(ValueError): with open(idxfname, 'rb') as idxf: f.export_index(fileobj=idxf) with open(idxfname, 'wb') as idxf: f.export_index(fileobj=idxf) # Check that we can read it back with igzip._IndexedGzipFile(fname) as f: # Should raise if wrong permissions # (append, so existing contents are # not overwritten) with pytest.raises(ValueError): with open(idxfname, 'ab') as idxf: f.import_index(fileobj=idxf) with open(idxfname, 'rb') as idxf: f.import_index(fileobj=idxf) f.seek(65535 * 8) val = np.frombuffer(f.read(8), dtype=np.uint64) assert val[0] == 65535 # Test exporting to / importing from a file-like object idxf = BytesIO() with igzip._IndexedGzipFile(fname) as f: f.export_index(fileobj=idxf) idxf.seek(0) with igzip._IndexedGzipFile(fname) as f: f.import_index(fileobj=idxf) f.seek(65535 * 8) val = np.frombuffer(f.read(8), dtype=np.uint64) assert val[0] == 65535 def test_wrapper_class(): with tempdir() as td: fname = op.join(td, 'test.gz') idxfname = op.join(td, 'test.gzidx') data = np.arange(65536, dtype=np.uint64) with gzip.open(fname, 'wb') as f: f.write(data.tostring()) with igzip.IndexedGzipFile(fname, drop_handles=False) as f: assert f.fileno() == f.fileobj().fileno() assert not f.drop_handles f.build_full_index() f.export_index(idxfname) f.import_index(idxfname) def gcd(num): if num <= 3: return 1 candidates = list(range(int(np.ceil(np.sqrt(num))), 2, -2)) candidates.extend((2, 1)) for divisor in candidates: if num % divisor == 0: return divisor return 1 def test_size_multiple_of_readbuf(): fname = 'test.gz' with tempdir(): while True: data = np.random.randint(1, 1000, 100000, dtype=np.uint32) with gzip.open(fname, 'wb') as f: f.write(data.tobytes()) del f f = None # we need a file size that is divisible # by the minimum readbuf size fsize = op.getsize(fname) if gcd(fsize) >= 128: break # readbuf size == file size bufsz = fsize with igzip.IndexedGzipFile(fname, readbuf_size=bufsz) as f: assert f.seek(fsize) == fsize del f f = None with igzip.IndexedGzipFile(fname, readbuf_size=bufsz) as f: read = np.ndarray(shape=100000, dtype=np.uint32, buffer=f.read()) assert np.all(read == data) del f f = None # Use a buf size that is a divisor of the file size bufsz = gcd(fsize) with igzip.IndexedGzipFile(fname, readbuf_size=bufsz) as f: assert f.seek(fsize) == fsize del f f = None with igzip.IndexedGzipFile(fname, readbuf_size=bufsz) as f: read = np.ndarray(shape=100000, dtype=np.uint32, buffer=f.read()) assert np.all(read == data) del f f = None def test_picklable(): # default behaviour is for drop_handles=True, # which means that an IndexedGzipFile object # should be picklable/serialisable fname = 'test.gz' with tempdir(): data = np.random.randint(1, 1000, (10000, 10000), dtype=np.uint32) with open(fname+'.bin', 'wb') as f: f.write(data.tobytes()) compress(fname+'.bin', fname) del f gzf = igzip.IndexedGzipFile(fname) first50MB = gzf.read(1048576 * 50) gzf.seek(gzf.tell()) pickled = pickle.dumps(gzf) second50MB = gzf.read(1048576 * 50) gzf.seek(gzf.tell()) gzf.close() del gzf gzf = pickle.loads(pickled) assert gzf.tell() == 1048576 * 50 assert gzf.read(1048576 * 50) == second50MB gzf.seek(0) assert gzf.read(1048576 * 50) == first50MB gzf.close() del gzf # if drop_handles=False, no pickle with tempdir(): data = np.random.randint(1, 1000, 50000, dtype=np.uint32) with gzip.open(fname, 'wb') as f: f.write(data.tobytes()) del f gzf = igzip.IndexedGzipFile(fname, drop_handles=False) with pytest.raises(pickle.PicklingError): pickled = pickle.dumps(gzf) gzf.close() del gzf def test_copyable(): fname = 'test.gz' with tempdir(): data = np.random.randint(1, 1000, (10000, 10000), dtype=np.uint32) with open(fname+'.bin', 'wb') as f: f.write(data.tobytes()) compress(fname+'.bin', fname) del f gzf = igzip.IndexedGzipFile(fname) gzf_copy = cp.deepcopy(gzf) first50MB = gzf.read(1048576 * 50) gzf.seek(gzf.tell()) gzf_copy2 = cp.deepcopy(gzf) second50MB = gzf.read(1048576 * 50) gzf.seek(gzf.tell()) gzf.close() del gzf assert gzf_copy.tell() == 0 assert gzf_copy2.tell() == 1048576 * 50 assert gzf_copy.read(1048576 * 50) == first50MB assert gzf_copy2.read(1048576 * 50) == second50MB gzf_copy2.seek(0) assert gzf_copy2.read(1048576 * 50) == first50MB gzf_copy.close() gzf_copy2.close() del gzf_copy del gzf_copy2 with tempdir(): data = np.random.randint(1, 1000, 50000, dtype=np.uint32) with gzip.open(fname, 'wb') as f: f.write(data.tobytes()) del f # if drop_handles=False, no copy gzf = igzip.IndexedGzipFile(fname, drop_handles=False) with pytest.raises(pickle.PicklingError): gzf_copy = cp.deepcopy(gzf) gzf.close() del gzf # If passed an open filehandle, no copy with open(fname, 'rb') as fobj: gzf = igzip.IndexedGzipFile(fileobj=fobj) with pytest.raises(pickle.PicklingError): gzf_copy = cp.deepcopy(gzf) gzf.close() del gzf del fobj def _mpfunc(gzf, size, offset): gzf.seek(offset) bytes = gzf.read(size) val = np.ndarray(int(size / 4), np.uint32, buffer=bytes) gzf.close() del gzf return val.sum() def test_multiproc_serialise(): fname = 'test.gz' with tempdir(): data = np.arange(10000000, dtype=np.uint32) with gzip.open(fname, 'wb') as f: f.write(data.tobytes()) del f gzf = igzip.IndexedGzipFile(fname) size = len(data) / 16 offsets = np.arange(0, len(data), size) func = ft.partial(_mpfunc, gzf, size * 4) pool = mp.Pool(8) results = pool.map(func, offsets * 4) pool.close() pool.join() gzf.close() del gzf del pool expected = [data[off:off+size].sum() for off in offsets] assert results == expected def test_32bit_overflow(niters, seed): with tempdir(): block = 2 ** 24 # 128MB nelems = block * 48 # 6GB data = np.ones(block, dtype=np.uint64).tobytes() with gzip.open('test.gz', 'wb') as f: for i in range(48): print('Generated to {}...'.format(block * i)) f.write(data) with igzip._IndexedGzipFile(filename='test.gz') as f: seekelems = np.random.randint(0, nelems, niters) for i, testval in enumerate(seekelems): readval = read_element(f, testval) ft = f.tell() assert ft == int(testval + 1) * 8 assert readval == 1 indexed_gzip-1.6.4/indexed_gzip/tests/__main__.py0000644000175000017500000000127014133320576021432 0ustar nileshnilesh#!/usr/bin/env python # # Author: Paul McCarthy # """ Run indexed_gzip unit tests. Requires pytest to be installed. Works around a problem with pytest not discovring/running conftest.py files when running tests that have been installed into the environment via the --pyargs option. https://github.com/pytest-dev/pytest/issues/1596 https://stackoverflow.com/questions/41270604/using-command-line-parameters-with-pytest-pyargs/43747114#43747114 """ import os.path as op import sys def main(): import pytest testdir = op.abspath(op.dirname(__file__)) sys.exit(pytest.main([testdir] + sys.argv[1:])) if __name__ == '__main__': main() indexed_gzip-1.6.4/indexed_gzip/tests/benchmark.py0000644000175000017500000001046514133320576021652 0ustar nileshnilesh#!/usr/bin/env python # # benchmark.py - benchmark indexed_gzip # # Author: Paul McCarthy # from __future__ import print_function import os import os.path as op import sys import gzip import time import shutil import hashlib import tempfile import argparse import contextlib import numpy as np import indexed_gzip as igzip @contextlib.contextmanager def tempdir(): testdir = tempfile.mkdtemp() prevdir = os.getcwd() try: os.chdir(testdir) yield testdir finally: os.chdir(prevdir) shutil.rmtree(testdir) def size(filename): with open(filename, 'rb') as f: f.seek(-1, 2) return f.tell() def gen_file(fname, nbytes): nelems = int(nbytes / 4) data = np.random.randint(0, 2 ** 32, nelems, dtype=np.uint32) # zero out 10% so there is something to compress zeros = np.random.randint(0, nelems, int(nelems / 10.0)) data[zeros] = 0 data = data.tostring() # write 1GB max at a time - the gzip # module doesn't like writing >= 4GB # in one go. chunksize = 1073741824 while len(data) > 0: chunk = data[:chunksize] data = data[chunksize:] with gzip.open(fname, 'ab') as outf: outf.write(chunk) def benchmark_file(fobj, seeks, lens, update): start = time.time() hashobj = hashlib.md5() for i, (s, l) in enumerate(zip(seeks, lens)): fobj.seek(s) data = fobj.read(l) hashobj.update(data) update(i) update(len(seeks)) end = time.time() elapsed = end - start return str(hashobj.hexdigest()), elapsed def benchmark(filename, nseeks): nbytes = size(filename) seeks = np.linspace(0, nbytes, nseeks, dtype=np.int) lens = np.random.randint(1048576, 16777216, nseeks) np.random.shuffle(seeks) names = [ 'GzipFile', 'IndexedGzipFile(drop_handles=True)', 'IndexedGzipFile(drop_handles=False)' ] namelen = max([len(n) for n in names]) namefmt = '{{:<{}s}}'.format(namelen) fobjs = [ lambda : gzip.GzipFile( filename, 'rb'), lambda : igzip.IndexedGzipFile(filename, drop_handles=True), lambda : igzip.IndexedGzipFile(filename, drop_handles=False), ] for name, fobj in zip(names, fobjs): def update(i): print('\r{} {:6.2f}%'.format( namefmt.format(name), 100.0 * i / len(seeks)), end='') sys.stdout.flush() with fobj() as f: md5, time = benchmark_file(f, seeks, lens, update) print(' {} {:0.0f}s'.format(md5, time)) if __name__ == '__main__': parser = argparse.ArgumentParser('indexe_gzip benchmark') parser.add_argument('-b', '--bytes', type=int, help='Uncompressed size of test file in bytes. ' 'Ignored if a --file is specified', default=16777216) parser.add_argument('-s', '--seeks', type=int, help='Number of random seeks', default=1000) parser.add_argument('-f', '--file', type=str, help='Test file (default: generate one)') parser.add_argument('-r', '--randomseed', type=int, help='Seed for random number generator') namespace = parser.parse_args() if namespace.randomseed is not None: np.random.seed(namespace.seed) if namespace.file is not None: namespace.file = op.abspath(namespace.file) with tempdir(): if namespace.file is None: print('Generating test data ({:0.2f}MB)...'.format( namespace.bytes / 1048576.), end='') sys.stdout.flush() namespace.file = 'test.gz' gen_file(namespace.file, namespace.bytes) print(' {:0.2f}MB compressed'.format( size(namespace.file) / 1048576.0)) if namespace.randomseed is not None: np.random.seed(namespace.seed) benchmark(namespace.file, namespace.seeks) indexed_gzip-1.6.4/indexed_gzip/tests/test_zran.py0000644000175000017500000001023314133320576021722 0ustar nileshnilesh#!/usr/bin/env python # # test_zran.py - Python wrapper around ctest_zran.pyx. # # Author: Paul McCarthy # from __future__ import print_function import sys if not sys.platform.startswith("win"): # Run these tests only on POSIX systems import pytest import numpy as np from . import ctest_zran pytestmark = pytest.mark.zran_test def test_fread(): ctest_zran.test_fread() def test_ftell(): ctest_zran.test_ftell() def test_fseek(): ctest_zran.test_fseek() def test_feof(): ctest_zran.test_feof() def test_ferror(): ctest_zran.test_ferror() def test_fflush(): ctest_zran.test_fflush() def test_fwrite(): ctest_zran.test_fwrite() def test_getc(): ctest_zran.test_getc() def test_init(testfile): for no_fds in (True, False): ctest_zran.test_init(testfile, no_fds) def test_init_file_modes(testfile): for no_fds in (True, False): ctest_zran.test_init_file_modes(testfile, no_fds) def test_no_auto_build(testfile, nelems): for no_fds in (True, False): ctest_zran.test_no_auto_build(testfile, no_fds, nelems) def test_seek_to_end(testfile, nelems): for no_fds in (True, False): ctest_zran.test_seek_to_end(testfile, no_fds, nelems) def test_seek_cur(testfile, nelems): for no_fds in (True, False): ctest_zran.test_seek_cur(testfile, no_fds, nelems) def test_seek_end(testfile, nelems): for no_fds in (True, False): ctest_zran.test_seek_end(testfile, no_fds, nelems) def test_seek_beyond_end(testfile, nelems): for no_fds in (True, False): ctest_zran.test_seek_beyond_end(testfile, no_fds, nelems) def test_sequential_seek_to_end(testfile, nelems, niters): for no_fds in (True, False): ctest_zran.test_sequential_seek_to_end(testfile, no_fds, nelems, niters) def test_random_seek(testfile, nelems, niters, seed): for no_fds in (True, False): ctest_zran.test_random_seek(testfile, no_fds, nelems, niters, seed) def test_read_all(testfile, nelems, use_mmap): for no_fds in (True, False): ctest_zran.test_read_all(testfile, no_fds, nelems, use_mmap) @pytest.mark.slow_test def test_seek_then_read_block(testfile, nelems, niters, seed, use_mmap): for no_fds in (True, False): ctest_zran.test_seek_then_read_block( testfile, no_fds, nelems, niters, seed, use_mmap ) def test_random_seek_and_read(testfile, nelems, niters, seed): for no_fds in (True, False): ctest_zran.test_random_seek_and_read(testfile, no_fds, nelems, niters, seed) @pytest.mark.slow_test def test_read_all_sequential(testfile, nelems): for no_fds in (True, False): ctest_zran.test_read_all_sequential(testfile, no_fds, nelems) @pytest.mark.slow_test def test_build_then_read(testfile, nelems, seed, use_mmap): for no_fds in (True, False): ctest_zran.test_build_then_read(testfile, no_fds, nelems, seed, use_mmap) @pytest.mark.slow_test def test_readbuf_spacing_sizes(testfile, nelems, niters, seed): for no_fds in (True, False): ctest_zran.test_readbuf_spacing_sizes( testfile, no_fds, nelems, niters, seed ) def test_export_then_import(testfile): for no_fds in (True, False): ctest_zran.test_export_then_import(testfile, no_fds) def test_export_import_no_points(): for no_fds in (True, False): ctest_zran.test_export_import_no_points(no_fds) def test_export_import_format_v0(): ctest_zran.test_export_import_format_v0() def test_crc_validation(concat): ctest_zran.test_crc_validation(concat) def test_standard_usage_with_null_padding(concat): ctest_zran.test_standard_usage_with_null_padding(concat) def test_inflateInit_leak_on_error(): ctest_zran.test_inflateInit_leak_on_error() def test_read_eof_memmove_rotate_bug(seed): ctest_zran.test_read_eof_memmove_rotate_bug(seed) indexed_gzip-1.6.4/indexed_gzip/tests/__init__.py0000644000175000017500000001433514133320576021457 0ustar nileshnilesh#!/usr/bin/env python # # __init__.py - # # Author: Paul McCarthy # import io import os import sys import time import gzip import shutil import tempfile import threading import subprocess as sp import multiprocessing as mp import numpy as np def tempdir(): """Returnsa context manager which creates and returns a temporary directory, and then deletes it on exit. """ class ctx(object): def __enter__(self): self.prevdir = os.getcwd() self.tempdir = tempfile.mkdtemp() os.chdir(self.tempdir) return self.tempdir def __exit__(self, *a, **kwa): os.chdir(self.prevdir) shutil.rmtree(self.tempdir) return ctx() def poll(until): """Waits until ``until`` returns ``True``, printing out a message every minute. """ start = time.time() while not until(): time.sleep(0.5) cur = time.time() elapsed = int(round(cur - start)) if int(elapsed) % 60 == 0: print('Waiting ({:0.2f} minutes)'.format(elapsed / 60.0)) def compress(infile, outfile, buflen=-1): """Use gzip to compress the data in infile, saving it to outfile. If buflen == -1, we compress all of the data at once. Otherwise we compress chunks, creating a concatenated gzip stream. """ def compress_with_gzip_module(): print('Compressing data using python gzip module ...', outfile) with open(infile, 'rb') as inf: while True: data = inf.read(buflen) if len(data) == 0: break with open(outfile, 'ab') as outf: gzip.GzipFile(fileobj=outf).write(data) def compress_with_gzip_command(): with open(infile, 'rb') as inf, open(outfile, 'wb') as outf: # If buflen == -1, do a single call if buflen == -1: print('Compressing data with a single ' 'call to gzip ...', outfile) sp.Popen(['gzip', '-c'], stdin=inf, stdout=outf).wait() # Otherwise chunk the call else: print('Compressing data with multiple ' 'calls to gzip ...', outfile) nbytes = 0 chunk = inf.read(buflen) while len(chunk) != 0: proc = sp.Popen(['gzip', '-c'], stdin=sp.PIPE, stdout=outf) proc.communicate(chunk) nbytes += len(chunk) if (nbytes / buflen) % 10 == 0: print('Compressed to {}...'.format(nbytes)) chunk = inf.read(buflen) # Use python gzip module on windows, # can't assume gzip exists if sys.platform.startswith("win"): target = compress_with_gzip_module # If not windows, assume that gzip command # exists, and use it, because the python # gzip module is super-slow. else: target = compress_with_gzip_command cmpThread = threading.Thread(target=target) cmpThread.start() poll(lambda : not cmpThread.is_alive()) def compress_inmem(data, concat): """Compress the given data (assumed to be bytes) and return a bytearray containing the compressed data (including gzip header and footer). Also returns offsets for the end of each separate stream. """ f = io.BytesIO() if concat: chunksize = len(data) // 10 else: chunksize = len(data) offsets = [] compressed = 0 print('Generating compressed data {}, concat: {})'.format( len(data), concat)) while compressed < len(data): start = len(f.getvalue()) chunk = data[compressed:compressed + chunksize] with gzip.GzipFile(mode='ab', fileobj=f) as gzf: gzf.write(chunk) end = len(f.getvalue()) print(' Wrote stream to {} - {} [{} bytes] ...'.format( start, end, end - start)) offsets.append(end) compressed += chunksize print(' Final size: {}'.format(len(f.getvalue()))) f.seek(0) return bytearray(f.read()), offsets def gen_test_data(filename, nelems, concat): """Make some data to test with. """ start = time.time() # The file just contains a sequentially # increasing list of numbers print('Generating test data ({} elems, {} bytes -> {})'.format( nelems, nelems * 8, filename)) # Generate the data as a numpy memmap array. # Allocate at most 128MB at a time toWrite = nelems offset = 0 writeBlockSize = min(16777216, nelems) datafile = '{}_temp'.format(filename) open(datafile, 'wb+').close() data = np.memmap(datafile, dtype=np.uint64, shape=nelems) idx = 0 while toWrite > 0: if idx % 10 == 0: print('Generated to {}...'.format(offset)) thisWrite = min(writeBlockSize, toWrite) vals = np.arange(offset, offset + thisWrite, dtype=np.uint64) data[offset:offset + thisWrite] = vals toWrite -= thisWrite offset += thisWrite idx += 1 data.flush() if not concat: maxBufSize = -1 else: maxBufSize = 8 * min(16777216, nelems // 50) compress(datafile, filename, maxBufSize) end = time.time() del data os.remove(datafile) print('Done in {:0.2f} seconds'.format(end - start)) def _check_chunk(args): s, e, test_data = args valid = np.arange(s, e, dtype=np.uint64) return np.all(test_data == valid) def check_data_valid(data, startval, endval=None): if endval is None: endval = len(data) chunksize = 10000000 startval = int(startval) endval = int(endval) offsets = np.arange(0, len(data), chunksize) args = [] result = True for offset in offsets: s = startval + offset e = min(s + chunksize, endval) nelems = e - s test_chunk = data[offset:offset + nelems] args.append((s, e, test_chunk)) pool = mp.Pool() result = all(pool.map(_check_chunk, args)) pool.terminate() return result indexed_gzip-1.6.4/indexed_gzip/tests/test_nibabel_integration.py0000644000175000017500000000633014133320576024752 0ustar nileshnilesh#!/usr/bin/env python import os.path as op import functools as ft import shutil import pytest nib = pytest.importorskip("nibabel") import numpy as np import indexed_gzip as igzip from indexed_gzip.tests import tempdir pytestmark = pytest.mark.nibabel_test @ft.total_ordering class Version(object): def __init__(self, vstr): self.v = [int(v) for v in vstr.split('.')] def __eq__(self, other): return other.v == self.v def __lt__(self, other): for sv, ov in zip(self.v, other.v): if sv > ov: return False if sv < ov: return True return False nibver = Version(nib.__version__) if nibver >= Version('2.1.0'): from nibabel.filebasedimages import ImageFileError else: from nibabel.spatialimages import ImageFileError def create_random_image(shape, fname): data = np.random.random(shape).astype(np.float32) aff = np.eye(4) nib.Nifti1Image(data, aff, None).to_filename(fname) return data def load_image(fname): basename = op.basename(fname)[:-7] # nibabel pre-2.1 is not indexed_gzip-aware if nibver <= Version('2.1.0'): fobj = igzip.IndexedGzipFile(fname) fmap = nib.Nifti1Image.make_file_map() fmap[basename].fileobj = fobj image = nib.Nifti1Image.from_file_map(fmap) # nibabel 2.2.x, we have to set keep_file_open='auto' # to get it to use indexed_gzip elif Version('2.2.0') <= nibver < Version('2.3.0'): image = nib.load(fname, keep_file_open='auto') # nibabel >= 2.3.x uses indexed_gzip automatically else: image = nib.load(fname) return image def test_nibabel_integration(): with tempdir(): data = create_random_image((50, 50, 50, 50), 'image.nii.gz') image = load_image('image.nii.gz') idata = np.asanyarray(image.dataobj) assert np.all(np.isclose(data, idata)) assert not image.in_memory if nibver < Version('2.2.0'): assert isinstance(image.file_map['image'].fileobj, igzip.IndexedGzipFile) else: assert isinstance(image.dataobj._opener.fobj, igzip.IndexedGzipFile) # https://github.com/pauldmccarthy/indexed_gzip/issues/40 def test_readdata_twice(): with tempdir(): # the bug only occurs on relatively small images, # where the full index comprises only one or two # index points data = create_random_image((10, 10, 10, 10), 'image.nii.gz') image = load_image('image.nii.gz') d1 = np.asanyarray(image.dataobj) d2 = np.asanyarray(image.dataobj) assert np.all(np.isclose(data, d1)) assert np.all(np.isclose(data, d2)) # https://github.com/pauldmccarthy/indexed_gzip/pull/45 def test_bad_image_error(): if nibver < Version('2.3.0'): return with tempdir(): create_random_image((10, 10, 10, 10), 'image.nii.gz') shutil.move('image.nii.gz', 'image.nii') with pytest.raises(ImageFileError): nib.load('image.nii') create_random_image((10, 10, 10, 10), 'image.nii') shutil.move('image.nii', 'image.nii.gz') with pytest.raises(ImageFileError): nib.load('image.nii.gz') indexed_gzip-1.6.4/indexed_gzip/indexed_gzip.pyx0000644000175000017500000011316414133320576021417 0ustar nileshnilesh# cython: binding=True,embedsignature=True # # The IndexedGzipFile class. # """This module provides the :class:`IndexedGzipFile` class, a drop-in replacement for the built-in ``gzip.GzipFile`` class, for faster read-only random access to gzip files. """ from libc.stdio cimport (SEEK_SET, SEEK_CUR, SEEK_END, FILE, fopen, fdopen, fclose) from libc.stdint cimport (uint8_t, uint32_t, uint64_t, int64_t) from cpython.mem cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free) from cpython.buffer cimport (PyObject_GetBuffer, PyBuffer_Release, PyBUF_ANY_CONTIGUOUS, PyBUF_SIMPLE) from cpython.ref cimport PyObject cimport indexed_gzip.zran as zran import io import os import os.path as op import pickle import logging import warnings import threading import contextlib builtin_open = open """Reference to the built-in open function, which is otherwise masked by our open function below. When support for Python 2.7 is dropped, the ``builtins`` module can be used instead. """ log = logging.getLogger(__name__) def open(filename=None, fileobj=None, *args, **kwargs): """Create and return an ``IndexedGzipFile``. :arg filename: File name or open file handle. :arg fileobj: Open file handle. See the ``IndexedGzipFile`` class for details on the other arguments. """ return IndexedGzipFile(filename, fileobj, **kwargs) class IndexedGzipFile(io.BufferedReader): """The ``IndexedGzipFile`` class allows for fast random access of a gzip file by using the ``zran`` library to build and maintain an index of seek points into the file. ``IndexedGzipFile`` is an ``io.BufferedReader`` which wraps an :class:`_IndexedGzipFile` instance. By accessing the ``_IndexedGzipFile`` instance through an ``io.BufferedReader``, read performance is improved through buffering, and access to the I/O methods is made thread-safe. A :meth:`pread` method is also implemented, as it is not implemented by the ``io.BufferedReader``. """ def __init__(self, *args, **kwargs): """Create an ``IndexedGzipFile``. The file may be specified either with an open file handle (``fileobj``), or with a ``filename``. If the former, the file must have been opened in ``'rb'`` mode. .. note:: The ``auto_build`` behaviour only takes place on calls to :meth:`seek`. :arg filename: File name or open file handle. :arg fileobj: Open file handle. :arg mode: Opening mode. Must be either ``'r'`` or ``'rb``. :arg auto_build: If ``True`` (the default), the index is automatically built on calls to :meth:`seek`. :arg skip_crc_check: Defaults to ``False``. If ``True``, CRC/size validation of the uncompressed data is not performed. :arg spacing: Number of bytes between index seek points. :arg window_size: Number of bytes of uncompressed data stored with each seek point. :arg readbuf_size: Size of buffer in bytes for storing compressed data read in from the file. :arg readall_buf_size: Size of buffer in bytes used by :meth:`read` when reading until EOF. :arg drop_handles: Has no effect if an open ``fid`` is specified, rather than a ``filename``. If ``True`` (the default), a handle to the file is opened and closed on every access. Otherwise the file is opened at ``__cinit__``, and kept open until this ``_IndexedGzipFile`` is destroyed. :arg index_file: Pre-generated index for this ``gz`` file - if provided, passed through to :meth:`import_index`. :arg buffer_size: Optional, must be passed as a keyword argument. Passed through to ``io.BufferedReader.__init__``. If not provided, a default value of 1048576 is used. """ buffer_size = kwargs.pop('buffer_size', 1048576) fobj = _IndexedGzipFile(*args, **kwargs) self.__file_lock = threading.RLock() self.__igz_fobj = fobj self.__buffer_size = buffer_size self.build_full_index = fobj.build_full_index self.import_index = fobj.import_index self.export_index = fobj.export_index self.fileobj = fobj.fileobj self.drop_handles = fobj.drop_handles self.seek_points = fobj.seek_points super(IndexedGzipFile, self).__init__(fobj, buffer_size) def pread(self, nbytes, offset): """Seeks to ``offset``, then reads and returns up to ``nbytes``. The calls to seek and read are protected by a ``threading.RLock``. """ with self.__file_lock: self.seek(offset) return self.read(nbytes) def __reduce__(self): """Used to pickle an ``IndexedGzipFile``. Returns a tuple containing: - a reference to the ``unpickle`` function - a tuple containing a "state" object, which can be passed to ``unpickle``. """ fobj = self.__igz_fobj if (not fobj.drop_handles) or (not fobj.own_file): raise pickle.PicklingError( 'Cannot pickle IndexedGzipFile that has been created ' 'with an open file object, or that has been created ' 'with drop_handles=False') # export and serialise the index if # any index points have been created. # The index data is serialised as a # bytes object. if fobj.npoints == 0: index = None else: index = io.BytesIO() self.export_index(fileobj=index) index = index.getvalue() state = { 'filename' : fobj.filename, 'auto_build' : fobj.auto_build, 'spacing' : fobj.spacing, 'window_size' : fobj.window_size, 'readbuf_size' : fobj.readbuf_size, 'readall_buf_size' : fobj.readall_buf_size, 'buffer_size' : self.__buffer_size, 'tell' : self.tell(), 'index' : index} return (unpickle, (state, )) cdef class _IndexedGzipFile: """The ``_IndexedGzipFile`` class allows for fast random access of a gzip file by using the ``zran`` library to build and maintain an index of seek points into the file. .. note:: The :meth:`seek` and :meth:`read` methods release the GIL while calling ``zran`` functions, but the ``_IndexedGzipFile`` is *not* thread-safe. Use the ``IndexedGzipFile`` class (i.e. without the leading underscore) if you need thread-safety. """ cdef zran.zran_index_t index """A reference to the ``zran_index`` struct. """ cdef readonly uint32_t spacing """Number of bytes between index seek points. """ cdef readonly uint32_t window_size """Number of bytes of uncompressed data stored with each seek point.""" cdef readonly uint32_t readbuf_size """Size of buffer in bytes for storing compressed data read in from the file. """ cdef readonly unsigned int readall_buf_size """Size of buffer in bytes used by :meth:`read` when reading until EOF. """ cdef readonly bint auto_build """Flag which is set to ``True`` if the file index is built automatically on seeks/reads. """ cdef readonly bint skip_crc_check """Flag which is set to ``True`` if CRC/size validation of uncompressed data is disabled. """ cdef readonly object filename """String containing path of file being indexed. Used to release and reopen file handles between seeks and reads. Set to ``None`` if file handle is passed. """ cdef readonly bint own_file """Flag which tracks whether this ``_IndexedGzipFile`` has opened its own file handle, or was given one. """ cdef readonly bint drop_handles """Copy of the ``drop_handles`` flag as passed to :meth:`__cinit__`. """ cdef object pyfid """A reference to the python file handle. """ cdef bint finalized """Flag which is set to ``True`` if the ``_IndexedGzipFile`` has been closed. Further operations will fail if ``True``. """ def __init__(self, filename=None, fileobj=None, mode=None, auto_build=True, spacing=4194304, window_size=32768, readbuf_size=1048576, readall_buf_size=16777216, drop_handles=True, index_file=None, skip_crc_check=False): """Create an ``_IndexedGzipFile``. The file may be specified either with an open file handle (``fileobj``), or with a ``filename``. If the former, the file is assumed have been opened for reading in binary mode. .. note:: The ``auto_build`` behaviour only takes place on calls to :meth:`seek`. :arg filename: File name or open file handle. :arg fileobj: Open file handle. :arg mode: Opening mode. Must be either ``'r'`` or ``'rb``. :arg auto_build: If ``True`` (the default), the index is automatically built on calls to :meth:`seek`. :arg skip_crc_check: Defaults to ``False``. If ``True``, CRC/size validation of the uncompressed data is not performed. Automatically enabled if an ``index_file`` is provided, or if :meth:`import_index` is called. :arg spacing: Number of bytes between index seek points. :arg window_size: Number of bytes of uncompressed data stored with each seek point. :arg readbuf_size: Size of buffer in bytes for storing compressed data read in from the file. :arg readall_buf_size: Size of buffer in bytes used by :meth:`read` when reading until EOF. :arg drop_handles: Has no effect if an open ``fid`` is specified, rather than a ``filename``. If ``True`` (the default), a handle to the file is opened and closed on every access. Otherwise the file is opened at ``__cinit__``, and kept open until this ``_IndexedGzipFile`` is destroyed. :arg index_file: Pre-generated index for this ``gz`` file - if provided, passed through to :meth:`import_index`. """ cdef FILE *fd = NULL if (fileobj is None and filename is None) or \ (fileobj is not None and filename is not None): raise ValueError('One of fileobj or filename must be specified') # filename can be either a # name or a file object if hasattr(filename, 'read'): fileobj = filename filename = None if fileobj is not None and \ getattr(fileobj, 'mode', 'rb') not in (None, 'r', 'rb'): raise ValueError('Invalid mode - fileobj must be opened ' 'in read-only binary ("rb") mode') if (fileobj is None) and (mode not in (None, 'r', 'rb')): raise ValueError('Invalid mode ({}), must be ' '"r" or "rb"'.format(mode)) # If __file_handle is called on a file # that doesn't exist, it passes the # path directly to fopen, which causes # a segmentation fault on linux. So # let's check before that happens. if (filename is not None) and (not op.isfile(filename)): raise ValueError('File {} does not exist'.format(filename)) mode = 'rb' own_file = fileobj is None # if file is specified with an open # file handle, drop_handles is ignored if fileobj is not None: drop_handles = False # if not drop_handles, we open a # file handle and keep it open for # the lifetime of this object. if not drop_handles: if fileobj is None: fileobj = builtin_open(filename, mode) try: fd = fdopen(fileobj.fileno(), 'rb') except io.UnsupportedOperation: fd = NULL self.spacing = spacing self.window_size = window_size self.readbuf_size = readbuf_size self.readall_buf_size = readall_buf_size self.auto_build = auto_build self.skip_crc_check = skip_crc_check self.drop_handles = drop_handles self.filename = filename self.own_file = own_file self.pyfid = fileobj flags = 0 if auto_build: flags |= zran.ZRAN_AUTO_BUILD if skip_crc_check: flags |= zran.ZRAN_SKIP_CRC_CHECK # Set index.fd here just for the initial # call, as __file_handle may otherwise # manipulate it incorrectly self.index.fd = fd with self.__file_handle(): if zran.zran_init(index=&self.index, fd=self.index.fd, f=fileobj, spacing=spacing, window_size=window_size, readbuf_size=readbuf_size, flags=flags): raise ZranError('zran_init returned error (file: ' '{})'.format(self.errname)) log.debug('%s.__init__(%s, %s, %s, %s, %s, %s, %s)', type(self).__name__, fileobj, filename, auto_build, spacing, window_size, readbuf_size, drop_handles) if index_file is not None: self.import_index(index_file) def __file_handle(self): """This method is used as a context manager whenever access to the underlying file stream is required. It makes sure that ``index.fd`` field is set appropriately, opening/closing the file handle as necessary (depending on the value of :attr:`drop_handles`). """ # Errors occur with Python 2.7 and # Cython < 0.26 when decorating # cdef-class methods. This workaround # can be removed when you are happy # dropping support for cython < 0.26. @contextlib.contextmanager def proxy(): # If a file handle already exists, # return it. This clause makes this # context manager reentrant. if self.index.fd is not NULL: yield # If a file-like object exists (without an associated # file descriptor, since self.index.fd is NULL), # also return it. elif self.pyfid is not None: yield # otherwise we open a new # file handle on each access else: try: self.index.fd = fopen(self.filename.encode(), 'rb') yield finally: fclose(self.index.fd) self.index.fd = NULL return proxy() def seek_points(self): """Return the seek point locations that currently exist in the index. Yields a sequence of tuples, with each tuple containing the uncompressed and compressed offsets for one seek point in the index. """ for i in range(self.index.npoints): point = self.index.list[i] yield (point.uncmp_offset, point.cmp_offset) def fileno(self): """Calls ``fileno`` on the underlying file object. Raises a :exc:`NoHandleError` if ``drop_handles is True``. """ if self.drop_handles: raise NoHandleError() return self.pyfid.fileno() def fileobj(self): """Returns a reference to the python file object. Raises a :exc:`NoHandleError` if ``drop_handles is True``. """ if self.drop_handles: raise NoHandleError() return self.pyfid @property def errname(self): """Used in exception messages. Returns the file name associated with this ``_IndexedGzipFile``, or ``'n/a'`` if a file name cannot be identified. """ if self.filename is not None: return self.filename if self.pyfid is not None: if getattr(self.pyfid, 'name', None) is not None: return self.pyfid.name return 'n/a' @property def npoints(self): """Returns the number of index points that have been created. """ return self.index.npoints @property def mode(self): """Returns the mode that this file was opened in. Currently always returns ``'rb'``. """ return 'rb' def close(self): """Closes this ``_IndexedGzipFile``. """ if self.closed: raise IOError('_IndexedGzipFile is already closed ' '(file: {})'.format(self.errname)) if self.own_file and self.pyfid is not None: self.pyfid.close() elif self.own_file and self.index.fd is not NULL: fclose(self.index.fd) zran.zran_free(&self.index) self.index.f = NULL self.index.fd = NULL self.filename = None self.pyfid = None self.finalized = True if log is not None: log.debug('%s.close()', type(self).__name__) @property def closed(self): """Returns ``True`` if this ``_IndexedGzipFile`` is closed, ``False`` otherwise. """ return self.finalized def readable(self): """Returns ``True`` if this ``_IndexedGzipFile`` is readable, ``False`` otherwise. """ return not self.closed def writable(self): """Currently always returns ``False`` - the ``_IndexedGzipFile`` does not support writing yet. """ return False def seekable(self): """Returns ``True`` if this ``_IndexedGzipFile`` supports seeking, ``False`` otherwise. """ return not self.closed def tell(self): """Returns the current seek offset into the uncompressed data stream. """ return zran.zran_tell(&self.index) def __enter__(self): """Returns this ``_IndexedGzipFile``. """ return self def __exit__(self, *args): """Calls close on this ``_IndexedGzipFile``. """ if not self.closed: self.close() def __dealloc__(self): """Frees the memory used by this ``_IndexedGzipFile``. If a file name was passed to :meth:`__cinit__`, the file handle is closed. """ if not self.closed: self.close() def build_full_index(self): """Re-builds the full file index. """ with self.__file_handle(): ret = zran.zran_build_index(&self.index, 0, 0) if ret != zran.ZRAN_BUILD_INDEX_OK: raise ZranError('zran_build_index returned error: {} (file: {})' .format(ZRAN_ERRORS.ZRAN_BUILD[ret], self.errname)) log.debug('%s.build_full_index()', type(self).__name__) def seek(self, offset, whence=SEEK_SET): """Seeks to the specified position in the uncompressed data stream. If this ``_IndexedGzipFile`` was created with ``auto_build=False``, and the requested offset is not covered by the index, a :exc:`NotCoveredError` is raised. :arg offset: Desired seek offset into the uncompressed data :arg whence: Either ``SEEK_SET``, ``SEEK_CUR``, or ``SEEK_END``. If not one of these, a :exc:`ValueError` is raised. :returns: The final seek location into the uncompressed stream. .. note:: This method releases the GIL while ``zran_seek`` is running. """ cdef int ret cdef int64_t off = offset cdef uint8_t c_whence = whence cdef zran.zran_index_t *index = &self.index if whence not in (SEEK_SET, SEEK_CUR, SEEK_END): raise ValueError('Invalid value for whence: {}'.format(whence)) with self.__file_handle(), nogil: ret = zran.zran_seek(index, off, c_whence, NULL) if ret == zran.ZRAN_SEEK_NOT_COVERED: raise NotCoveredError('Index does not cover ' 'offset {}'.format(offset)) elif ret == zran.ZRAN_SEEK_INDEX_NOT_BUILT: raise NotCoveredError('Index must be completely built ' 'in order to seek from SEEK_END') elif ret == zran.ZRAN_SEEK_CRC_ERROR: raise CrcError('CRC/size validation failed - the ' 'GZIP data might be corrupt (file: ' '{})'.format(self.errname)) elif ret not in (zran.ZRAN_SEEK_OK, zran.ZRAN_SEEK_EOF): raise ZranError('zran_seek returned error: {} (file: {})' .format(ZRAN_ERRORS.ZRAN_SEEK[ret], self.errname)) offset = self.tell() log.debug('%s.seek(%s)', type(self).__name__, offset) return offset def read(self, nbytes=-1): """Reads up to ``nbytes`` bytes from the uncompressed data stream. If ``nbytes < 0`` the stream is read until EOF. If the stream is already at EOF, ``b''`` is returned. .. note:: This method releases the GIL while ``zran_read`` is running. """ if nbytes == 0: return bytes() elif nbytes < 0: buf = ReadBuffer(self.readall_buf_size) else: buf = ReadBuffer(nbytes) cdef zran.zran_index_t *index = &self.index cdef size_t nread = 0 cdef uint64_t bufsz = buf.size cdef size_t offset = 0 cdef void *buffer cdef int64_t ret with self.__file_handle(): # Read until EOF or enough # bytes have been read while True: # read some bytes into the correct # buffer location buffer = buf.buffer + offset with nogil: ret = zran.zran_read(index, buffer, bufsz) # No bytes were read, and there are # no more bytes to read. This will # happen when the seek point was at # or beyond EOF when zran_read was # called if ret == zran.ZRAN_READ_EOF: break # This will happen if the current # seek point is not covered by the # index, and auto-build is disabled elif ret == zran.ZRAN_READ_NOT_COVERED: raise NotCoveredError('Index does not cover ' 'current offset') # CRC or size check failed - data # might be corrupt elif ret == zran.ZRAN_READ_CRC_ERROR: raise CrcError('CRC/size validation failed - the ' 'GZIP data might be corrupt (file: ' '{})'.format(self.errname)) # Unknown error elif ret < 0: raise ZranError('zran_read returned error: {} (file: ' '{})'.format(ZRAN_ERRORS.ZRAN_READ[ret], self.errname)) nread += ret offset += ret # If we requested a specific number of # bytes, zran_read will have returned # them all (or all until EOF), so we're # finished if nbytes > 0: break # Otherwise if reading until EOF, check # and increase the buffer size if necessary if (nread + self.readall_buf_size) > buf.size: buf.resize(buf.size + self.readall_buf_size) offset = nread buf.resize(nread) pybuf = (buf.buffer)[:nread] log.debug('%s.read(%s)', type(self).__name__, len(pybuf)) return pybuf def readinto(self, buf): """Reads up to ``len(buf)`` bytes directly into ``buf``, which is assumed to be a mutable ``bytes``-like object (e.g. a ``memoryview`` or ``bytearray``. """ cdef zran.zran_index_t *index = &self.index cdef uint64_t bufsz = len(buf) cdef Py_buffer pbuf cdef void *vbuf cdef int64_t ret # Create a Py_Buffer which allows # us to access the memory managed # by the provided buf PyObject_GetBuffer(buf, &pbuf, PyBUF_SIMPLE | PyBUF_ANY_CONTIGUOUS) # read some bytes try: vbuf = pbuf.buf with self.__file_handle(), nogil: ret = zran.zran_read(index, vbuf, bufsz) # release the py_buffer finally: PyBuffer_Release(&pbuf) # see how the read went if ret == zran.ZRAN_READ_FAIL: raise ZranError('zran_read returned error: {} (file: {})' .format(ZRAN_ERRORS.ZRAN_READ[ret], self.errname)) # This will happen if the current # seek point is not covered by the # index, and auto-build is disabled elif ret == zran.ZRAN_READ_NOT_COVERED: raise NotCoveredError('Index does not cover current offset') # No bytes were read, and there are # no more bytes to read. This will # happen when the seek point was at # or beyond EOF when zran_read was # called elif ret == zran.ZRAN_READ_EOF: return 0 # Return the number of bytes that # were read else: return ret def pread(self, nbytes, offset): """Seeks to the specified ``offset``, then reads and returns ``nbytes``. See :meth:`seek` and :meth:`read`. """ with self.__file_handle(): self.seek(offset) return self.read(nbytes) def readline(self, size=-1): """Read and return up to the next ``'\n'`` character (up to at most ``size`` bytes, if ``size >= 0``) from the uncompressed data stream. If the end of the stream has been reached, ``b''`` is returned. """ if size == 0: return bytes() linebuf = b'' startpos = self.tell() bufsz = 1024 # Read in chunks of [bufsz] bytes at a time with self.__file_handle(): while True: buf = self.read(bufsz) lineidx = buf.find(b'\n') haveline = lineidx >= 0 eof = len(buf) == 0 # Are we at EOF? Nothing more to do if eof: break # Have we found a line? Discard # everything that comes after it if haveline: linebuf = linebuf + buf[:lineidx + 1] # If we've found a line, and are # not size-limiting, we're done if haveline and size < 0: break # If we're size limiting, and have # read in enough bytes, we're done if size >= 0 and len(linebuf) > size: linebuf = linebuf[:size] break # Rewind the seek location # to the finishing point self.seek(startpos + len(linebuf)) return linebuf def readlines(self, hint=-1): """Reads and returns a list of lines from the uncompressed data. If ``hint`` is provided, lines will be read until the total size of all lines exceeds ``hint`` in bytes. """ totalsize = 0 lines = [] with self.__file_handle(): while True: line = self.readline() if line == b'': break lines.append(line) totalsize += len(line) if hint >= 0 and totalsize > hint: break return lines def __iter__(self): """Returns this ``_IndexedGzipFile`` which can be iterated over to return lines (separated by ``'\n'``) in the uncompressed stream. """ return self def __next__(self): """Returns the next line from the uncompressed stream. Raises :exc:`StopIteration` when there are no lines left. """ line = self.readline() if line == b'': raise StopIteration() else: return line def write(self, *args, **kwargs): """Currently raises a :exc:`NotImplementedError`.""" raise NotImplementedError('_IndexedGzipFile does not support writing') def flush(self): """Currently does nothing. """ pass def export_index(self, filename=None, fileobj=None): """Export index data to the given file. Either ``filename`` or ``fileobj`` should be specified, but not both. ``fileobj`` should be opened in 'wb' mode. :arg filename: Name of the file. :arg fileobj: Open file handle. """ if filename is None and fileobj is None: raise ValueError('One of filename or fileobj must be specified') if filename is not None and fileobj is not None: raise ValueError( 'Only one of filename or fileobj must be specified') if filename is not None: fileobj = builtin_open(filename, 'wb') close_file = True else: close_file = False if getattr(fileobj, 'mode', 'wb') != 'wb': raise ValueError( 'File should be opened in writeable binary mode.') try: # Pass both the Python file object and # file descriptor (if this is an actual # file) to the zran_export_index function try: fd = fdopen(fileobj.fileno(), 'wb') except io.UnsupportedOperation: fd = NULL ret = zran.zran_export_index(&self.index, fd, fileobj) if ret != zran.ZRAN_EXPORT_OK: raise ZranError('export_index returned error: {} (file: ' '{})'.format(ZRAN_ERRORS.ZRAN_EXPORT[ret], self.errname)) finally: if close_file: fileobj.close() log.debug('%s.export_index(%s, %s)', type(self).__name__, filename, fileobj) def import_index(self, filename=None, fileobj=None): """Import index data from the given file. Either ``filename`` or ``fileobj`` should be specified, but not both. ``fileobj`` should be opened in 'rb' mode. :arg filename: Name of the file. :arg fileobj: Open file handle. """ if filename is None and fileobj is None: raise ValueError('One of filename or fileobj must be specified') if filename is not None and fileobj is not None: raise ValueError( 'Only one of filename or fileobj must be specified') if filename is not None: fileobj = builtin_open(filename, 'rb') close_file = True else: close_file = False if getattr(fileobj, 'mode', 'rb') != 'rb': raise ValueError( 'File should be opened read-only binary mode.') try: # Pass both the Python file object and # file descriptor (if this is an actual # file) to the zran_import_index function try: fd = fdopen(fileobj.fileno(), 'rb') except io.UnsupportedOperation: fd = NULL ret = zran.zran_import_index(&self.index, fd, fileobj) if ret != zran.ZRAN_IMPORT_OK: raise ZranError('import_index returned error: {} (file: ' '{})'.format(ZRAN_ERRORS.ZRAN_IMPORT[ret], self.errname)) self.skip_crc_check = True finally: if close_file: fileobj.close() log.debug('%s.import_index(%s, %s)', type(self).__name__, filename, fileobj) cdef class ReadBuffer: """Wrapper around a chunk of memory. .. see:: http://docs.cython.org/src/tutorial/memory_allocation.html """ cdef void *buffer """A raw chunk of bytes. """ cdef size_t size; """Size of the buffer. """ def __cinit__(self, size_t size): """Allocate ``size`` bytes of memory. """ self.size = size self.buffer = PyMem_Malloc(size) if not self.buffer: raise MemoryError('PyMem_Malloc fail') log.debug('ReadBuffer.__cinit__(%s)', size) def resize(self, size_t size): """Re-allocate the memory to the given ``size``. """ if size == self.size: return buf = PyMem_Realloc(self.buffer, size) if not buf: raise MemoryError('PyMem_Realloc fail') log.debug('ReadBuffer.resize(%s)', size) self.size = size self.buffer = buf def __dealloc__(self): """Free the mwmory. """ PyMem_Free(self.buffer) log.debug('ReadBuffer.__dealloc__()') def unpickle(state): """Create a new ``IndexedGzipFile`` from a pickled state. :arg state: State of a pickled object, as returned by the ``IndexedGzipFile.__reduce__`` method. :returns: A new ``IndexedGzipFile`` object. """ tell = state.pop('tell') index = state.pop('index') gzobj = IndexedGzipFile(**state) if index is not None: gzobj.import_index(fileobj=io.BytesIO(index)) gzobj.seek(tell) return gzobj class NotCoveredError(ValueError): """Exception raised by the :class:`_IndexedGzipFile` when an attempt is made to seek to/read from a location that is not covered by the index. If the ``_IndexedGzipFile`` was created with ``auto_build=True``, this error will only occur on attempts to call the ``seek`` method with ``whence=SEEK_END``, where the index has not been completely built. """ pass class ZranError(IOError): """Exception raised by the :class:`_IndexedGzipFile` when the ``zran`` library signals an error. """ pass class CrcError(OSError): """Exception raised by the :class:`_IndexedGzipFile` when a CRC/size validation check fails, which suggests that the GZIP data might be corrupt. """ pass class NoHandleError(ValueError): """Exception raised by the :class:`_IndexedGzipFile` when ``drop_handles is True`` and an attempt is made to access the underlying file object. """ class ZRAN_ERRORS(object): """Contains text versions of all error codes emitted by zran.c. """ ZRAN_BUILD = { zran.ZRAN_BUILD_INDEX_FAIL : 'ZRAN_BUILD_INDEX_FAIL', zran.ZRAN_BUILD_INDEX_CRC_ERROR : 'ZRAN_BUILD_INDEX_CRC_ERROR' } ZRAN_SEEK = { zran.ZRAN_SEEK_CRC_ERROR : 'ZRAN_SEEK_CRC_ERROR', zran.ZRAN_SEEK_FAIL : 'ZRAN_SEEK_FAIL', zran.ZRAN_SEEK_NOT_COVERED : 'ZRAN_SEEK_NOT_COVERED', zran.ZRAN_SEEK_EOF : 'ZRAN_SEEK_EOF', zran.ZRAN_SEEK_INDEX_NOT_BUILT : 'ZRAN_SEEK_INDEX_NOT_BUILT' } ZRAN_READ = { zran.ZRAN_READ_NOT_COVERED : 'ZRAN_READ_NOT_COVERED', zran.ZRAN_READ_EOF : 'ZRAN_READ_EOF', zran.ZRAN_READ_FAIL : 'ZRAN_READ_FAIL', zran.ZRAN_READ_CRC_ERROR : 'ZRAN_READ_CRC_ERROR' } ZRAN_EXPORT = { zran.ZRAN_EXPORT_WRITE_ERROR : 'ZRAN_EXPORT_WRITE_ERROR' } ZRAN_IMPORT = { zran.ZRAN_IMPORT_OK : 'ZRAN_IMPORT_OK', zran.ZRAN_IMPORT_FAIL : 'ZRAN_IMPORT_FAIL', zran.ZRAN_IMPORT_EOF : 'ZRAN_IMPORT_EOF', zran.ZRAN_IMPORT_READ_ERROR : 'ZRAN_IMPORT_READ_ERROR', zran.ZRAN_IMPORT_INCONSISTENT : 'ZRAN_IMPORT_INCONSISTENT', zran.ZRAN_IMPORT_MEMORY_ERROR : 'ZRAN_IMPORT_MEMORY_ERROR', zran.ZRAN_IMPORT_UNKNOWN_FORMAT : 'ZRAN_IMPORT_UNKNOWN_FORMAT', zran.ZRAN_IMPORT_UNSUPPORTED_VERSION : 'ZRAN_IMPORT_UNSUPPORTED_VERSION' } indexed_gzip-1.6.4/indexed_gzip/zran.c0000644000175000017500000032724014133320576017324 0ustar nileshnilesh/* * zran.c - indexed access to gzip files. * * See zran.h for documentation. * * This module was originally based on the zran example, written by Mark * Alder, which ships with the zlib source code. */ #include #include #include #include #include #include "zlib.h" #define PY_SSIZE_T_CLEAN #include #ifdef _WIN32 #include "windows.h" #include "io.h" static int is_readonly(FILE *fd, PyObject *f) { /* Can't find a way to do this correctly under Windows and the check is not required anyway since the underlying Python module checks it already */ return 1; } #else #include /* Check if file is read-only */ static int is_readonly(FILE *fd, PyObject *f) { /* Skip the test for python file-likes */ return fd != NULL ? (fcntl(fileno(fd), F_GETFL) & O_ACCMODE) == O_RDONLY : 1; } static uint32_t max(uint32_t a, uint32_t b) { if (a > b) return a; else return b; } #endif #include "zran.h" #include "zran_file_util.h" #ifdef NO_C99 static double round(double val) { return floor(val + 0.5); } #endif /* * Turn this on to make noise. * * #define ZRAN_VERBOSE */ //#define ZRAN_VERBOSE #ifdef ZRAN_VERBOSE #define zran_log(...) fprintf(stderr, __VA_ARGS__) #else #define zran_log(...) #endif /* * Identifier and version number for index files created by zran_export_index. */ const char ZRAN_INDEX_FILE_ID[] = {'G', 'Z', 'I', 'D', 'X'}; const uint8_t ZRAN_INDEX_FILE_VERSION = 1; /* * Discards all points in the index which come after the specfiied * compressed offset. * * Returns 0 on success, non-0 on failure. */ static int _zran_invalidate_index( zran_index_t *index, /* The index */ uint64_t from /* Offset into the compressed data */ ); /* * Expands the capacity of the memory used to store the index ilst. * * Returns 0 on success, non-0 on failure. */ static int _zran_expand_point_list( zran_index_t *index /* The index */ ); /* * Reduces the capacity of the memory used to store the index list, so that it * is only as big as necessary. * * Returns 0 on success, non-0 on failure. */ static int _zran_free_unused( zran_index_t *index /* The index */ ); /* * Returns the current limit of the index, i.e. how much of the file is covered * by the index. */ static uint64_t _zran_index_limit( zran_index_t *index, /* The index */ uint8_t compressed /* Pass in non-0 to get the compressed stream limit, or 0 for the uncompressed limit. */ ); /* Return codes for _zran_get_point_at */ int ZRAN_GET_POINT_CRC_ERROR = -2; int ZRAN_GET_POINT_FAIL = -1; int ZRAN_GET_POINT_OK = 0; int ZRAN_GET_POINT_NOT_COVERED = 1; int ZRAN_GET_POINT_EOF = 2; /* * Searches for the zran_point which preceeds the given offset. The offset * may be specified as being relative to the start of the compressed data, * or the uncompressed data. * * Returns: * * - ZRAN_GET_POINT_OK on success. * * - ZRAN_GET_POINT_NOT_COVERED if the index does not yet cover the * specified offset. * - ZRAN_GET_POINT_EOF if the specified offset is at or beyond the end of * the file. */ static int _zran_get_point_at( zran_index_t *index, /* The index */ uint64_t offset, /* The desired offset into the compressed or uncompressed data stream */ uint8_t compressed, /* Pass in 0 or non-0 to indicate that the offset is relative to the uncompressed or compressed data streams, respectively. */ zran_point_t **point /* If an index point corresponding to the specified offset is identified, this pointer will be updated to point to it. */ ); /* * If the index has been created without the ZRAN_AUTO_BUILD flag, this * function is identical to the _zran_get_point_at function. * * If the index has been created with the ZRAN_AUTO_BUILD flag, and the * requested offset is beyond the current range of the index, the index will * be expanded to encompass it. * * The input arguments and return values are identical to the * _zran_get_point_at function, however: * * - if the index has been initialised with the ZRAN_AUTO_BUILD flag, this * function will never return ZRAN_GET_POINT_NOT_COVERED. * * - If a CRC validation error occurs while the index is being expanded, * ZRAN_GET_POINT_CRC_ERROR is returned. */ static int _zran_get_point_with_expand( zran_index_t *index, /* The index */ uint64_t offset, /* Desired offset */ uint8_t compressed, /* Compressed or uncompressed offset */ zran_point_t **point /* Place to store the identified point */ ); /* * Estimate an offset in the compressed / uncompressed data stream * corresponding to the given offset, which is specified in the uncompressed / * compressed data stream. If the given offset is specified relative to the * compressed data stream, the returned value is a location in the * uncompressed data stream which approximately corresponds to the given * offset. * * This function is used by the _zran_get_point_with_expand function, if the * index has been created with the ZRAN_AUTO_BUILD flag, to determine how far * the index needs to be expanded to cover a requested offset that is not yet * covered. */ static uint64_t _zran_estimate_offset( zran_index_t *index, /* The index */ uint64_t offset, /* The offset for which a corresponding offset is to be estimated. */ uint8_t compressed /* Pass in 0 or non-0 to indicate that the given offset is specified relative to the uncompressed or compressed stream, respectively. */ ); /* * Used by _zran_inflate. Initialises zlib to start decompressing/inflating * from either: * * - the current seek location in the compressed data, or * - from a location denoted by a specific index point * * If an index point is provided, the function will seek to the specified * compressed data offset before initialising zlib. * * Otherwise (no index point), inflation is initialised at the current seek * location in the input data, and a GZIP header is expected at that location. * * The index->readbuf and readbuf_size, and the z_stream->avail_in, avail_out, * next_in and next_out fields must all be set before this function is called. * * Returns the number of bytes over the input data that were read (which could * be 0), or a negative value on failure. */ static int _zran_init_zlib_inflate( zran_index_t *index, /* The index */ z_stream *stream, /* Pointer to a z_stream struct */ zran_point_t *point /* Pass in NULL to initialise for inflation from the current location in the input file. Or pass a pointer to the index point corresponding to the location to start from. */ ); /* * Return codes for _zran_expand_index. These are currently * assumed to have identical values to the ZRAN_BUILD_INDEX * return codes. */ int ZRAN_EXPAND_INDEX_OK = 0; int ZRAN_EXPAND_INDEX_FAIL = -1; int ZRAN_EXPAND_INDEX_CRC_ERROR = -2; /* * Expands the index from its current end-point until the given offset (which * must be specified relative to the compressed data stream). * * The index is expanded so that the last point comes after the given offset. * If the specified offset is past the last point in the index, a call to this * function is guaranteed to create at least one more index point. If there * is already an index point which comes after the offset, this function does * nothing, and return a success code. * * We require at least one point to be created, because we want index points * to be located at compression block boundaries, but in some data there may * be a long distance between block boundaries (longer than the desired index * point spacing). * * Returns 0 on success. If a CRC check fails, returns * ZRAN_EXPAND_INDEX_CRC_ERROR. For other types of failure, returns * ZRAN_EXPAND_INDEX_FAIL. */ static int _zran_expand_index( zran_index_t *index, /* The index */ uint64_t until /* Expand the index to this point */ ); /* * Adds a new point to the end of the index. */ static int _zran_add_point( zran_index_t *index, /* The index */ uint8_t bits, /* If the compressed and uncompressed offsets are not byte-aligned, this is the number of bits in the compressed data, before the cmp_offset, where the point is located. */ uint64_t cmp_offset, /* Offset into the compressed data. */ uint64_t uncmp_offset, /* Offset into the uncompressed data. */ uint32_t data_offset, /* Offset into the data pointer specifying the point at which the uncompressed data associated with this point begins - see _zran_expand_index. It is assumed that the uncompressed data wraps around this offset. */ uint32_t data_size, /* Number of bytes in data */ uint8_t *data /* Pointer to data_size bytes of uncompressed data preceeding this index point. */ ); /* _zran_read_data return codes */ int ZRAN_READ_DATA_EOF = -1; int ZRAN_READ_DATA_ERROR = -2; /* * This function is a sub-function of _zran_inflate, used to read data from * the input file to be passed to zlib:inflate for decompression. * * Up to index->readbuf_size bytes are read from the input file into * index->readbuf, and the z_stream counters/pointers are updated accordingly. * * On success, returns 0. * * If there are no more bytes left to read from the input file (i.e. we are at * EOF), ZRAN_READ_DATA_EOF is returned. If an error occurs, returns * ZRAN_READ_DATA_ERROR. */ static int _zran_read_data_from_file( zran_index_t *index, /* The index */ z_stream *stream, /* The z_stream struct */ uint64_t cmp_offset, /* Current offset in the compressed data */ uint64_t uncmp_offset, /* Current offset in the uncompressed data */ uint32_t need_atleast /* Skip read if the read buffer already has this many bytes */ ); /* _zran_find_next_stream return codes */ int ZRAN_FIND_STREAM_ERROR = -2; int ZRAN_FIND_STREAM_NOT_FOUND = -1; /* * This function is a sub-function of _zran_inflate, used to search for a new * GZIP stream in a series of concatenated streams. It searches through the * compressed data (pointed to by stream->next_in) to find the location of the * next compressed stream. * * If a new stream was found, the z_stream struct is re-initialised to * decompress data from the new stream, using _zran_init_zlib_inflate. In * this case, the value returned by that function is returned. * * Otherwise (if a compressed stream was not found), this function returns * ZRAN_FIND_STREAM_NOT_FOUND. * * The number of bytes that were skipped over before the new stream was found * is added to the provided offset pointer. * * If an error occurs, ZRAN_FIND_STREAM_ERROR is returned. */ static int _zran_find_next_stream( zran_index_t *index, /* The index */ z_stream *stream, /* The z_stream struct */ int *offset /* Used to store the number of bytes skipped over */ ); /* _zran_validate_stream return codes */ int ZRAN_VALIDATE_STREAM_ERROR = -2; int ZRAN_VALIDATE_STREAM_INVALID = -1; /* * This function is a sub-function of _zran_inflate, called when the end of a * gzip stream is reached. It reads the CRC32 and uncompressed file size from * the end of the stream, and compares them to the CRC32 and size that was * incrementally calculated by _zran_inflate (which are stored in * index->stream_crc32 and index->stream_size), * * The number of bytes that were read before the new stream was found is * added to the provided offset pointer. * * If ZRAN_SKIP_CRC_CHECK is active, this function returns immediately without * doing anything. * * If an error occurs, ZRAN_VALIDATE_STREAM_ERROR is returned. */ static int _zran_validate_stream( zran_index_t *index, /* The index */ z_stream *stream, /* The z_stream struct */ int *offset /* Used to store the number of bytes skipped over */ ); /* _zran_inflate return codes */ int ZRAN_INFLATE_CRC_ERROR = -6; int ZRAN_INFLATE_ERROR = -5; int ZRAN_INFLATE_NOT_COVERED = -4; int ZRAN_INFLATE_OUTPUT_FULL = -3; int ZRAN_INFLATE_BLOCK_BOUNDARY = -2; int ZRAN_INFLATE_EOF = -1; int ZRAN_INFLATE_OK = 0; /* * _zran_inflate input flags. * Bit position, as a power of 2 */ uint32_t ZRAN_INFLATE_INIT_Z_STREAM = 1; uint32_t ZRAN_INFLATE_FREE_Z_STREAM = 2; uint32_t ZRAN_INFLATE_INIT_READBUF = 4; uint32_t ZRAN_INFLATE_FREE_READBUF = 8; uint32_t ZRAN_INFLATE_USE_OFFSET = 16; uint32_t ZRAN_INFLATE_CLEAR_READBUF_OFFSETS = 32; uint32_t ZRAN_INFLATE_STOP_AT_BLOCK = 64; /* Macros used by _zran_inflate for testing flags. */ #define inflate_init_stream( flags) ((flags & ZRAN_INFLATE_INIT_Z_STREAM) > 0) #define inflate_free_stream( flags) ((flags & ZRAN_INFLATE_FREE_Z_STREAM) > 0) #define inflate_init_readbuf( flags) ((flags & ZRAN_INFLATE_INIT_READBUF) > 0) #define inflate_free_readbuf( flags) ((flags & ZRAN_INFLATE_FREE_READBUF) > 0) #define inflate_use_offset( flags) ((flags & ZRAN_INFLATE_USE_OFFSET) > 0) #define inflate_stop_at_block(flags) ((flags & ZRAN_INFLATE_STOP_AT_BLOCK) > 0) #define inflate_clear_readbuf_offsets(flags) \ ((flags & ZRAN_INFLATE_CLEAR_READBUF_OFFSETS) > 0) /* * Inflate (decompress) the specified number of bytes, or until the next * Z_BLOCK/Z_STREAM_END is reached. * * This is a complicated function which implements the core decompression * routine, and is used by both _zran_expand_index, and zran_read. It reads * compressed data from the file, starting from the specified compressed * offset, inflates (a.k.a. decompresses) it, and copies the decompressed * data to the provided output buffer. * * This function is complicated because it is used in three different * situations: * - When generating the index (by zran_expand_index) * - When starting from an index seek point and discarding compressed data * to find a requested seek location (by zran_read) * - When actually reading and decompressing data (by zran_read). * * This function may be used in a re-entrant or non-re-entrant manner, * depending on the flags which are used. In the latter (more likely) case, * various pieces of information representing the current inflation state are * stored in fields of the zran_index_t struct. * * Specifically, this function does the following: * * 1. Figures out the starting offsets into the compressed/uncompressed * streams. If the ZRAN_INFLATE_USE_OFFSET flag is active, the index * point preceeding the specified offset is used as the starting point. * If there is no such point, ZRAN_INFLATE_NOT_COVERED is returned. If * ZRAN_INFLATE_USE_OFFSET is not active, index->inflate_cmp_offset and * index->inflate_uncmp_offset are used as the starting point. * * 2. Create a read buffer, if ZRAN_INFLATE_INIT_READBUF is active. A * reference to the read buffer is stored at index->readbuf. If * ZRAN_INFLATE_INIT_READBUF is not set, the function assumes that the * read buffer already exists. * * 3. If the ZRAN_INFLATE_CLEAR_READBUF_OFFSETS flag is active, the read * buffer offset (index->readbuf_offset) and length (index->readbuf_end) * fields are both set to 0. Otherwise, the function assumes that the * current offset/length values are valid. * 4. Initialises the z_stream struct, if ZRAN_INFLATE_INIT_Z_STREAM is * active. Otherwise, the function assumes that the z_stream struct is * already initialised and ready to be used. * * 5. Read some compressed data from the file into the read buffer as needed. * * 6. Pass that data to the zlib inflate function, and store the resulting * uncompressed data in the provided data buffer. If the end of a GZIP * stream is reached for the first time, it is validated against the * CRC/file size stored in the GZIP footer (unless ZRAN_SKIP_CRC_CHECK * is active). * * 7. Repeat steps 5 and 6 until one of the following is true: * * - The requested number of bytes have been read * * - The output buffer is full * * - End of file is reached * * - ZRAN_INFLATE_STOP_AT_BLOCK is active, and a block is reached * * 8. If ZRAN_INFLATE_FREE_READBUF is active, the file read buffer is * de-allocated. * * 9. If ZRAN_INFLATE_FREE_Z_STREAM is active, the memory used by the * z_stream struct is de-allocated (via the zlib inflateEnd function). * * The control flags can be a combination (bitwise OR) of the following: * * - ZRAN_INFLATE_INIT_Z_STREAM: Initialise the z_stream struct * before inflation. * * - ZRAN_INFLATE_FREE_Z_STREAM: Clean up the z_stream struct * after inflation. * * - ZRAN_INFLATE_INIT_READBUF: Allocate a read buffer before * inflation. * * - ZRAN_INFLATE_FREE_READBUF: Free the read buffer after * inflation. * * - ZRAN_INFLATE_USE_OFFSET: If set, use the provided offset * parameter; otherwise, use the * offsets stored in the index * struct. * * - ZRAN_INFLATE_CLEAR_READBUF_OFFSETS: If set, clear the read buffer * offset/length stored in the index * struct, otherwise assume that they * are valid. * * - ZRAN_INFLATE_STOP_AT_BLOCK: If set, stop inflation when a * deflate block boundary is reached * (the Z_BLOCK flag is passed to the * zlib inflate function). Otherwise, * inflation will continue until one * of the conditions in step 7, above, * are met. * * This function returns one of the following codes. Furthermore, if an error * did not occur (i.e. anything but ZRAN_INFLATE_ERROR or * ZRAN_INFLATE_NOT_COVERED was returned), the total_consumed and total_output * parameters are respectively updated to contain the total number of * compressed bytes that were read from the file, and total number of * decompressed bytes that were copied to the data buffer. * * - ZRAN_INFLATE_OK: Inflation was successful and the requested * number of bytes were copied to the provided * data buffer. * * - ZRAN_INFLATE_NOT_COVERED: The requested compressed data offset is not * covered by the index. * * - ZRAN_INFLATE_OUTPUT_FULL: The provided data buffer has been filled. * * - ZRAN_INFLATE_BLOCK_BOUNDARY: A deflate block boundary was encountered. * This will only be returned if the * ZRAN_INFLATE_STOP_AT_BLOCK flag is active. * * - ZRAN_INFLATE_EOF: The end of file has been reached. * * - ZRAN_INFLATE_CRC_ERROR: The CRC or uncompressed data size in the * GZIP footer does not match the CRC/size * that was calculated. * * - ZRAN_INFLATE_ERROR: A critical error has occurred. */ static int _zran_inflate( zran_index_t *index, /* Pointer to the index. */ z_stream *strm, /* Pointer to a z_stream struct. */ uint64_t offset, /* Compressed data offset to start inflation from. */ uint16_t flags, /* Control flags. */ uint32_t *total_consumed, /* Pointer which is updated to contain the total number of bytes that were read from the input file. */ uint32_t *total_output, /* Pointer which is updated to contain the total number of bytes that were inflated, and stored in data. */ uint32_t len, /* Maximum number of bytes to inflate. May be 0. */ uint8_t *data, /* Place to store the inflated bytes. */ int add_stream_points /* Add index points at the beginning of every gzip stream, including the first one at the beginning of the input file */ ); /* Initialise a zran_index_t struct for use with the given GZIP file. */ int zran_init(zran_index_t *index, FILE *fd, PyObject *f, uint32_t spacing, uint32_t window_size, uint32_t readbuf_size, uint16_t flags) { zran_point_t *point_list = NULL; int64_t compressed_size; zran_log("zran_init(%u, %u, %u, %u)\n", spacing, window_size, readbuf_size, flags); if (spacing == 0) spacing = 1048576; if (window_size == 0) window_size = 32768; if (readbuf_size == 0) readbuf_size = 16384; /* * The zlib manual specifies that a window * size of 32KB is 'always enough' to * initialise inflation/deflation with a * set dictionary. Less than that is not * guaranteed to be enough. */ if (window_size < 32768) goto fail; /* * Small read-buffers make code complicated. * The absolute minimum we need is enough to * store a GZIP footer, null padding bytes at * the end of a stream, and the subsequent * GZIP header. There are no bounds on the * number of padding bytes, or the size of a * GZIP header, so this constraint is * arbitrary (but should be good enough). */ if (readbuf_size < 128) goto fail; /* * window_size bytes of uncompressed data are * stored with each seek point in the index. * So it's a bit silly to have the distance * between consecutive points less than the * window size. */ if (spacing <= window_size) goto fail; /* The file must be opened in read-only mode */ if (!is_readonly(fd, f)) goto fail; /* * Calculate the size of the compressed file */ if (fseek_(fd, f, 0, SEEK_END) != 0) goto fail; compressed_size = ftell_(fd, f); if (compressed_size < 0) goto fail; if (fseek_(fd, f, 0, SEEK_SET) != 0) goto fail; /* * Allocate some initial space * for the index point list */ point_list = calloc(1, sizeof(zran_point_t) * 8); if (point_list == NULL) { goto fail; } /* initialise the index struct */ index->fd = fd; index->f = f; index->flags = flags; index->compressed_size = compressed_size; index->uncompressed_size = 0; index->spacing = spacing; index->window_size = window_size; index->log_window_size = (int)round(log10(window_size) / log10(2)); index->readbuf_size = readbuf_size; index->readbuf_offset = 0; index->readbuf_end = 0; index->readbuf = NULL; index->npoints = 0; index->size = 8; index->uncmp_seek_offset = 0; index->inflate_cmp_offset = 0; index->inflate_uncmp_offset = 0; index->validating = 0; index->last_stream_ended = 0; index->stream_size = 0; index->stream_crc32 = 0; index->list = point_list; return 0; fail: free(point_list); return -1; } /* Returns the compressed or uncompressed index limit. */ uint64_t _zran_index_limit(zran_index_t *index, uint8_t compressed) { if (index->npoints == 0) return 0; if (compressed) return index->list[index->npoints - 1].cmp_offset; else return index->list[index->npoints - 1].uncmp_offset; } /* Expands the memory used to store the index points. */ int _zran_expand_point_list(zran_index_t *index) { zran_point_t *new_list; uint32_t new_size = index->size * 2; zran_log("_zran_expand_point_list(%i -> %i)\n", index->size, new_size); new_list = realloc(index->list, sizeof(zran_point_t) * new_size); if (new_list == NULL) { /* old list is still valid */ return -1; } index->list = new_list; index->size = new_size; return 0; } /* Frees any unused memory allocated for index storage. */ int _zran_free_unused(zran_index_t *index) { zran_point_t *new_list; size_t new_size; zran_log("_zran_free_unused\n"); if (index->npoints < 8) new_size = 8; else new_size = index->npoints; new_list = realloc(index->list, sizeof(zran_point_t) * new_size); if (new_list == NULL) { return -1; } index->list = new_list; index->size = new_size; return 0; } /* Deallocate memory used by a zran_index_t struct. */ void zran_free(zran_index_t *index) { uint32_t i; zran_point_t *pt; zran_log("zran_free\n"); for (i = 0; i < index->npoints; i++) { pt = &(index->list[i]); /* * points at compression stream boundaries * have no data associated with them */ if (pt->data != NULL) { free(pt->data); } } free(index->list); index->fd = NULL; index->f = NULL; index->spacing = 0; index->window_size = 0; index->readbuf_size = 0; index->npoints = 0; index->size = 0; index->list = NULL; index->uncmp_seek_offset = 0; } /* Discard all points in the index after the specified compressed offset. */ int _zran_invalidate_index(zran_index_t *index, uint64_t from) { uint64_t i; zran_point_t *p; if (index->npoints == 0) return 0; for (i = 0; i < index->npoints; i++) { p = &(index->list[i]); if (p->cmp_offset >= from) break; } /* * The index doesn't cover * the requested offest */ if (i == index->npoints) return 0; if (i <= 1) index->npoints = 0; else index->npoints = i - 1; return _zran_free_unused(index); } /* (Re-)Builds the full index. */ int zran_build_index(zran_index_t *index, uint64_t from, uint64_t until) { if (_zran_invalidate_index(index, from) != 0) return ZRAN_BUILD_INDEX_FAIL; if (until == 0) until = index->compressed_size; return _zran_expand_index(index, until); } /* Searches for and returns the index at the specified offset. */ int _zran_get_point_at( zran_index_t *index, uint64_t offset, uint8_t compressed, zran_point_t **point) { uint64_t cmp_max; uint64_t uncmp_max; zran_point_t *last; zran_point_t *prev; zran_point_t *curr; uint8_t bit; uint32_t i; *point = NULL; /* * Bad input - past the end of the compressed or * uncompressed streams (if the latter is known). */ if (compressed && offset >= index->compressed_size) goto eof; if (!compressed && index->uncompressed_size > 0 && offset >= index->uncompressed_size) goto eof; if (index->npoints == 0) goto not_covered; zran_log("_zran_get_point_at(%llu, c=%u)\n", offset, compressed); /* * Figure out how much of the compressed * and uncompressed data the index currently * covers - the offsets of the last point * in the index. */ last = &(index->list[index->npoints - 1]); uncmp_max = last->uncmp_offset; cmp_max = last->cmp_offset; if ( compressed && offset > cmp_max) goto not_covered; if (!compressed && offset > uncmp_max) goto not_covered; /* * We should have an index point * which corresponds to this offset, * so let's search for it. */ prev = index->list; for (i = 1; i < index->npoints; i++) { curr = &(index->list[i]); if (compressed) { /* * Adjust the offset for non * byte-aligned seek points. */ if (curr->bits > 0) bit = 1; else bit = 0; if (curr->cmp_offset > offset + bit) break; } else { if (curr->uncmp_offset > offset) break; } prev = curr; } *point = prev; return ZRAN_GET_POINT_OK; not_covered: *point = NULL; return ZRAN_GET_POINT_NOT_COVERED; eof: *point = NULL; return ZRAN_GET_POINT_EOF; } /* * Get the index point corresponding to the given offset, expanding * the index as needed if ZRAN_AUTO_BUILD is active. */ int _zran_get_point_with_expand(zran_index_t *index, uint64_t offset, uint8_t compressed, zran_point_t **point) { int result; uint64_t expand; uint64_t limit; zran_log("_zran_get_point_with_expand(%llu, %u, autobuild=%u)\n", offset, compressed, index->flags & ZRAN_AUTO_BUILD); /* * See if there is an index point that * covers the specified offset. If there's * not, we're going to expand the index * until there is. */ result = _zran_get_point_at(index, offset, compressed, point); /* * Don't expand the index if * auto_build is not active */ if ((index->flags & ZRAN_AUTO_BUILD) == 0) { return result; } while (result == ZRAN_GET_POINT_NOT_COVERED) { /* * If result == ZRAN_GET_POINT_NOT_COVERED, * get_point says that an index point for * this offset doesn't yet exist. So let's * expand the index. * * Guess how far we need to expand the index, * and expand it by that much. */ if (compressed == 0) expand = _zran_estimate_offset(index, offset, 0); else expand = offset; /* * If _zran_estimate_offset was unable to * estimate a sensible compressed offset * (i.e. smaller or at the current index * extent), we force it past the limit, * so that the expand_index function will * create at least one point. */ limit = _zran_index_limit(index, 1); if (expand <= limit) expand = limit + 10; zran_log("Estimated mapping from uncompresseed offset " "%lu into compressed data: %lu\n", offset, expand); /* * Expand the index */ result = _zran_expand_index(index, expand); if (result == ZRAN_EXPAND_INDEX_CRC_ERROR) { goto crcerror; } else if (result != 0) { goto fail; } /* * Index has been expanded, so * there should now be a point * which covers the requested * offset. */ result = _zran_get_point_at(index, offset, compressed, point); /* * If we've made it to EOF, return * a ref to the eof point. */ if (result == ZRAN_GET_POINT_EOF) { *point = &index->list[index->npoints - 1]; if (offset < index->uncompressed_size) { result = ZRAN_GET_POINT_OK; } } } return result; crcerror: return ZRAN_GET_POINT_CRC_ERROR; fail: return ZRAN_GET_POINT_FAIL; } /* * Given an offset in one stream, estimates the corresponding offset into the * other stream. */ uint64_t _zran_estimate_offset( zran_index_t *index, uint64_t offset, uint8_t compressed) { zran_point_t *last; uint64_t estimate; /* * The first index in the list maps * indices 0 and 0, which won't help * us here. So we need at least two * index points. */ if (index->npoints <= 1) last = NULL; else last = &(index->list[index->npoints - 1]); /* * We have no reference. At least two * index points need to have been created. * The assumed correspondences between * the compressed streams are arbitrary. */ if (last == NULL) { if (compressed) estimate = offset * 2.0; else estimate = offset * 0.8; } /* * I'm just assuming a roughly linear correspondence * between the compressed/uncompressed data streams. */ else if (compressed) { estimate = round(offset * ((float)last->uncmp_offset / last->cmp_offset)); } else { estimate = round(offset * ((float)last->cmp_offset / last->uncmp_offset)); } zran_log("_zran_estimate_offset(%llu, %u) = %llu\n", offset, compressed, estimate); return estimate; } /* Add a new point to the index. */ int _zran_add_point(zran_index_t *index, uint8_t bits, uint64_t cmp_offset, uint64_t uncmp_offset, uint32_t data_offset, uint32_t data_size, uint8_t *data) { uint8_t *point_data = NULL; zran_point_t *next = NULL; #ifdef ZRAN_VERBOSE zran_log("_zran_add_point(%i, c=%lld + %i, u=%lld, data=%u / %u)\n", index->npoints, cmp_offset, bits > 0, uncmp_offset, data_offset, data_size); if (data != NULL) zran_log("Window data: [%02x %02x %02x %02x ...]\n", data[(data_offset - index->window_size + 0) % data_size], data[(data_offset - index->window_size + 1) % data_size], data[(data_offset - index->window_size + 2) % data_size], data[(data_offset - index->window_size + 3) % data_size]); #endif /* if list is full, make it bigger */ if (index->npoints == index->size) { if (_zran_expand_point_list(index) != 0) { goto fail; } } /* * Allocate memory to store the * uncompressed data (the "window") * associated with this point. Index * points corresponding to the beginning * of a gzip stream (including at start * of file) do not have any window data * associated with them. */ if (data == NULL) { point_data = NULL; } else { point_data = calloc(1, index->window_size); if (point_data == NULL) goto fail; } next = &(index->list[index->npoints]); next->bits = bits; next->cmp_offset = cmp_offset; next->uncmp_offset = uncmp_offset; next->data = point_data; /* * The uncompressed data may not start at * the beginning of the data pointer, but * rather from an arbitrary point. So we * copy the beginning of the window from * the end of data, and the end of the * window from the beginning of data. Does * that make sense? */ if (data != NULL) { if (data_offset >= index->window_size) { memcpy(point_data, data + (data_offset - index->window_size), index->window_size); zran_log("Copy %u bytes from %u to %u\n", index->window_size, data_offset - index->window_size, data_offset); } else { memcpy(point_data, data + (data_size - (index->window_size - data_offset)), (index->window_size - data_offset)); memcpy(point_data + (index->window_size - data_offset), data, data_offset); zran_log("Copy %u bytes from %u to %u, %u bytes from %u to %u\n", (index->window_size - data_offset), (data_size - (index->window_size - data_offset)), data_size, data_offset, 0, data_offset); } } index->npoints++; return 0; fail: free(point_data); return -1; } /* Initialise the given z_stream struct for decompression/inflation. */ int _zran_init_zlib_inflate(zran_index_t *index, z_stream *strm, zran_point_t *point) { int ret; int window; int64_t seek_loc; unsigned long bytes_read; bytes_read = strm->avail_in; window = index->log_window_size; strm->zalloc = Z_NULL; strm->zfree = Z_NULL; strm->opaque = Z_NULL; /* * If we're starting from the the current location in * the compressed data, we assume that there is a gzip * header present. Initialise inflation, then read * past the header. * Below, we will re-initialise for raw inflation. */ if (point == NULL) { zran_log("_zran_init_zlib_inflate from current " "seek location (expecting GZIP header)\n"); if (inflateInit2(strm, window + 32) != Z_OK) { goto fail; } if (inflate(strm, Z_BLOCK) != Z_OK) { goto fail_free_strm; } if (inflateEnd(strm) != Z_OK) { goto fail; } } /* * If starting from an index point, seek to the * required location in the compressed data stream. * * The compressed offset for index points correspond * to the first full byte of compressed data. So if * the index point is not byte-aligned (bits > 0), we * need to seek to the previous byte, and tell zlib * about it (via the inflatePrime call below). */ else { seek_loc = point->cmp_offset - (point->bits > 0); zran_log("_zran_init_zlib_inflate from index point (%li, %li, %li)\n", seek_loc, point->cmp_offset, point->uncmp_offset); if (fseek_(index->fd, index->f, seek_loc, SEEK_SET) != 0) { goto fail; } } /* * Now initialise for raw inflation. This tells zlib * not to expect any GZIP headers, and not to read * the GZIP footer (as we do our own CRC validation * in _zran_inflate). */ if (inflateInit2(strm, -window) != Z_OK) { goto fail; } /* * If starting from an index point, initialise * the inflation dictionary from the uncompressed * data associated with the index point. */ if (point != NULL && point->data != NULL) { /* * The starting index point is not byte-aligned, * so we'll insert the initial bits into the * inflate stream using inflatePrime (above, * we seeked one byte back to accommodate this). */ if (point->bits > 0) { ret = getc_(index->fd, index->f); if (ret == -1 && ferror_(index->fd, index->f)) { goto fail_free_strm; } if (inflatePrime(strm, point->bits, ret >> (8 - point->bits)) != Z_OK) goto fail_free_strm; } /* * Initialise the inflate stream * with the index point data. */ if (inflateSetDictionary(strm, point->data, index->window_size) != Z_OK) goto fail_free_strm; } /* * Reset CRC/size validation counters when * we start reading a new gzip stream */ index->validating = (point == NULL); index->stream_size = 0; index->stream_crc32 = 0; zran_log("_zran_zlib_init_inflate: initialised, read %i bytes\n", bytes_read - strm->avail_in); /* * Return the number of bytes of compressed * data, if any that were read over */ return bytes_read - strm->avail_in; /* * Something has gone wrong, but * inflateInit2 has been called, * so we need to call inflateEnd. * Falls through to the fail: * clause. */ fail_free_strm: inflateEnd(strm); /* Something has gone wrong */ fail: return -1; } /* * Read data from the GZIP file, and copy it into the read buffer for * decompression. */ static int _zran_read_data_from_file(zran_index_t *index, z_stream *stream, uint64_t cmp_offset, uint64_t uncmp_offset, uint32_t need_atleast) { size_t f_ret; if (stream->avail_in >= need_atleast) { return 0; } /* * If there are any unprocessed bytes * left over, put them at the beginning * of the read buffer. * * TODO: In times gone by, we would only * attempt to read data (and therefore * rotate memory here) when the read * buffer was empty. But now, to keep * the code in _zran_inflate clean-ish, * we do this repeatedly, even when we * are at EOF, to ensure that there is * enough data to validate one stream, * and find the next. We could improve * things here, by only rotating memory * here if needed. */ if (stream->avail_in > 0) { memmove(index->readbuf, stream->next_in, stream->avail_in); } zran_log("Reading from file %llu [== %llu?] " "[into readbuf offset %u]\n", ftell_(index->fd, index->f), cmp_offset + stream->avail_in, stream->avail_in); /* * Read a block of compressed data * (offsetting past any left over * bytes that we may have copied to * the beginning of the read buffer * above). */ f_ret = fread_(index->readbuf + stream->avail_in, 1, index->readbuf_size - stream->avail_in, index->fd, index->f); if (ferror_(index->fd, index->f)) { goto fail; } /* * No bytes left to read, and there are * only 8 bytes left to process (size of * gzip footer) - we've reached EOF. */ if (f_ret == 0 && stream->avail_in <= 8) { if (feof_(index->fd, index->f, f_ret)) { zran_log("End of file, stopping inflation\n"); /* * Reset next_in pointer to beginning of * read buffer, as we rotated it above, * and the area that next_in was pointing * to may have been overwritten by memmove. */ stream->next_in = index->readbuf; /* * we have uncompressed everything, * so we now know its size. */ if (index->uncompressed_size == 0) { zran_log("Updating uncompressed data " "size: %llu\n", uncmp_offset); index->uncompressed_size = uncmp_offset; } goto eof; } /* * Or something went wrong (this * should never happen if ferror * does the right thing). */ else { goto fail; } } zran_log("Read %lu bytes from file [c=%llu, u=%llu] " "[%02x %02x %02x %02x ...]\n", f_ret, cmp_offset, uncmp_offset, index->readbuf[stream->avail_in], index->readbuf[stream->avail_in + 1], index->readbuf[stream->avail_in + 2], index->readbuf[stream->avail_in + 3]); /* * Tell zlib about the block * of compressed data that we * just read in. */ index->readbuf_end = f_ret + stream->avail_in; stream->avail_in += f_ret; stream->next_in = index->readbuf; return 0; eof: return ZRAN_READ_DATA_EOF; fail: return ZRAN_READ_DATA_ERROR; } /* * Identify the location of the next compressed stream (if the file * contains concatenated streams). */ int _zran_find_next_stream(zran_index_t *index, z_stream *stream, int *offset) { int ret; int found; /* * Search for the beginning of * the next stream. GZIP files * start with 0x1f8b. */ found = 0; zran_log("Searching for a new stream [%u]\n", stream->avail_in); while (stream->avail_in > 0) { if (stream->avail_in >= 2 && stream->next_in[0] == 0x1f && stream->next_in[1] == 0x8b) { found = 1; break; } *offset += 1; stream->next_in += 1; stream->avail_in -= 1; } /* * No header found for * the next stream. */ if (found == 0) { zran_log("Could not find another stream [%u]\n", stream->avail_in); goto not_found; } zran_log("New stream found, re-initialising inflation\n"); /* * Re-configure for inflation * from the new stream. */ if (inflateEnd(stream) != Z_OK) { goto fail; } ret = _zran_init_zlib_inflate(index, stream, NULL); if (ret < 0) { goto fail; } *offset += ret; return 0; fail: return ZRAN_FIND_STREAM_ERROR; not_found: return ZRAN_FIND_STREAM_NOT_FOUND; } /* Validate the CRC32 and size of a GZIP stream. */ static int _zran_validate_stream(zran_index_t *index, z_stream *stream, int *offset) { uint32_t crc; uint32_t size; /* CRC validation is disabled */ if (index->flags & ZRAN_SKIP_CRC_CHECK) { return 0; } /* * A gzip stream should end with an 8 byte footer, * which contains the CRC32 of the uncompressed * data, and the uncompressed size modulo 2^32. */ if (stream->avail_in < 8) { return ZRAN_VALIDATE_STREAM_ERROR; } crc = ((stream->next_in[0] << 0) + (stream->next_in[1] << 8) + (stream->next_in[2] << 16) + (stream->next_in[3] << 24)); size = ((stream->next_in[4] << 0) + (stream->next_in[5] << 8) + (stream->next_in[6] << 16) + (stream->next_in[7] << 24)); zran_log("Validating CRC32 and size [%8x == %8x, %u == %u]\n", crc, index->stream_crc32, size, index->stream_size); stream->avail_in -= 8; stream->next_in += 8; *offset += 8; if (index->stream_crc32 != crc || index->stream_size != size) { return ZRAN_VALIDATE_STREAM_INVALID; } return 0; } /* The workhorse. Inflate/decompress data from the file. */ static int _zran_inflate(zran_index_t *index, z_stream *strm, uint64_t offset, uint16_t flags, uint32_t *total_consumed, uint32_t *total_output, uint32_t len, uint8_t *data, int add_stream_points) { /* * z_ret is for zlib/zran functions. * off is for storing offset of new * stream (from _zran_validate_stream * and _zran_find_next_stream). * return_val/error_return_val is * the return value for this function. */ int z_ret; int off; int return_val = ZRAN_INFLATE_OK; int error_return_val = ZRAN_INFLATE_ERROR; /* * Offsets into the compressed * and uncompressed data streams, * and total number of bytes * decompressed and output. */ uint64_t cmp_offset; uint64_t uncmp_offset; uint32_t _total_consumed = 0; uint32_t _total_output = 0; /* * Number of bytes input/decompressed * during a single call to zlib:inflate. */ uint32_t bytes_consumed = 0; uint32_t bytes_output = 0; /* * Index point to start from * (if ZRAN_INFLATE_USE_OFFSET * is active). */ zran_point_t *start = NULL; /* * Set all zstream_t fields to 0 * if we are initialising. */ if (inflate_init_stream(flags)) { memset(strm, 0, sizeof(z_stream)); } /* * If ZRAN_INFLATE_INIT_READBUF is not set, * make sure that a read buffer exists. * * If the opposite is true, the read buffer * from a prior call has not been cleaned up. */ if ((!inflate_init_readbuf(flags) && index->readbuf == NULL) || ( inflate_init_readbuf(flags) && index->readbuf != NULL)) { goto fail; } /* * It begins... */ zran_log("_zran_inflate(%llu, block=%u, use_offset=%u, init_stream=%u,\n" " free_stream=%u, init_readbuf=%u, free_readbuf=%u,\n" " clear_offsets=%u, nbytes=%u)\n", offset, inflate_stop_at_block( flags), inflate_use_offset( flags), inflate_init_stream( flags), inflate_free_stream( flags), inflate_init_readbuf( flags), inflate_free_readbuf( flags), inflate_clear_readbuf_offsets(flags), len); /* * The compressed/uncompressed offsets are initialised in * one of three ways. If ZRAN_INFLATE_USE_OFFSET is active, * they are either: * * - Both set to 0 * * - Initialised according to an existing index * point that preceeds the requested offset. * * Otherwise, they are initialised from index->inflate_cmp_offset * and index->inflate_uncmp_offset, which are assumed to have been * set in a prior call to _zran_inflate. */ if (inflate_use_offset(flags)) { cmp_offset = 0; uncmp_offset = 0; /* * If a non-zero offset has been specified, * search the index to see if we can start * inflating from a known location. */ if (offset > 0) { /* * In order to successfully decompress * data from the current uncompressed seek * location, we need to start decompressing * from the index point which preceeds it. */ z_ret = _zran_get_point_at(index, offset, 1, &start); if (z_ret == ZRAN_GET_POINT_NOT_COVERED) return ZRAN_INFLATE_NOT_COVERED; if (z_ret == ZRAN_GET_POINT_EOF) return ZRAN_INFLATE_EOF; } /* * Start inflating from the index point * corresponding to the offset (or keep * the offsets at 0 if no point was found). */ if (start != NULL) { cmp_offset = start->cmp_offset; uncmp_offset = start->uncmp_offset; } } /* * If ZRAN_INFLATE_USE_OFFSET is not active, * we initialise from offsets which were * stored on the last call to _zran_inflate. */ else { cmp_offset = index->inflate_cmp_offset; uncmp_offset = index->inflate_uncmp_offset; } zran_log("initialising to inflate from " "cmp_offset=%llu, uncmp_offset=%llu\n", cmp_offset, uncmp_offset); /* * If ZRAN_INFLATE_INIT_READBUF, * allocate memory for reading * compressed data from the file. * The buffer is attached to the * zran_index_t->readbuf pointer. */ if (inflate_init_readbuf(flags)) { index->readbuf = calloc(1, index->readbuf_size); if (index->readbuf == NULL) goto fail; } /* * If ZRAN_INFLATE_CLEAR_READBUF_OFFSETS, * we clear any stored information about * the read buffer, and start reading * from/writing to it from the beginning. */ if (inflate_clear_readbuf_offsets(flags)) { index->readbuf_offset = 0; index->readbuf_end = 0; } /* * Otherwise, assume that there is already * some input (compressed) data in the * readbuf, and that index->readbuf_offset * and index->readbuf_end were set on a * prior call. * * - readbuf_offset tells us where in * readbuf the data starts * * - readbuf_end tells us where it ends. */ else { strm->next_in = index->readbuf + index->readbuf_offset; strm->avail_in = index->readbuf_end - index->readbuf_offset; } /* * Tell zlib where to store * the uncompressed data. */ strm->avail_out = len; strm->next_out = data; /* * If ZRAN_INFLATE_INIT_Z_STREAM is active, * initialise the zlib struct for inflation. * * If ZRAN_INFLATE_INIT_Z_STREAM is not * active, we assume that the inflation is * already initialised. */ if (inflate_init_stream(flags)) { /* * No index point - we need to start reading * from the beginning of the input file. If * starting from an index point, the * _zran_init_zlib_inflate function will seek * to the correct location in the file for us. */ if (start == NULL) { if (fseek_(index->fd, index->f, 0, SEEK_SET) != 0) { goto fail; } /* * In this situation, _zran_init_zlib_inflate * is going to expect a GZIP header, so make * sure we have some data for it to look at. */ if (_zran_read_data_from_file(index, strm, cmp_offset, uncmp_offset, index->readbuf_size) != 0) { goto fail; } } /* * If init_zlib_inflate skips over any input data * (e.g. gzip header), it returns the number of * bytes tyhat were read */ z_ret = _zran_init_zlib_inflate(index, strm, start); if (z_ret < 0) { goto fail; } cmp_offset += z_ret; _total_consumed += z_ret; if (start == NULL && add_stream_points) { if (_zran_add_point(index, 0, cmp_offset, 0, 0, 0, NULL) != 0) { goto fail; } } } /* * Keep going until we run out of space. */ while (strm->avail_out > 0) { /* * Make sure the input buffer contains * some data to be decompressed. */ z_ret = _zran_read_data_from_file(index, strm, cmp_offset, uncmp_offset, 1); /* EOF - there is no more data left to read */ if (z_ret == ZRAN_READ_DATA_EOF) { return_val = ZRAN_INFLATE_EOF; break; } else if (z_ret != 0) { goto fail; } /* * Decompress data until there's no data * left, or we've read enough bytes */ z_ret = Z_OK; while (strm->avail_in > 0) { /* * Initialise counters to calculate * how many bytes are input/output * during this call to inflate. */ bytes_consumed = strm->avail_in; bytes_output = strm->avail_out; zran_log("Before inflate - avail_in=%u, avail_out=%u, " "cmp_offset=%lu, uncmp_offset=%lu\n", strm->avail_in, strm->avail_out, cmp_offset, uncmp_offset); /* * Inflate the block - the decompressed * data is output straight to the provided * data buffer. * * If ZRAN_INFLATE_STOP_AT_BLOCK is active, * Z_BLOCK tells inflate to stop inflating * at a compression block boundary. Otherwise, * inflate stops when it comes to the end of a * stream, or it runs out of input or output. */ if (inflate_stop_at_block(flags)) { z_ret = inflate(strm, Z_BLOCK); } else { z_ret = inflate(strm, Z_NO_FLUSH); } /* * Adjust our offsets according to what * was actually consumed/decompressed. */ bytes_consumed = bytes_consumed - strm->avail_in; bytes_output = bytes_output - strm->avail_out; cmp_offset += bytes_consumed; _total_consumed += bytes_consumed; uncmp_offset += bytes_output; _total_output += bytes_output; zran_log("After inflate - avail_in=%u, avail_out=%u, " "cmp_offset=%lu, uncmp_offset=%lu\n", strm->avail_in, strm->avail_out, cmp_offset, uncmp_offset); /* * Now we need to figure out what just happened. * * Z_BUF_ERROR indicates that the output buffer * is full; we clobber it though, as it makes the * code below a bit easier (and anyway, we can * tell if the output buffer is full by checking * strm->avail_out). */ if (z_ret == Z_BUF_ERROR) { z_ret = Z_OK; } /* * If z_ret is not Z_STREAM_END or * Z_OK, something has gone wrong. * * If the file comprises a sequence of * concatenated gzip streams, we will * encounter Z_STREAM_END before the end * of the file (where one stream ends and * the other begins). * * If at a new stream, we re-initialise * inflation on the next loop iteration. */ if (z_ret != Z_OK && z_ret != Z_STREAM_END) { zran_log("zlib inflate failed (code: %i, msg: %s)\n", z_ret, strm->msg); goto fail; } /* * If we have not yet validated the current * GZIP stream, update its size and crc so * we can validate them against the size * recorded in the stream footer when we * get to it. */ if ((uncmp_offset > index->last_stream_ended) && index->validating && !(index->flags & ZRAN_SKIP_CRC_CHECK)) { index->stream_size += bytes_output; index->stream_crc32 = crc32(index->stream_crc32, strm->next_out - bytes_output, bytes_output); } /* * End of a block? If INFLATE_STOP_AT_BLOCK * is active, we want to stop at a compression * block boundary. * * If we used Z_BLOCK above, and inflate * encountered a block boundary, it indicates * this in the the strm->data_type field. */ if (inflate_stop_at_block(flags) && ((strm->data_type & 128) && !(strm->data_type & 64))) { zran_log("At block or stream boundary, " "stopping inflation\n"); return_val = ZRAN_INFLATE_BLOCK_BOUNDARY; break; } /* * We've found the end of file, or end of one * gzip stream. Validate the uncompressed * data (size/ CRC) against the gzip footer. * Then search for a new stream and, if we * find one, re-initialise inflation */ if (z_ret == Z_STREAM_END) { zran_log("End of gzip stream [%u]\n", strm->avail_in); /* * Make sure we have data in the input buffer * to read and vaildate the gzip footer and, * in case we are reading concatentaed * streams, to search for the next stream and * read its header. * * There is no way of knowing how much data we * need to read in here - there is no upper * bound on the amount of null padding bytes * that may be present in between, or at the * end of, a stream, and there is no upper * bound on the size of a gzip header. * So a critical assumption is made here, that * the size of the read buffer is large enough * to encompass all of: * * - the footer of a gzip stream, * - null padding after the end of a gzip * stream, and * - the header of the next gzip stream * * This assumption could be removed by changing * the way that data is loaded and buffered * from the file - e.g. we could load bytes in * one-by-one, skipping over null bytes, and * then parse the gzip header ourselves. But * this is far too much work for what is a very * edge-casey scenario. */ z_ret = _zran_read_data_from_file(index, strm, cmp_offset, uncmp_offset, index->readbuf_size); if (!((z_ret == 0) || (z_ret == ZRAN_READ_DATA_EOF))) { goto fail; } /* * If there is no more data, the footer is * missing, so the data must be corrupt. */ if (strm->avail_in < 8) { goto fail; } /* * _validate_stream reads and checks in the * gzip stream footer (the CRC32 and ISIZE * fields at the end of a gzip stream), and * _find_next_stream will skip over any * remaining bytes (e.g. null padding bytes) * until eof, or a new stream is found. * * The number of bytes that were read/ * skipped over are accumulated into off. */ off = 0; /* * If we have not yet validated this stream, * check that the CRC and uncompressed size in * the footer match what we have calculated */ if (uncmp_offset > index->last_stream_ended && index->validating) { z_ret = _zran_validate_stream(index, strm, &off); if (z_ret == ZRAN_VALIDATE_STREAM_INVALID) { error_return_val = ZRAN_INFLATE_CRC_ERROR; goto fail; } else if (z_ret != 0) { goto fail; } index->last_stream_ended = uncmp_offset; index->validating = 0; } /* Otherwise skip over the 8 byte GZIP footer */ else { strm->avail_in -= 8; strm->next_in += 8; off += 8; } /* * See if we have another concatenated gzip * stream. If we run out of input data here, * bad things will happen. Refer to the long * comment regarding the input buffer, above. */ z_ret = _zran_find_next_stream(index, strm, &off); cmp_offset += off; _total_consumed += off; /* * If _zran_find_next_stream can't find * a new stream, we are either out of * compressed input data, or at eof. In * either case, break and let the outer * loop deal with it. */ if (z_ret == ZRAN_FIND_STREAM_NOT_FOUND) { break; } else if (z_ret != 0) { goto fail; } if (add_stream_points) { if (_zran_add_point(index, 0, cmp_offset, uncmp_offset, 0, 0, NULL) != 0) { goto fail; } } } /* * We've run out of space to store decompressed * data - this is the responsibility of the caller, * so bail out. */ if (strm->avail_out == 0) { zran_log("Output buffer full - stopping inflation\n"); /* * We return OUTPUT_FULL if we haven't * decompressed the requested number of * bytes, or ZRAN_INFLATE_STOP_AT_BLOCK * is active and we haven't yet found a * block. */ if (inflate_stop_at_block(flags) || _total_output < len) { return_val = ZRAN_INFLATE_OUTPUT_FULL; } break; } } /* * Some of the code above has decided that * it wants this _zran_inflate call to return. */ if (return_val != ZRAN_INFLATE_OK) { break; } } /* * If ZRAN_INFLATE_FREE_READBUF is * active, clear input buffer memory * and offsets. */ if (inflate_free_readbuf(flags)) { free(index->readbuf); index->readbuf = NULL; index->readbuf_offset = 0; index->readbuf_end = 0; } /* * Otherwise save the readbuf * offset for next time. */ else { index->readbuf_offset = index->readbuf_end - strm->avail_in; } /* * If ZRAN_INFLATE_FREE_Z_STREAM * is active, do just that. */ if (inflate_free_stream(flags)) { if (inflateEnd(strm) != Z_OK) goto fail; } /* * Update the total number of * bytes that were consumed/read */ *total_consumed = _total_consumed; *total_output = _total_output; /* * Update the compressed/uncompressed * offsets in case we need to use them * later. */ index->inflate_cmp_offset = cmp_offset; index->inflate_uncmp_offset = uncmp_offset; zran_log("Inflate finished - consumed=%u, output=%u,\n" " cmp_offset=%llu, uncmp_offset=%llu \n\n", *total_consumed, *total_output, cmp_offset, uncmp_offset); /* Phew. */ return return_val; fail: if (index->readbuf != NULL) { free(index->readbuf); index->readbuf = NULL; index->readbuf_offset = 0; index->readbuf_end = 0; } return error_return_val; } /* * Expands the index to encompass the * compressed offset specified by 'until'. */ int _zran_expand_index(zran_index_t *index, uint64_t until) { /* * Used to store return code when * an error occurs. */ int error_return_val = ZRAN_EXPAND_INDEX_FAIL; /* * Used to store and check return values * from zlib and zran functions. */ int z_ret; /* Zlib stream struct */ z_stream strm; /* * Number of bytes read/decompressed * on each call to _zran_inflate. */ uint32_t bytes_consumed; uint32_t bytes_output; /* * Buffer to store uncompressed data, * size of said buffer, and current offset * into said buffef. We wrap the buffer * around to the beginning when it is * filled. * * Ideally, we only want to decompress * index->spacing bytes before creating a * new index point. But we may have to * decompress more than this before a * suitable location (a block/stream * boundary) is found, so we allocate * more space. */ uint8_t *data = NULL; uint32_t data_size = index->spacing * 4; uint32_t data_offset = 0; /* * _zran_inflate control flags. We need * to use different flags on the first * call - first_inflate is used to track * this. */ uint16_t inflate_flags; uint8_t first_inflate = 1; /* * Counters to keep track of where we are * in both the compressed and uncompressed * streams. */ uint64_t cmp_offset; uint64_t uncmp_offset; uint64_t last_uncmp_offset; /* * start is a reference to the last * point in the index when this function * is called. This is where we need * to start decompressing data from * before we can add more index points. * * last_created is a reference to the * most recent point that was added * to the index in this call to * _zran_expand_index. */ zran_point_t *start = NULL; zran_point_t *last_created = NULL; /* * In order to create a new index * point, we need to start reading * at the last index point, so that * we read enough data to initialise * the inflation. If we don't have * at least two points, we start * at the beginning of the file. */ start = NULL; if (index->npoints > 1) { start = &(index->list[index->npoints - 1]); /* * The index already covers the requested * offset. Nothing needs to be done. */ if (until <= start->cmp_offset) return 0; } /* * Allocate memory for the * uncompressed data buffer. */ data = calloc(1, data_size); if (data == NULL) goto fail; /* Let's do this. */ zran_log("_zran_expand_index(%llu)\n", until); /* * If the caller passed until == 0, * we force some data to be read. */ if (until == 0) { until = index->spacing; } /* * We start from the last point in * the index, or the beginning of * the file, if there are not enough * points in the index. */ if (start != NULL) { cmp_offset = start->cmp_offset; uncmp_offset = start->uncmp_offset; last_uncmp_offset = uncmp_offset; } else { cmp_offset = 0; uncmp_offset = 0; last_uncmp_offset = 0; } /* * Don't finish until we're at the end of the * file, or we've expanded the index past * the requested offset (and have created at * least one new index point - * last_created == NULL tells us whether a * point has been created). */ while ((cmp_offset < index->compressed_size) && (last_created == NULL || last_created->cmp_offset < until)) { /* * On the first call to _zran_inflate, we * tell it to initialise the zlib stream * struct, create a read buffer, and start * inflation from our starting point. */ if (first_inflate) { first_inflate = 0; inflate_flags = (ZRAN_INFLATE_INIT_Z_STREAM | ZRAN_INFLATE_INIT_READBUF | ZRAN_INFLATE_USE_OFFSET | ZRAN_INFLATE_CLEAR_READBUF_OFFSETS | ZRAN_INFLATE_STOP_AT_BLOCK); } /* * On subsequent calls, we tell _zran_inflate * to just continue where it left off on the * previous call. */ else { inflate_flags = ZRAN_INFLATE_STOP_AT_BLOCK; } zran_log("Searching for next block boundary\n" " c=%llu, u=%llu,\n" " data_offset=%u, data_space=%u\n", cmp_offset, uncmp_offset, data_offset, data_size - data_offset); /* * We wrap the data buffer around to its * beginning by using some trickery with * the data_offset. By doing this, the * _zran_add_point function will be able * to retrieve the data associated with * an index point even if some of it it * is contained at the end of the data * buffer, and the rest at the beginning. */ z_ret = _zran_inflate(index, &strm, cmp_offset, inflate_flags, &bytes_consumed, &bytes_output, data_size - data_offset, data + data_offset, 1); cmp_offset += bytes_consumed; uncmp_offset += bytes_output; data_offset = (data_offset + bytes_output) % data_size; /* * update the last created offset on every iteration, * to catch any index points created by _zran_inflate */ if (index->npoints > 0) { last_created = &index->list[index->npoints - 1]; last_uncmp_offset = last_created->uncmp_offset; } /* * Has the output buffer been filled? * If so, we just continue - the * data_offset trickery means that we * can ask the _zran_inflate function * to just keep filling the buffer * until we find a block. */ if (z_ret == ZRAN_INFLATE_OUTPUT_FULL) continue; /* * If z_ret != ZRAN_INFLATE_EOF or * ZRAN_INFLATE_BLOCK_BOUNDARY, * something has gone wrong. */ else if (z_ret != ZRAN_INFLATE_EOF && z_ret != ZRAN_INFLATE_BLOCK_BOUNDARY) { if (z_ret == ZRAN_INFLATE_CRC_ERROR) { error_return_val = ZRAN_EXPAND_INDEX_CRC_ERROR; } goto fail; } /* * If we're at the end of the file (z_ret * == ZRAN_INFLATE_EOF), or at a compress * block boundary, and index->spacing bytes * have passed since the last index point * that was created, we'll create a new * index point at this location. * * Note that the _zran_inflate function * also adds index points at the beginning * of the file, and at the beginning of all * other gzip streams, in the case of * concatenated streams (refer to its * add_stream_points argument). */ if (z_ret == ZRAN_INFLATE_EOF || uncmp_offset - last_uncmp_offset >= index->spacing) { if (_zran_add_point(index, strm.data_type & 7, cmp_offset, uncmp_offset, data_offset, data_size, data) != 0) { goto fail; } last_created = &index->list[index->npoints - 1]; last_uncmp_offset = uncmp_offset; } /* And if at EOF, we are done. */ if (z_ret == ZRAN_INFLATE_EOF) { break; } } /* * A final call to _zran_inflate, to clean * up read buffer and z_stream memory. */ z_ret = _zran_inflate(index, &strm, 0, (ZRAN_INFLATE_CLEAR_READBUF_OFFSETS | ZRAN_INFLATE_FREE_Z_STREAM | ZRAN_INFLATE_FREE_READBUF), &bytes_consumed, &bytes_output, 0, data, 0); if (z_ret != ZRAN_INFLATE_OK && z_ret != ZRAN_INFLATE_EOF) { if (z_ret == ZRAN_INFLATE_CRC_ERROR) { error_return_val = ZRAN_EXPAND_INDEX_CRC_ERROR; } goto fail; } /* * The index may have over-allocated * space for storing index points, so * here we free the unused memory. */ if (_zran_free_unused(index) != 0) { goto fail; } zran_log("Expansion finished (cmp_offset=%llu, last_created=%llu)\n", cmp_offset, last_created->cmp_offset); free(data); return ZRAN_EXPAND_INDEX_OK; fail: free(data); return error_return_val; } /* * Seek to the approximate location of the specified offset into * the uncompressed data stream. */ int zran_seek(zran_index_t *index, int64_t offset, uint8_t whence, zran_point_t **point) { int result; zran_point_t *seek_point = NULL; zran_log("zran_seek(%lld, %i)\n", offset, whence); if (whence == SEEK_END && index->uncompressed_size == 0) { goto index_not_built; } /* * The offset passed in is signed, so * negative offsets are allowed. But * here we transform the offset to * positive, as _zran_get_point_with_expand * requires an absolute offset from the * beginning of the uncompressed stream. * * I am not currently taking into account * the overflow potential when converting * from int64 to uint64. */ /* * SEEK_END: seek relative to the * end of the uncompressed stream */ if (whence == SEEK_END) { offset += index->uncompressed_size; } /* * SEEK_CUR: seek relative to * the current file position. */ if (whence == SEEK_CUR) { offset += index->uncmp_seek_offset; } /* Bad input */ if (offset < 0) { goto fail; } /* * We implicitly allow seek(0) - if * not auto-building the index, * seek(0) would otherwwise fail. */ if (offset == 0) { index->uncmp_seek_offset = offset; } else { /* * Get the index point that * corresponds to this offset. */ result = _zran_get_point_with_expand(index, offset, 0, &seek_point); if (result == ZRAN_GET_POINT_EOF) goto eof; else if (result == ZRAN_GET_POINT_NOT_COVERED) goto not_covered; else if (result == ZRAN_GET_POINT_CRC_ERROR) goto crcerror; else if (result != ZRAN_GET_POINT_OK) goto fail; /* * transform into an offset * into the compresesd stream */ index->uncmp_seek_offset = offset; offset = seek_point->cmp_offset; /* * This index point is not byte-aligned. * Adjust the offset accordingly. */ if (seek_point->bits > 0) offset -= 1; } /* * The caller wants a ref to the * index point corresponding to * the seek location. */ if (point != NULL) { *point = seek_point; } if (fseek_(index->fd, index->f, offset, SEEK_SET) != 0) goto fail; return ZRAN_SEEK_OK; crcerror: return ZRAN_SEEK_CRC_ERROR; fail: return ZRAN_SEEK_FAIL; index_not_built: return ZRAN_SEEK_INDEX_NOT_BUILT; not_covered: return ZRAN_SEEK_NOT_COVERED; eof: index->uncmp_seek_offset = index->uncompressed_size; return ZRAN_SEEK_EOF; } /* Return the current seek position in the uncompressed data stream. */ uint64_t zran_tell(zran_index_t *index) { return index->uncmp_seek_offset; } /* Read len bytes from the uncompressed data stream, storing them in buf. */ int64_t zran_read(zran_index_t *index, void *buf, uint64_t len) { /* Used to store/check return values. */ int ret; /* * Used to store error code for return * if an error occurs */ int error_return_val = ZRAN_READ_FAIL; /* * Number of bytes we try to read, and * number of bytes actually read/output * on each call to _zran_inflate. */ uint64_t bytes_to_read; uint32_t bytes_consumed; uint32_t bytes_output; /* * _zran_inflate control flags. We need * to pass different flags on thefirst * call to _zran_inflate. */ uint16_t inflate_flags; uint8_t first_inflate = 1; /* * Counters keeping track of the current * location in both the compressed and * uncompressed streams, and the total * number of bytes read. */ uint64_t uncmp_offset; uint64_t cmp_offset; uint64_t total_read; /* * Zlib stream struct and starting * index point for the read.. */ z_stream strm; zran_point_t *start = NULL; /* * Memory used to store bytes that we skip * over before reaching the appropriate * point in the uncompressed data stream. * * to_discard is used to store the number of * bytes that we want to discard on a single * call to _zran_inflate (which is limited by * the discard buffer size). * * total_discarded keeps track of the total * number of bytes discarded so far. * * discard_size is the size of the discard * * buffer. Ideally we will only have to * decompress (on average) spacing / 2 bytes * before reaching the seek location, but this * isn't a guarantee, so we allocate more to * reduce the number of reads that are required. */ uint8_t *discard = NULL; uint64_t to_discard = 0; uint64_t total_discarded = 0; uint64_t discard_size = index->spacing * 4; if (len == 0) return 0; if (len > INT64_MAX) goto fail; zran_log("zran_read(%llu, %lu)\n", len, index->uncmp_seek_offset); /* * Search for the index point that corresponds to * our current seek location in the uncompressed * data stream. Reading from the start of file is * always allowed, even if the index does not * contain any points. */ if (index->uncmp_seek_offset == 0) { cmp_offset = 0; uncmp_offset = 0; } else { ret = _zran_get_point_with_expand(index, index->uncmp_seek_offset, 0, &start); if (ret == ZRAN_GET_POINT_EOF) goto eof; if (ret == ZRAN_GET_POINT_NOT_COVERED) goto not_covered; else if (ret != ZRAN_GET_POINT_OK) { if (ret == ZRAN_GET_POINT_CRC_ERROR) { error_return_val = ZRAN_READ_CRC_ERROR; } goto fail; } cmp_offset = start->cmp_offset; uncmp_offset = start->uncmp_offset; } /* * We have to start decompressing from * the index point that preceeds the seek * location, so we need to skip over bytes * until we get to that location. We use * the discard buffer to store those bytes. */ discard = malloc(discard_size); if (discard == NULL) { goto fail; } /* * Inflate and discard data until we * reach the current seek location * into the uncompresesd data stream. */ first_inflate = 1; total_discarded = 0; while (uncmp_offset < index->uncmp_seek_offset) { /* * On the first call to _zran_inflate, * we tell it to initialise the z_stream, * and create a read buffer. */ if (first_inflate) { first_inflate = 0; inflate_flags = (ZRAN_INFLATE_INIT_Z_STREAM | ZRAN_INFLATE_INIT_READBUF | ZRAN_INFLATE_CLEAR_READBUF_OFFSETS | ZRAN_INFLATE_USE_OFFSET); } /* * On subsequent calls, we just tell * _zran_inflate to continue where * it left off. */ else { inflate_flags = 0; } /* * Don't read past the uncompressed seek * location - at this point, we will need * to stop discarding bytes, and start * fulfilling the read request. */ to_discard = index->uncmp_seek_offset - uncmp_offset; if (to_discard > discard_size) to_discard = discard_size; zran_log("Discarding %llu bytes (%llu < %llu)\n", to_discard, uncmp_offset, index->uncmp_seek_offset); ret = _zran_inflate(index, &strm, cmp_offset, inflate_flags, &bytes_consumed, &bytes_output, to_discard, discard, 0); /* * _zran_inflate should return 0 if * it runs out of output space (which * is ok), or it has read enough bytes * (which is perfect). Any other * return code means that something * has gone wrong. */ if (ret != ZRAN_INFLATE_OUTPUT_FULL && ret != ZRAN_INFLATE_EOF && ret != ZRAN_INFLATE_OK) { if (ret == ZRAN_INFLATE_CRC_ERROR) { error_return_val = ZRAN_READ_CRC_ERROR; } goto fail; } cmp_offset += bytes_consumed; uncmp_offset += bytes_output; total_discarded += bytes_output; } /* * Sanity check - we should be at the * correct location in the uncompressed * stream. * * TODO What happens here if we are at EOF? */ if (uncmp_offset != index->uncmp_seek_offset) goto fail; zran_log("Discarded %llu bytes, ready to " "read from %llu (== %llu)\n", total_discarded, uncmp_offset, index->uncmp_seek_offset); /* * At this point, we are ready to inflate * from the uncompressed seek location. */ total_read = 0; while (total_read < len) { /* * If we started at the correct location, * the discard loop above will not have * executed, and _zran_inflate will not * have initialised itself. So we repeat * the flag control stuff here. */ if (first_inflate) { first_inflate = 0; inflate_flags = (ZRAN_INFLATE_INIT_Z_STREAM | ZRAN_INFLATE_INIT_READBUF | ZRAN_INFLATE_CLEAR_READBUF_OFFSETS | ZRAN_INFLATE_USE_OFFSET); } else { inflate_flags = 0; } /* * _zran_inflate only allows us to * read max(uint32_t) at a time. If * len is greater than this, we need * to split it into multiple calls. */ bytes_to_read = len - total_read; if (bytes_to_read > 4294967295) { bytes_to_read = 4294967295; } ret = _zran_inflate(index, &strm, cmp_offset, inflate_flags, &bytes_consumed, &bytes_output, bytes_to_read, (uint8_t *)(buf) + total_read, 0); cmp_offset += bytes_consumed; uncmp_offset += bytes_output; total_read += bytes_output; if (ret == ZRAN_INFLATE_EOF) break; else if (ret == ZRAN_INFLATE_OUTPUT_FULL) { /* * We might be reading 2**32 sized chunks * of data on each call to _zran_inflate. */ if (bytes_to_read == len) { break; } } else if (ret != ZRAN_INFLATE_OK) { if (ret == ZRAN_INFLATE_CRC_ERROR) { error_return_val = ZRAN_READ_CRC_ERROR; } goto fail; } zran_log("Read %u bytes (%llu / %llu)\n", bytes_output, total_read, len); } /* * A final call to _zran_inflate, * to clean up memory */ ret = _zran_inflate(index, &strm, 0, (ZRAN_INFLATE_CLEAR_READBUF_OFFSETS | ZRAN_INFLATE_FREE_Z_STREAM | ZRAN_INFLATE_FREE_READBUF), &bytes_consumed, &bytes_output, 0, discard, 0); if (ret != ZRAN_INFLATE_OK && ret != ZRAN_INFLATE_EOF) { if (ret == ZRAN_INFLATE_CRC_ERROR) { error_return_val = ZRAN_READ_CRC_ERROR; } goto fail; } /* * Update the current uncompressed * seek position. */ index->uncmp_seek_offset += total_read; zran_log("Read succeeded - %llu bytes read [compressed offset: %ld]\n", total_read, ftell_(index->fd, index->f)); free(discard); return total_read; not_covered: return ZRAN_READ_NOT_COVERED; eof: return ZRAN_READ_EOF; fail: if (discard != NULL) free(discard); return error_return_val; } /* * Store checkpoint information from index to file fd. File should be opened * in binary write mode. */ int zran_export_index(zran_index_t *index, FILE *fd, PyObject *f) { /* * TODO: Endianness check for fwrite calls. Prefer little-endian to be * consistent with gzip library. */ /* Used for checking return value of fwrite calls. */ size_t f_ret; /* Used for iterating over elements of zran_index_t.list. */ zran_point_t *point; zran_point_t *list_end; /* File flags, currently not used. Also used as a temporary variable. */ uint8_t flags = 0; zran_log("zran_export_index: (%lu, %lu, %u, %u, %u)\n", index->compressed_size, index->uncompressed_size, index->spacing, index->window_size, index->npoints); /* Write ID and version, and check for errors. */ f_ret = fwrite_(ZRAN_INDEX_FILE_ID, sizeof(ZRAN_INDEX_FILE_ID), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; f_ret = fwrite_(&ZRAN_INDEX_FILE_VERSION, 1, 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write flags (currently unused) */ f_ret = fwrite_(&flags, 1, 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write compressed size, and check for errors. */ f_ret = fwrite_(&index->compressed_size, sizeof(index->compressed_size), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write uncompressed size, and check for errors. */ f_ret = fwrite_(&index->uncompressed_size, sizeof(index->uncompressed_size), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write spacing, and check for errors. */ f_ret = fwrite_(&index->spacing, sizeof(index->spacing), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write window size, and check for errors. */ f_ret = fwrite_(&index->window_size, sizeof(index->window_size), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write number of points, and check for errors. */ f_ret = fwrite_(&index->npoints, sizeof(index->npoints), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* * We will make two passes over points list now. In the first pass, offset * mapping information of each point will be written. In the second pass, * checkpoint snapshot data will be written. This will keep offsets bundled * together, which enables user to read all offset mappings in one pass. */ /* Write all points iteratively for checkpoint offset mapping. */ point = index->list; list_end = index->list + index->npoints; while (point < list_end) { /* Write compressed offset, and check for errors. */ f_ret = fwrite_(&point->cmp_offset, sizeof(point->cmp_offset), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write uncompressed offset, and check for errors. */ f_ret = fwrite_(&point->uncmp_offset, sizeof(point->uncmp_offset), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write bit offset, and check for errors. */ f_ret = fwrite_(&point->bits, sizeof(point->bits), 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Write data flag, and check for errors. */ flags = (point->data != NULL) ? 1 : 0; f_ret = fwrite_(&flags, 1, 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; zran_log("zran_export_index: (p%lu, %lu, %lu, %u, %u)\n", (index->npoints - (list_end - point)), // point index point->cmp_offset, point->uncmp_offset, point->bits, flags); point++; } /* * Now write out the window data for every point. No data is written for * points which don't have any data (e.g. at stream boundaries). */ point = index->list; list_end = index->list + index->npoints; while (point < list_end) { if (point->data == NULL) { point++; continue; } /* Write checkpoint data, and check for errors. */ f_ret = fwrite_(point->data, index->window_size, 1, fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 1) goto fail; /* Print first and last three bytes of the checkpoint window. */ zran_log("zran_export_index: " "(%lu, [%02x %02x %02x...%02x %02x %02x])\n", (index->npoints - (list_end - point)), // point index point->data[0], point->data[1], point->data[2], point->data[index->window_size - 3], point->data[index->window_size - 2], point->data[index->window_size - 1]); /* Done with this point. Proceed to next one. */ point++; } zran_log("zran_export_index: done\n"); /* * It is important to flush written file when done, since underlying file * descriptor can be closed by Python code before having a chance to flush. */ f_ret = fflush_(fd, f); if (ferror_(fd, f)) goto fail; if (f_ret != 0) goto fail; return ZRAN_EXPORT_OK; fail: return ZRAN_EXPORT_WRITE_ERROR; } /* * Load checkpoint information from file fd to index. File should be opened in * binary read mode. */ int zran_import_index(zran_index_t *index, FILE *fd, PyObject *f) { /* Used for checking return value of fread calls. */ size_t f_ret; /* Return value of function if a failure happens. */ int fail_ret; /* Used for iterating over elements of zran_index_t.list. */ uint64_t i; zran_point_t *point; zran_point_t *list_end; /* * Used to store flags for each point - allocated once we * know how many points there are. */ uint8_t *dataflags = NULL; /* * Used for checking file ID, version, and * flags at the beginning of the file. Flags * is also used as a temporary variable. */ char file_id[sizeof(ZRAN_INDEX_FILE_ID)]; uint8_t version; uint8_t flags; /* * Data fields that will be read from the file. They aren't stored directly * to index struct to keep original index in case of any failures while * reading those data. */ uint64_t compressed_size; uint64_t uncompressed_size; uint32_t spacing; uint32_t window_size; uint32_t npoints; zran_point_t *new_list = NULL; /* CRC validation is currently not possible on an imported index */ index->flags |= ZRAN_SKIP_CRC_CHECK; /* Check if file is read only. */ if (!is_readonly(fd, f)) goto fail; /* Read ID, and check for file errors and EOF. */ f_ret = fread_(file_id, sizeof(file_id), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Verify file ID. */ if (memcmp(file_id, ZRAN_INDEX_FILE_ID, sizeof(file_id))) goto unknown_format; /* Read file format version */ f_ret = fread_(&version, 1, 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* This file is too new for us to cope */ if (version > ZRAN_INDEX_FILE_VERSION) goto unsupported_version; /* Read flags (currently unused) */ f_ret = fread_(&flags, 1, 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Read compressed size, and check for file errors and EOF. */ f_ret = fread_(&compressed_size, sizeof(compressed_size), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* * Compare compressed_size in the index file to the existing size in * the current index (set in zran_init), if they don't match this means * this index file is not created for this compressed file. */ if (compressed_size != index->compressed_size) goto inconsistent; if (feof_(fd, f, f_ret)) goto eof; /* Read uncompressed size, and check for file errors and EOF. */ f_ret = fread_(&uncompressed_size, sizeof(uncompressed_size), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* * Uncompressed size may not be set in either current index or exported * file, or both. Therefore, they are compared only if it's set in both. */ if (uncompressed_size != 0 && index->uncompressed_size != 0 && index->uncompressed_size != uncompressed_size) goto inconsistent; /* Read spacing, and check for file errors and EOF. */ f_ret = fread_(&spacing, sizeof(spacing), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Read window size, and check for file errors and EOF. */ f_ret = fread_(&window_size, sizeof(window_size), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* * Make sanity checks for window size and spacing. These are similar to * sanity checks done in zran_init. */ if (window_size < 32768) goto fail; if (spacing < window_size) goto fail; /* Read number of points, and check for file errors and EOF. */ f_ret = fread_(&npoints, sizeof(npoints), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; zran_log("zran_import_index: (%u, %lu, %lu, %u, %u, %u)\n", version, compressed_size, uncompressed_size, spacing, window_size, npoints); /* * At this step, the number of points is known. Allocate space for new list * of points. This pointer should be cleaned up before exit in case of * failure. * * The index file is allowed to contain 0 points, in which case we * initialise the point list to 8 (same as in zran_init). */ new_list = calloc(1, sizeof(zran_point_t) * max(npoints, 8)); if (new_list == NULL) goto memory_error; /* * Allocate space for the data flag for each point - whether or not * there is data associated with it */ dataflags = calloc(npoints, 1); if (dataflags == NULL) goto memory_error; /* Read new points iteratively for reading offset mapping. */ for (i = 0, point = new_list; i < npoints; i++, point++) { /* Read compressed offset, and check for errors. */ f_ret = fread_(&point->cmp_offset, sizeof(point->cmp_offset), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Read uncompressed offset, and check for errors. */ f_ret = fread_(&point->uncmp_offset, sizeof(point->uncmp_offset), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Read bit offset, and check for errors. */ f_ret = fread_(&point->bits, sizeof(point->bits), 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* Read data flag (added in version 1), and check for errors. */ if (version >= 1) { f_ret = fread_(&flags, 1, 1, fd, f); if (feof_(fd, f, f_ret)) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* * The data flag determines whether or not any window data * is associated with this point. We set point->data to 1 * to indicate to the loop below that this point has data * to be loaded. */ } /* * In index file version 0, the first point * has no data, but all other points do. */ else { flags = (point == new_list) ? 0 : 1; } dataflags[i] = flags; zran_log("zran_import_index: (p%lu, %lu, %lu, %u, %u)\n", i, point->cmp_offset, point->uncmp_offset, point->bits, flags); } /* * Now loop through and load the window data for all index points. */ for (i = 0, point = new_list; i < npoints; i++, point++) { /* * There is no data associated with this point - it is either * at the beginning of the file, or on a stream boundary. */ if (dataflags[i] == 0) { continue; } /* * Allocate space for checkpoint data. These pointers in each point * should be cleaned up in case of any failures. */ point->data = calloc(1, window_size); if (point->data == NULL) goto memory_error; /* * Read checkpoint data, and check for errors. End of file can be * reached just after the last element, so it's not an error for * the last element. */ f_ret = fread_(point->data, window_size, 1, fd, f); if (feof_(fd, f, f_ret) && i < npoints - 1) goto eof; if (ferror_(fd, f)) goto read_error; if (f_ret != 1) goto read_error; /* * TODO: If there are still more data after importing is done, it * is silently ignored. It might be handled by other means. */ /* Print first and last three bytes of the checkpoint window. */ zran_log("zran_import_index:" "(%lu, [%02x %02x %02x...%02x %02x %02x])\n", i, point->data[0], point->data[1], point->data[2], point->data[window_size - 3], point->data[window_size - 2], point->data[window_size - 1]); } /* There are no errors, it's safe to overwrite existing index data now. */ /* If a new uncompressed_size is read, update current index. */ if (index->uncompressed_size == 0 && uncompressed_size != 0) { index->uncompressed_size = uncompressed_size; } /* Overwrite spacing. */ if (index->spacing != spacing) { index->spacing = spacing; } /* Overwrite window size. */ if (index->window_size != window_size) { index->window_size = window_size; } /* * Now, we will release current checkpoint list of the index, and then * point to the new list. */ /* * Initialize point to the second element of the list, and list_end to the * end of the list. We initialize to the second element, because first * element does not keep checkpoint data. */ point = index->list + 1; list_end = index->list + index->npoints; while (point < list_end) { free(point->data); point++; } /* Now release the old list. */ free(index->list); /* The old list is dead, long live the new list! */ index->list = new_list; index->npoints = npoints; /* * Let's not forget to update the size as well. * If npoints is 0, the list will have been * initialised to allow space for 8 points. */ index->size = max(npoints, 8); zran_log("zran_import_index: done\n"); free(dataflags); return ZRAN_IMPORT_OK; /* For each failure case, we assign return value and then clean up. */ fail: fail_ret = ZRAN_IMPORT_FAIL; goto cleanup; eof: fail_ret = ZRAN_IMPORT_EOF; goto cleanup; read_error: fail_ret = ZRAN_IMPORT_READ_ERROR; goto cleanup; inconsistent: fail_ret = ZRAN_IMPORT_INCONSISTENT; goto cleanup; memory_error: fail_ret = ZRAN_IMPORT_MEMORY_ERROR; goto cleanup; unknown_format: fail_ret = ZRAN_IMPORT_UNKNOWN_FORMAT; goto cleanup; unsupported_version: fail_ret = ZRAN_IMPORT_UNSUPPORTED_VERSION; goto cleanup; cleanup: if (new_list != NULL) { /* * Initialize point to the second element of the list, and list_end to * the end of the list. We initialize to the second element, because * first element does not keep checkpoint data. */ point = new_list + 1; list_end = new_list + npoints; /* * Release until the end of list or the first NULL data pointer, * whichever comes first. */ while (point < list_end && point->data != NULL) { free(point->data); point++; } /* Release the list itself. */ free(new_list); } if (dataflags != NULL) { free(dataflags); } return fail_ret; } indexed_gzip-1.6.4/indexed_gzip/__init__.py0000644000175000017500000000102014133320576020300 0ustar nileshnilesh#!/usr/bin/env python # # __init__.py - The indexed_gzip namespace. # """The indexed_gzip namespace. """ from .indexed_gzip import (_IndexedGzipFile, # noqa IndexedGzipFile, open, NotCoveredError, NoHandleError, ZranError) SafeIndexedGzipFile = IndexedGzipFile """Alias for ``IndexedGzipFile``, to preserve compatibility with older versions of ``nibabel``. """ __version__ = '1.6.4' indexed_gzip-1.6.4/.ci/0000755000175000017500000000000014133320576014176 5ustar nileshnileshindexed_gzip-1.6.4/.ci/install_32bit_dependencies.sh0000755000175000017500000000102114133320576021706 0ustar nileshnilesh#!/usr/bin/env bash apt-get install -y \ build-essential \ software-properties-common \ zlib1g \ zlib1g-dev if [ "$PYTHON_VERSION" == "2.7" ]; then PACKAGES="python-pip python-virtualenv" else PACKAGES="python3-pip python${PYTHON_VERSION}-venv" fi add-apt-repository -y ppa:deadsnakes/ppa apt-get update -y apt-get install -y \ python"$PYTHON_VERSION" \ python"$PYTHON_VERSION"-dev \ $PACKAGES indexed_gzip-1.6.4/.ci/create_test_env.sh0000755000175000017500000000144214133320576017710 0ustar nileshnilesh#!/usr/bin/env bash # Set up a virtual environment with build and # run-time dependencies for indexed_gzip. # set -e set -x envdir="$1" thisdir=$(cd $(dirname "$0") && pwd) # NUMPY= if [[ -n "$NUMPY" ]]; then NUMPY="numpy==$NUMPY.*" else NUMPY="numpy" fi # NIBABEL= if [[ -n "$NIBABEL" ]]; then NIBABEL="nibabel==$NIBABEL.*" else NIBABEL="nibabel" fi if [[ "$USING_OS_PYTHON" != "1" ]]; then pip install virtualenv fi if [[ "$PYTHON_VERSION" == "2.7" ]]; then virtualenv "$envdir" elif [[ "$USING_OS_PYTHON" == "1" ]]; then python"$PYTHON_VERSION" -m venv "$envdir" else python -m venv "$envdir" fi source $thisdir/activate_env.sh "$envdir" pip install wheel setuptools pip install cython pytest coverage pytest-cov "$NUMPY" "$NIBABEL" indexed_gzip-1.6.4/.ci/build_dev_indexed_gzip.sh0000755000175000017500000000042514133320576021224 0ustar nileshnilesh#!/usr/bin/env bash # # Build a test version of indexed_gzip. set -e envdir="$1" thisdir=$(cd $(dirname "$0") && pwd) source $thisdir/activate_env.sh "$envdir" # enable line tracing for cython # modules - see setup.py export INDEXED_GZIP_TESTING=1 python setup.py develop indexed_gzip-1.6.4/.ci/run_tests.sh0000755000175000017500000000211514133320576016562 0ustar nileshnilesh#!/bin/bash # # Run indexed_gzip unit tests. Assumes that # python setup.py develop has been run. set -e envdir="$1" thisdir=$(cd $(dirname "$0") && pwd) source $thisdir/activate_env.sh "$envdir" # NITERS= # (see conftest.py) if [[ -n "$NITERS" ]]; then NITERS="--niters $NITERS" fi # NELEMS= # (see conftest.py) if [[ -n "$NELEMS" ]]; then NELEMS="--nelems $NELEMS" fi # No coverage on windows, because coverage or # pytest-cov seem to have trouble with threading/ # multiproc, which causes the coverage report # generation to sporadically fail # # https://github.com/pytest-dev/pytest-cov/issues/406 if [[ "$PLATFORM" == "windows"* ]]; then EXTRA_ARGS="$EXTRA_ARGS --no-cov" fi python -m indexed_gzip.tests \ -c setup.cfg \ --cov-config=./.coveragerc \ -v -s \ -m "$TEST_SUITE" \ -k "$TEST_PATTERN" \ $NELEMS \ $NITERS \ $EXTRA_ARGS indexed_gzip-1.6.4/.ci/download_zlib.sh0000755000175000017500000000077014133320576017370 0ustar nileshnilesh#!/usr/bin/env bash # # Download zlib sources # set -e curl -o zlib.tar.gz https://www.zlib.net/zlib-1.2.11.tar.gz tar -xzf zlib.tar.gz ZLIB_HOME=$(pwd)/zlib-1.2.11 # if windows, turn /drive/path/to/zlib into # drive:/path/to/zlib. if [[ "$PLATFORM" == "windows"* ]]; then drive=$(echo "$ZLIB_HOME" | cut -d / -f 2) offset=$(expr ${#drive} + 2) ZLIB_HOME="${drive}:/${ZLIB_HOME:$offset}" fi echo "Setting ZLIB_HOME: $ZLIB_HOME" # used by setup.py echo "ZLIB_HOME=$ZLIB_HOME" >> "$GITHUB_ENV" indexed_gzip-1.6.4/.ci/build_wheels.sh0000755000175000017500000000245514133320576017211 0ustar nileshnilesh#!/usr/bin/env bash # Make sure cython is available on all platforms # Numpy is required to build the test modules export CIBW_BEFORE_BUILD="pip install cython numpy" # Make sure zlib headers are available on linux export CIBW_BEFORE_ALL_LINUX="yum install -y zlib-devel" # ZLIB is compiled into indexed_gzip on windwos - # see .ci/download_zlib.sh and setup.py export CIBW_ENVIRONMENT_WINDOWS="ZLIB_HOME='$ZLIB_HOME'" # Run quick test suite on built wheels. We need # cython for the Cython.Coverage plugin. export CIBW_TEST_REQUIRES="cython pytest pytest-cov coverage numpy nibabel" # Disable pypy and win+py27 builds export CIBW_SKIP="pp* cp27-win*" # Pytest makes it *very* awkward to run tests # from an installed package, and still find/ # interpret a conftest.py file correctly. Also # disabling coverage reporting, because the # .coveragerc file doesn't seem to be found # correctly. echo '#!/usr/bin/env bash' > testcmd echo 'cp $1/.coveragerc $1/setup.cfg .' >> testcmd echo 'python -m indexed_gzip.tests -c setup.cfg -m "not slow_test" --no-cov' >> testcmd chmod a+x testcmd export CIBW_TEST_COMMAND="bash {project}/testcmd {project}" python -m pip install cibuildwheel==1.* python -m cibuildwheel --output-dir ./dist indexed_gzip-1.6.4/.ci/build_test_wheel.sh0000755000175000017500000000035014133320576020055 0ustar nileshnilesh#!/usr/bin/env bash # # Builds a binary wheel for indexed_gzip set -e envdir="$1" thisdir=$(cd $(dirname "$0") && pwd) source $thisdir/activate_env.sh "$envdir" pip install twine python setup.py bdist_wheel twine check dist/* indexed_gzip-1.6.4/.ci/activate_env.sh0000755000175000017500000000015514133320576017206 0ustar nileshnilesh#!/usr/bin/env bash set -e envdir="$1" source "$envdir"/bin/activate || source "$envdir"/Scripts/activate indexed_gzip-1.6.4/README.md0000644000175000017500000002076614133320576015017 0ustar nileshnilesh# indexed_gzip [![PyPi version](https://img.shields.io/pypi/v/indexed_gzip.svg)](https://pypi.python.org/pypi/indexed_gzip/) [![Anaconda version](https://anaconda.org/conda-forge/indexed_gzip/badges/version.svg)](https://anaconda.org/conda-forge/indexed_gzip/)![Test status](https://github.com/pauldmccarthy/indexed_gzip/actions/workflows/master.yaml/badge.svg) *Fast random access of gzip files in Python* * [Overview](#overview) * [Installation](#installation) * [Usage](#usage) * [Using with `nibabel`](#using-with-nibabel) * [Index import/export](#index-import-export) * [Write support](#write-support) * [Performance](#performance) * [Acknowledgements](#acknowledgements) * [License](#license) ## Overview The `indexed_gzip` project is a Python extension which aims to provide a drop-in replacement for the built-in Python `gzip.GzipFile` class, the `IndexedGzipFile`. `indexed_gzip` was written to allow fast random access of compressed [NIFTI](http://nifti.nimh.nih.gov/) image files (for which GZIP is the de-facto compression standard), but will work with any GZIP file. `indexed_gzip` is easy to use with `nibabel` (http://nipy.org/nibabel/). The standard `gzip.GzipFile` class exposes a random access-like interface (via its `seek` and `read` methods), but every time you seek to a new point in the uncompressed data stream, the `GzipFile` instance has to start decompressing from the beginning of the file, until it reaches the requested location. An `IndexedGzipFile` instance gets around this performance limitation by building an index, which contains *seek points*, mappings between corresponding locations in the compressed and uncompressed data streams. Each seek point is accompanied by a chunk (32KB) of uncompressed data which is used to initialise the decompression algorithm, allowing us to start reading from any seek point. If the index is built with a seek point spacing of 1MB, we only have to decompress (on average) 512KB of data to read from any location in the file. ## Intended use You may find `indexed_gzip` useful if you need to read from large GZIP files. A major advantage of `indexed_gzip` is that it will work with any GZIP file. However, if you have control over the creation of your GZIP files, you may wish to consider some alternatives: * [`mgzip`](https://github.com/vinlyx/mgzip/) provides an accelerated GZIP compression and decompression library. * Compression formats other than GZIP, such as `bzip2` and `xz`, have better support for random access. ## Installation `indexed_gzip` is available on [PyPi](https://pypi.python.org/pypi) - to install, simply type: ```sh pip install indexed_gzip ``` You can also install `indexed_gzip` from conda-forge: ```sh conda install -c conda-forge indexed_gzip ``` To compile `indexed_gzip`, make sure you have [cython](http://cython.org/) installed (and `numpy` if you want to compile the tests), and then run: ```sh python setup.py develop ``` To run the tests, type the following; you will need `numpy`, `nibabel`, `pytest`, `pytest-cov`, and `coverage` installed: ```sh python -m indexed_gzip.tests ``` ## Usage You can use the `indexed_gzip` module directly: ```python import indexed_gzip as igzip # You can create an IndexedGzipFile instance # by specifying a file name, or an open file # handle. For the latter use, the file handle # must be opened in read-only binary mode. # Write support is currently non-existent. myfile = igzip.IndexedGzipFile('big_file.gz') some_offset_into_uncompressed_data = 234195 # The index will be automatically # built on-demand when seeking. myfile.seek(some_offset_into_uncompressed_data) data = myfile.read(1048576) ``` ## Using with in-memory data You can use `indexed_gzip` with any Python file-like object. For example: ```python import io import indexed_gzip as igzip # Load some gzip data from somewhere with open('my_file.gz') as f: data = f.read() # Create an IndexedGzipFile based on the # in-memory data buffer gzf = igzip.IndexedGzipFile(fileobj=io.BytesIO(data)) uncompressed = gzf.read(1048576) ``` ## Using with `nibabel` You can use `indexed_gzip` with `nibabel`. `nibabel` >= 2.3.0 will automatically use `indexed_gzip` if it is present: ```python import nibabel as nib image = nib.load('big_image.nii.gz') ``` If you are using `nibabel` 2.2.x, you need to explicitly set the `keep_file_open` flag: ```python import nibabel as nib image = nib.load('big_image.nii.gz', keep_file_open='auto') ``` To use `indexed_gzip` with `nibabel` 2.1.0 or older, you need to do a little more work: ```python import nibabel as nib import indexed_gzip as igzip # Here we are using 4MB spacing between # seek points, and using a larger read # buffer (than the default size of 16KB). fobj = igzip.IndexedGzipFile( filename='big_image.nii.gz', spacing=4194304, readbuf_size=131072) # Create a nibabel image using # the existing file handle. fmap = nib.Nifti1Image.make_file_map() fmap['image'].fileobj = fobj image = nib.Nifti1Image.from_file_map(fmap) # Use the image ArrayProxy to access the # data - the index will automatically be # built as data is accessed. vol3 = image.dataobj[:, :, :, 3] ``` ## Index import/export If you have a large file, you may wish to pre-generate the index once, and save it out to an index file: ```python import indexed_gzip as igzip # Load the file, pre-generate the # index, and save it out to disk. fobj = igzip.IndexedGzipFile('big_file.gz') fobj.build_full_index() fobj.export_index('big_file.gzidx') ``` The next time you open the same file, you can load in the index: ```python import indexed_gip as igzip fobj = igzip.IndexedGzipFile('big_file.gz', index_file='big_file.gzidx') ``` ## Write support `indexed_gzip` does not currently have any support for writing. Currently if you wish to write to a file, you will need to save the file by alternate means (e.g. via `gzip` or `nibabel`), and then re-create a new `IndexedGzipFile` instance. For example: ```python import nibabel as nib # Load the entire image into memory image = nib.load('big_image.nii.gz') data = image.get_data() # Make changes to the data data[:, :, :, 5] *= 100 # Save the image using nibabel nib.save(data, 'big_image.nii.gz') # Re-load the image image = nib.load('big_image.nii.gz') ``` ## Performance A small [test script](indexed_gzip/tests/benchmark.py) is included with `indexed_gzip`; this script compares the performance of the `IndexedGzipFile` class with the `gzip.GzipFile` class. This script does the following: 1. Generates a test file. 2. Generates a specified number of seek locations, uniformly spaced throughout the test file. 3. Randomly shuffles these locations 4. Seeks to each location, and reads a chunk of data from the file. This plot shows the results of this test for a few compresed files of varying sizes, with 500 seeks: ![Indexed gzip performance](./performance.png) ## Acknowledgements The `indexed_gzip` project is based upon the `zran.c` example (written by Mark Alder) which ships with the [zlib](http://www.zlib.net/) source code. `indexed_gzip` was originally inspired by Zalan Rajna's (@zrajna) [zindex](https://github.com/zrajna/zindex) project: Z. Rajna, A. Keskinarkaus, V. Kiviniemi and T. Seppanen "Speeding up the file access of large compressed NIfTI neuroimaging data" Engineering in Medicine and Biology Society (EMBC), 2015 37th Annual International Conference of the IEEE, Milan, 2015, pp. 654-657. https://sourceforge.net/projects/libznzwithzindex/ Initial work on `indexed_gzip` took place at [Brainhack](http://www.brainhack.org/) Paris, at the Institut Pasteur, 24th-26th February 2016, with the support of the [FMRIB Centre](https://www.ndcn.ox.ac.uk/divisions/fmrib/), at the University of Oxford, UK. Many thanks to the following contributors (listed chronologically): - Zalan Rajna (@zrajna): Bug fixes (#2) - Martin Craig (@mcraig-ibme): Porting `indexed_gzip` to Windows (#3) - Chris Markiewicz (@effigies): Option to drop file handles (#6) - Omer Ozarslan (@ozars): Index import/export (#8) - @DarioDaF: Windows overflow bug (#30) - Sławomir Zborowski (@szborows): `seek_points` method (#35), README fixes (#34) - Ashwin Ramaswami (@epicfaace): Support for in-memory file objects (#55), bug fixes (#63, #64, #65). - Michał Górny (@mgorny): Remove hard dependency on `nibabel` from test suite (#78). - Alexander Gorban (@alexgorban) Fix memory leak (#82, #83). ## License `indexed_gzip` inherits the [zlib](http://www.zlib.net) license, available for perusal in the [LICENSE](LICENSE) file. indexed_gzip-1.6.4/.coveragerc0000644000175000017500000000023614133320576015647 0ustar nileshnilesh[run] plugins = Cython.Coverage concurrency = thread multiprocessing parallel = True source = indexed_gzip omit = indexed_gzip/tests/* indexed_gzip-1.6.4/AUTHOR0000644000175000017500000000031514133320576014451 0ustar nileshnileshPaul McCarthy (@pauldmccarthy) Zalan Rajna (@zrajna) Martin Craig (@mcraig-ibme) Chris Markiewicz (@effigies) Omer Ozarslan (@ozars) @DarioDaF Sławomir Zborowski (@szborows) Ashwin Ramaswami (@epicfaace) indexed_gzip-1.6.4/setup.py0000644000175000017500000001744514133320576015252 0ustar nileshnilesh#!/usr/bin/env python """Setup script for indexed_gzip. If an environment variable called `INDEXED_GZIP_TESTING` is defined, the Cython modules are compiled with line-tracing enabled, via the Cython `linetrace` directive, and the `CYTHON_TRACE_NOGIL` macro. See https://cython.readthedocs.io/en/latest/src/reference/compilation.html#compiler-directives for more details. The ZLIB_HOME environment variable can be used to compile and statically link ZLIB into the indexed_gzip shared library file. It should point to a directory which contains the ZLIB source code. If not provided, the ZLIB header and library files are assumed to be provided by the system. """ import sys import os import glob import functools as ft import os.path as op import shutil from setuptools import setup from setuptools import Extension from setuptools import Command # Custom 'clean' command class Clean(Command): user_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): base = op.dirname(__file__) igzbase = op.join(base, 'indexed_gzip') shutil.rmtree(op.join(base, 'build'), ignore_errors=True) shutil.rmtree(op.join(base, 'dist'), ignore_errors=True) shutil.rmtree(op.join(base, 'indexed_gzip.egg-info'), ignore_errors=True) shutil.rmtree(op.join(base, '.eggs'), ignore_errors=True) shutil.rmtree(op.join(base, '__pycache__'), ignore_errors=True) shutil.rmtree(op.join(igzbase, '__pycache__'), ignore_errors=True) shutil.rmtree(op.join(igzbase, 'tests', '__pycache__'), ignore_errors=True) files = [ '*.so', '.coverage.*', op.join(igzbase, 'indexed_gzip.c'), op.join(igzbase, '*.pyc'), op.join(igzbase, '*.so'), op.join(igzbase, 'tests', '*.so'), op.join(igzbase, 'tests', '*.pyc'), op.join(igzbase, 'tests', 'ctest_zran.c'), op.join(igzbase, 'tests', 'ctest_indexed_gzip.c')] for f in files: for g in glob.glob(f): try: os.remove(g) except OSError: pass # Platform information python2 = sys.version_info[0] == 2 noc99 = python2 or (sys.version_info[0] == 3 and sys.version_info[1] <= 4) windows = sys.platform.startswith("win") testing = 'INDEXED_GZIP_TESTING' in os.environ # compile ZLIB source? ZLIB_HOME = os.environ.get("ZLIB_HOME", None) # Load README description readme = op.join(op.dirname(__file__), 'README.md') if python2: openreadme = ft.partial(open, readme, 'rt') else: openreadme = ft.partial(open, readme, 'rt', encoding='utf-8') with openreadme() as f: readme = f.read().strip() # If cython is present, we'll compile # the pyx files from scratch. Otherwise, # we'll compile the pre-generated c # files (which are assumed to be present). have_cython = True have_numpy = True try: from Cython.Build import cythonize except Exception: have_cython = False # We need numpy to compile the test modules try: import numpy as np except Exception: have_numpy = False print('indexed_gzip setup') print(' have_cython: {} (if True, modules will be cythonized, ' 'otherwise pre-cythonized C files are assumed to be ' 'present)'.format(have_cython)) print(' have_numpy: {} (if True, test modules will ' 'be compiled)'.format(have_numpy)) print(' ZLIB_HOME: {} (if set, ZLIB sources are compiled into ' 'the indexed_gzip extension)'.format(ZLIB_HOME)) print(' testing: {} (if True, code will be compiled with line ' 'tracing enabled)'.format(testing)) # compile flags include_dirs = ['indexed_gzip'] lib_dirs = [] libs = [] extra_srcs = [] extra_compile_args = [] compiler_directives = {'language_level' : 2} define_macros = [] if ZLIB_HOME is not None: include_dirs.append(ZLIB_HOME) extra_srcs.extend(glob.glob(op.join(ZLIB_HOME, '*.c'))) # If numpy is present, we need # to include the headers if have_numpy: include_dirs.append(np.get_include()) if windows: if ZLIB_HOME is None: libs.append('zlib') # For stdint.h which is not included in the old Visual C # compiler used for Python 2 if python2: include_dirs.append('compat') # Some C functions might not be present when compiling against # older versions of python if noc99: extra_compile_args += ['-DNO_C99'] # linux / macOS else: # if ZLIB_HOME is set, statically link, # rather than use system-provided zlib if ZLIB_HOME is None: libs.append('z') extra_compile_args += ['-Wall', '-Wno-unused-function'] if testing: compiler_directives['linetrace'] = True define_macros += [('CYTHON_TRACE_NOGIL', '1')] # Compile from cython files if # possible, or compile from c. if have_cython: pyx_ext = 'pyx' else: pyx_ext = 'c' # The indexed_gzip module igzip_ext = Extension( 'indexed_gzip.indexed_gzip', [op.join('indexed_gzip', 'indexed_gzip.{}'.format(pyx_ext)), op.join('indexed_gzip', 'zran.c'), op.join('indexed_gzip', 'zran_file_util.c')] + extra_srcs, libraries=libs, library_dirs=lib_dirs, include_dirs=include_dirs, extra_compile_args=extra_compile_args, define_macros=define_macros) # Optional test modules test_exts = [ Extension( 'indexed_gzip.tests.ctest_indexed_gzip', [op.join('indexed_gzip', 'tests', 'ctest_indexed_gzip.{}'.format(pyx_ext))], libraries=libs, library_dirs=lib_dirs, include_dirs=include_dirs, extra_compile_args=extra_compile_args, define_macros=define_macros) ] if not windows: # Uses POSIX memmap API so won't work on Windows test_exts.append(Extension( 'indexed_gzip.tests.ctest_zran', [op.join('indexed_gzip', 'tests', 'ctest_zran.{}'.format(pyx_ext)), op.join('indexed_gzip', 'zran.c'), op.join('indexed_gzip', 'zran_file_util.c')] + extra_srcs, libraries=libs, library_dirs=lib_dirs, include_dirs=include_dirs, extra_compile_args=extra_compile_args, define_macros=define_macros)) # If we have numpy, we can compile the tests if have_numpy: extensions = [igzip_ext] + test_exts else: extensions = [igzip_ext] # Cythonize if we can if have_cython: extensions = cythonize(extensions, compiler_directives=compiler_directives) # find the version number def readVersion(): version = {} initfile = op.join(op.dirname(__file__), 'indexed_gzip', '__init__.py') with open(initfile, 'rt') as f: for line in f: if line.startswith('__version__'): exec(line, version) break return version.get('__version__') setup( name='indexed_gzip', packages=['indexed_gzip', 'indexed_gzip.tests'], version=readVersion(), author='Paul McCarthy', author_email='pauldmccarthy@gmail.com', description='Fast random access of gzip files in Python', long_description=readme, long_description_content_type='text/markdown', url='https://github.com/pauldmccarthy/indexed_gzip', license='zlib', classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'License :: OSI Approved :: zlib/libpng License', 'Programming Language :: C', 'Programming Language :: Cython', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', 'Topic :: System :: Archiving :: Compression', ], cmdclass={'clean' : Clean}, ext_modules=extensions, tests_require=['pytest', 'numpy', 'nibabel', 'coverage', 'pytest-cov'], test_suite='tests', ) indexed_gzip-1.6.4/setup.cfg0000644000175000017500000000053114133320576015345 0ustar nileshnilesh[aliases] test=pytest [tool:pytest] testpaths = indexed_gzip/tests addopts = -v --cov=indexed_gzip markers = zran_test: Test the zran.c library indexed_gzip_test: Test the indexed_gzip library nibabel_test: Test interaction between indexed_gzip and nibabel/numpy slow_test: Test which takes a long time indexed_gzip-1.6.4/compat/0000755000175000017500000000000014133320576015010 5ustar nileshnileshindexed_gzip-1.6.4/compat/stdint.h0000644000175000017500000001706314133320576016475 0ustar nileshnilesh// ISO C9x compliant stdint.h for Microsoft Visual Studio // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 // // Copyright (c) 2006-2008 Alexander Chemeris // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ #error "Use this header only with Microsoft Visual C++ compilers!" #endif // _MSC_VER ] #ifndef _MSC_STDINT_H_ // [ #define _MSC_STDINT_H_ #if _MSC_VER > 1000 #pragma once #endif #include // For Visual Studio 6 in C++ mode and for many Visual Studio versions when // compiling for ARM we should wrap include with 'extern "C++" {}' // or compiler give many errors like this: // error C2733: second C linkage of overloaded function 'wmemchr' not allowed #ifdef __cplusplus extern "C" { #endif # include #ifdef __cplusplus } #endif // Define _W64 macros to mark types changing their size, like intptr_t. #ifndef _W64 # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 # define _W64 __w64 # else # define _W64 # endif #endif // 7.18.1 Integer types // 7.18.1.1 Exact-width integer types // Visual Studio 6 and Embedded Visual C++ 4 doesn't // realize that, e.g. char has the same size as __int8 // so we give up on __intX for them. #if (_MSC_VER < 1300) typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; #else typedef signed __int8 int8_t; typedef signed __int16 int16_t; typedef signed __int32 int32_t; typedef unsigned __int8 uint8_t; typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; #endif typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; // 7.18.1.2 Minimum-width integer types typedef int8_t int_least8_t; typedef int16_t int_least16_t; typedef int32_t int_least32_t; typedef int64_t int_least64_t; typedef uint8_t uint_least8_t; typedef uint16_t uint_least16_t; typedef uint32_t uint_least32_t; typedef uint64_t uint_least64_t; // 7.18.1.3 Fastest minimum-width integer types typedef int8_t int_fast8_t; typedef int16_t int_fast16_t; typedef int32_t int_fast32_t; typedef int64_t int_fast64_t; typedef uint8_t uint_fast8_t; typedef uint16_t uint_fast16_t; typedef uint32_t uint_fast32_t; typedef uint64_t uint_fast64_t; // 7.18.1.4 Integer types capable of holding object pointers #ifdef _WIN64 // [ typedef signed __int64 intptr_t; typedef unsigned __int64 uintptr_t; #else // _WIN64 ][ typedef _W64 signed int intptr_t; typedef _W64 unsigned int uintptr_t; #endif // _WIN64 ] // 7.18.1.5 Greatest-width integer types typedef int64_t intmax_t; typedef uint64_t uintmax_t; // 7.18.2 Limits of specified-width integer types #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 // 7.18.2.1 Limits of exact-width integer types #define INT8_MIN ((int8_t)_I8_MIN) #define INT8_MAX _I8_MAX #define INT16_MIN ((int16_t)_I16_MIN) #define INT16_MAX _I16_MAX #define INT32_MIN ((int32_t)_I32_MIN) #define INT32_MAX _I32_MAX #define INT64_MIN ((int64_t)_I64_MIN) #define INT64_MAX _I64_MAX #define UINT8_MAX _UI8_MAX #define UINT16_MAX _UI16_MAX #define UINT32_MAX _UI32_MAX #define UINT64_MAX _UI64_MAX // 7.18.2.2 Limits of minimum-width integer types #define INT_LEAST8_MIN INT8_MIN #define INT_LEAST8_MAX INT8_MAX #define INT_LEAST16_MIN INT16_MIN #define INT_LEAST16_MAX INT16_MAX #define INT_LEAST32_MIN INT32_MIN #define INT_LEAST32_MAX INT32_MAX #define INT_LEAST64_MIN INT64_MIN #define INT_LEAST64_MAX INT64_MAX #define UINT_LEAST8_MAX UINT8_MAX #define UINT_LEAST16_MAX UINT16_MAX #define UINT_LEAST32_MAX UINT32_MAX #define UINT_LEAST64_MAX UINT64_MAX // 7.18.2.3 Limits of fastest minimum-width integer types #define INT_FAST8_MIN INT8_MIN #define INT_FAST8_MAX INT8_MAX #define INT_FAST16_MIN INT16_MIN #define INT_FAST16_MAX INT16_MAX #define INT_FAST32_MIN INT32_MIN #define INT_FAST32_MAX INT32_MAX #define INT_FAST64_MIN INT64_MIN #define INT_FAST64_MAX INT64_MAX #define UINT_FAST8_MAX UINT8_MAX #define UINT_FAST16_MAX UINT16_MAX #define UINT_FAST32_MAX UINT32_MAX #define UINT_FAST64_MAX UINT64_MAX // 7.18.2.4 Limits of integer types capable of holding object pointers #ifdef _WIN64 // [ # define INTPTR_MIN INT64_MIN # define INTPTR_MAX INT64_MAX # define UINTPTR_MAX UINT64_MAX #else // _WIN64 ][ # define INTPTR_MIN INT32_MIN # define INTPTR_MAX INT32_MAX # define UINTPTR_MAX UINT32_MAX #endif // _WIN64 ] // 7.18.2.5 Limits of greatest-width integer types #define INTMAX_MIN INT64_MIN #define INTMAX_MAX INT64_MAX #define UINTMAX_MAX UINT64_MAX // 7.18.3 Limits of other integer types #ifdef _WIN64 // [ # define PTRDIFF_MIN _I64_MIN # define PTRDIFF_MAX _I64_MAX #else // _WIN64 ][ # define PTRDIFF_MIN _I32_MIN # define PTRDIFF_MAX _I32_MAX #endif // _WIN64 ] #define SIG_ATOMIC_MIN INT_MIN #define SIG_ATOMIC_MAX INT_MAX #ifndef SIZE_MAX // [ # ifdef _WIN64 // [ # define SIZE_MAX _UI64_MAX # else // _WIN64 ][ # define SIZE_MAX _UI32_MAX # endif // _WIN64 ] #endif // SIZE_MAX ] // WCHAR_MIN and WCHAR_MAX are also defined in #ifndef WCHAR_MIN // [ # define WCHAR_MIN 0 #endif // WCHAR_MIN ] #ifndef WCHAR_MAX // [ # define WCHAR_MAX _UI16_MAX #endif // WCHAR_MAX ] #define WINT_MIN 0 #define WINT_MAX _UI16_MAX #endif // __STDC_LIMIT_MACROS ] // 7.18.4 Limits of other integer types #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 // 7.18.4.1 Macros for minimum-width integer constants #define INT8_C(val) val##i8 #define INT16_C(val) val##i16 #define INT32_C(val) val##i32 #define INT64_C(val) val##i64 #define UINT8_C(val) val##ui8 #define UINT16_C(val) val##ui16 #define UINT32_C(val) val##ui32 #define UINT64_C(val) val##ui64 // 7.18.4.2 Macros for greatest-width integer constants #define INTMAX_C INT64_C #define UINTMAX_C UINT64_C #endif // __STDC_CONSTANT_MACROS ] #endif // _MSC_STDINT_H_ ] indexed_gzip-1.6.4/LICENSE0000644000175000017500000000165114133320576014535 0ustar nileshnileshindexed_gzip: Copyright (C) 2016-2021 Paul McCarthy zlib: Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. indexed_gzip-1.6.4/performance.png0000644000175000017500000006200514133320576016537 0ustar nileshnileshPNG  IHDR-sBIT|d pHYsaa?i9tEXtSoftwarematplotlib version 2.1.0, http://matplotlib.org/mߕ IDATxwxU$@JHM"QB/(UcAe* !"6aAHGz %B <$3$If u9|!sc|<],x5,x5?OP1FREIg=] @!VB6%)J:Ir)LcJJ:=|z|QJz <{,Asp59wڴi#IgrrL聕^``<]F(Y5܀ Y܃gqk``9/^>L;vйs$I'NT&M<\\[ŋ$cLXs5h@sO_{Q t@V*U 4p@OrE߶mۦ_|Q6lPzԶm[kN. JZHJ 2:rH $)%%%OCUz` ,ZHIII֭}YOW=k|}}bŊ9Á)Z;vLə~鄅iƌIOVt7Dy6:$IR+VtY[hQ0 ޡXb#G8?N"ETZ5O/SD (QehР$i>}M={OW^t7=6))IfҜ9ssN?^e˖խުnݺ|.ZJ=zM7ݤcjҤIZp<`o^Kn&}m{ѤIj*;vLŋWZM6j?~իW/uQZ|?{GÇwoQR2etw멧Ξ='ѣG۫G'Nh޼yZl٣ǏOjݺ:w,,ߛ~ISLΝ;5k{2}o~͝;W۷o׹sjժe˖?+VoՆ t)(QB 4P׮]U~}pu08j1W w.m*Tܹs3=խZرc~z%&&^zzUV-/#r-z199Ygٳc]pA*UR֭Ox}ϟ?{L1115jTPɓzuq7Nrgjʔ)Zhbcc믿^<:t඾G꣏>ҥKuiURE=XuԶmۜwJJ"##siԨQ5tP:8/sY$iرO\ڵkÇgznݺi͚5㏵uVM6MVHHN+{q/ՠAlRvҲe˴b [jժΝ?իW+ @kVppvڥN .Ԅ ~^xQݻwݻըQ#ըQCNR͚5ծ];_^W TreIr ,Y~… Z8qBk֬ʕ+|r 2󌍍գ>"EA*88eD=Sھ}7nUjڵϴ{n >\]tӧuM7J*Zvu k.[|z-rʺuImܸQcƌѢE'hѢnk裏4qDկ__͚5Ӟ={;39{{9[D-[VǏ׎;bŊ ֻᆱɓ'Gjذ:_UK,ձcG:@~[,$-[VڵO9>6&&Fu7kZjz3fd=qD7N~~~jҤb u] nqE Zl+uDڶm>c-]T~3*^F_>|ԩ뮻NRpѣٳgݻ߯`5mTIIIڸqM6euETJԪU+>|XC ɴZnz .f͚Vvڥ={nݺn pm߾];v쐯omkv~_>u;V7oVƍUvmYF_|VZI&WjڴiVӦMm}} 4H 4P2eFիWy2d˶)S/3fdx]6nܨ5jh&foժSNǏW_Յ ݻ;ݼyz)͜9S}|IUPyڵkkϜ9W^yE˗/W_}]믿ɓU^=g#3fLkȐ!ZnjժwyG*UrnKJJo5ydU\YjԨܶf=szհaCdÇkJ履~s=oVof̘H:uJgv7&&F&L&Nkњ5ktIxl2p >}>{5kڷo-[h.T^] P\\^~eĮ'OҥKմi RRR/jjӦ̙>@}fΜZjӒ%K\{wuAlR3gȑ#5i$ 6L6l뙕d 4H.\P^jȑԾ}{^qүݻСC*Rz q};uꔫZl٢ &(::Zof͚:uh۶m4ie=<.$uUl߽{͛e7t $Sfe˖^/kU^qqqT=\~3[Nu]矻=TR߿ᕔ!CSbEgh.b:ut뭷*%%%P݆%KT%I .̴~%R{i߾}9ä0ƌ^Inv㔔M0A4j(JJUأG%&&jޮ_~cǎUڵ3S t, 7ܠΝ;;+V͛]5ku=aÆ%Jw{'Nモ$.j)~Xb1csmO}̙3zW]ޏЦM5i800P}$͘1#OY2GҥKeUfƍKgeʔQF.NG#Gd~7g;Yughw>]vNߤGͰ-99Y+V_] 5qDIҾ}2yڊ-y%Ivʕ+9mۦGrʊtOV%YZRhhU^zĉ:ǠAW_]k׮ .J' :I=zbbbCBSNoڴI2&MOIII.VRbbWy%TreܹSZ_~Y7o֗_~QZ p˗KJnXLJdR?D'%%RJzH-ܒgOIUmjԨ.]yRRRԿٳG;wu煮35lP~~~:z<2TJ{/m4$믿$hizVXrt֑Y%KTRti>|XaaavfUf \ۧ}*&&&kGxxoIpg^:Jf;<6ֱsXTmuy7ݝ K/c… ?< ET;{:9' /J*RJXb:wՁ+Cr9z!;&In_)RDz8>޽;GRרQtɓz'\zԩS5uLsi0sR+k6r9dЋ/g]sǎ˳+;{͚53 9K/7InW+(9vvvB~)&&F͛7W׮]UzuH"JLL^r%ehhhK?#19E#дc\_F4*?VkzG5p@g}0aqIã &)ksٷgϞ:tʕ+3fB =ztAAAVu믝V*Uu\?ǏW^Ѽydwܡ~#sX\| X*Wmu?_,IԳg,'txbe̖c2ͳgɓUJ[n'o/y:䥁chZ ;jȉ0ٳs(g}=mϞ=ڱcʔ)w}W~~?:g5tr8ͽ{h{YT,WP9++)JikJuj8x6oޜO]wKŋ駟v3ϨK.1bv<}Yk֭9:uo]}Ys0fS@@̙#??? 0@=z) ;, PHHItD|1 k֬رcU|y5iDg믿#Fdz-[fZzh$s].ǵ2;<~Æ >|4f,Y2jK]III]Ͻ5g?  IRf4vX-^X}-U7֊+4{lyfΜ))79*9´ef$>-]TG6īS{nڵc;V!V[|8%9߲e˜C|I9sտ[.+ ;n!!!6l|}}5p@M2E},Co|ܥK=ZRP+((貞o?CΡFRÆ uСL{~!/kժUZ*ÐCsc]:?\)7ԀTF 6mZÐ-ZkԮ][~~~:p8aR~}I;z9;wNk׮Ço߾JJJȑ#UZ<˝+Wfh[nĥڶm'HJ` ]vex}/^)Sjժ-[*!!A}&%%_u>.Rz%kue8grrV\zxX>t1[ _4jH˗/WڵuEduUZRnr}Zj9{Sf͚ڽ{>>/_z嗯ڴi0Yfrޣ9C, Pv4yd͝;WsHٳg5fDŽSNԷo_ 6,Cm6]֥WKrr^y=zTQQQ.G?O=Zuusן7xC?JuIIIZbtwHJWu֚;wy59جY7߬뮻N>coWzuy%$$^бcԷo_iԩS||>CIIxS5Rٲe;wWC 0bС:{-[;FT￵c%&&jƌyS,$$Do맱cjΜ9YN<5k())I:t𚕞#諯R^԰aC-[V֭[գGJyeذazgao^WٲeuqܹS'OtYQG'PDD*WbŊرcھ}Ξ=^{-OWpu8vbP,ctL_.T_ŋ+"""CoCy:.02e("""Ο^r` W 2"z'OԻ|AN:2hjO^ѵW޽{kܸq޽6m ZJAAA v;^~_V-UXQھ}>zXƍիu7F3<<\ ^zI_}s({==Ӛ:u͛5k*44T'NЖ-[tY=K^x[N:vzZnׯ\=J9~ZZhW]voe맟~Җ-[Txqܹ}uH[Nc(]ժU 狌T^ԤI*UJk֬ѱct 7dX JOUVմiӴyf[oU=Λk3gnݪ۷+((H{բE 簏Լys}ך4iV\~IŋWÆ M6y~+ѯ_?ըQCSNuN7pF6myUdI}駚9sΝ۷?T2eTfM~ytkԩZn~w)RDjܸ7or(C͝;Wgֶm۴ebŊرNK,ѤITLVBk۶V\YfiРAzOʕ_o/-[(11QWn 4yd}G?+<<\Æ 6,)u>LƍӟKӈ#JK1IKccc~g%v]3hfsݻezH=P=ܣ{'G7iĥOf\mիW|nO=\&S_ԡC\_;cǎnٽV~iۊ)A.^9},Vj[/>|y-[VڵӉ'gթSGT=4`*UJ0xq2eThjvQ }ܞvڊͲ)u(U\\kNRꜣ͛8UVMm۶Ր!Cr:].\XS:֞ E,k8I1%%n[UN8g*<<ܥ}ڴi"""u &!)uAڳ4ZY^96mڸ^\9;w.c.Z2ݾu,ngPX=zr W@@ۧ۷+00P [> ">ڶz6ֱsTTmuų?%UJ*z衇vZ_^ Q۶m5h C9u]pqn^e  S?|}}U\9knwwp"ujݺU']!W#W#W#^mر]PŊ_\uv?_=C{``kƌZ"ygϞ.Z#F(22ReʔQFua IDAT'\ֹ֯_|0-YkݥK Ι3G?|\Vkѣ5l03Fܹs=]Rݻ/^w}Wu޽{/\ֹ®իW+88x q{lX\|v,sm9_[T)/_^AAAU!ԩSu7|zg<~Νjժʔ)VZi߫q Qƍ5}t綯J:r䈳_:t k$)99YojԨ0u]ڰa5F*UӘ1c\[N|&M۫zjժF_϶'ZzjĈjڴBBBԩS'PlY\|y*Uʹs^œXxZh2eo/< k}W@˜9 o]-Δ)S7h4iϟODZlwO?ŋw۷V^_~Y={ԪU$I;wV˖-%I/tREGG#I9rNh-_\r:t蠳gJΝ#Gւ ?;IO?0vm>9ɖ$Z#_ԩԩ5kH6nܨkjԨQ]:w{ؑ#GwܹU~Xm۶u4fmܸQÇW~4~xg/{=znvEDDhСѼy$I'OVv+22RoK/RJ=K=ϟ?#C/P/_֍7ި!Ch̙:u>뇅9_OI Vwy3Ij۶Tzu}ڵ&O|Ý5Ȗ NwU<}sϩ}J) N-Y)ի;'OJ#___ըQùN:Ui&XB&Lp]xѥ7Tҥ{W]tq vڥϫs.w^IݻաC続5kH".衇wtI\~_^9d۷OKvϝ;u]dfӦMڼy<]RJ]#;X\҇Xžl/I$e 䘛*^}Uuɥ-+V֭[/hѢ.ۧM!t"T\Y ,p> 3wY=ӧ;'ҿ^ ͳ׭[7=.mBad%]C}*+իWWrrvl۲e>u޽{UbE>~GhРAm_+Ce8G2e$Iڶm;v(11;3L^TTտ{A$mݺ-[G} ԩ;vdx9s@E-)Gՙ3gt!%M*:= zƒ]1璺Zk5Ɣ4LRyI%mM?{c$Eɋz g 4*)˙ԭc%뎓4SR<7= Ee2lӵ w Eegӵ w E Zr8.ITBJU*oJpRWԂxHxFwEW`O \B@^}R c%I)VJJ˜xE>ꀫWP\fJZqIYp(1&EJO,ٛ:l0+>FZ7` 1"֎FJjZמS nv++t1L:شWyIRSp! ]#4P , fpz5ڠI R{^pn{X?e`dI-S *]W-y\) P ?=\)b,cL1fS(,k8kmS(,\ Ee2lӵ w Eegӵ w EZ1QƘ-VzN㬵zNp"Wt㒴xoN%$t1_rAP!' &^>F2/7]8X~>eCB<.zI-$X)٦)I c`:1lp\RA/BFZ7` B p*!Yne,BbtidyIRSBXA~+" aF]*We@'.FRW1UjϫA1jZFF:}"WP9T.O-2a,x5,x5,x5,x5,x5,x56cjKzTm%H:*i[k/w(242Ԡޗ4PIF6Ƽb/zPdkђ֞`c-7<vX5l].i1Hޖr;0S:7+UzS%70Pe`I-i$cZKj-Is:Oo2R`Ij+iv1fRW'MNz`T9wK)Fo~82ԡPҮ* r` iR{alKk | Z(m7KEY0ƔVF1ډ] >'ʻruVRskm1&P&cX Pf=f{l:i l"wkmi5̏*ZǗ$ti_JZU s31c:'`Yain{ic̟4Z{,756J}I[k[kϤ}͗W^?%EhyDһJj|cLck)km}I$u6Ɣe $tMiIUss1k\kkLv+ivvJ2sVju[f3cJ:$M^qIZ7^UZT Pݒ*9lC”nwkm /1fMub=ARUZ/:Jr^CoA 1% ZA~־,Ƙ I_T@ kmRZJI}$JtMZ:bJմLAIޒZkȃ5ǎ ?OƷƘI*/i^.cL(=CqIZ V҂x=YISTyq1kZ%ֆ[k7[kWuYk#%5sWbaY1, CNO$uBTBʤRdR1I=1wJ )ҍھQp-*]7Il~ UN:$M't U3YbU "_(( 鮈-wI#KjU>cuƘb2lӵuDj+#9'vo^=X'nI$()(w$nM vqƘN{n~>FQMȒZ7^UZ wriuI$,!J| ƕ uKz ^N.[kg]Ҷs@G"@>Xe$msӾ-mor`)7ϤmMN,i1NInTYҽU vfH*<v9%k]œc$E)gC&E t1Ocr1K㬵zNNz$[17G$r`UMm@IuDҍnK:r2ג>0Ɯ$1ɯ)g@IU%,))Gd1YG1:l𼤍}]X{%I1ڤl*Ƙ(IQٜ_":Ƙç%mVʃƘ1w+>3BI.g~dg88 eI.*m)|Dk-n&+Zۜy\ +ꤪ:꟪VՑ[L%%yZ?Ҡ3|++n=;ntI%R%$HrlwTՆ$2!qm-o|bXWm֌%KA ̲:7|I6EjI f-WܐOnK֒[[$汼X0Wf`]UINOIFg0m;n'6Mq%1 `IuV1ɹo?5gg0K NJ.fln =Zk_Nr$0ɭ"Hr[sV,lXUUvo%IZk;g$HYzǵ8`aj Wת\U7WBm,b0aTnk'MAL6ɛ$*Pg IDAT_dg$Y$Jͳ_*ɪqkF)W3֌d s~oKS$H$?NI~IޓVr$.jV^fdy`avJk$Q-%U6;6O]"\3wޚctxN)+`10V$OA TM`(,6TI.t-E`6&Y7Z88"`QUUkϪocOny,v $W%$OIrtCjJ:?+Zkg$w_g&։IqOrC[k&֍I9Ir`_3 >U*IKrXU\L,f`q'6]\IlJdɁv'y^U&ɃӅX_l}kXZk[lZ`? <5'G&l;l=evJz}'Tt`N$$Oi}|fr%lI$T՝fYSU$L~03|IdkYjmLV[Q<1INNhI$o]6D3um]Lf&$:avKd $\UcٿQ$3 Κ*`  ZkB`2XU}.0>fT+n{־$m1;~lSXI??G~& ZkT՟V_.`aӜ;'sULf tV2yv3S5qͪ6j{}(:'ɏ́־?'$끥. )tBu" Uuu] gQXI ΢X 5CMP`0X 5CMP`0X 5CMP`0X 5CMP`0X 5CMP`0X 5CMP`0X 5CMP`0X 5Cm^XUutU]]U_ &fגApnIrVkKU*ɕU̓. 1|Ac9BXUUG꺪jU+PUTΪ|UZ''9vueI#ɆNV3.ɫ{QUc¸c'j ܜn!l]$Ɇ~$OJ$䟒Zt{Tߡw-0džVU$kVU'yDoL%yWmw=;9Ybš({茌Nth0X 9c1l߾34V%9Ƕr,`816 /w_ٞ^qC>qˠKi-ݾ%GװcY}ˀ+`P [nky7䓛$j{f$?.YrX Lo۔+VOHއV@e0ܴj}?oy6bgk«${ǒ$;<|ۦtvv>Dth?χk?{FV!"g m-D/LKcsY_=GVf``-^3+j o/7Z?s{FV f Z 1P iP:Pг3=pX LW󞑕* bh zD, (PUW'|еppEZZ[dݠk, KP`0X mrm"Ccɠ [mSVo:;GVf׽?Ϲ#F3sh?/7Z?*n++VguuYO߻k<:{devX=Jk32O 5k.;3#_7ڵl>Y9Rkn9rK7AY?z˅W&Z@͝}Sޝ`>`- Zv\uZv\L!0o  z 2v6vbkX}Kͮe}poϫ96۷ Rdpݴj}?/;WY~\r5{ύXKoMHI@LNYNe !CMP`0X 5CmQXUNrk,6ɺAY &` j, &` j, &` j, &` j, &` j, &` j, &` j, &`ͻnt-̾y`% o Ƽ ZkNrӠ`niUUUG꺪jU+PUTΪ|U.skYHaU$K$'^TU *KZk&0Ij!mwƼ ɓ<7zUuT-Oo`/5ol߾=;vth0X 9c1ܞ\fNTՑINNrmUuqG˞䜉p,O}&&Jr$'p|4&YVUM禸y$;6Igy&W4A0aA,sc>Z亙kFZkpcw%5o[M>aX0X 9c1OԿs}:?HrkLm` MZ۝$?VUOEnNVI~zXֶW"IJ,;ylWWem0aA,scܽXգ|jSn=7I0ɪ$_JF˜XpLFP`0X HUmkjgU} &O_UTU0fiUmjGU}VNs|U}zZ2ạ U]UϙJUQUz}1sfQUz_oSWU;qUw5Ww71zmUt# :^SU[zksUIUUsEK@T3.ݭSOJI.{ 0_NO1$g$9"'jYߘ7/'yZoqI><~O$G&ydL$7SII^UNU=<|y)sfIU%gI$kA4K &9ϜKƼ?Ƀ}"'g݁U`xY&yq41Y܅p'ڋ{Krmn?`'~[kV՝$jCo|-#ZkI>ho 6[kIZ{pk} 1'{a-(+|v9OsGMq\Zk;v$IZ@U=0IZ7 I>^IM}0IF[kwCI~Z X @U.9xXkG .X{<9ݪ$[$Wp(Ɋt squQW1Zk9&U󫓬ʾsGI>}{.Nr["c.sQVb$%IU=4ɩI.7YX ݒ.y7?pAb|}ϴ־;*ڍϵU|.fcVT՝h0Uկ $A]I}[IIPU;?>sU{nIˠI^U{|1[k7YԖx1Ƀ Uu$$9s"tX+Zk{Ūzp^;\Yh<=ɳ<+WXU׵A=+$5 W&6VUoL[k;-ɑUūϵm|.fc~|Gjy$H򅪺nIר%Gcl^9sh9{UwcspHr~kZkMwU k]2cǏ?=6U71ɯ&yLkm˄!W3S\璜8g$ٞ|(\5Ƙ,vtqsE)mLO8vtw*K->ck<&9sFoxH^UnO>,jB@T3-~~˓n &4PUoJdIwG2I~1s} $i=w$_Jw#$oߒQU|%6w vѬIg݅YRUOD$L.ےxzY?J>L&C[U[t7^xgꝿs~"Bn.[w%t龚g5;Zk/1YX HU8aKI^ZUTAڻzc&$LrT;wuUTƷNrspz 4<:ݒI5$9~)yIZko;_I^wts}cM$n5ɇ\oC҅O[kwí ~56mW1d1`0` j, &` j, `Un?]U?uMx#UY~TGf5#BUTծ>RUtmS, li}v@/tkU`UUU=znU:HrRU=j0XU IL$$OH$V4Ap~Zi_UIf& ~5Ɏ).[Idu%$?Z۝%y,`)IK7[k_m.U9Uu|UQU۫U+KUުU:^ZU۪UyaU]XU?{jyϨKjggΝZU{޵UZUշjgUV?{jU]{UuNU}ܯWՋ&Խ;E97BXUiϻ?лwꜪZ2˟dMMrI~w칽㓹vmkIw&H:<$Mjck[k7.86IHr$7)kn$WXIӫw*J)Vy]$w㓜ӽSW$Yd50L2B`cm-\Z6I7|Z޸Ò<\]UJbk$ߨ%x}kH{鶧{}kIZo|zIKIOrs$_썽g>Zޱ^UITxP$n}|g-xzIޞw􆜓 T՟$^$MS;ҭz_0IcnM=<*]wބ0VU?.0Y Zp\;^%Ik$7΍f $W«cpMx +&<~h*I.J3$L}[UϮs#ɿ$WUwI65Ifµ_;>r/=L4]=PygNx:%4׼SU\Sy_GT}XfH^~|֏Lg0hJz{&IDATi ,bڷ{_[ItU\vo^b$0X I>$_NsI^nzDSKZkOtw\t^dSU1]54?t/MrC_L oVZ=6'|?]'9I<.3$&y[U.+vz& YmHZ;ڏҭhUmMInKWLQ§ҭzPL1{߇ӽ$j:,}5f ;0$' 8*]t {cZ' ^.Mh}'ݝsZ7 2**32 bytes in a single call. * Support for concatenated gzip streams. * Many related bug fixes. * `zran.c` refactored so it is much cleaner and clearer. * Test suite is much more comprehensive ## 0.2 (August 31st 2016) The following changes have been made in this release: * `indexed_gzip` now releases the GIL when possible * A new `SafeIndexedGzipFile` class provides simple thread-safe file access to compressed files. * Some initial test coverage using https://travis-ci.org * Some important bug fixes. ## 0.1 (June 16th 2016) * First seemingly stable release. indexed_gzip-1.6.4/.github/0000755000175000017500000000000014133320576015065 5ustar nileshnileshindexed_gzip-1.6.4/.github/workflows/0000755000175000017500000000000014133320576017122 5ustar nileshnileshindexed_gzip-1.6.4/.github/workflows/release.yaml0000644000175000017500000000624614133320576021436 0ustar nileshnilesh# This workflow builds binary wheels for indexed_gzip, # for different platforms and different Python versions, # using cibuildwheel. It is triggered manually. on: workflow_dispatch defaults: run: shell: bash jobs: build_sdist: name: Build source distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 name: Install Python with: python-version: "3.8" - name: Build sdist run: python setup.py sdist - uses: actions/upload-artifact@v2 with: name: sdist path: ./dist/*.tar.gz build_macos_wheels: name: Build macos wheels runs-on: macos-latest env: PLATFORM: ${{ matrix.os }} CIBW_ARCHS_MACOS: "x86_64 arm64 universal2" steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 name: Install Python with: python-version: "3.8" - name: Build wheels run: bash ./.ci/build_wheels.sh - uses: actions/upload-artifact@v2 with: name: wheels path: ./dist/*.whl build_windows_wheels: name: Build Windows ${{ matrix.arch }} wheels runs-on: windows-latest strategy: matrix: arch: ["AMD64", "x86"] env: PLATFORM: windows-latest CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 name: Install Python with: python-version: "3.8" - name: Download zlib run: bash ./.ci/download_zlib.sh - name: Build wheels run: bash ./.ci/build_wheels.sh - uses: actions/upload-artifact@v2 with: name: wheels path: ./dist/*.whl build_linux_wheels: # Typo left in for hilarity name: Build Linux ${{ matrix.arch }} eels runs-on: ubuntu-latest strategy: matrix: arch: ["x86_64", "i686", "aarch64"] env: PLATFORM: ubuntu-latest CIBW_ARCHS_LINUX: ${{ matrix.arch }} steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 name: Install Python with: python-version: "3.8" - name: Set up QEMU for emulated (e.g. ARM) builds if: ${{ matrix.arch == 'aarch64' }} uses: docker/setup-qemu-action@v1 - name: Build wheels run: bash ./.ci/build_wheels.sh - uses: actions/upload-artifact@v2 with: name: wheels path: ./dist/*.whl publish_to_pypi: name: Publish indexed_gzip to PyPi runs-on: ubuntu-latest needs: [build_sdist, build_macos_wheels, build_windows_wheels, build_linux_wheels] steps: - name: Download source archive uses: actions/download-artifact@v2 with: name: sdist path: dist/ - name: Download wheel archives uses: actions/download-artifact@v2 with: name: wheels path: dist/ - name: Publish archives to PyPI uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} indexed_gzip-1.6.4/.github/workflows/pull_request.yaml0000644000175000017500000000477114133320576022543 0ustar nileshnilesh# This workflow runs a small set of indexed_gzip unit # tests on a range of different versions of Python. # It is run on pull requests. on: pull_request defaults: run: shell: bash jobs: test: runs-on: ${{ matrix.os }} strategy: matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] python-version: [2.7, 3.6, 3.7, 3.8, 3.9] extra-args: ["", "--concat"] exclude: - os: windows-latest python-version: 2.7 env: PLATFORM: ${{ matrix.os }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_SUITE: "not slow_test" EXTRA_ARGS: ${{ matrix.extra-args }} ENV_DIR: ./test.env steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} architecture: x64 - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Download zlib if: ${{ matrix.os == 'windows-latest' }} run: bash ./.ci/download_zlib.sh - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" # Quick tests on 32 bit platform test-32bit: runs-on: ubuntu-latest container: i386/ubuntu:18.04 strategy: matrix: python-version: [2.7, 3.8] extra-args: ["", "--concat"] env: USING_OS_PYTHON: 1 PYTHON_VERSION: ${{ matrix.python-version }} TEST_SUITE: "not slow_test" EXTRA_ARGS: ${{ matrix.extra-args }} ENV_DIR: ./test.env steps: - name: Install git run: apt-get update -y && apt-get install -y git - uses: actions/checkout@v1 - name: Install system dependencies run: bash ./.ci/install_32bit_dependencies.sh - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" indexed_gzip-1.6.4/.github/workflows/master.yaml0000644000175000017500000002466614133320576021317 0ustar nileshnilesh# This workflow runs the full set of of indexed_gzip # unit tests on a range of different versions of Python, # and on different operating systems and architectures. # It is run on pushes to the master branch. on: push: branches: - master defaults: run: shell: bash jobs: # Quick tests run on all OSes/python versions test-indexed_gzip: runs-on: ${{ matrix.os }} strategy: matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] python-version: [2.7, 3.6, 3.7, 3.8, 3.9] extra-args: ["", "--concat"] exclude: - os: windows-latest python-version: 2.7 env: PLATFORM: ${{ matrix.os }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_SUITE: "not (slow_test or zran_test)" EXTRA_ARGS: ${{ matrix.extra-args }} ENV_DIR: ./test.env steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} architecture: x64 - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Download zlib if: ${{ matrix.os == 'windows-latest' }} run: bash ./.ci/download_zlib.sh - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" # Slow indexed_gzip tests run on all OSes test-indexed_gzip-slow: runs-on: ${{ matrix.os }} strategy: matrix: include: - os: "windows-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "" - os: "macos-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "" - os: "ubuntu-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "" nelems: "" - os: "windows-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "--concat" nelems: "" - os: "macos-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "--concat" nelems: "" - os: "ubuntu-latest" test-suite: "indexed_gzip_test and slow_test" extra-args: "--concat" nelems: "" # mmap test errors under windos (I think because # memory needs to be alliocated for the mmap, but # there's not enough RAM) - os: "macos-latest" test-suite: "indexed_gzip_test" extra-args: "--use_mmap" nelems: "805306368" - os: "ubuntu-latest" test-suite: "indexed_gzip_test" extra-args: "--use_mmap" nelems: "805306368" env: PLATFORM: ${{ matrix.os }} PYTHON_VERSION: 3.8 TEST_SUITE: ${{ matrix.test-suite }} EXTRA_ARGS: ${{ matrix.extra-args }} NELEMS: ${{ matrix.nelems }} ENV_DIR: ./test.env steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 architecture: x64 - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Download zlib if: ${{ matrix.os == 'windows-latest' }} run: bash ./.ci/download_zlib.sh - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" # Nibabel integration run on all OSes, # and different versions of nibabel test-nibabel-integration: runs-on: ${{ matrix.os }} strategy: matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] nibabel: ["2.0", "2.1", "2.2", "2.3", "2.4", "2.5", "3.0", "3.1"] env: PLATFORM: ${{ matrix.os }} PYTHON_VERSION: 3.8 TEST_SUITE: nibabel_test NIBABEL: ${{ matrix.nibabel }} # earlier versions of nibabel require # an older version of numpy NUMPY: ${{ (matrix.nibabel <= 2.3) && '1.17' || '' }} ENV_DIR: ./test.env steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 architecture: x64 - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Download zlib if: ${{ matrix.os == 'windows-latest' }} run: bash ./.ci/download_zlib.sh - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" # zran tests run only on ubuntu test-zran: runs-on: "ubuntu-latest" strategy: matrix: include: - niters: "5000" nelems: "rnd_1000000" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" extra-args: "" - niters: "5000" nelems: "rnd_1000000" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" extra-args: "--concat" - niters: "5000" nelems: "rnd_1000000" test-pattern: "test_readbuf_spacing_sizes" extra-args: "" - niters: "5000" nelems: "rnd_1000000" test-pattern: "test_readbuf_spacing_sizes" extra-args: "--concat" - niters: "5000" nelems: "rnd_1000000" test-pattern: "test_seek_then_read_block" extra-args: "" - niters: "5000" nelems: "rnd_1000000" test-pattern: "test_seek_then_read_block" extra-args: "--concat" - niters: "5000" nelems: "rnd_2000000" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" extra-args: "" - niters: "5000" nelems: "rnd_2000000" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" extra-args: "--concat" - niters: "500" nelems: "rnd_2000000" test-pattern: "test_readbuf_spacing_sizes" extra-args: "" - niters: "500" nelems: "rnd_2000000" test-pattern: "test_readbuf_spacing_sizes" extra-args: "--concat" - niters: "500" nelems: "rnd_2000000" test-pattern: "test_seek_then_read_block" extra-args: "" - niters: "500" nelems: "rnd_2000000" test-pattern: "test_seek_then_read_block" extra-args: "--concat" - niters: "1000" nelems: "rnd_800000000" extra-args: "--use_mmap" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" - niters: "1000" nelems: "rnd_800000000" extra-args: "--use_mmap --concat" test-pattern: "not test_readbuf_spacing_sizes and not test_seek_then_read_block" - niters: "50" nelems: "rnd_800000000" extra-args: "--use_mmap" test-pattern: "test_readbuf_spacing_sizes" - niters: "50" nelems: "rnd_800000000" extra-args: "--use_mmap --concat" test-pattern: "test_readbuf_spacing_sizes" - niters: "25" nelems: "rnd_800000000" extra-args: "--use_mmap" test-pattern: "test_seek_then_read_block" - niters: "25" nelems: "rnd_800000000" extra-args: "--use_mmap --concat" test-pattern: "test_seek_then_read_block" env: PLATFORM: ${{ matrix.os }} PYTHON_VERSION: "3.8" TEST_SUITE: "zran_test" TEST_PATTERN: ${{ matrix.test-pattern }} EXTRA_ARGS: ${{ matrix.extra-args }} NELEMS: ${{ matrix.nelems }} NITERS: ${{ matrix.niters }} ENV_DIR: ./test.env steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 architecture: x64 - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" # Tests on 32 bit platform test-32bit: runs-on: ubuntu-latest container: i386/ubuntu:18.04 strategy: matrix: python-version: [2.7, 3.8] extra-args: ["", "--concat"] env: USING_OS_PYTHON: 1 PYTHON_VERSION: ${{ matrix.python-version }} EXTRA_ARGS: ${{ matrix.extra-args }} ENV_DIR: ./test.env steps: - name: Install git run: apt-get update -y && apt-get install -y git - uses: actions/checkout@v1 - name: Install system dependencies run: bash ./.ci/install_32bit_dependencies.sh - name: Create test environment run: bash ./.ci/create_test_env.sh "$ENV_DIR" - name: Log test environment run: | source ./.ci/activate_env.sh "$ENV_DIR" python -V pip freeze - name: Build indexed_gzip run: bash ./.ci/build_dev_indexed_gzip.sh "$ENV_DIR" - name: Run tests run: bash ./.ci/run_tests.sh "$ENV_DIR" indexed_gzip-1.6.4/MANIFEST.in0000644000175000017500000000045714133320576015271 0ustar nileshnileshinclude LICENSE include README.md include conftest.py recursive-include indexed_gzip *.py recursive-include indexed_gzip *.pyx recursive-include indexed_gzip *.pxd recursive-include indexed_gzip *.c recursive-include indexed_gzip *.h