pax_global_header00006660000000000000000000000064133700577670014531gustar00rootroot0000000000000052 comment=02259e2b37bb0796ece84a553efb7d01a0697014 bitshuffle-0.3.5/000077500000000000000000000000001337005776700136715ustar00rootroot00000000000000bitshuffle-0.3.5/.gitignore000066400000000000000000000012171337005776700156620ustar00rootroot00000000000000## C # Object files *.o *.ko *.obj *.elf # Libraries *.lib *.a # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe *.out *.app *.i*86 *.x86_64 *.hex ## Python *.py[cod] # C extensions *.so # Packages *.egg *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg lib lib64 __pycache__ # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox nosetests.xml # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject # Documentation builds doc/_build doc/generated ## Editor files and backups. *.swp *.swo # Generated files bitshuffle/ext.c bitshuffle/h5.c bitshuffle-0.3.5/.travis.yml000066400000000000000000000021151337005776700160010ustar00rootroot00000000000000language: python os: linux # To test filter plugins, need hdf5 1.8.11+, present in Trusty but not Precise. dist: trusty # Required to get Trusty. #sudo: true python: - "2.7" - "3.4" - "3.5" - "3.6" addons: apt: packages: - libhdf5-serial-dev - hdf5-tools install: - "pip install -U pip virtualenv" # Ensures the system hdf5 headers/libs will be used whatever its version - "export HDF5_DIR=/usr/lib" - "pip install -r requirements.txt" # Installing the plugin to arbitrary directory to check the install script. - "python setup.py install --h5plugin --h5plugin-dir ~/hdf5/lib" # Ensure it's installable and usable in virtualenv - "virtualenv ~/venv" - "travis_wait 30 ~/venv/bin/pip -v install --no-binary=h5py ." - "~/venv/bin/pip -v install nose" # Can't be somewhere that has a 'bitshuffle' directory as nose will use that # copy instead of installed package. script: - "cd ~" - "nosetests -v bitshuffle" # Test the system install - "venv/bin/nosetests -v bitshuffle" # Test the virtualenv install bitshuffle-0.3.5/LICENSE000066400000000000000000000021741337005776700147020ustar00rootroot00000000000000Bitshuffle - Filter for improving compression of typed binary data. Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. bitshuffle-0.3.5/MANIFEST.in000066400000000000000000000003461337005776700154320ustar00rootroot00000000000000recursive-include src *.h *.c recursive-include bitshuffle *.pyx recursive-include lz4 *.h *.c recursive-include lzf *.h *.c include setup.cfg.example include LICENSE include README.rst include requirements.txt exclude setup.cfg bitshuffle-0.3.5/README.rst000066400000000000000000000220101337005776700153530ustar00rootroot00000000000000========== Bitshuffle ========== Filter for improving compression of typed binary data. Bitshuffle is an algorithm that rearranges typed, binary data for improving compression, as well as a python/C package that implements this algorithm within the Numpy framework. The library can be used along side HDF5 to compress and decompress datasets and is integrated through the `dynamically loaded filters`_ framework. Bitshuffle is HDF5 filter number ``32008``. Algorithmically, Bitshuffle is closely related to HDF5's `Shuffle filter`_ except it operates at the bit level instead of the byte level. Arranging a typed data array in to a matrix with the elements as the rows and the bits within the elements as the columns, Bitshuffle "transposes" the matrix, such that all the least-significant-bits are in a row, etc. This transpose is performed within blocks of data roughly 8kB long [1]_. This does not in itself compress data, only rearranges it for more efficient compression. To perform the actual compression you will need a compression library. Bitshuffle has been designed to be well matched Marc Lehmann's LZF_ as well as LZ4_. Note that because Bitshuffle modifies the data at the bit level, sophisticated entropy reducing compression libraries such as GZIP and BZIP are unlikely to achieve significantly better compression than simpler and faster duplicate-string-elimination algorithms such as LZF and LZ4. Bitshuffle thus includes routines (and HDF5 filter options) to apply LZ4 compression to each block after shuffling [2]_. The Bitshuffle algorithm relies on neighbouring elements of a dataset being highly correlated to improve data compression. Any correlations that span at least 24 elements of the dataset may be exploited to improve compression. Bitshuffle was designed with performance in mind. On most machines the time required for Bitshuffle+LZ4 is insignificant compared to the time required to read or write the compressed data to disk. Because it is able to exploit the SSE and AVX instruction sets present on modern Intel and AMD processors, on these machines compression is only marginally slower than an out-of-cache memory copy. On modern x86 processors you can expect Bitshuffle to have a throughput of roughly 1 byte per clock cycle, and on the Haswell generation of Intel processors (2013) and later, you can expect up to 2 bytes per clock cycle. In addition, Bitshuffle is parallelized using OpenMP. As a bonus, Bitshuffle ships with a dynamically loaded version of `h5py`'s LZF compression filter, such that the filter can be transparently used outside of python and in command line utilities such as ``h5dump``. .. [1] Chosen to fit comfortably within L1 cache as well as be well matched window of the LZF compression library. .. [2] Over applying bitshuffle to the full dataset then applying LZ4 compression, this has the tremendous advantage that the block is already in the L1 cache. .. _`dynamically loaded filters`: http://www.hdfgroup.org/HDF5/doc/Advanced/DynamicallyLoadedFilters/HDF5DynamicallyLoadedFilters.pdf .. _`Shuffle filter`: http://www.hdfgroup.org/HDF5/doc_resource/H5Shuffle_Perf.pdf .. _LZF: http://oldhome.schmorp.de/marc/liblzf.html .. _LZ4: https://code.google.com/p/lz4/ Applications ------------ Bitshuffle might be right for your application if: - You need to compress typed binary data. - Your data is arranged such that adjacent elements over the fastest varying index of your dataset are similar (highly correlated). - A special case of the previous point is if you are only exercising a subset of the bits in your data-type, as is often true of integer data. - You need both high compression ratios and high performance. Comparing Bitshuffle to other compression algorithms and HDF5 filters: - Bitshuffle is less general than many other compression algorithms. To achieve good compression ratios, consecutive elements of your data must be highly correlated. - For the right datasets, Bitshuffle is one of the few compression algorithms that promises both high throughput and high compression ratios. - Bitshuffle should have roughly the same throughput as Shuffle, but may obtain higher compression ratios. - The MAFISC_ filter actually includes something similar to Bitshuffle as one of its prefilters, However, MAFICS's emphasis is on obtaining high compression ratios at all costs, sacrificing throughput. .. _MAFISC: http://wr.informatik.uni-hamburg.de/research/projects/icomex/mafisc Installation for Python ----------------------- Installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python (h5py), Numpy and Cython. Bitshuffle must be linked against the same version of HDF5 as h5py, which in practice means h5py must be built from source_ rather than pre-built wheels [3]_. To use the dynamically loaded HDF5 filter requires HDF5 1.8.11 or later. To install:: python setup.py install [--h5plugin [--h5plugin-dir=spam]] To get finer control of installation options, including whether to compile with OpenMP multi-threading, copy the ``setup.cfg.example`` to ``setup.cfg`` and edit the values therein. If using the dynamically loaded HDF5 filter (which gives you access to the Bitshuffle and LZF filters outside of python), set the environment variable ``HDF5_PLUGIN_PATH`` to the value of ``--h5plugin-dir`` or use HDF5's default search location of ``/usr/local/hdf5/lib/plugin``. If you get an error about missing source files when building the extensions, try upgrading setuptools. There is a weird bug where setuptools prior to 0.7 doesn't work properly with Cython in some cases. .. _source: http://docs.h5py.org/en/latest/build.html#source-installation .. [3] Typically you will be able to install Bitshuffle, but there will be errors when creating and reading datasets. Usage from Python ----------------- The `bitshuffle` module contains routines for shuffling and unshuffling Numpy arrays. If installed with the dynamically loaded filter plugins, Bitshuffle can be used in conjunction with HDF5 both inside and outside of python, in the same way as any other filter; simply by specifying the filter number ``32008``. Otherwise the filter will be available only within python and only after importing `bitshuffle.h5`. Reading Bitshuffle encoded datasets will be transparent. The filter can be added to new datasets either through the `h5py` low level interface or through the convenience functions provided in `bitshuffle.h5`. See the docstrings and unit tests for examples. For `h5py` version 2.5.0 and later Bitshuffle can added to new datasets through the high level interface, as in the example below. Example h5py ------------ :: import h5py import numpy import bitshuffle.h5 print(h5py.__version__) # >= '2.5.0' f = h5py.File(filename, "w") # block_size = 0 let Bitshuffle choose its value block_size = 0 dataset = f.create_dataset( "data", (100, 100, 100), compression=bitshuffle.h5.H5FILTER, compression_opts=(block_size, bitshuffle.h5.H5_COMPRESS_LZ4), dtype='float32', ) # create some random data array = numpy.random.rand(100, 100, 100) array = array.astype('float32') dataset[:] = array f.close() Usage from C ------------ If you wish to use Bitshuffle in your C program and would prefer not to use the HDF5 dynamically loaded filter, the C library in the ``src/`` directory is self-contained and complete. Usage from Java --------------- You can use Bitshuffle even in Java and the routines for shuffling and unshuffling are ported into `snappy-java`_. To use the routines, you need to add the following dependency to your pom.xml:: org.xerial.snappy snappy-java 1.1.3-M1 First, import org.xerial.snapy.BitShuffle in your Java code:: import org.xerial.snappy.BitShuffle; Then, you use them like this:: int[] data = new int[] {1, 3, 34, 43, 34}; byte[] shuffledData = BitShuffle.bitShuffle(data); int[] result = BitShuffle.bitUnShuffleIntArray(shuffledData); .. _`snappy-java`: https://github.com/xerial/snappy-java Anaconda -------- The conda package can be build via:: conda build conda-recipe For Best Results ---------------- Here are a few tips to help you get the most out of Bitshuffle: - For multi-dimensional datasets, order your data such that the fastest varying dimension is the one over which your data is most correlated (have values that change the least), or fake this using chunks. - To achieve the highest throughput, use a data type that is 64 *bytes* or smaller. If you have a very large compound data type, consider adding a dimension to your datasets instead. - To make full use of the SSE2 instruction set, use a data type whose size is a multiple of 2 bytes. For the AVX2 instruction set, use a data type whose size is a multiple of 4 bytes. Citing Bitshuffle ----------------- Bitshuffle was initially described in http://dx.doi.org/10.1016/j.ascom.2015.07.002, pre-print available at http://arxiv.org/abs/1503.00638. bitshuffle-0.3.5/bitshuffle/000077500000000000000000000000001337005776700160245ustar00rootroot00000000000000bitshuffle-0.3.5/bitshuffle/__init__.py000066400000000000000000000006221337005776700201350ustar00rootroot00000000000000""" Filter for improving compression of typed binary data. Functions ========= using_NEON using_SSE2 using_AVX2 bitshuffle bitunshuffle compress_lz4 decompress_lz4 """ from __future__ import absolute_import from bitshuffle.ext import (__version__, bitshuffle, bitunshuffle, using_NEON, using_SSE2, using_AVX2, compress_lz4, decompress_lz4) bitshuffle-0.3.5/bitshuffle/ext.pyx000066400000000000000000000336171337005776700174000ustar00rootroot00000000000000""" Wrappers for public and private bitshuffle routines """ from __future__ import absolute_import, division, print_function, unicode_literals import numpy as np cimport numpy as np cimport cython np.import_array() # Repeat each calculation this many times. For timing. cdef int REPEATC = 1 #cdef int REPEATC = 32 REPEAT = REPEATC cdef extern from b"bitshuffle.h": int bshuf_using_NEON() int bshuf_using_SSE2() int bshuf_using_AVX2() int bshuf_bitshuffle(void *A, void *B, int size, int elem_size, int block_size) int bshuf_bitunshuffle(void *A, void *B, int size, int elem_size, int block_size) int bshuf_compress_lz4_bound(int size, int elem_size, int block_size) int bshuf_compress_lz4(void *A, void *B, int size, int elem_size, int block_size) int bshuf_decompress_lz4(void *A, void *B, int size, int elem_size, int block_size) int BSHUF_VERSION_MAJOR int BSHUF_VERSION_MINOR int BSHUF_VERSION_POINT __version__ = str("%d.%d.%d").format(BSHUF_VERSION_MAJOR, BSHUF_VERSION_MINOR, BSHUF_VERSION_POINT) # Prototypes from bitshuffle.c cdef extern int bshuf_copy(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_elem_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_elem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_elem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bitrow_eight(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_bitrow_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_bitrow_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_bitrow_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_byte_bitrow_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem(void *A, void *B, int size, int elem_size) ctypedef int (*Cfptr) (void *A, void *B, int size, int elem_size) def using_NEON(): """Whether compiled using Arm NEON instructions.""" if bshuf_using_NEON(): return True else: return False def using_SSE2(): """Whether compiled using SSE2 instructions.""" if bshuf_using_SSE2(): return True else: return False def using_AVX2(): """Whether compiled using AVX2 instructions.""" if bshuf_using_AVX2(): return True else: return False def _setup_arr(arr): shape = tuple(arr.shape) if not arr.flags['C_CONTIGUOUS']: msg = "Input array must be C-contiguous." raise ValueError(msg) size = arr.size dtype = arr.dtype itemsize = dtype.itemsize out = np.empty(shape, dtype=dtype) return out, size, itemsize @cython.boundscheck(False) @cython.wraparound(False) cdef _wrap_C_fun(Cfptr fun, np.ndarray arr): """Wrap a C function with standard call signature.""" cdef int ii, size, itemsize, count=0 cdef np.ndarray out out, size, itemsize = _setup_arr(arr) cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] arr_flat arr_flat = arr.view(np.uint8).ravel() cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] out_flat out_flat = out.view(np.uint8).ravel() cdef void* arr_ptr = &arr_flat[0] cdef void* out_ptr = &out_flat[0] for ii in range(REPEATC): count = fun(arr_ptr, out_ptr, size, itemsize) if count < 0: msg = "Failed. Error code %d." excp = RuntimeError(msg % count, count) raise excp return out def copy(np.ndarray arr not None): """Copies the data. For testing and profiling purposes. """ return _wrap_C_fun(&bshuf_copy, arr) def trans_byte_elem_scal(np.ndarray arr not None): """Transpose bytes within words but not bits. """ return _wrap_C_fun(&bshuf_trans_byte_elem_scal, arr) def trans_byte_elem_SSE(np.ndarray arr not None): """Transpose bytes within array elements. """ return _wrap_C_fun(&bshuf_trans_byte_elem_SSE, arr) def trans_byte_elem_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_byte_elem_NEON, arr) def trans_bit_byte_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_byte_scal, arr) def trans_bit_byte_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_byte_SSE, arr) def trans_bit_byte_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_byte_NEON, arr) def trans_bit_byte_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_byte_AVX, arr) def trans_bitrow_eight(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bitrow_eight, arr) def trans_bit_elem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem_AVX, arr) def trans_bit_elem_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem_scal, arr) def trans_bit_elem_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem_SSE, arr) def trans_bit_elem_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem_NEON, arr) def trans_byte_bitrow_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_byte_bitrow_SSE, arr) def trans_byte_bitrow_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_byte_bitrow_NEON, arr) def trans_byte_bitrow_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_byte_bitrow_AVX, arr) def trans_byte_bitrow_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_byte_bitrow_scal, arr) def shuffle_bit_eightelem_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_scal, arr) def shuffle_bit_eightelem_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_SSE, arr) def shuffle_bit_eightelem_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_NEON, arr) def shuffle_bit_eightelem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX, arr) def untrans_bit_elem_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_SSE, arr) def untrans_bit_elem_NEON(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_NEON, arr) def untrans_bit_elem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX, arr) def untrans_bit_elem_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_scal, arr) def trans_bit_elem(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem, arr) def untrans_bit_elem(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem, arr) @cython.boundscheck(False) @cython.wraparound(False) def bitshuffle(np.ndarray arr not None, int block_size=0): """Bitshuffle an array. Output array is the same shape and data type as input array but underlying buffer has been bitshuffled. Parameters ---------- arr : numpy array Data to ne processed. block_size : positive integer Block size in number of elements. By default, block size is chosen automatically. Returns ------- out : numpy array Array with the same shape as input but underlying data has been bitshuffled. """ cdef int ii, size, itemsize, count=0 cdef np.ndarray out out, size, itemsize = _setup_arr(arr) cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] arr_flat arr_flat = arr.view(np.uint8).ravel() cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] out_flat out_flat = out.view(np.uint8).ravel() cdef void* arr_ptr = &arr_flat[0] cdef void* out_ptr = &out_flat[0] for ii in range(REPEATC): count = bshuf_bitshuffle(arr_ptr, out_ptr, size, itemsize, block_size) if count < 0: msg = "Failed. Error code %d." excp = RuntimeError(msg % count, count) raise excp return out @cython.boundscheck(False) @cython.wraparound(False) def bitunshuffle(np.ndarray arr not None, int block_size=0): """Bitshuffle an array. Output array is the same shape and data type as input array but underlying buffer has been un-bitshuffled. Parameters ---------- arr : numpy array Data to ne processed. block_size : positive integer Block size in number of elements. Must match value used for shuffling. Returns ------- out : numpy array Array with the same shape as input but underlying data has been un-bitshuffled. """ cdef int ii, size, itemsize, count=0 cdef np.ndarray out out, size, itemsize = _setup_arr(arr) cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] arr_flat arr_flat = arr.view(np.uint8).ravel() cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] out_flat out_flat = out.view(np.uint8).ravel() cdef void* arr_ptr = &arr_flat[0] cdef void* out_ptr = &out_flat[0] for ii in range(REPEATC): count = bshuf_bitunshuffle(arr_ptr, out_ptr, size, itemsize, block_size) if count < 0: msg = "Failed. Error code %d." excp = RuntimeError(msg % count, count) raise excp return out @cython.boundscheck(False) @cython.wraparound(False) def compress_lz4(np.ndarray arr not None, int block_size=0): """Bitshuffle then compress an array using LZ4. Parameters ---------- arr : numpy array Data to ne processed. block_size : positive integer Block size in number of elements. By default, block size is chosen automatically. Returns ------- out : array with np.uint8 data type Buffer holding compressed data. """ cdef int ii, size, itemsize, count=0 shape = (arr.shape[i] for i in range(arr.ndim)) if not arr.flags['C_CONTIGUOUS']: msg = "Input array must be C-contiguous." raise ValueError(msg) size = arr.size dtype = arr.dtype itemsize = dtype.itemsize max_out_size = bshuf_compress_lz4_bound(size, itemsize, block_size) cdef np.ndarray out out = np.empty(max_out_size, dtype=np.uint8) cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] arr_flat arr_flat = arr.view(np.uint8).ravel() cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] out_flat out_flat = out.view(np.uint8).ravel() cdef void* arr_ptr = &arr_flat[0] cdef void* out_ptr = &out_flat[0] for ii in range(REPEATC): count = bshuf_compress_lz4(arr_ptr, out_ptr, size, itemsize, block_size) if count < 0: msg = "Failed. Error code %d." excp = RuntimeError(msg % count, count) raise excp return out[:count] @cython.boundscheck(False) @cython.wraparound(False) def decompress_lz4(np.ndarray arr not None, shape, dtype, int block_size=0): """Decompress a buffer using LZ4 then bitunshuffle it yielding an array. Parameters ---------- arr : numpy array Input data to be decompressed. shape : tuple of integers Shape of the output (decompressed array). Must match the shape of the original data array before compression. dtype : numpy dtype Datatype of the output array. Must match the data type of the original data array before compression. block_size : positive integer Block size in number of elements. Must match value used for compression. Returns ------- out : numpy array with shape *shape* and data type *dtype* Decompressed data. """ cdef int ii, size, itemsize, count=0 if not arr.flags['C_CONTIGUOUS']: msg = "Input array must be C-contiguous." raise ValueError(msg) size = np.prod(shape) itemsize = dtype.itemsize cdef np.ndarray out out = np.empty(tuple(shape), dtype=dtype) cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] arr_flat arr_flat = arr.view(np.uint8).ravel() cdef np.ndarray[dtype=np.uint8_t, ndim=1, mode="c"] out_flat out_flat = out.view(np.uint8).ravel() cdef void* arr_ptr = &arr_flat[0] cdef void* out_ptr = &out_flat[0] for ii in range(REPEATC): count = bshuf_decompress_lz4(arr_ptr, out_ptr, size, itemsize, block_size) if count < 0: msg = "Failed. Error code %d." excp = RuntimeError(msg % count, count) raise excp if count != arr.size: msg = "Decompressed different number of bytes than input buffer size." msg += "Input buffer %d, decompressed %d." % (arr.size, count) raise RuntimeError(msg, count) return out bitshuffle-0.3.5/bitshuffle/h5.pyx000066400000000000000000000146561337005776700171160ustar00rootroot00000000000000""" HDF5 support for Bitshuffle. To read a dataset that uses the Bitshuffle filter using h5py, simply import this module (unless you have installed the Bitshuffle dynamically loaded filter, in which case importing this module is unnecessary). To create a new dataset that includes the Bitshuffle filter, use one of the convenience functions provided. Constants ========= H5FILTER : The Bitshuffle HDF5 filter integer identifier. H5_COMPRESS_LZ4 : Filter option flag for LZ4 compression. Functions ========= create_dataset create_bitshuffle_lzf_dataset create_bitshuffle_compressed_dataset Examples ======== >>> import numpy as np >>> import h5py >>> import bitshuffle.h5 >>> shape = (123, 456) >>> chunks = (10, 456) >>> dtype = np.float64 >>> f = h5py.File("tmp_test.h5") >>> bitshuffle.h5.create_bitshuffle_compressed_dataset( f, "some_data", shape, dtype, chunks) >>> f["some_data"][:] = 42 """ from __future__ import absolute_import, division, print_function, unicode_literals import numpy import h5py from h5py import h5d, h5s, h5t, h5p, filters cimport cython cdef extern from b"bshuf_h5filter.h": int bshuf_register_h5filter() int BSHUF_H5FILTER int BSHUF_H5_COMPRESS_LZ4 cdef int LZF_FILTER = 32000 H5FILTER = BSHUF_H5FILTER H5_COMPRESS_LZ4 = BSHUF_H5_COMPRESS_LZ4 def register_h5_filter(): ret = bshuf_register_h5filter() if ret < 0: raise RuntimeError("Failed to register bitshuffle HDF5 filter.", ret) register_h5_filter() def create_dataset(parent, name, shape, dtype, chunks=None, maxshape=None, fillvalue=None, track_times=None, filter_pipeline=(), filter_flags=None, filter_opts=None): """Create a dataset with an arbitrary filter pipeline. Return a new low-level dataset identifier. Much of this code is copied from h5py, but couldn't reuse much code due to unstable API. """ if hasattr(filter_pipeline, "__getitem__"): filter_pipeline = list(filter_pipeline) else: filter_pipeline = [filter_pipeline] filter_flags = [filter_flags] filter_opts = [filter_opts] nfilters = len(filter_pipeline) if filter_flags is None: filter_flags = [None] * nfilters if filter_opts is None: filter_opts = [None] * nfilters if not len(filter_flags) == nfilters or not len(filter_opts) == nfilters: msg = "Supplied incompatible number of filters, flags, and options." raise ValueError(msg) shape = tuple(shape) tmp_shape = maxshape if maxshape is not None else shape # Validate chunk shape chunks_larger = (numpy.array([ not i>=j for i,j in zip(tmp_shape,chunks) if i is not None])).any() if isinstance(chunks, tuple) and chunks_larger: errmsg = ("Chunk shape must not be greater than data shape in any " "dimension. {} is not compatible with {}".format(chunks, shape)) raise ValueError(errmsg) if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) if shape == (): if any((chunks, filter_pipeline)): raise TypeError("Scalar datasets don't support chunk/filter options") if maxshape and maxshape != (): raise TypeError("Scalar datasets cannot be extended") return h5p.create(h5p.DATASET_CREATE) def rq_tuple(tpl, name): """Check if chunks/maxshape match dataset rank""" if tpl in (None, True): return try: tpl = tuple(tpl) except TypeError: raise TypeError('"%s" argument must be None or a sequence object' % name) if len(tpl) != len(shape): raise ValueError('"%s" must have same rank as dataset shape' % name) rq_tuple(chunks, 'chunks') rq_tuple(maxshape, 'maxshape') if (chunks is True) or (chunks is None and filter_pipeline): chunks = filters.guess_chunk(shape, maxshape, dtype.itemsize) if maxshape is True: maxshape = (None,)*len(shape) dcpl = h5p.create(h5p.DATASET_CREATE) if chunks is not None: dcpl.set_chunk(chunks) dcpl.set_fill_time(h5d.FILL_TIME_ALLOC) # prevent resize glitch if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") for ii in range(nfilters): this_filter = filter_pipeline[ii] this_flags = filter_flags[ii] this_opts = filter_opts[ii] if this_flags is None: this_flags = 0 if this_opts is None: this_opts = () dcpl.set_filter(this_filter, this_flags, this_opts) if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl) return dset_id def create_bitshuffle_lzf_dataset(parent, name, shape, dtype, chunks=None, maxshape=None, fillvalue=None, track_times=None): """Create dataset with a filter pipeline including bitshuffle and LZF""" filter_pipeline = [H5FILTER, LZF_FILTER] dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks, filter_pipeline=filter_pipeline, maxshape=maxshape, fillvalue=fillvalue, track_times=track_times) return dset_id def create_bitshuffle_compressed_dataset(parent, name, shape, dtype, chunks=None, maxshape=None, fillvalue=None, track_times=None): """Create dataset with bitshuffle+internal LZ4 compression.""" filter_pipeline = [H5FILTER,] filter_opts = [(0, H5_COMPRESS_LZ4)] dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks, filter_pipeline=filter_pipeline, filter_opts=filter_opts, maxshape=maxshape, fillvalue=fillvalue, track_times=track_times) return dset_id bitshuffle-0.3.5/bitshuffle/tests/000077500000000000000000000000001337005776700171665ustar00rootroot00000000000000bitshuffle-0.3.5/bitshuffle/tests/__init__.py000066400000000000000000000000001337005776700212650ustar00rootroot00000000000000bitshuffle-0.3.5/bitshuffle/tests/data/000077500000000000000000000000001337005776700200775ustar00rootroot00000000000000bitshuffle-0.3.5/bitshuffle/tests/data/regression_0.1.3.h5000066400000000000000000003374171337005776700232530ustar00rootroot00000000000000HDF  `TREEHEAPX(compressedorigional0HhTREEP8-@HEAP`~SNOD Hh(Pp Pp TREE8u|@xHEAP`{ rT bPLii2HzmV\yTa VjgU.]0Uo .+ n{.DB($sk jpW͞OD{EmttP_D&1eMGFόN?-Eעϰg{؄JV"6/:kMZHOXYS@B1.ga<5)cx uQ$_QdOi G8Fr:R +T'#(B1:?Kywc Gu7]0FN\8rEWq@ֆ^+=:>B14%KHѽtȒׯ,Zx^xI^ |pj3 ţ!^%5lFP\뫢uc,b˶!,*!Ү6'V>N5Sf`gPˢ5^zJOqև۔f¦ 3Yw2݊( L/cb.SӻI vn!Ǹ\+°޶s%sd vb-fvyd   "{`S.pXQ>w!#w;%cq6@B1kF1D=v2BQx`z.Gx"BL7T5.k#^ `c ˦3tr%@pzt4`q}.0S2vdc /݄){y^sT.Ax|zn=6 S5GJQKm)1j&BW#@B1JPG$`7Mb.MJS\߆Yʠ~+ [Nsc["f>bB1a`Wzך'uDhș_Ftnq=T 3ј#NB1- v=p)L;[B֏~f;KgV' ?I\WjjE*p5vͰ#FE;ATхؔ/ܥv3wfZ^wvў!X=*E}=6Sg5mK񎊥@Ed/J1d4J\gEt]q a2!X<[RiԞ 97zg1OTi `(٘)KS~ Ybi%xl8 Ĕh]j{L,q=MA 5|*er25Y{N^4 lӵBES<OyyQvy V p. zA@=t$3PPcԷsC`ta_c? 0Pg]"fxH9H_E\▎#ް ʼhtZ M=.Ӊ0-^;\JOR3Sgڧ2yJlt-xpX<$@;ϳ`UwEcme5Xy#8+he!S[r YG( Wo)4 ~F\ ?P\?k\6K$4X9>Y3 H-1ϐѴ|q&2=0(gpH)wF_>Sz͚BYL\Q~ގS"Dir)_,c+r|8^lQm:*lf#FnfJiaiԣQqk=S8@]&s I©IwMAAe>^^:ʐ_e┉QLԋ]ww>4$NxbupTEj,bJzsy,_{p:*[uqИ6$(Rfwrp6] @oZC8 !ú3)E8 3P_ K]Ы 9ٞ$^'SBL"4$ `-1ϐѴ|ŜUBW#C`b)ӳvuxy" 9Lizf+',l)¥w a,I#-#9RXikI9{͑r~t.ñ՘BQ嘡&.N¾Mrԝ]AbkѿȃQrѕاc{0; *Z`׻鄺g D% DЖSÔgc.~6 `~E@׮w'H'P&=dž'bۋO#܀TUxKQ~ly4ҹC αSP ϹNJXLV8{`s23L7 kCJ1І]rlW"i*0_ɉR4ү]Uɖn1 :q:%d8[񛖙Fȩgh ZcYٰ]y?rT h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle@GrTTREEZAJCrT h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle@(RrTTREEXZq3_᱊B>"\eMfjFHJ%v$koX[*iӵ2>Fl36yg}䑷N4K$!eޤp?(ރT0i[?ү=q{7qR 8G e5oIF7 @`'AH@*:pQyT/u!Z^J `@["v5ܘ[ɎB`B4tY&<9rw{*!sv?q2s,}0KXf3 '1Iɧ_`ԜM_b5mܹ̂uj(00Ibp ;rC!}Ep,6P3C"C4~k,|kDR`pcP$0Ŗ% m#t1!* (raQ UW k7瓫J"ȏXh:2n9X=8L=Nv)QHALx>+2O^W,ӠQH sSΫ] F"O}s9ͤ[CP7buHōz\+"9ưANJzm6XocDJG\iĎ,7٣IFzi? QRVx#l2ix :lE;9[I W VYUJp<׏vm'x)[nޫEzhybOWtd0!Bb (crQPxW=2Q P\rT h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle@xdrTTREE _mIrT\cl8N\32/>@ m4Ү&s]^Xod[{m~ ~-`zV_e>wcXV~+Z-2H, IAuS}LwBW<+*N R1JJ˚JD\ȴvVI,cblYE8YJq4w,<ǰ-x_^㜾n̿z,䯺yJ/=[~Qu%,./A(m_ n)vkKsGzCEk=*k^}F%,QTvQW5܃n[qfc^֜,QʩdR3x#d |3_5UR="}znhЬx 2?sT3+]~3Ϝ/`wc-7ofy0V.q=OIAS<pjo_-^fũe!əecC \@Ss1 lřkN\_:z_9;u56L -]Cnr8ڡQg):qwg^bE`t:℄on}o*``HYJ`-{H <1\qdM}8zO5a-"ٻĝSGBZ6* ۻ~u=H+޳cP' qA1v%# mi-A ~iI6ud%޼?gD?ɢ/0;;'AU0nȅ]1+GY@nT< U _}(=4ujugcCj(AWᚲW6X7w!?{o& *LH{X#h6|nrH'mt:դYӐi"{VҵNp|yeW8:{vژ]Fp&Lvw1CwJn_;heߩfdJ@*,&x֝n@M6䘅dgRFUjP|UP/ܺ@Mѿr{@qds\B|"?-37}9FS A6-}(O,f"Ÿ00h%~G<&-]JETh4Ob=aF8HIT530ZcZ a}~?ZZse}]"{ōnb %%D|y/%sG Ƙc$(+Wg^v,ge?v#̯XʟU Ϗ(|~} (5 PqaPc2_@-¡JIW& Wy q As .&Jt婧OVxOkCga$bV`ɝ7.'[ɚ$}t>]z-NƂE22S *%( )vj^45Du@1 .lA_EB"mpZep1/ xpQ{tr: Z8鰇U&ҝ>rb~(5Wl~.lvysRy9|33|ܭTjRVY+ 'lAҩ{0 ’ayǴV'윉؂[z``)werT h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle@XrTTREE:ʼnT䩢=!)N1WYQ]> {iH߭v (+7_Cr0W=߳kB{߇:r>i܏ 0'axCXtY~Sa0xTn6̲]e=̖[Ym)-w>7t2=,vQbr[OlತAZ'JM"Zzju.˂d;q@QӬj @'FT}}w|{v+֏/XwِB38o@&u1A ! ʪ1Rr%-XۃE77] G\܈6qF+wScsPӢ̆V}(,̙KMFw?(KvR " 8'nT<~puɭn&j&Dƌ{\>4-\?ȼnܳ 2 up1Qq]χvP3i4QH_յK783UqK8\*[p14.Jr/1?r>Jn/ЮT4aKMih_ X 4K^&A^Txx2a}^$ε\32ګHZ Q;%hTO .d#r}l Y;ypS~JDԮZL7^%!nIqSBV"aj,:+u"[UaOs4p`b8J=5%Lߚ 1x =j\٣Eruuorȕ9BSs$U->3>6-?V%ޕ! ;.BÄ!X0SΈ c(c5AQ%1LaL= F5 |x!@!1VXeާ%LJV2N*\45nlW,Ko(sv?-3X R:WdOFkc`̪98PEv>SҐö.W 0p6ݡDCW/hNWM@!ćb?P;[J+  jT-z,s%=|9lKr~2 %w/  ګĔXOSn>ga,mhD [eMN ['y?B/Km֕AT|ւ[#x}%9wgwHq=/xᔆXF6vCɫj=fz'Idcn`V^emD ;mԢx XG' jՒW I91{IA~5  Zt-!3~O~4AU@;BS꿛g}FA!Tv:^h2EmuѷBM^SRǰʥRVN+Ϯ2{@+ɝ,X[ȅC&n7%..–$P,Ԝ`p?:hRє:d[$Hz L%?>qb>MSeG[.}> R͚TB/f@lJ\ z{]?+i^I'a9 ,C?9gJrwD/;M$%X6^]TT -X_*}PP淬+}_YFz3y#+WiEnQ90@ G^XDgid>p/{?ØJ桌5kJb#s_0U/OdP zYjN$?pj%d('qO[8Whyod#g@ږwR'c @o.+?Vѡ{HxE3sNLK2Yi6jWJkIcz /[x !"yq090>M(!Bi\\='Y3" V͂YSYk`3T\ⶥTgg5^ >kk}bpX iΗGKQ;{0BLa"%|6Q8F?&3@?d CPN5 a AN+}<~ "}/0"}⒆ f~GZ(ser.(9%TVN8c z81ɪq*UyuHc@r9jm,6ihLR֬P~4єk*8eEz,h V-?3,GSKL3sgj\t,A=켇mזHAc&')];`b|(z > Q8Poe >or\c]e x6Xv՚fZ'v{?VQJ9 ׏z5,}ޕL3'ilbʴ2WNy8RP\f~6x>F9MŇTwDc4E[:9)9t8n h IM}aX* ▻oé^r8K0ӓa2`{\qPUԤEPs&6*wZLC(DrT h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle@rTTREEh(D5 +j KdQY>P @X3W^D|Sgngsځ".-`{BTO0`0p,b,s'UQ".j)&"̹%k6jKō]H/JQHfiPjQ/v>{Cq0`( ]ϊ;\DB 'K,};eL /)i uI Qdeai*(f^7sG6E}؈e/ ŌMEx'OUIAye\_P\OV ڍ2~.f1M):(C$[(v 8`>`'Z PN!ǓE73#qҞO}(%OdN+/j_2MȑK#- 坘@C1¶1UX4!+TD QQ7tsgB[ZwP-ݟfr'o[;ߗa8Q!r>0:xg h݅ԡ՟wzZ/}OkLpGw#11|xg}$3 &U1b&Ÿv#Q*P-eN[ٕZ{rm|ݣo;,NG+z 1t.ƥ#X {r ,ARWbqe 9}N0l636& ;%U(l(O+Ags#H:Cߵcʨuлr}b/}pL6Eg ͆j3i51s[a1_Zn9 {vy$zS%UʱVW۠x&ׄ4DҭAa ACr3:]x@:EFF^G3՝t-c;,US-lu ;ϲ6<.!&"f3}SF[f^ۥ{M][t)hw!#^+Ү3CTREEXrm1JP$iP'YG>ҟhae:5LfT̮O$Պ:?]ԗѷmvPUͮΧ#.i"QSivi8\El"}Xw@ŌKUŒV`ܮ@4\Rx}m$ pbت?P(KZ@ {n1`%iBk5{_D)ǍZO=C2^{;NO*f O3 \]6qdqs (:QaEei i\j'8*,*a~}]͎D I&"*PSmti J!F_lx]UK2ri\õ 9s(gpN.PxXrԌVuT 6dbLQndw\]o).8Bgx )c(|(Ra ta_qlsf#0EULDIчMAi@-]4a M$>>΃hw~6aϒtԂ1c'6"ȅ-<L"lʎgpPv<\#o;c6dOE#U ?-J6uY D߷oV+M]>9Ĝ ] c4tVz9w(u$籈p rP8DnRh NIra=f(pfiv12AVUr y !gx)o ytqORMV8p|cajJpgLAw,ڿ8彂w V F^waӫ˕+h#|/(ǩa6nuHKy;Ei9W e|H7|PnGHctA41h)M1Jݼoe~C Ie=rZE/^y9@؊t<Qg;1P"M WCڶX,j_@)_J p&*P/00ֈ p%MgZ)ӈ\B<(Dg{  ;a`"GJɘzKC9Pݷ2!0?aRquLl,Po^Or1h$^+Ү3C(UN#܆ηɕ A%JJL Uڙ*'lV$Fr[Kbqù(d{I. 7Xؾweu0]E?"Jc!%o퇋fښM"s1u $FHs[op?&|rK6Z{dʮv t5"UVSsuJi!Ĝ^$+]3˘$:nXs3v$*_^1; 9@ 0o4:L"7mMi\0HC;\+@DH/K6KY)w>Ϧ+ h+ePɲ:f* &<#KĮNHЯqt|A-y,fR;QK鹇%b0EhkD4XۊRt(ޯHɫʠa:7aUAg>ms;r`4['Xw4>3o]MKo_*kr#77*@ò9ٯ,{F'(W"@R5r<]1+N5Pm1`_Tڻ@=M+DX!kLYt${)ܯNFm}-Oi5M?Vxa4,%i[anݹZLKv$IUۛqH ;RO:Ś/9Y bZbϲw +.2g_9KQOMKECl⨟&e YMXR B>jT-%aÀlShsgW HS1!_TU -eQJo܎.~ic=.)^邗Q|oe_%i]%WT XrC_-YM>s!XUF-`9hL"~ih,|cGH;{o6 SO ՗csIw5_Gk]g {L~ଘS-OYvHD?I+2hhOZ"\Q8,BrRA k18-okN֙~: 39nmq_eS9awOJgR1BfPҰ]:~dԀKg!1 XVy(@i]݄r/~\Ce@8xge]>d(ɿP_$=v`N挚3|t}RM&y48T˿ɓVBwDqGP:7i*V7 &g}3$,X^ h[nvUaKs<|4ϔz7T*e{"a[GP1mĜUl09pC2vS-bQlQcMU.v>h"[Iܣv-4 YU:2t'0XۆPzR݀'!}GGQ4jyºFPlqAЊqi`eĔYs02bK&NxGqDhe a<"R,d.x fJ\GPD同ȃ2 X"ԣt?)&G. /I'ҞؠD|&\iMq'HP%u'M$ʰi>YjȬ @4'v@ҍʋBlz T3Cd7/^ ,IViٵN8 WMjd=&*rw <}_v&mHp6 ɽ#?;Wp@zF[L^WP|zq^ZjBt( ^G_ wiTF\| &d(=DB4Dck _!<c ]Fϖf28JF?lXJrQcuV`ݼѪ-ПDJv<G0j<:Y)[c qjb+7'ty}_='psMP:#TޘNւEج+lFJ)x3T`Oi#*#_ Qw- >+bz,˃ʙIW(vjGlOp>;qtC|QGzH!b:\k1/ tqgUWW#\CPp0=(zPj;\2׹`qu4꩘? HϾ<jZMƏ0 "m5 ?\kxkƗTؐI܀Ox#_}6b}&YaR/][8=cr?v{2(C?߯I<<0PtmN1Aky ^0O{9MoӆI5U_WCx]U⠳-ean\t~L3Uwu< uF)'|bAIγaHUCodbg]M=Cn5 FKL Vnrr]o,$kȶ,otBOR8a?7\pKQh+71V>_}\FnJ8 uhȂ |>҃ňzxPp_ >K;K{ ki>}pl2ԝ^Ah́U3Kf R;a-`i M1N\By?rz=c#*7VQg<`|8=N݆[,4a?Ewh 6cU$8F*$=T^^AR2$&=ͺ KGKT(\3@b<ѵ,o̙EmLW `2f/mIۏ ıV Cw%sQAF'oþyZ. c[ ~?}e.c贅r"i|k7겒m ʥc eF@f̴$hR^[; Zq߾#P2K;-vYAXW'Ca"wy6 +gY%aX4-)<)~.y8fh,QH [c TV.~A}eRT\5:rjDHE^{{D[jv0K+֏P.;ˌWXC F͋B&@B]4a&v굷jAvӍ["ms,RDnͶP"EԱ/M}r6J)Y }|s/ Ǭ~E==)' *ʥr-$R%hRbx&J:(,֦TJCr?Qt,}\KY:DތnE1}62&&wLn@A:@uv G&XpügF} +/AU`Ez-9cė/h`4iiwAZV{/m5bsՉ,>qTqx-6mxԗSy )Bawv .k&TKBDU/^3! lG<>V-jZI1u'r$C!Z*aZA{Tumpؿ3C.W./t{g,tKv mu:h,=HO yYM}֙1G x X:YE-pb0V=?Jշtgm@v󰲯̟d!fZvX٣'/J3 ~tWM[OXTk7lE`BD GiR`1TF*$ ,/X V`w$ZLeۨ9# H<=xWP<=CdrlSg0 S4QZhlbW&g^q<+0 `ʎ^Kޟ]^ؔH.҈|UGZI-X fId~L/nDaB@3{#5t!mh usǥ޳*M. y +J53˼YLTS+c{v6#@Kj䫨q}F(YϤvF y;?q:m;kiŢbB!惂<޶\"ɬW7r2z5Fj բIZ*a~ң؊X% *ԛe[51F2׋yH'{M.FYhQX B0)BGQCDZtg۝xuIL>Yuhʏԯ`c噐QAcFnu҉arPog_F01fX7p/+ӿ; xl0G-0F|:ޡwi~^Q6?A:[\2%e}zZ ơJ}65&rQȿN.r O4G.kt3uWrƙ`{5;Yaoۖ.^.aNf,Н j<vЦx)z 2O`ɸl jZ\/f4]Jw2xoG?L%,8j;.YRcØ֧x3뭃UumD4:MMkgP' EtmG,:vh1_Q;sIwTBK2-_,8|L\6\ ŚI*3"τG]*V_7#L7 ge7ug $FvTO$ynW{}׼+0|y7\$!.Թijdev߼`FHfj-8g"uZI3ADDJ(ݼ/`L~t.fa !py@Y["%}&Nz՝ D={Iiq0QaK(  m^r﭂n66iCN%]{p Bi%ꂖ͚73ʹ%+d:WMĭ1e >25}q`V寧ysn yf"֌Va3oSmfM"K)]\9a&sIo nwуIHm^)s^ 7{cb*h߆p`u|_r`h PQPU}Y+{02qq+d ։x!eZK8BxkG:0'|Zs#ɕ;ax?l:i[ C1aa<?GO=M= @ٸn$aӹͻ&i^s~kɿSokűlޠ@!ƞ?;dnJ XF)cu \K :@;2fƧ}S㱡 ~yYCڹ+Aqq{FdĕD xl)٦E ~Oю\%i>NdQx6&ZFNe5 !?f)a`YY᎙6"u؍<ZGr ѳəB%]С\? w ",L!eW&=3$: 3I)l&hO̢yV[4^U)?r- 4b~zP+0ĉo K^u{o'8RMn1 xltyC5[ AcX_(~Λ"<4hM"?T|I0+^k7`TDR~ b?y&g_RT'd|CZ zF\f5r/:iA[hhAa%]wV 4b+Wo͍+C ˯)QYx/!k N73*N1@;N>4োc܇>J`E?oS ?=ߔc$qNB~ɵG D} yjQyXeiUM\#tT MaV&g* n,G5Zx_RP47[b&(i &^iNڷ+aR,moQ%0~v[A|EdG{ķV9~/h}F#-TjioWE~S/z*&{U41%yNI1Ժ#b?,F)V#5d0yRF'H<D Q?vnܷgTdKRQq&|PAhoBl Uw#PvN6@yQG c Y~;]d'OaSJO ~ T,i.eD?Py`m`2_k%y )ce겤 )+qA>" qٸ̖u;lo [PK953Ǻg(Dx [5ۼK} _hjFcڤ&_ϢgjzWIZ Y 0bn#2}ؤbtr:<Ս ݅!%Z>?qi]^KH"C{7n"21%. T}HJGM@pҺҐhDM?7wU8t)1 of7Ӕ:gKzPJ< iKP" va (w`qglc-q X;{'=;VWCmFX2F8k;cI$dT"3nCF3lX 1_}!@s0Rԋ)0`h=1K=`A=2V6gQkI=dߪz ( o-diP;C`r=#r(~#,ۣFp^T5R szV ToMn;gffs|)޿Z}\N9^nXԩ+@p0hG"M|d>/6 ` +zCԣmK5;%"i0Pݦ'L_E.{J'Oܐ4yOl?w7*3l(vyB+: ov<,b _ˡH&ys`tBc)?d%̇Ipc͟ji[s鵸 yql `%gO{^Kvb=yK$a $RWbDs Nnmte(hnմԙ 7 sd<,P(j`zWq>x  !oskvh dO ޷.%0|6-95"]}>GGFyS`?z.B$d+zcIcBrA\IniD7Q#op^Q5 Z6 f˲j, TP@a8ⳟan({rHט%x*(HO)E$2#ce*;NRꔐ ~oA|fAjjɿՠ .'\DU'L;YĜLS)bD:rmlcBۥ\D`Jݽ׹svZse(SO3kid>y{}RĬG[ /8Ň a4}d9>oh^׾*3 JuBU#ڀjU .d;|ͣ>A;L3lI> pabRZaVn] KQiu ՆP;9ⳈL<}1-'dm4>ɿu =5!2{x_a #d66woRЬҸ@UsCz] |}":*-6)aTL Dn ʦ +-ՇY3yࡻz(:;*޹YlG{}M'S3!C?cIGKIYb'͓ZC^0@%"Nn@;$$ %BatNvuǶb7/B2NPJS9.zeģ&=~Yv[Ӎq%oy'й֪/~5~sǮ 7 4h},uhMuPbr:v ;hFIx|p_r"G Y 5u*[kf3>z &k Rq-Pnݰ WKU洱0TUaţ%.IMM_|a7[27^Av1M&n(܋ac`isT! ؕAQ{%|<'bOrAٛN_\Gw~uS5mCJDJ!PN-IF <n-KEg6<;JHHa !mtZ.ky28F7` ^!e`H'KTw|_3]:OR59t }GNL Ur ?cf:$U[2g.W L>sR"z<&S/(""$YX+/OᖣY(-gC&rFPkJ|UbLDY '+]y-͚2:Kq }\(7Su=#9Pia#aX>1hy6E2'㍶`Yı4n[Y˴&/o.]hi(;rb?p%pa֫/#\6sH|rϢ͎ S >Sm8^&JGC7[$Cؕr7g\ψFM%Ӡ\d,"&30u;M y.0IYB̍>E/M $\xCB7#_TREEWmM*RrDNL~S\8lG9*~ t0-ǻtmMkKk!g"*-տd,pktYzѠIlg^je441SoG0İzrQ,{`-C靶y uPĉLoaGZnp~^DzrU0I[6U  ^=۠F\{Yc@px4`Y1:8 Dq]rlL"(f\vCN Yzi ahө5S!18b?ۋXP%5WwAPnqa171t2sk2uqWgnҤd-\ tђG_LCx BF\u.^?% 7};!VK,]ڨKV]+.s2M q YQ/ +mSطi=K׮J2i_ V{PQbκ 3w dʤN\6ip_lMѧh`]p?pdy%㬤ZL«4xKGҩ1w,;Htq#1zO3{mnt#bI7[wUlzB h?IC5p8ɇC>,E)۰,u?ULÃj`z^y:B C)0G"(]goC|pǁj{y[< 'ʸۉ K+o )w.MٿB N~+TѮ&j(߉۹W,$io[Uu[Yc"5vF}`j3uxo&Q&!CD` BM;{tY/ov,ʀZxϬWDPzИ>7˔DYArqz>i#YPwONgJN09RuUNCev!4 ޘ#qN\ .276&WS_1=ZHJBԠ^%1T 9 Su6)8|u%sԨ’&Dlf:fȩ.m}srT.ET^Qʧg uϓf@xp&ȃSǛa(٥xMmNC}odŮU}!,PNҭCRyO7fW/sha`|>j)ធQL O10鋋3 JMܵut"l'R<&5_nGJse͖' B6\^Hx#Ӑg3'uO?W2Xb``sؠ~n6:Hy@٭|/ *L/Brt+yJ3ʥft&1EGX~|u=hLֽ}xtM;њ~ L&al8驠ϺHr;*TLI 3N!wP ܮGUBΟ1$dٮ@ϋ*EHrkc<c+yA mfsgIX- QF؟:Lw&}l﮿=k2ZBPU R^MṅJ5[C`h͊:0/&[+?.Y )xgQ ӊx(3SEReU;_'_Cef󾵿S䅥ZepՆqN{ [MR2V4`3^(ͿaW93M<Zk{obBfuZ䶑n ]<Ye='%-Mo+\#  |2ó舢A`Jq]WBӶb>Nbho{R{͝/{9 MYoI8C&GZ=`+4.pLD9ˌ1ʉY25 }Vo~cN6fﺞ\JXkZ;Vhۇ쓷CWat`J龯*+oK53J~Ĝ&HGP?8צQ[}yȿflD-"5&T]C<&XVJ n@/ 7.|:&o:s/8B?r+ȕfz* P0ǧuL˚nO])B;A.~Wxw1] 4e~ `(;(&IbXyIԓ0YQ&dGsaQEϫxg/lepFѺ^f/ ,_Ϝ 19fI3|p [3;*rʙFR(%!  $nc`5xa]_;c,.LK-{'MA1y*cVVO3]&GpUuՒU'm@ ~/d,N}-S$Zak ڴ_cc5qQW ,)}Z5gO$‰q "_ ӽw΋*^yƐ~jCVlJζ+t\ g"#XD9)}b)$#nn=]=?. +G}'vkeDTa{ ilZ/DcqZ!"\{~A4#;A0 3W>eFrހ#Q`.si?5HٙVXr^![q@MՈ<%Gf2ߌz©qAvc<~:HΣt-Hb>AxD0e$7\Q7cJt`R n(^~\FJb@:0~UCŻer5*e\X A8C<}A5u T6)|0S:f7^h{Vf||%Uœt \xmQ'Qkg >3֦J[e (P0n߾_o=]UL9n=[0'\&ɢ$`qno #bc+^TI %  ]7zX0W8o 7}򕥚:ߌ8< .koel cCMnd;8Pٔ8L 3p}_!RzߣJ#WEU$0I z̒ 2 x>>um ukwO fÜ-,[w׋*^m>> Ưw|[7jXr#b4 UvS?j[`K$ccp]!9V!ԡW3Zdɱ3[̖|^~1MƑ?!tk|֧rzvx]o3ݾsXhUSN޴ǬqM?ئo*XBb}lN0Yi)4jjfPCԳ/'?^#^G9Lj.WR@z`Ws1΀ Gj-+oy͙<d$ 2->a,>z֦Y?#%IwW Sn7a6#x6\^K"d1iGb],orP3V`NK~BݒIRy>tYK?iEZx8cڡ?3-=jeWTؑ*wUFI%Y"Q" ĞZ: rTa1_a|S1_b|S1_ca2_a|S2_b|S2_ca3_a|S3_b|S3_ca4_a|S4_b|S4_ca6_a|S6_b|S6_ca8_a|S8_b|S8_ca10_a|S10_b|S10_cSNODP8h(zۚ  h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle @C rTa1_a|S1_b|S1_ca2_a|S2_b|S2_ca3_a|S3_b|S3_ca4_a|S4_b|S4_ca6_a|S6_b|S6_ca8_a|S8_b|S8_ca10_a|S10_b|S10_cSNODPHh8,} RwS|=OH SWH7^wF2+o׆МjA)=Jyx;;@*Dv#޵q9\"P8]p1X*Y$"P7`@KEN(1L}ȕ[ ŧ亪*P$e#PmDb~Bd`PEbCޢM,'ڇas38JsWj ̹$2"'w.5 mO-ZmB͢ӷnN3k^~1{g;<đSؓ};|ұrt!1 _2[7ԃz#d`TREEhs >sH`}&%aȷgU>E[ =vU (kϳ6=σ>'D ߽ܓ:S" Xv]oý@(uVoƫ4WdfO ҃`+,aiKivmV9Pr%>нsۏR;&" ,/DGFTC BZw%@nM r3t=T4OdeW ? ֎a=>=hDNEV,wbym9W4HՍs JT\KY@ ǁp"B;$tr>)b k|J` m_g$ǼiVAxn5:A`7kWZO6*5GdkOK_R;aSiv~}WC:HZ ΥryЇz/h8qUԑ9#rMv(D3k,Yjlyݸtwqΐt*W) ̔O#r1*i^o1 `ho9`oLH~߼z@ Qn+=suY@ȿ 'sJ'a+fo)&{NK?`IeK7VTͻaP bg9>6 s^%`5(\w*̚PV3F{C|Agф .u'HLzEq7[!V%wHs FojR+E:,$U ;c q1x^R]vD؀*7Rʪ _j -HoR`%c3w c:MQKY`JT5y2m^) u}ze?΋7g}gΥytJda#-k8Fs]I?2?D}`h,n񷮑c%(ݣ=$K;U]'tw7RsR#SÒ#h!{TB~5lφnt:vbHcCjkI( psr*A뒍8+oY0U뿸,j !͓xUMHc\4˜ ]=ϵc 1ɤ4hieJ"^v-dWg"Q$S2)5Iuwz{C J hБ!qf׊E" \)mK8兛jD:j%^~cĉUG&Sl5ʜ ɛ欀Jѯ.'XZW5zDCp}鱉kIuR 1W͝$ .&]PyIAZں .Z8{Ƈzއa6>.YQ&oYт9,-}SX9Xc,;vM|``!Ŭ43"MQiǘĢqCD3gc%ꂒƎ~!Jyead[UͺGqH-#m;EOĮ}}?u82k-x]֋OcaE5nnc:Ӥ qCM zgW*JBSdxt|OC33pU8aM*Fr.%}fy\(46Pĕ-AK+4P[,Wq}c_DSl P$a%LZQ(U.= $YWZAz̙ T)n>E6:7U[P7me[HcV#_"i%WQdbyjF$0 h@V:x? &sL1O^9 ns4{N <l(h]1; \]CJB`3 ^LyGKT*ׯRnf>fGS4_$eG"1?`X"K7c<%poh3I>}E#-<~^a3! e 𽶷+-ux>gaE5T?JkVxP"֪*iN'W?En^Ip87*%lY7s;R[)wϤ F3bp]FQf#;/ŢzZ2]sdY싋#ٿP \r$q8X5Ӯ" k jy {Ca֭~H-]^Hٕ:~|3oluGU/VxQ&}!z=;:DWQ\w03ϓ -~ a 5(3HQ7`*]1r߀IE H xkC%/(qA+p~[:H/fHiťqFdrfug@gmUx()1keU?*'W((#VƜ@\r">~T$=Seoޟ-%ɇ^)q*aTe/]ݗ=;\{I>>YUв,l pɱIO(aV2TZ7JYmC@[{ rS=;ZUfNّz}ZY~o^kg Bv ;43*I-VGvI/w۷ ETk 5VuewiP%}sROOzr$m"g+n!״2|o7Y%W>fqnKߪx|zB!+Ip[zBTnZi}H/|aSzQ{YvԽ\$,tCwI!zĞ2qXB~a}4y]] d }W>ޢ  ٙQtBPD5r!hstRPq, uܖHu#7e.]?24@OJ *aeGQO\/Lwc5@wA+#= MW3!'lEmzjQ+-e~$F# 5 lbֻJv궺=]'.glSX 4|Tlmƍ\eh_#mɎ*s>״{ASd vMv @ɈK^R^;1H(4+XUR [suD%dp_I2-܊̈́s7;K?&  G خێ&\O$V-M5rxY!ɪccj4lOh!ZeAVS?֞M o ]HIƗeGgw۲ _g]ؑG Auޚѱ8d9@N3d`L+/4ЕpJ yef22i,49C?ޭhSzs&j_ ⌜xPwf9+tN^>xbf>w8;L23jwpm6G×"gQ: ͪZuFTݼ,8Co؂p^L$}Swr]]*]/Hyr7(^=2(dA$7RvȏC" C}+ImJT+CXWWadyWfeh~qv'v|P:+G&ټ*̰: xz.=tzJA{|u2c"-ӝF GViEk_x1 BRj=ҁ,ƵH$1/~7K2Q30*Q٬: n}feȟ1Og<cˏP~PcΩA32'4#̗^*/^;9V >9hyJp,k)fyE~XVR$_=2pInW.UMjs"GI\kctذ3pGf;&*u'<0ÎDeUTfyk;Np#]osbl#\XhPrv=X֙zY\YȈ_;0 ,KFBtxAQ.zu d;]g!]aј_,OW+9Qh$g_|Xo*x6t3 NA< \h%e58ԛElƝؚG`yV [tM@"u. Mce&UyΓ  \LH}cԜJdY,LaYM IB33tNw/\,*,Pu'FCmѕ1% GzYi9"OL bNSKLQ͚ͮ!ak%ݛ2|o7Y%W>fqnKߪx|zB 1 and (err.args[1] == -11) and not ext.using_SSE2()): return if (len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2()): return else: raise delta_t = min(delta_ts) size_i = self.data.size * self.data.dtype.itemsize size_o = out.size * out.dtype.itemsize size = max([size_i, size_o]) speed = (ext.REPEAT * size / delta_t / 1024**3) # GB/s if TIME: print("%-20s: %5.2f s/GB, %5.2f GB/s" % (self.case, 1./speed, speed)) if not self.check is None: ans = self.check(self.data).view(np.uint8) self.assertTrue(np.all(ans == out.view(np.uint8))) if not self.check_data is None: ans = self.check_data.view(np.uint8) self.assertTrue(np.all(ans == out.view(np.uint8))) def test_00_copy(self): self.case = "copy" self.fun = ext.copy self.check = lambda x: x def test_01a_trans_byte_elem_scal_16(self): self.case = "byte T elem scal 16" self.data = self.data.view(np.int16) self.fun = ext.trans_byte_elem_scal self.check = trans_byte_elem def test_01b_trans_byte_elem_scal_32(self): self.case = "byte T elem scal 32" self.data = self.data.view(np.int32) self.fun = ext.trans_byte_elem_scal self.check = trans_byte_elem def test_01c_trans_byte_elem_scal_64(self): self.case = "byte T elem scal 64" self.data = self.data.view(np.int64) self.fun = ext.trans_byte_elem_scal self.check = trans_byte_elem def test_01d_trans_byte_elem_16(self): self.case = "byte T elem SSE 16" self.data = self.data.view(np.int16) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_01e_trans_byte_elem_32(self): self.case = "byte T elem SSE 32" self.data = self.data.view(np.float32) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_01f_trans_byte_elem_64(self): self.case = "byte T elem SSE 64" self.data = self.data.view(np.float64) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_01g_trans_byte_elem_128(self): self.case = "byte T elem SSE 128" self.data = self.data.view(np.complex128) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_01h_trans_byte_elem_96(self): self.case = "byte T elem SSE 96" n = self.data.size // 128 * 96 dt = np.dtype([(str('a'), np.int32), (str('b'), np.int32), (str('c'), np.int32)]) self.data = self.data[:n].view(dt) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_01i_trans_byte_elem_80(self): self.case = "byte T elem SSE 80" n = self.data.size // 128 * 80 dt = np.dtype([(str('a'), np.int16), (str('b'), np.int16), (str('c'), np.int16), (str('d'), np.int16), (str('e'), np.int16)]) self.data = self.data[:n].view(dt) self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def test_03a_trans_bit_byte(self): self.case = "bit T byte scal 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_byte_scal self.check = trans_bit_byte def test_03d_trans_bit_byte_SSE(self): self.case = "bit T byte SSE 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_byte_SSE self.check = trans_bit_byte def test_03f_trans_bit_byte_AVX(self): self.case = "bit T byte AVX 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_byte_AVX self.check = trans_bit_byte def test_03g_trans_bit_byte_AVX_32(self): self.case = "bit T byte AVX 32" self.data = self.data.view(np.float32) self.fun = ext.trans_bit_byte_AVX self.check = trans_bit_byte def test_04a_trans_bit_elem_AVX(self): self.case = "bit T elem AVX 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_04b_trans_bit_elem_AVX_128(self): self.case = "bit T elem AVX 128" self.data = self.data.view(np.complex128) self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_04c_trans_bit_elem_AVX_32(self): self.case = "bit T elem AVX 32" self.data = self.data.view(np.float32) self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_04d_trans_bit_elem_AVX_16(self): self.case = "bit T elem AVX 16" self.data = self.data.view(np.int16) self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_04e_trans_bit_elem_64(self): self.case = "bit T elem scal 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_scal self.check = trans_bit_elem def test_04f_trans_bit_elem_SSE_32(self): self.case = "bit T elem SSE 32" self.data = self.data.view(np.float32) self.fun = ext.trans_bit_elem_SSE self.check = trans_bit_elem def test_04g_trans_bit_elem_SSE_64(self): self.case = "bit T elem SSE 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_SSE self.check = trans_bit_elem def test_06a_untrans_bit_elem_16(self): self.case = "bit U elem SSE 16" pre_trans = self.data.view(np.int16) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_SSE self.check_data = pre_trans def test_06b_untrans_bit_elem_128(self): self.case = "bit U elem SSE 128" pre_trans = self.data.view(np.complex128) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_SSE self.check_data = pre_trans def test_06c_untrans_bit_elem_32(self): self.case = "bit U elem SSE 32" pre_trans = self.data.view(np.float32) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_SSE self.check_data = pre_trans def test_06d_untrans_bit_elem_32(self): self.case = "bit U elem AVX 32" pre_trans = self.data.view(np.float32) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_AVX self.check_data = pre_trans def test_06e_untrans_bit_elem_64(self): self.case = "bit U elem SSE 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_SSE self.check_data = pre_trans def test_06f_untrans_bit_elem_64(self): self.case = "bit U elem AVX 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_AVX self.check_data = pre_trans def test_06g_untrans_bit_elem_64(self): self.case = "bit U elem scal 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_scal self.check_data = pre_trans def test_07a_trans_byte_bitrow_64(self): self.case = "byte T row scal 64" self.data = self.data.view(np.float64) self.fun = ext.trans_byte_bitrow_scal def test_07b_trans_byte_bitrow_SSE_64(self): self.case = "byte T row SSE 64" self.data = self.data.view(np.float64) self.fun = ext.trans_byte_bitrow_SSE self.check = ext.trans_byte_bitrow_scal def test_07c_trans_byte_bitrow_AVX_64(self): self.case = "byte T row AVX 64" self.data = self.data.view(np.float64) self.fun = ext.trans_byte_bitrow_AVX self.check = ext.trans_byte_bitrow_scal def test_08a_shuffle_bit_eight_scal_64(self): self.case = "bit S eight scal 64" self.data = self.data.view(np.float64) self.fun = ext.shuffle_bit_eightelem_scal def test_08b_shuffle_bit_eight_SSE_64(self): self.case = "bit S eight SSE 64" self.data = self.data.view(np.float64) self.fun = ext.shuffle_bit_eightelem_SSE self.check = ext.shuffle_bit_eightelem_scal def test_08c_shuffle_bit_eight_AVX_32(self): self.case = "bit S eight AVX 32" self.data = self.data.view(np.float32) self.fun = ext.shuffle_bit_eightelem_AVX self.check = ext.shuffle_bit_eightelem_scal def test_08d_shuffle_bit_eight_AVX_64(self): self.case = "bit S eight AVX 64" self.data = self.data.view(np.float64) self.fun = ext.shuffle_bit_eightelem_AVX self.check = ext.shuffle_bit_eightelem_scal def test_08e_shuffle_bit_eight_AVX_16(self): self.case = "bit S eight AVX 16" self.data = self.data.view(np.int16) self.fun = ext.shuffle_bit_eightelem_AVX self.check = ext.shuffle_bit_eightelem_scal def test_08f_shuffle_bit_eight_AVX_128(self): self.case = "bit S eight AVX 128" self.data = self.data.view(np.complex128) self.fun = ext.shuffle_bit_eightelem_AVX self.check = ext.shuffle_bit_eightelem_scal def test_09a_trans_bit_elem_scal_64(self): self.case = "bit T elem scal 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_scal self.check = trans_bit_elem def test_09b_trans_bit_elem_SSE_64(self): self.case = "bit T elem SSE 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_SSE self.check = trans_bit_elem def test_09c_trans_bit_elem_AVX_64(self): self.case = "bit T elem AVX 64" self.data = self.data.view(np.float64) self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_09d_untrans_bit_elem_scal_64(self): self.case = "bit U elem scal 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_scal self.check_data = pre_trans def test_09e_untrans_bit_elem_SSE_64(self): self.case = "bit U elem SSE 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_SSE self.check_data = pre_trans def test_09f_untrans_bit_elem_AVX_64(self): self.case = "bit U elem AVX 64" pre_trans = self.data.view(np.float64) self.data = trans_bit_elem(pre_trans) self.fun = ext.untrans_bit_elem_AVX self.check_data = pre_trans def test_10a_bitshuffle_64(self): self.case = "bitshuffle 64" self.data = self.data.view(np.float64) self.fun = lambda x: ext.bitshuffle(x, BLOCK) def test_10b_bitunshuffle_64(self): self.case = "bitunshuffle 64" pre_trans = self.data.view(np.float64) self.data = ext.bitshuffle(pre_trans, BLOCK) self.fun = lambda x: ext.bitunshuffle(x, BLOCK) self.check_data = pre_trans def test_10c_compress_64(self): self.case = "compress 64" self.data = self.data.view(np.float64) self.fun = lambda x:ext.compress_lz4(x, BLOCK) def test_10d_decompress_64(self): self.case = "decompress 64" pre_trans = self.data.view(np.float64) self.data = ext.compress_lz4(pre_trans, BLOCK) self.fun = lambda x: ext.decompress_lz4(x, pre_trans.shape, pre_trans.dtype, BLOCK) self.check_data = pre_trans """ Commented out to prevent nose from finding them. class TestDevCases(unittest.TestCase): def deactivated_test_trans_byte_bitrow_AVX(self): d = np.arange(256, dtype=np.uint32) #d = ext.trans_bit_elem(d) t = ext.trans_byte_bitrow_AVX(d).view(np.uint8) t1 = ext.trans_byte_bitrow_SSE(d).view(np.uint8) t.shape = (32, 32) t1.shape = (32, 32) #print t[:20,:18] self.assertTrue(np.all(t == t1)) def deactivated_test_untrans_bit_elem(self): d = np.arange(32, dtype=np.uint16) #d = random.randint(0, 2**7, 256).astype(np.uint16) d1 = ext.trans_bit_elem(d) #print d t = ext.untrans_bit_elem_AVX(d1) #t1 = ext.untrans_bit_byte_scal(d1) #print np.reshape(d1.view(np.uint8), (16, 4)) #print np.reshape(t1.view(np.uint8), (2, 32)) #print np.reshape(t2.view(np.uint8), (32, 2)) #print np.reshape(t.view(np.uint8), (32, 2)) def deactivated_test_trans_bit_byte(self): d = np.arange(16, dtype=np.uint16) t = ext.trans_bit_byte_scal(d) #print t t1 = trans_bit_byte(d) #print t1 self.assertTrue(np.all(t == t1)) def deactivated_test_trans_byte_bitrow_SSE(self): d = np.arange(256, dtype = np.uint8) t = ext.trans_byte_bitrow_scal(d) #print np.reshape(t, (32, 8)) t1 = ext.trans_byte_bitrow_SSE(d) #print np.reshape(t1, (32, 8)) self.assertTrue(np.all(t == t1)) def deactivated_test_trans_byte_elem_SSE(self): d = np.empty(16, dtype=([('a', 'u4'), ('b', 'u4'), ('c', 'u4')])) d['a'] = np.arange(16) * 1 d['b'] = np.arange(16) * 2 d['c'] = np.arange(16) * 3 #print d.dtype.itemsize #print np.reshape(d.view(np.uint8), (16, 12)) t1 = ext.trans_byte_elem_SSE(d) #print np.reshape(t1.view(np.uint8), (12, 16)) t0 = trans_byte_elem(d) #print np.reshape(t0.view(np.uint8), (12, 16)) self.assertTrue(np.all(t0.view(np.uint8) == t1.view(np.uint8))) def deactivated_test_bitshuffle(self): d = np.arange(128, dtype=np.uint16) t1 = ext.bitshuffle(d) #print t1 t2 = ext.bitunshuffle(t1) #print t2 self.assertTrue(np.all(t2.view(np.uint8) == d.view(np.uint8))) """ class TestOddLengths(unittest.TestCase): def setUp(self): self.reps = 10 self.nmax = 128 * 8 #self.nmax = 4 * 8 # XXX self.fun = ext.copy self.check = lambda x: x def test_trans_bit_elem_SSE(self): self.fun = ext.trans_bit_elem_SSE self.check = trans_bit_elem def test_untrans_bit_elem_SSE(self): self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x)) self.check = lambda x: x def test_trans_bit_elem_AVX(self): self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem def test_untrans_bit_elem_AVX(self): self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x)) self.check = lambda x: x def test_trans_bit_elem_scal(self): self.fun = ext.trans_bit_elem_scal self.check = trans_bit_elem def test_untrans_bit_elem_scal(self): self.fun = lambda x: ext.untrans_bit_elem_scal(ext.trans_bit_elem(x)) self.check = lambda x: x def test_trans_byte_elem_SSE(self): self.fun = ext.trans_byte_elem_SSE self.check = trans_byte_elem def tearDown(self): try: for dtype in TEST_DTYPES: itemsize = np.dtype(dtype).itemsize nbyte_max = self.nmax * itemsize dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8) dbuf = dbuf.view(dtype) for ii in range(self.reps): n = random.randint(0, self.nmax // 8, 1)[0] * 8 data = dbuf[:n] out = self.fun(data).view(np.uint8) ans = self.check(data).view(np.uint8) self.assertTrue(np.all(out == ans)) except RuntimeError as err: if (len(err.args) > 1 and (err.args[1] == -11) and not ext.using_SSE2()): return if (len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2()): return else: raise class TestBitShuffleCircle(unittest.TestCase): """Ensure that final filter is circularly consistant for any data type and any length buffer.""" def test_circle(self): nmax = 100000 reps = 20 for dtype in TEST_DTYPES: itemsize = np.dtype(dtype).itemsize nbyte_max = nmax * itemsize dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8) dbuf = dbuf.view(dtype) for ii in range(reps): n = random.randint(0, nmax, 1)[0] data = dbuf[:n] shuff = ext.bitshuffle(data) out = ext.bitunshuffle(shuff) self.assertTrue(out.dtype is data.dtype) self.assertTrue(np.all(data.view(np.uint8) == out.view(np.uint8))) def test_circle_with_compression(self): nmax = 100000 reps = 20 for dtype in TEST_DTYPES: itemsize = np.dtype(dtype).itemsize nbyte_max = nmax * itemsize dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8) dbuf = dbuf.view(dtype) for ii in range(reps): n = random.randint(0, nmax, 1)[0] data = dbuf[:n] shuff = ext.compress_lz4(data) out = ext.decompress_lz4(shuff, data.shape, data.dtype) self.assertTrue(out.dtype is data.dtype) self.assertTrue(np.all(data.view(np.uint8) == out.view(np.uint8))) # Python implementations for checking results. def trans_byte_elem(arr): dtype = arr.dtype itemsize = dtype.itemsize in_buf = arr.flat[:].view(np.uint8) nelem = in_buf.size // itemsize in_buf.shape = (nelem, itemsize) out_buf = np.empty((itemsize, nelem), dtype=np.uint8) for ii in range(nelem): for jj in range(itemsize): out_buf[jj,ii] = in_buf[ii,jj] return out_buf.flat[:].view(dtype) def trans_bit_byte(arr): n = arr.size dtype = arr.dtype itemsize = dtype.itemsize bits = np.unpackbits(arr.view(np.uint8)) bits.shape = (n * itemsize, 8) # We have to reverse the order of the bits both for unpacking and packing, # since we want to call the least significant bit the first bit. bits = bits[:,::-1] bits_shuff = (bits.T).copy() bits_shuff.shape = (n * itemsize, 8) bits_shuff = bits_shuff[:,::-1] arr_bt = np.packbits(bits_shuff.flat[:]) return arr_bt.view(dtype) def trans_bit_elem(arr): n = arr.size dtype = arr.dtype itemsize = dtype.itemsize bits = np.unpackbits(arr.view(np.uint8)) bits.shape = (n * itemsize, 8) # We have to reverse the order of the bits both for unpacking and packing, # since we want to call the least significant bit the first bit. bits = bits[:,::-1].copy() bits.shape = (n, itemsize * 8) bits_shuff = (bits.T).copy() bits_shuff.shape = (n * itemsize, 8) bits_shuff = bits_shuff[:,::-1] arr_bt = np.packbits(bits_shuff.flat[:]) return arr_bt.view(dtype) if __name__ == "__main__": unittest.main() bitshuffle-0.3.5/bitshuffle/tests/test_h5filter.py000066400000000000000000000047161337005776700223310ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function, unicode_literals import unittest import os import glob import numpy as np import h5py from h5py import h5f, h5d, h5z, h5t, h5s, filters from subprocess import Popen, PIPE, STDOUT from bitshuffle import h5 os.environ["HDF5_PLUGIN_PATH"] = "" class TestFilter(unittest.TestCase): def test_filter(self): shape = (32 * 1024 + 783,) chunks = (4 * 1024 + 23,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) h5.create_dataset(f, b"range", shape, dtype, chunks, filter_pipeline=(32008, 32000), filter_flags=(h5z.FLAG_MANDATORY, h5z.FLAG_MANDATORY), filter_opts=None) f["range"][:] = data f.close() f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close() def test_with_block_size(self): shape = (128 * 1024 + 783,) chunks = (4 * 1024 + 23,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) h5.create_dataset(f, b"range", shape, dtype, chunks, filter_pipeline=(32008, 32000), filter_flags=(h5z.FLAG_MANDATORY, h5z.FLAG_MANDATORY), filter_opts=((680,), ()), ) f["range"][:] = data f.close() #os.system('h5dump -H -p tmp_test_filters.h5') f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close() def test_with_compression(self): shape = (128 * 1024 + 783,) chunks = (4 * 1024 + 23,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) h5.create_dataset(f, b"range", shape, dtype, chunks, filter_pipeline=(32008,), filter_flags=(h5z.FLAG_MANDATORY,), filter_opts=((0, h5.H5_COMPRESS_LZ4),), ) f["range"][:] = data f.close() #os.system('h5dump -H -p tmp_test_filters.h5') f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close() def tearDown(self): files = glob.glob("tmp_test_*") for f in files: os.remove(f) if __name__ == "__main__": unittest.main() bitshuffle-0.3.5/bitshuffle/tests/test_h5plugin.py000066400000000000000000000046101337005776700223330ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function, unicode_literals import unittest import os, os.path import glob import numpy as np import h5py from h5py import h5f, h5d, h5z, h5t, h5s, filters from subprocess import Popen, PIPE, STDOUT import bitshuffle plugin_dir = os.path.join(os.path.dirname(bitshuffle.__file__), 'plugin') os.environ["HDF5_PLUGIN_PATH"] = plugin_dir H5VERSION = h5py.h5.get_libversion() if (H5VERSION[0] < 1 or (H5VERSION[0] == 1 and (H5VERSION[1] < 8 or (H5VERSION[1] == 8 and H5VERSION[2] < 11)))): H51811P = False else: H51811P = True class TestFilterPlugins(unittest.TestCase): def test_plugins(self): if not H51811P: return shape = (32 * 1024,) chunks = (4 * 1024,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) tid = h5t.py_create(dtype, logical=1) sid = h5s.create_simple(shape, shape) # Different API's for different h5py versions. try: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None, None) except TypeError: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None) dcpl.set_filter(32008, h5z.FLAG_MANDATORY) dcpl.set_filter(32000, h5z.FLAG_MANDATORY) dset_id = h5d.create(f.id, b"range", tid, sid, dcpl=dcpl) dset_id.write(h5s.ALL, h5s.ALL, data) f.close() # Make sure the filters are working outside of h5py by calling h5dump h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT) stdout, nothing = h5dump.communicate() err = h5dump.returncode self.assertEqual(err, 0) f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close() #def test_h5py_hl(self): # if not H51811P: # return # # Does not appear to be supported by h5py. # fname = "tmp_test_h5py_hl.h5" # f = h5py.File(fname) # f.create_dataset("range", np.arange(1024, dtype=np.int64), # compression=32008) def tearDown(self): files = glob.glob("tmp_test_*") for f in files: os.remove(f) if __name__ == "__main__": unittest.main() bitshuffle-0.3.5/bitshuffle/tests/test_regression.py000066400000000000000000000016001337005776700227540ustar00rootroot00000000000000""" Test that data encoded with earlier versions can still be decoded correctly. """ from __future__ import absolute_import, division, print_function import unittest from os import path import numpy as np import h5py import bitshuffle from bitshuffle import h5 TEST_DATA_DIR = path.dirname(bitshuffle.__file__) + "/tests/data" OUT_FILE_TEMPLATE = TEST_DATA_DIR + "/regression_%s.h5" VERSIONS = ["0.1.3",] class TestAll(unittest.TestCase): def test_regression(self): for version in VERSIONS: file_name = OUT_FILE_TEMPLATE % version f = h5py.File(file_name) g_orig = f["origional"] g_comp = f["compressed"] for dset_name in g_comp.keys(): self.assertTrue(np.all(g_comp[dset_name][:] == g_orig[dset_name][:])) if __name__ == "__main__": unittest.main() bitshuffle-0.3.5/conda-recipe/000077500000000000000000000000001337005776700162225ustar00rootroot00000000000000bitshuffle-0.3.5/conda-recipe/bld.bat000066400000000000000000000001131337005776700174460ustar00rootroot00000000000000SET CONDA_HOME=%PREFIX% "%PYTHON%" setup.py install if errorlevel 1 exit 1 bitshuffle-0.3.5/conda-recipe/build.sh000066400000000000000000000001361337005776700176550ustar00rootroot00000000000000export CONDA_HOME=$PREFIX $PYTHON setup.py install # Python command to install the script bitshuffle-0.3.5/conda-recipe/meta.yaml000066400000000000000000000007671337005776700200460ustar00rootroot00000000000000package: name: bitshuffle version: 0.2.1 source: # git_url: https://github.com/kiyo-masui/bitshuffle.git # git_rev: 0.2.1 path: .. patches: - setup.py.patch requirements: build: - python - setuptools - cython - numpy - h5py - hdf5 run: - python - numpy - h5py - cython about: home: https://github.com/kiyo-masui/bitshuffle/blob/master/setup.py summary: "bitshuffle library." bitshuffle-0.3.5/conda-recipe/setup.py.patch000066400000000000000000000007671337005776700210440ustar00rootroot00000000000000--- setup.py 2016-01-19 16:56:12.954563000 +0100 +++ xxx.py 2016-01-19 16:56:00.817087000 +0100 @@ -40,8 +40,8 @@ # Copied from h5py. # TODO, figure out what the canonacal way to do this should be. -INCLUDE_DIRS = [] -LIBRARY_DIRS = [] +INCLUDE_DIRS = [os.environ['CONDA_HOME'] + '/include'] +LIBRARY_DIRS = [os.environ['CONDA_HOME'] + '/lib'] if sys.platform == 'darwin': # putting here both macports and homebrew paths will generate # "ld: warning: dir not found" at the linking phase bitshuffle-0.3.5/lz4/000077500000000000000000000000001337005776700144025ustar00rootroot00000000000000bitshuffle-0.3.5/lz4/LICENSE000066400000000000000000000024361337005776700154140ustar00rootroot00000000000000LZ4 Library Copyright (c) 2011-2014, Yann Collet All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.bitshuffle-0.3.5/lz4/README.md000066400000000000000000000027621337005776700156700ustar00rootroot00000000000000LZ4 - Library Files ================================ The __lib__ directory contains several files, but you don't necessarily need them all. To integrate fast LZ4 compression/decompression into your program, you basically just need "**lz4.c**" and "**lz4.h**". For more compression at the cost of compression speed (while preserving decompression speed), use **lz4hc** on top of regular lz4. `lz4hc` only provides compression functions. It also needs `lz4` to compile properly. If you want to produce files or data streams compatible with `lz4` command line utility, use **lz4frame**. This library encapsulates lz4-compressed blocks into the [official interoperable frame format]. In order to work properly, lz4frame needs lz4 and lz4hc, and also **xxhash**, which provides error detection algorithm. (_Advanced stuff_ : It's possible to hide xxhash symbols into a local namespace. This is what `liblz4` does, to avoid symbol duplication in case a user program would link to several libraries containing xxhash symbols.) A more complex "lz4frame_static.h" is also provided, although its usage is not recommended. It contains definitions which are not guaranteed to remain stable within future versions. Use for static linking ***only***. The other files are not source code. There are : - LICENSE : contains the BSD license text - Makefile : script to compile or install lz4 library (static or dynamic) - liblz4.pc.in : for pkg-config (make install) [official interoperable frame format]: ../lz4_Frame_format.md bitshuffle-0.3.5/lz4/lz4.c000066400000000000000000001531321337005776700152640ustar00rootroot00000000000000/* LZ4 - Fast LZ compression algorithm Copyright (C) 2011-2015, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - LZ4 source repository : https://github.com/Cyan4973/lz4 - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ /************************************** * Tuning parameters **************************************/ /* * HEAPMODE : * Select how default compression functions will allocate memory for their hash table, * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). */ #define HEAPMODE 0 /* * ACCELERATION_DEFAULT : * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 */ #define ACCELERATION_DEFAULT 1 /************************************** * CPU Feature Detection **************************************/ /* * LZ4_FORCE_SW_BITCOUNT * Define this parameter if your target system or compiler does not support hardware bit count */ #if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ # define LZ4_FORCE_SW_BITCOUNT #endif /************************************** * Includes **************************************/ #include "lz4.h" /************************************** * Compiler Options **************************************/ #ifdef _MSC_VER /* Visual Studio */ # define FORCE_INLINE static __forceinline # include # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ # pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ #else # if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ # if defined(__GNUC__) || defined(__clang__) # define FORCE_INLINE static inline __attribute__((always_inline)) # else # define FORCE_INLINE static inline # endif # else # define FORCE_INLINE static # endif /* __STDC_VERSION__ */ #endif /* _MSC_VER */ /* LZ4_GCC_VERSION is defined into lz4.h */ #if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) # define expect(expr,value) (__builtin_expect ((expr),(value)) ) #else # define expect(expr,value) (expr) #endif #define likely(expr) expect((expr) != 0, 1) #define unlikely(expr) expect((expr) != 0, 0) /************************************** * Memory routines **************************************/ #include /* malloc, calloc, free */ #define ALLOCATOR(n,s) calloc(n,s) #define FREEMEM free #include /* memset, memcpy */ #define MEM_INIT memset /************************************** * Basic Types **************************************/ #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ # include typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; #else typedef unsigned char BYTE; typedef unsigned short U16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; #endif /************************************** * Reading and writing into memory **************************************/ #define STEPSIZE sizeof(size_t) static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } static unsigned LZ4_isLittleEndian(void) { const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ return one.c[0]; } static U16 LZ4_read16(const void* memPtr) { U16 val16; memcpy(&val16, memPtr, 2); return val16; } static U16 LZ4_readLE16(const void* memPtr) { if (LZ4_isLittleEndian()) { return LZ4_read16(memPtr); } else { const BYTE* p = (const BYTE*)memPtr; return (U16)((U16)p[0] + (p[1]<<8)); } } static void LZ4_writeLE16(void* memPtr, U16 value) { if (LZ4_isLittleEndian()) { memcpy(memPtr, &value, 2); } else { BYTE* p = (BYTE*)memPtr; p[0] = (BYTE) value; p[1] = (BYTE)(value>>8); } } static U32 LZ4_read32(const void* memPtr) { U32 val32; memcpy(&val32, memPtr, 4); return val32; } static U64 LZ4_read64(const void* memPtr) { U64 val64; memcpy(&val64, memPtr, 8); return val64; } static size_t LZ4_read_ARCH(const void* p) { if (LZ4_64bits()) return (size_t)LZ4_read64(p); else return (size_t)LZ4_read32(p); } static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } /* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) { BYTE* d = (BYTE*)dstPtr; const BYTE* s = (const BYTE*)srcPtr; BYTE* e = (BYTE*)dstEnd; do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); # elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; _BitScanForward( &r, (U32)val ); return (int)(r>>3); # elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } } else /* Big Endian CPU */ { if (LZ4_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse64( &r, val ); return (unsigned)(r>>3); # elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clzll((U64)val) >> 3); # else unsigned r; if (!(val>>32)) { r=4; } else { r=0; val>>=32; } if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } r += (!val); return r; # endif } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse( &r, (unsigned long)val ); return (unsigned)(r>>3); # elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz((U32)val) >> 3); # else unsigned r; if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } r += (!val); return r; # endif } } } static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) { const BYTE* const pStart = pIn; while (likely(pIn compression run slower on incompressible data */ /************************************** * Local Structures and types **************************************/ typedef struct { U32 hashTable[HASH_SIZE_U32]; U32 currentOffset; U32 initCheck; const BYTE* dictionary; BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ U32 dictSize; } LZ4_stream_t_internal; typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; typedef enum { byPtr, byU32, byU16 } tableType_t; typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; typedef enum { full = 0, partial = 1 } earlyEnd_directive; /************************************** * Local Utils **************************************/ int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } int LZ4_sizeofState() { return LZ4_STREAMSIZE; } /******************************** * Compression functions ********************************/ static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) { if (tableType == byU16) return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); else return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); } static const U64 prime5bytes = 889523592379ULL; static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) { const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; const U32 hashMask = (1<> (40 - hashLog)) & hashMask; } static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) { if (LZ4_64bits()) return LZ4_hashSequence64(sequence, tableType); return LZ4_hashSequence((U32)sequence, tableType); } static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) { switch (tableType) { case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } } } static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { U32 h = LZ4_hashPosition(p, tableType); LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); } static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ } static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { U32 h = LZ4_hashPosition(p, tableType); return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); } FORCE_INLINE int LZ4_compress_generic( void* const ctx, const char* const source, char* const dest, const int inputSize, const int maxOutputSize, const limitedOutput_directive outputLimited, const tableType_t tableType, const dict_directive dict, const dictIssue_directive dictIssue, const U32 acceleration) { LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; const BYTE* ip = (const BYTE*) source; const BYTE* base; const BYTE* lowLimit; const BYTE* const lowRefLimit = ip - dictPtr->dictSize; const BYTE* const dictionary = dictPtr->dictionary; const BYTE* const dictEnd = dictionary + dictPtr->dictSize; const size_t dictDelta = dictEnd - (const BYTE*)source; const BYTE* anchor = (const BYTE*) source; const BYTE* const iend = ip + inputSize; const BYTE* const mflimit = iend - MFLIMIT; const BYTE* const matchlimit = iend - LASTLITERALS; BYTE* op = (BYTE*) dest; BYTE* const olimit = op + maxOutputSize; U32 forwardH; size_t refDelta=0; /* Init conditions */ if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ switch(dict) { case noDict: default: base = (const BYTE*)source; lowLimit = (const BYTE*)source; break; case withPrefix64k: base = (const BYTE*)source - dictPtr->currentOffset; lowLimit = (const BYTE*)source - dictPtr->dictSize; break; case usingExtDict: base = (const BYTE*)source - dictPtr->currentOffset; lowLimit = (const BYTE*)source; break; } if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ if (inputSize> LZ4_skipTrigger); if (unlikely(forwardIp > mflimit)) goto _last_literals; match = LZ4_getPositionOnHash(h, ctx, tableType, base); if (dict==usingExtDict) { if (match<(const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; } else { refDelta = 0; lowLimit = (const BYTE*)source; } } forwardH = LZ4_hashPosition(forwardIp, tableType); LZ4_putPositionOnHash(ip, h, ctx, tableType, base); } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); } /* Catch up */ while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } { /* Encode Literal length */ unsigned litLength = (unsigned)(ip - anchor); token = op++; if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) return 0; /* Check output limit */ if (litLength>=RUN_MASK) { int len = (int)litLength-RUN_MASK; *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); ip += MINMATCH + matchLength; if (ip==limit) { unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); matchLength += more; ip += more; } } else { matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); ip += MINMATCH + matchLength; } if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) return 0; /* Check output limit */ if (matchLength>=ML_MASK) { *token += ML_MASK; matchLength -= ML_MASK; for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } if (matchLength >= 255) { matchLength-=255; *op++ = 255; } *op++ = (BYTE)matchLength; } else *token += (BYTE)(matchLength); } anchor = ip; /* Test end of chunk */ if (ip > mflimit) break; /* Fill table */ LZ4_putPosition(ip-2, ctx, tableType, base); /* Test next position */ match = LZ4_getPosition(ip, ctx, tableType, base); if (dict==usingExtDict) { if (match<(const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; } else { refDelta = 0; lowLimit = (const BYTE*)source; } } LZ4_putPosition(ip, ctx, tableType, base); if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) && (match+MAX_DISTANCE>=ip) && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) { token=op++; *token=0; goto _next_match; } /* Prepare next loop */ forwardH = LZ4_hashPosition(++ip, tableType); } _last_literals: /* Encode Last Literals */ { const size_t lastRun = (size_t)(iend - anchor); if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */ if (lastRun >= RUN_MASK) { size_t accumulator = lastRun - RUN_MASK; *op++ = RUN_MASK << ML_BITS; for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; *op++ = (BYTE) accumulator; } else { *op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize)) { if (inputSize < LZ4_64Klimit) return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); else return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); } else { if (inputSize < LZ4_64Klimit) return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); else return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); } } int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) { #if (HEAPMODE) void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ #else LZ4_stream_t ctx; void* ctxPtr = &ctx; #endif int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); #if (HEAPMODE) FREEMEM(ctxPtr); #endif return result; } int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); } /* hidden debug function */ /* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) { LZ4_stream_t ctx; LZ4_resetStream(&ctx); if (inputSize < LZ4_64Klimit) return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); else return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); } /******************************** * destSize variant ********************************/ static int LZ4_compress_destSize_generic( void* const ctx, const char* const src, char* const dst, int* const srcSizePtr, const int targetDstSize, const tableType_t tableType) { const BYTE* ip = (const BYTE*) src; const BYTE* base = (const BYTE*) src; const BYTE* lowLimit = (const BYTE*) src; const BYTE* anchor = ip; const BYTE* const iend = ip + *srcSizePtr; const BYTE* const mflimit = iend - MFLIMIT; const BYTE* const matchlimit = iend - LASTLITERALS; BYTE* op = (BYTE*) dst; BYTE* const oend = op + targetDstSize; BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); BYTE* const oMaxSeq = oMaxLit - 1 /* token */; U32 forwardH; /* Init conditions */ if (targetDstSize < 1) return 0; /* Impossible to store anything */ if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ if (*srcSizePtr> LZ4_skipTrigger); if (unlikely(forwardIp > mflimit)) goto _last_literals; match = LZ4_getPositionOnHash(h, ctx, tableType, base); forwardH = LZ4_hashPosition(forwardIp, tableType); LZ4_putPositionOnHash(ip, h, ctx, tableType, base); } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) || (LZ4_read32(match) != LZ4_read32(ip)) ); } /* Catch up */ while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } { /* Encode Literal length */ unsigned litLength = (unsigned)(ip - anchor); token = op++; if (op + ((litLength+240)/255) + litLength > oMaxLit) { /* Not enough space for a last match */ op--; goto _last_literals; } if (litLength>=RUN_MASK) { unsigned len = litLength - RUN_MASK; *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } else *token = (BYTE)(litLength< oMaxMatch) { /* Match description too long : reduce it */ matchLength = (15-1) + (oMaxMatch-op) * 255; } //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH); ip += MINMATCH + matchLength; if (matchLength>=ML_MASK) { *token += ML_MASK; matchLength -= ML_MASK; while (matchLength >= 255) { matchLength-=255; *op++ = 255; } *op++ = (BYTE)matchLength; } else *token += (BYTE)(matchLength); } anchor = ip; /* Test end of block */ if (ip > mflimit) break; if (op > oMaxSeq) break; /* Fill table */ LZ4_putPosition(ip-2, ctx, tableType, base); /* Test next position */ match = LZ4_getPosition(ip, ctx, tableType, base); LZ4_putPosition(ip, ctx, tableType, base); if ( (match+MAX_DISTANCE>=ip) && (LZ4_read32(match)==LZ4_read32(ip)) ) { token=op++; *token=0; goto _next_match; } /* Prepare next loop */ forwardH = LZ4_hashPosition(++ip, tableType); } _last_literals: /* Encode Last Literals */ { size_t lastRunSize = (size_t)(iend - anchor); if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) { /* adapt lastRunSize to fill 'dst' */ lastRunSize = (oend-op) - 1; lastRunSize -= (lastRunSize+240)/255; } ip = anchor + lastRunSize; if (lastRunSize >= RUN_MASK) { size_t accumulator = lastRunSize - RUN_MASK; *op++ = RUN_MASK << ML_BITS; for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; *op++ = (BYTE) accumulator; } else { *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ { return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); } else { if (*srcSizePtr < LZ4_64Klimit) return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); else return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); } } int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) { #if (HEAPMODE) void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ #else LZ4_stream_t ctxBody; void* ctx = &ctxBody; #endif int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); #if (HEAPMODE) FREEMEM(ctx); #endif return result; } /******************************** * Streaming functions ********************************/ LZ4_stream_t* LZ4_createStream(void) { LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ LZ4_resetStream(lz4s); return lz4s; } void LZ4_resetStream (LZ4_stream_t* LZ4_stream) { MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); } int LZ4_freeStream (LZ4_stream_t* LZ4_stream) { FREEMEM(LZ4_stream); return (0); } #define HASH_UNIT sizeof(size_t) int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) { LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; const BYTE* p = (const BYTE*)dictionary; const BYTE* const dictEnd = p + dictSize; const BYTE* base; if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ LZ4_resetStream(LZ4_dict); if (dictSize < (int)HASH_UNIT) { dict->dictionary = NULL; dict->dictSize = 0; return 0; } if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; dict->currentOffset += 64 KB; base = p - dict->currentOffset; dict->dictionary = p; dict->dictSize = (U32)(dictEnd - p); dict->currentOffset += dict->dictSize; while (p <= dictEnd-HASH_UNIT) { LZ4_putPosition(p, dict->hashTable, byU32, base); p+=3; } return dict->dictSize; } static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) { if ((LZ4_dict->currentOffset > 0x80000000) || ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ { /* rescale hash table */ U32 delta = LZ4_dict->currentOffset - 64 KB; const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; int i; for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; else LZ4_dict->hashTable[i] -= delta; } LZ4_dict->currentOffset = 64 KB; if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; } } int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) { LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = (const BYTE*) source; if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; LZ4_renormDictT(streamPtr, smallest); if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; /* Check overlapping input/dictionary space */ { const BYTE* sourceEnd = (const BYTE*) source + inputSize; if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { streamPtr->dictSize = (U32)(dictEnd - sourceEnd); if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; streamPtr->dictionary = dictEnd - streamPtr->dictSize; } } /* prefix mode : source data follows dictionary */ if (dictEnd == (const BYTE*)source) { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); else result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); streamPtr->dictSize += (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } /* external dictionary mode */ { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); else result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } } /* Hidden debug function, to force external dictionary mode */ int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) { LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; int result; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = dictEnd; if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) { LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; memmove(safeBuffer, previousDictEnd - dictSize, dictSize); dict->dictionary = (const BYTE*)safeBuffer; dict->dictSize = (U32)dictSize; return dictSize; } /******************************* * Decompression functions *******************************/ /* * This generic decompression function cover all use cases. * It shall be instantiated several times, using different sets of directives * Note that it is essential this generic function is really inlined, * in order to remove useless branches during compilation optimization. */ FORCE_INLINE int LZ4_decompress_generic( const char* const source, char* const dest, int inputSize, int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ int endOnInput, /* endOnOutputSize, endOnInputSize */ int partialDecoding, /* full, partial */ int targetOutputSize, /* only used if partialDecoding==partial */ int dict, /* noDict, withPrefix64k, usingExtDict */ const BYTE* const lowPrefix, /* == dest if dict == noDict */ const BYTE* const dictStart, /* only if dict==usingExtDict */ const size_t dictSize /* note : = 0 if noDict */ ) { /* Local Variables */ const BYTE* ip = (const BYTE*) source; const BYTE* const iend = ip + inputSize; BYTE* op = (BYTE*) dest; BYTE* const oend = op + outputSize; BYTE* cpy; BYTE* oexit = op + targetOutputSize; const BYTE* const lowLimit = lowPrefix - dictSize; const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; const int safeDecode = (endOnInput==endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); /* Special cases */ if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); /* Main Loop */ while (1) { unsigned token; size_t length; const BYTE* match; /* get literal length */ token = *ip++; if ((length=(token>>ML_BITS)) == RUN_MASK) { unsigned s; do { s = *ip++; length += s; } while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) || ((!endOnInput) && (cpy>oend-COPYLENGTH))) { if (partialDecoding) { if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ } else { if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ } memcpy(op, ip, length); ip += length; op += length; break; /* Necessarily EOF, due to parsing restrictions */ } LZ4_wildCopy(op, ip, cpy); ip += length; op = cpy; /* get offset */ match = cpy - LZ4_readLE16(ip); ip+=2; if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ /* get matchlength */ length = token & ML_MASK; if (length == ML_MASK) { unsigned s; do { if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; s = *ip++; length += s; } while (s==255); if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ } length += MINMATCH; /* check external dictionary */ if ((dict==usingExtDict) && (match < lowPrefix)) { if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ if (length <= (size_t)(lowPrefix-match)) { /* match can be copied as a single segment from external dictionary */ match = dictEnd - (lowPrefix-match); memmove(op, match, length); op += length; } else { /* match encompass external dictionary and current segment */ size_t copySize = (size_t)(lowPrefix-match); memcpy(op, dictEnd - copySize, copySize); op += copySize; copySize = length - copySize; if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ { BYTE* const endOfMatch = op + copySize; const BYTE* copyFrom = lowPrefix; while (op < endOfMatch) *op++ = *copyFrom++; } else { memcpy(op, lowPrefix, copySize); op += copySize; } } continue; } /* copy repeated sequence */ cpy = op + length; if (unlikely((op-match)<8)) { const size_t dec64 = dec64table[op-match]; op[0] = match[0]; op[1] = match[1]; op[2] = match[2]; op[3] = match[3]; match += dec32table[op-match]; LZ4_copy4(op+4, match); op += 8; match -= dec64; } else { LZ4_copy8(op, match); op+=8; match+=8; } if (unlikely(cpy>oend-12)) { if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ if (op < oend-8) { LZ4_wildCopy(op, match, oend-8); match += (oend-8) - op; op = oend-8; } while (opprefixSize = (size_t) dictSize; lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; lz4sd->externalDict = NULL; lz4sd->extDictSize = 0; return 1; } /* *_continue() : These decoding functions allow decompression of multiple blocks in "streaming" mode. Previously decoded blocks must still be available at the memory position where they were decoded. If it's not possible, save the relevant part of decoded data into a safe buffer, and indicate where it stands using LZ4_setStreamDecode() */ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) { LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; int result; if (lz4sd->prefixEnd == (BYTE*)dest) { result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize += result; lz4sd->prefixEnd += result; } else { lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize = result; lz4sd->prefixEnd = (BYTE*)dest + result; } return result; } int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) { LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; int result; if (lz4sd->prefixEnd == (BYTE*)dest) { result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize += originalSize; lz4sd->prefixEnd += originalSize; } else { lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize = originalSize; lz4sd->prefixEnd = (BYTE*)dest + originalSize; } return result; } /* Advanced decoding functions : *_usingDict() : These decoding functions work the same as "_continue" ones, the dictionary must be explicitly provided within parameters */ FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) { if (dictSize==0) return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); if (dictStart+dictSize == dest) { if (dictSize >= (int)(64 KB - 1)) return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); } return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); } int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) { return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); } int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) { return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); } /* debug function */ int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) { return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); } /*************************************************** * Obsolete Functions ***************************************************/ /* obsolete compression functions */ int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } /* These function names are deprecated and should no longer be used. They are only provided here for compatibility with older user programs. - LZ4_uncompress is totally equivalent to LZ4_decompress_fast - LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe */ int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } /* Obsolete Streaming functions */ int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) { MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); lz4ds->bufferStart = base; } int LZ4_resetStreamState(void* state, char* inputBuffer) { if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); return 0; } void* LZ4_create (char* inputBuffer) { void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); return lz4ds; } char* LZ4_slideInputBuffer (void* LZ4_Data) { LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); return (char*)(ctx->bufferStart + dictSize); } /* Obsolete streaming decompression functions */ int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) { return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); } int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) { return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); } #endif /* LZ4_COMMONDEFS_ONLY */ bitshuffle-0.3.5/lz4/lz4.h000066400000000000000000000446161337005776700152770ustar00rootroot00000000000000/* LZ4 - Fast LZ compression algorithm Header File Copyright (C) 2011-2015, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - LZ4 source repository : https://github.com/Cyan4973/lz4 - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ #pragma once #if defined (__cplusplus) extern "C" { #endif /* * lz4.h provides block compression functions, and gives full buffer control to programmer. * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), * and can let the library handle its own memory, please use lz4frame.h instead. */ /************************************** * Version **************************************/ #define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ #define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ #define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ #define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) int LZ4_versionNumber (void); /************************************** * Tuning parameter **************************************/ /* * LZ4_MEMORY_USAGE : * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) * Increasing memory usage improves compression ratio * Reduced memory usage can improve speed, due to cache effect * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ #define LZ4_MEMORY_USAGE 14 /************************************** * Simple Functions **************************************/ int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); /* LZ4_compress_default() : Compresses 'sourceSize' bytes from buffer 'source' into already allocated 'dest' buffer of size 'maxDestSize'. Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). It also runs faster, so it's a recommended setting. If the function cannot compress 'source' into a more limited 'dest' budget, compression stops *immediately*, and the function result is zero. As a consequence, 'dest' content is not valid. This function never writes outside 'dest' buffer, nor read outside 'source' buffer. sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) or 0 if compression fails LZ4_decompress_safe() : compressedSize : is the precise full size of the compressed block. maxDecompressedSize : is the size of destination buffer, which must be already allocated. return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) If destination buffer is not large enough, decoding will stop and output an error code (<0). If the source stream is detected malformed, the function will stop decoding and return a negative result. This function is protected against buffer overflow exploits, including malicious data packets. It never writes outside output buffer, nor reads outside input buffer. */ /************************************** * Advanced Functions **************************************/ #define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ #define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) /* LZ4_compressBound() : Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) This function is primarily useful for memory allocation purposes (destination buffer size). Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) inputSize : max supported value is LZ4_MAX_INPUT_SIZE return : maximum output size in a "worst case" scenario or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) */ int LZ4_compressBound(int inputSize); /* LZ4_compress_fast() : Same as LZ4_compress_default(), but allows to select an "acceleration" factor. The larger the acceleration value, the faster the algorithm, but also the lesser the compression. It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. An acceleration value of "1" is the same as regular LZ4_compress_default() Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. */ int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); /* LZ4_compress_fast_extState() : Same compression function, just using an externally allocated memory space to store compression state. Use LZ4_sizeofState() to know how much memory must be allocated, and allocate it on 8-bytes boundaries (using malloc() typically). Then, provide it as 'void* state' to compression function. */ int LZ4_sizeofState(void); int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); /* LZ4_compress_destSize() : Reverse the logic, by compressing as much data as possible from 'source' buffer into already allocated buffer 'dest' of size 'targetDestSize'. This function either compresses the entire 'source' content into 'dest' if it's large enough, or fill 'dest' buffer completely with as much data as possible from 'source'. *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. New value is necessarily <= old value. return : Nb bytes written into 'dest' (necessarily <= targetDestSize) or 0 if compression fails */ int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); /* LZ4_decompress_fast() : originalSize : is the original and therefore uncompressed size return : the number of bytes read from the source buffer (in other words, the compressed size) If the source stream is detected malformed, the function will stop decoding and return a negative result. Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. note : This function fully respect memory boundaries for properly formed compressed data. It is a bit faster than LZ4_decompress_safe(). However, it does not provide any protection against intentionally modified data stream (malicious input). Use this function in trusted environment only (data to decode comes from a trusted source). */ int LZ4_decompress_fast (const char* source, char* dest, int originalSize); /* LZ4_decompress_safe_partial() : This function decompress a compressed block of size 'compressedSize' at position 'source' into destination buffer 'dest' of size 'maxDecompressedSize'. The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, reducing decompression time. return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. Always control how many bytes were decoded. If the source stream is detected malformed, the function will stop decoding and return a negative result. This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets */ int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); /*********************************************** * Streaming Compression Functions ***********************************************/ #define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) #define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) /* * LZ4_stream_t * information structure to track an LZ4 stream. * important : init this structure content before first use ! * note : only allocated directly the structure if you are statically linking LZ4 * If you are using liblz4 as a DLL, please use below construction methods instead. */ typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; /* * LZ4_resetStream * Use this function to init an allocated LZ4_stream_t structure */ void LZ4_resetStream (LZ4_stream_t* streamPtr); /* * LZ4_createStream will allocate and initialize an LZ4_stream_t structure * LZ4_freeStream releases its memory. * In the context of a DLL (liblz4), please use these methods rather than the static struct. * They are more future proof, in case of a change of LZ4_stream_t size. */ LZ4_stream_t* LZ4_createStream(void); int LZ4_freeStream (LZ4_stream_t* streamPtr); /* * LZ4_loadDict * Use this function to load a static dictionary into LZ4_stream. * Any previous data will be forgotten, only 'dictionary' will remain in memory. * Loading a size of 0 is allowed. * Return : dictionary size, in bytes (necessarily <= 64 KB) */ int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); /* * LZ4_compress_fast_continue * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. * Important : Previous data blocks are assumed to still be present and unmodified ! * 'dst' buffer must be already allocated. * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. */ int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); /* * LZ4_saveDict * If previously compressed data block is not guaranteed to remain available at its memory location * save it into a safer place (char* safeBuffer) * Note : you don't need to call LZ4_loadDict() afterwards, * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error */ int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); /************************************************ * Streaming Decompression Functions ************************************************/ #define LZ4_STREAMDECODESIZE_U64 4 #define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; /* * LZ4_streamDecode_t * information structure to track an LZ4 stream. * init this structure content using LZ4_setStreamDecode or memset() before first use ! * * In the context of a DLL (liblz4) please prefer usage of construction methods below. * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure * LZ4_freeStreamDecode releases its memory. */ LZ4_streamDecode_t* LZ4_createStreamDecode(void); int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); /* * LZ4_setStreamDecode * Use this function to instruct where to find the dictionary. * Setting a size of 0 is allowed (same effect as reset). * Return : 1 if OK, 0 if error */ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); /* *_continue() : These decoding functions allow decompression of multiple blocks in "streaming" mode. Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) In the case of a ring buffers, decoding buffer must be either : - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. In which case, encoding and decoding buffers do not need to be synchronized, and encoding ring buffer can have any size, including small ones ( < 64 KB). - _At least_ 64 KB + 8 bytes + maxBlockSize. In which case, encoding and decoding buffers do not need to be synchronized, and encoding ring buffer can have any size, including larger than decoding buffer. Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, and indicate where it is saved using LZ4_setStreamDecode() */ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); /* Advanced decoding functions : *_usingDict() : These decoding functions work the same as a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. */ int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); /************************************** * Obsolete Functions **************************************/ /* Deprecate Warnings */ /* Should these warnings messages be a problem, it is generally possible to disable them, with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual for example. You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ #ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK # define LZ4_DEPRECATE_WARNING_DEFBLOCK # define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) # if (LZ4_GCC_VERSION >= 405) || defined(__clang__) # define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) # elif (LZ4_GCC_VERSION >= 301) # define LZ4_DEPRECATED(message) __attribute__((deprecated)) # elif defined(_MSC_VER) # define LZ4_DEPRECATED(message) __declspec(deprecated(message)) # else # pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") # define LZ4_DEPRECATED(message) # endif #endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ /* Obsolete compression functions */ /* These functions are planned to start generate warnings by r131 approximately */ int LZ4_compress (const char* source, char* dest, int sourceSize); int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); /* Obsolete decompression functions */ /* These function names are completely deprecated and must no longer be used. They are only provided here for compatibility with older programs. - LZ4_uncompress is the same as LZ4_decompress_fast - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe These function prototypes are now disabled; uncomment them only if you really need them. It is highly recommended to stop using these prototypes and migrate to maintained ones */ /* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ /* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ /* Obsolete streaming functions; use new streaming interface whenever possible */ LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); /* Obsolete streaming decoding functions */ LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); #if defined (__cplusplus) } #endif bitshuffle-0.3.5/lzf/000077500000000000000000000000001337005776700144645ustar00rootroot00000000000000bitshuffle-0.3.5/lzf/LICENSE.txt000066400000000000000000000030361337005776700163110ustar00rootroot00000000000000Copyright Notice and Statement for LZF filter Copyright (c) 2008-2009 Andrew Collette http://h5py.alfven.org All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. c. Neither the name of the author nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bitshuffle-0.3.5/lzf/README.txt000066400000000000000000000050671337005776700161720ustar00rootroot00000000000000=============================== LZF filter for HDF5, revision 3 =============================== The LZF filter provides high-speed compression with acceptable compression performance, resulting in much faster performance than DEFLATE, at the cost of a slightly lower compression ratio. It's appropriate for large datasets of low to moderate complexity, for which some compression is much better than none, but for which the speed of DEFLATE is unacceptable. This filter has been tested against HDF5 versions 1.6.5 through 1.8.3. It is released under the BSD license (see LICENSE.txt for details). Using the filter from HDF5 -------------------------- There is exactly one new public function declared in lzf_filter.h, with the following signature: int register_lzf(void) Calling this will register the filter with the HDF5 library. A non-negative return value indicates success. If the registration fails, an error is pushed onto the current error stack and a negative value is returned. It's strongly recommended to use the SHUFFLE filter with LZF, as it's cheap, supported by all current versions of HDF5, and can significantly improve the compression ratio. An example C program ("example.c") is included which demonstrates the proper use of the filter. Compiling --------- The filter consists of a single .c file and header, along with an embedded version of the LZF compression library. Since the filter is stateless, it's recommended to statically link the entire thing into your program; for example: $ gcc -O2 -lhdf5 lzf/*.c lzf_filter.c myprog.c -o myprog It can also be built as a shared library, although you will have to install the resulting library somewhere the runtime linker can find it: $ gcc -O2 -lhdf5 -fPIC -shared lzf/*.c lzf_filter.c -o liblzf_filter.so A similar procedure should be used for building C++ code. As in these examples, using option -O1 or higher is strongly recommended for increased performance. Contact ------- This filter is maintained as part of the HDF5 for Python (h5py) project. The goal of h5py is to provide access to the majority of the HDF5 C API and feature set from Python. The most recent version of h5py (1.1) includes the LZF filter by default. * Downloads and bug tracker: http://h5py.googlecode.com * Main web site and documentation: http://h5py.alfven.org * Contact email: h5py at alfven dot org History of changes ------------------ Revision 3 (6/25/09) Fix issue with changed filter struct definition under HDF5 1.8.3. Revision 2 Minor speed enhancement. Revision 1 Initial release. bitshuffle-0.3.5/lzf/README_bitshuffle.txt000066400000000000000000000003151337005776700203740ustar00rootroot00000000000000The LZF filter for HDF5 is part of the h5py project (http://h5py.alfven.org). The version included with bitshuffle is from version 2.3 of h5py with no modifications other than the addition of this README. bitshuffle-0.3.5/lzf/example.c000066400000000000000000000052661337005776700162740ustar00rootroot00000000000000/* Copyright (C) 2009 Andrew Collette http://h5py.alfven.org License: BSD (see LICENSE.txt) Example program demonstrating use of the LZF filter from C code. To compile this program: h5cc -DH5_USE_16_API lzf/*.c lzf_filter.c example.c -o example To run: $ ./example Success! $ h5ls -v test_lzf.hdf5 Opened "test_lzf.hdf5" with sec2 driver. dset Dataset {100/100, 100/100, 100/100} Location: 0:1:0:976 Links: 1 Modified: 2009-02-15 16:35:11 PST Chunks: {1, 100, 100} 40000 bytes Storage: 4000000 logical bytes, 174288 allocated bytes, 2295.05% utilization Filter-0: shuffle-2 OPT {4} Filter-1: lzf-32000 OPT {1, 261, 40000} Type: native float */ #include #include "hdf5.h" #include "lzf_filter.h" #define SIZE 100*100*100 #define SHAPE {100,100,100} #define CHUNKSHAPE {1,100,100} int main(){ static float data[SIZE]; static float data_out[SIZE]; const hsize_t shape[] = SHAPE; const hsize_t chunkshape[] = CHUNKSHAPE; int r, i; int return_code = 1; hid_t fid, sid, dset, plist = 0; for(i=0; i0) H5Dclose(dset); if(sid>0) H5Sclose(sid); if(plist>0) H5Pclose(plist); if(fid>0) H5Fclose(fid); return return_code; } bitshuffle-0.3.5/lzf/lzf/000077500000000000000000000000001337005776700152575ustar00rootroot00000000000000bitshuffle-0.3.5/lzf/lzf/lzf.h000066400000000000000000000104751337005776700162320ustar00rootroot00000000000000/* * Copyright (c) 2000-2008 Marc Alexander Lehmann * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * * Alternatively, the contents of this file may be used under the terms of * the GNU General Public License ("GPL") version 2 or any later version, * in which case the provisions of the GPL are applicable instead of * the above. If you wish to allow the use of your version of this file * only under the terms of the GPL and not to allow others to use your * version of this file under the BSD license, indicate your decision * by deleting the provisions above and replace them with the notice * and other provisions required by the GPL. If you do not delete the * provisions above, a recipient may use your version of this file under * either the BSD or the GPL. */ #ifndef LZF_H #define LZF_H /*********************************************************************** ** ** lzf -- an extremely fast/free compression/decompression-method ** http://liblzf.plan9.de/ ** ** This algorithm is believed to be patent-free. ** ***********************************************************************/ #define LZF_VERSION 0x0105 /* 1.5, API version */ /* * Compress in_len bytes stored at the memory block starting at * in_data and write the result to out_data, up to a maximum length * of out_len bytes. * * If the output buffer is not large enough or any error occurs return 0, * otherwise return the number of bytes used, which might be considerably * more than in_len (but less than 104% of the original size), so it * makes sense to always use out_len == in_len - 1), to ensure _some_ * compression, and store the data uncompressed otherwise (with a flag, of * course. * * lzf_compress might use different algorithms on different systems and * even different runs, thus might result in different compressed strings * depending on the phase of the moon or similar factors. However, all * these strings are architecture-independent and will result in the * original data when decompressed using lzf_decompress. * * The buffers must not be overlapping. * * If the option LZF_STATE_ARG is enabled, an extra argument must be * supplied which is not reflected in this header file. Refer to lzfP.h * and lzf_c.c. * */ unsigned int lzf_compress (const void *const in_data, unsigned int in_len, void *out_data, unsigned int out_len); /* * Decompress data compressed with some version of the lzf_compress * function and stored at location in_data and length in_len. The result * will be stored at out_data up to a maximum of out_len characters. * * If the output buffer is not large enough to hold the decompressed * data, a 0 is returned and errno is set to E2BIG. Otherwise the number * of decompressed bytes (i.e. the original length of the data) is * returned. * * If an error in the compressed data is detected, a zero is returned and * errno is set to EINVAL. * * This function is very fast, about as fast as a copying loop. */ unsigned int lzf_decompress (const void *const in_data, unsigned int in_len, void *out_data, unsigned int out_len); #endif bitshuffle-0.3.5/lzf/lzf/lzfP.h000066400000000000000000000125101337005776700163420ustar00rootroot00000000000000/* * Copyright (c) 2000-2007 Marc Alexander Lehmann * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * * Alternatively, the contents of this file may be used under the terms of * the GNU General Public License ("GPL") version 2 or any later version, * in which case the provisions of the GPL are applicable instead of * the above. If you wish to allow the use of your version of this file * only under the terms of the GPL and not to allow others to use your * version of this file under the BSD license, indicate your decision * by deleting the provisions above and replace them with the notice * and other provisions required by the GPL. If you do not delete the * provisions above, a recipient may use your version of this file under * either the BSD or the GPL. */ #ifndef LZFP_h #define LZFP_h #define STANDALONE 1 /* at the moment, this is ok. */ #ifndef STANDALONE # include "lzf.h" #endif /* * Size of hashtable is (1 << HLOG) * sizeof (char *) * decompression is independent of the hash table size * the difference between 15 and 14 is very small * for small blocks (and 14 is usually a bit faster). * For a low-memory/faster configuration, use HLOG == 13; * For best compression, use 15 or 16 (or more, up to 23). */ #ifndef HLOG # define HLOG 17 /* Avoid pathological case at HLOG=16 A.C. 2/15/09 */ #endif /* * Sacrifice very little compression quality in favour of compression speed. * This gives almost the same compression as the default code, and is * (very roughly) 15% faster. This is the preferred mode of operation. */ #ifndef VERY_FAST # define VERY_FAST 1 #endif /* * Sacrifice some more compression quality in favour of compression speed. * (roughly 1-2% worse compression for large blocks and * 9-10% for small, redundant, blocks and >>20% better speed in both cases) * In short: when in need for speed, enable this for binary data, * possibly disable this for text data. */ #ifndef ULTRA_FAST # define ULTRA_FAST 1 #endif /* * Unconditionally aligning does not cost very much, so do it if unsure */ #ifndef STRICT_ALIGN # define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) #endif /* * You may choose to pre-set the hash table (might be faster on some * modern cpus and large (>>64k) blocks, and also makes compression * deterministic/repeatable when the configuration otherwise is the same). */ #ifndef INIT_HTAB # define INIT_HTAB 0 #endif /* ======================================================================= Changing things below this line may break the HDF5 LZF filter. A.C. 2/15/09 ======================================================================= */ /* * Avoid assigning values to errno variable? for some embedding purposes * (linux kernel for example), this is neccessary. NOTE: this breaks * the documentation in lzf.h. */ #ifndef AVOID_ERRNO # define AVOID_ERRNO 0 #endif /* * Wether to pass the LZF_STATE variable as argument, or allocate it * on the stack. For small-stack environments, define this to 1. * NOTE: this breaks the prototype in lzf.h. */ #ifndef LZF_STATE_ARG # define LZF_STATE_ARG 0 #endif /* * Wether to add extra checks for input validity in lzf_decompress * and return EINVAL if the input stream has been corrupted. This * only shields against overflowing the input buffer and will not * detect most corrupted streams. * This check is not normally noticable on modern hardware * (<1% slowdown), but might slow down older cpus considerably. */ #ifndef CHECK_INPUT # define CHECK_INPUT 1 #endif /*****************************************************************************/ /* nothing should be changed below */ typedef unsigned char u8; typedef const u8 *LZF_STATE[1 << (HLOG)]; #if !STRICT_ALIGN /* for unaligned accesses we need a 16 bit datatype. */ # include # if USHRT_MAX == 65535 typedef unsigned short u16; # elif UINT_MAX == 65535 typedef unsigned int u16; # else # undef STRICT_ALIGN # define STRICT_ALIGN 1 # endif #endif #if ULTRA_FAST # if defined(VERY_FAST) # undef VERY_FAST # endif #endif #if INIT_HTAB # ifdef __cplusplus # include # else # include # endif #endif #endif bitshuffle-0.3.5/lzf/lzf/lzf_c.c000066400000000000000000000214571337005776700165310ustar00rootroot00000000000000/* * Copyright (c) 2000-2008 Marc Alexander Lehmann * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * * Alternatively, the contents of this file may be used under the terms of * the GNU General Public License ("GPL") version 2 or any later version, * in which case the provisions of the GPL are applicable instead of * the above. If you wish to allow the use of your version of this file * only under the terms of the GPL and not to allow others to use your * version of this file under the BSD license, indicate your decision * by deleting the provisions above and replace them with the notice * and other provisions required by the GPL. If you do not delete the * provisions above, a recipient may use your version of this file under * either the BSD or the GPL. */ #include "lzfP.h" #define HSIZE (1 << (HLOG)) /* * don't play with this unless you benchmark! * decompression is not dependent on the hash function * the hashing function might seem strange, just believe me * it works ;) */ #ifndef FRST # define FRST(p) (((p[0]) << 8) | p[1]) # define NEXT(v,p) (((v) << 8) | p[2]) # if ULTRA_FAST # define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1)) # elif VERY_FAST # define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) # else # define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) # endif #endif /* * IDX works because it is very similar to a multiplicative hash, e.g. * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1)) * the latter is also quite fast on newer CPUs, and compresses similarly. * * the next one is also quite good, albeit slow ;) * (int)(cos(h & 0xffffff) * 1e6) */ #if 0 /* original lzv-like hash function, much worse and thus slower */ # define FRST(p) (p[0] << 5) ^ p[1] # define NEXT(v,p) ((v) << 5) ^ p[2] # define IDX(h) ((h) & (HSIZE - 1)) #endif #define MAX_LIT (1 << 5) #define MAX_OFF (1 << 13) #define MAX_REF ((1 << 8) + (1 << 3)) #if __GNUC__ >= 3 # define expect(expr,value) __builtin_expect ((expr),(value)) # define inline inline #else # define expect(expr,value) (expr) # define inline static #endif #define expect_false(expr) expect ((expr) != 0, 0) #define expect_true(expr) expect ((expr) != 0, 1) /* * compressed format * * 000LLLLL ; literal * LLLooooo oooooooo ; backref L * 111ooooo LLLLLLLL oooooooo ; backref L+7 * */ unsigned int lzf_compress (const void *const in_data, unsigned int in_len, void *out_data, unsigned int out_len #if LZF_STATE_ARG , LZF_STATE htab #endif ) { #if !LZF_STATE_ARG LZF_STATE htab; #endif const u8 **hslot; const u8 *ip = (const u8 *)in_data; u8 *op = (u8 *)out_data; const u8 *in_end = ip + in_len; u8 *out_end = op + out_len; const u8 *ref; /* off requires a type wide enough to hold a general pointer difference. * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only * works for differences within a single object). We also assume that no * no bit pattern traps. Since the only platform that is both non-POSIX * and fails to support both assumptions is windows 64 bit, we make a * special workaround for it. */ #if ( defined (WIN32) && defined (_M_X64) ) || defined (_WIN64) unsigned _int64 off; /* workaround for missing POSIX compliance */ #else unsigned long off; #endif unsigned int hval; int lit; if (!in_len || !out_len) return 0; #if INIT_HTAB memset (htab, 0, sizeof (htab)); # if 0 for (hslot = htab; hslot < htab + HSIZE; hslot++) *hslot++ = ip; # endif #endif lit = 0; op++; /* start run */ hval = FRST (ip); while (ip < in_end - 2) { hval = NEXT (hval, ip); hslot = htab + IDX (hval); ref = *hslot; *hslot = ip; if (1 #if INIT_HTAB && ref < ip /* the next test will actually take care of this, but this is faster */ #endif && (off = ip - ref - 1) < MAX_OFF && ip + 4 < in_end && ref > (u8 *)in_data #if STRICT_ALIGN && ref[0] == ip[0] && ref[1] == ip[1] && ref[2] == ip[2] #else && *(u16 *)ref == *(u16 *)ip && ref[2] == ip[2] #endif ) { /* match found at *ref++ */ unsigned int len = 2; unsigned int maxlen = in_end - ip - len; maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */ if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */ return 0; op [- lit - 1] = lit - 1; /* stop run */ op -= !lit; /* undo run if length is zero */ for (;;) { if (expect_true (maxlen > 16)) { len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; len++; if (ref [len] != ip [len]) break; } do len++; while (len < maxlen && ref[len] == ip[len]); break; } len -= 2; /* len is now #octets - 1 */ ip++; if (len < 7) { *op++ = (off >> 8) + (len << 5); } else { *op++ = (off >> 8) + ( 7 << 5); *op++ = len - 7; } *op++ = off; lit = 0; op++; /* start run */ ip += len + 1; if (expect_false (ip >= in_end - 2)) break; #if ULTRA_FAST || VERY_FAST --ip; # if VERY_FAST && !ULTRA_FAST --ip; # endif hval = FRST (ip); hval = NEXT (hval, ip); htab[IDX (hval)] = ip; ip++; # if VERY_FAST && !ULTRA_FAST hval = NEXT (hval, ip); htab[IDX (hval)] = ip; ip++; # endif #else ip -= len + 1; do { hval = NEXT (hval, ip); htab[IDX (hval)] = ip; ip++; } while (len--); #endif } else { /* one more literal byte we must copy */ if (expect_false (op >= out_end)) return 0; lit++; *op++ = *ip++; if (expect_false (lit == MAX_LIT)) { op [- lit - 1] = lit - 1; /* stop run */ lit = 0; op++; /* start run */ } } } if (op + 3 > out_end) /* at most 3 bytes can be missing here */ return 0; while (ip < in_end) { lit++; *op++ = *ip++; if (expect_false (lit == MAX_LIT)) { op [- lit - 1] = lit - 1; /* stop run */ lit = 0; op++; /* start run */ } } op [- lit - 1] = lit - 1; /* end run */ op -= !lit; /* undo run if length is zero */ return op - (u8 *)out_data; } bitshuffle-0.3.5/lzf/lzf/lzf_d.c000066400000000000000000000105051337005776700165220ustar00rootroot00000000000000/* * Copyright (c) 2000-2007 Marc Alexander Lehmann * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * * Alternatively, the contents of this file may be used under the terms of * the GNU General Public License ("GPL") version 2 or any later version, * in which case the provisions of the GPL are applicable instead of * the above. If you wish to allow the use of your version of this file * only under the terms of the GPL and not to allow others to use your * version of this file under the BSD license, indicate your decision * by deleting the provisions above and replace them with the notice * and other provisions required by the GPL. If you do not delete the * provisions above, a recipient may use your version of this file under * either the BSD or the GPL. */ #include "lzfP.h" #if AVOID_ERRNO # define SET_ERRNO(n) #else # include # define SET_ERRNO(n) errno = (n) #endif /* ASM is slower than C in HDF5 tests -- A.C. 2/5/09 #ifndef __STRICT_ANSI__ #ifndef H5PY_DISABLE_LZF_ASM #if (__i386 || __amd64) && __GNUC__ >= 3 # define lzf_movsb(dst, src, len) \ asm ("rep movsb" \ : "=D" (dst), "=S" (src), "=c" (len) \ : "0" (dst), "1" (src), "2" (len)); #endif #endif #endif */ unsigned int lzf_decompress (const void *const in_data, unsigned int in_len, void *out_data, unsigned int out_len) { u8 const *ip = (const u8 *)in_data; u8 *op = (u8 *)out_data; u8 const *const in_end = ip + in_len; u8 *const out_end = op + out_len; do { unsigned int ctrl = *ip++; if (ctrl < (1 << 5)) /* literal run */ { ctrl++; if (op + ctrl > out_end) { SET_ERRNO (E2BIG); return 0; } #if CHECK_INPUT if (ip + ctrl > in_end) { SET_ERRNO (EINVAL); return 0; } #endif #ifdef lzf_movsb lzf_movsb (op, ip, ctrl); #else do *op++ = *ip++; while (--ctrl); #endif } else /* back reference */ { unsigned int len = ctrl >> 5; u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; #if CHECK_INPUT if (ip >= in_end) { SET_ERRNO (EINVAL); return 0; } #endif if (len == 7) { len += *ip++; #if CHECK_INPUT if (ip >= in_end) { SET_ERRNO (EINVAL); return 0; } #endif } ref -= *ip++; if (op + len + 2 > out_end) { SET_ERRNO (E2BIG); return 0; } if (ref < (u8 *)out_data) { SET_ERRNO (EINVAL); return 0; } #ifdef lzf_movsb len += 2; lzf_movsb (op, ref, len); #else *op++ = *ref++; *op++ = *ref++; do *op++ = *ref++; while (--len); #endif } } while (ip < in_end); return op - (u8 *)out_data; } bitshuffle-0.3.5/lzf/lzf_filter.c000066400000000000000000000154201337005776700167720ustar00rootroot00000000000000/***** Preamble block ********************************************************* * * This file is part of h5py, a low-level Python interface to the HDF5 library. * * Copyright (C) 2008 Andrew Collette * http://h5py.alfven.org * License: BSD (See LICENSE.txt for full license) * * $Date$ * ****** End preamble block ****************************************************/ /* Implements an LZF filter module for HDF5, using the BSD-licensed library by Marc Alexander Lehmann (http://www.goof.com/pcg/marc/liblzf.html). No Python-specific code is used. The filter behaves like the DEFLATE filter, in that it is called for every type and space, and returns 0 if the data cannot be compressed. The only public function is (int) register_lzf(void), which passes on the result from H5Zregister. */ #include #include #include #include "hdf5.h" #include "lzf/lzf.h" #include "lzf_filter.h" /* Our own versions of H5Epush_sim, as it changed in 1.8 */ #if H5_VERS_MAJOR == 1 && H5_VERS_MINOR < 7 #define PUSH_ERR(func, minor, str) H5Epush(__FILE__, func, __LINE__, H5E_PLINE, minor, str) #define H5PY_GET_FILTER H5Pget_filter_by_id #else #define PUSH_ERR(func, minor, str) H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str) #define H5PY_GET_FILTER(a,b,c,d,e,f,g) H5Pget_filter_by_id2(a,b,c,d,e,f,g,NULL) #endif /* Deal with the mutiple definitions for H5Z_class_t. Note: Only HDF5 1.6 and 1.8 are supported. (1) The old class should always be used for HDF5 1.6 (2) The new class should always be used for HDF5 1.8 < 1.8.3 (3) The old class should be used for HDF5 1.8 >= 1.8.3 only if the macro H5_USE_16_API is set */ #if H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 8 && (H5_VERS_RELEASE < 3 || !H5_USE_16_API) #define H5PY_H5Z_NEWCLS 1 #else #define H5PY_H5Z_NEWCLS 0 #endif size_t lzf_filter(unsigned flags, size_t cd_nelmts, const unsigned cd_values[], size_t nbytes, size_t *buf_size, void **buf); herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space); /* Try to register the filter, passing on the HDF5 return value */ int register_lzf(void){ int retval; #if H5PY_H5Z_NEWCLS H5Z_class_t filter_class = { H5Z_CLASS_T_VERS, (H5Z_filter_t)(H5PY_FILTER_LZF), 1, 1, "lzf", NULL, (H5Z_set_local_func_t)(lzf_set_local), (H5Z_func_t)(lzf_filter) }; #else H5Z_class_t filter_class = { (H5Z_filter_t)(H5PY_FILTER_LZF), "lzf", NULL, (H5Z_set_local_func_t)(lzf_set_local), (H5Z_func_t)(lzf_filter) }; #endif retval = H5Zregister(&filter_class); if(retval<0){ PUSH_ERR("register_lzf", H5E_CANTREGISTER, "Can't register LZF filter"); } return retval; } /* Filter setup. Records the following inside the DCPL: 1. If version information is not present, set slots 0 and 1 to the filter revision and LZF API version, respectively. 2. Compute the chunk size in bytes and store it in slot 2. */ herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space){ int ndims; int i; herr_t r; unsigned int bufsize; hsize_t chunkdims[32]; unsigned int flags; size_t nelements = 8; unsigned values[] = {0,0,0,0,0,0,0,0}; r = H5PY_GET_FILTER(dcpl, H5PY_FILTER_LZF, &flags, &nelements, values, 0, NULL); if(r<0) return -1; if(nelements < 3) nelements = 3; /* First 3 slots reserved. If any higher slots are used, preserve the contents. */ /* It seems the H5Z_FLAG_REVERSE flag doesn't work here, so we have to be careful not to clobber any existing version info */ if(values[0]==0) values[0] = H5PY_FILTER_LZF_VERSION; if(values[1]==0) values[1] = LZF_VERSION; ndims = H5Pget_chunk(dcpl, 32, chunkdims); if(ndims<0) return -1; if(ndims>32){ PUSH_ERR("lzf_set_local", H5E_CALLBACK, "Chunk rank exceeds limit"); return -1; } bufsize = H5Tget_size(type); if(bufsize==0) return -1; for(i=0;i=3)&&(cd_values[2]!=0)){ outbuf_size = cd_values[2]; /* Precomputed buffer guess */ }else{ outbuf_size = (*buf_size); } #ifdef H5PY_LZF_DEBUG fprintf(stderr, "Decompress %d chunk w/buffer %d\n", nbytes, outbuf_size); #endif while(!status){ free(outbuf); outbuf = malloc(outbuf_size); if(outbuf == NULL){ PUSH_ERR("lzf_filter", H5E_CALLBACK, "Can't allocate decompression buffer"); goto failed; } status = lzf_decompress(*buf, nbytes, outbuf, outbuf_size); if(!status){ /* compression failed */ if(errno == E2BIG){ outbuf_size += (*buf_size); #ifdef H5PY_LZF_DEBUG fprintf(stderr, " Too small: %d\n", outbuf_size); #endif } else if(errno == EINVAL) { PUSH_ERR("lzf_filter", H5E_CALLBACK, "Invalid data for LZF decompression"); goto failed; } else { PUSH_ERR("lzf_filter", H5E_CALLBACK, "Unknown LZF decompression error"); goto failed; } } /* if !status */ } /* while !status */ } /* compressing vs decompressing */ if(status != 0){ free(*buf); *buf = outbuf; *buf_size = outbuf_size; return status; /* Size of compressed/decompressed data */ } failed: free(outbuf); return 0; } /* End filter function */ bitshuffle-0.3.5/lzf/lzf_filter.h000066400000000000000000000015521337005776700170000ustar00rootroot00000000000000/***** Preamble block ********************************************************* * * This file is part of h5py, a low-level Python interface to the HDF5 library. * * Copyright (C) 2008 Andrew Collette * http://h5py.alfven.org * License: BSD (See LICENSE.txt for full license) * * $Date$ * ****** End preamble block ****************************************************/ #ifndef H5PY_LZF_H #define H5PY_LZF_H #ifdef __cplusplus extern "C" { #endif /* Filter revision number, starting at 1 */ #define H5PY_FILTER_LZF_VERSION 4 /* Filter ID registered with the HDF Group as of 2/6/09. For maintenance requests, contact the filter author directly. */ #define H5PY_FILTER_LZF 32000 /* Register the filter with the library. Returns a negative value on failure, and a non-negative value on success. */ int register_lzf(void); #ifdef __cplusplus } #endif #endif bitshuffle-0.3.5/requirements.txt000066400000000000000000000001271337005776700171550ustar00rootroot00000000000000# Order matters setuptools>=0.7 Cython>=0.19 numpy>=1.6.1 h5py>=2.4.0 --no-binary=h5py bitshuffle-0.3.5/setup.cfg.example000066400000000000000000000005201337005776700171410ustar00rootroot00000000000000[install] # These control the installation of the hdf5 dynamically loaded filter plugin. h5plugin = 0 h5plugin-dir = /usr/local/hdf5/lib/plugin [build_ext] # Whether to compile with OpenMP multi-threading. Default is system dependant: # False on OSX (since the clang compiler does not yet support OpenMP) and True # otherwise. omp = 1 bitshuffle-0.3.5/setup.py000066400000000000000000000265231337005776700154130ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function # I didn't import unicode_literals. They break setuptools or Cython in python # 2.7, but python 3 seems to be happy with them. import glob import os from os import path from setuptools import setup, Extension from setuptools.command.build_ext import build_ext as build_ext_ from setuptools.command.develop import develop as develop_ from setuptools.command.install import install as install_ import shutil import subprocess import sys VERSION_MAJOR = 0 VERSION_MINOR = 3 VERSION_POINT = 5 # Only unset in the 'release' branch and in tags. VERSION_DEV = 0 VERSION = "%d.%d.%d" % (VERSION_MAJOR, VERSION_MINOR, VERSION_POINT) if VERSION_DEV: VERSION = VERSION + ".dev%d" % VERSION_DEV COMPILE_FLAGS = ['-O3', '-ffast-math', '-march=native', '-std=c99'] # Cython breaks strict aliasing rules. COMPILE_FLAGS += ['-fno-strict-aliasing'] COMPILE_FLAGS += ['-fPIC'] COMPILE_FLAGS_MSVC = ['/Ox', '/fp:fast'] MACROS = [ ('BSHUF_VERSION_MAJOR', VERSION_MAJOR), ('BSHUF_VERSION_MINOR', VERSION_MINOR), ('BSHUF_VERSION_POINT', VERSION_POINT), ] H5PLUGINS_DEFAULT = '/usr/local/hdf5/lib/plugin' # OSX's clang compliler does not support OpenMP. if sys.platform == 'darwin': OMP_DEFAULT = False else: OMP_DEFAULT = True FALLBACK_CONFIG = { 'include_dirs': [], 'library_dirs': [], 'libraries': [], 'extra_compile_args': [], 'extra_link_args': [], } if 'HDF5_DIR' in os.environ: FALLBACK_CONFIG['include_dirs'] += [os.environ['HDF5_DIR'] + '/include'] # macports FALLBACK_CONFIG['library_dirs'] += [os.environ['HDF5_DIR'] + '/lib'] # macports elif sys.platform == 'darwin': # putting here both macports and homebrew paths will generate # "ld: warning: dir not found" at the linking phase FALLBACK_CONFIG['include_dirs'] += ['/opt/local/include'] # macports FALLBACK_CONFIG['library_dirs'] += ['/opt/local/lib'] # macports FALLBACK_CONFIG['include_dirs'] += ['/usr/local/include'] # homebrew FALLBACK_CONFIG['library_dirs'] += ['/usr/local/lib'] # homebrew elif sys.platform.startswith('freebsd'): FALLBACK_CONFIG['include_dirs'] += ['/usr/local/include'] # homebrew FALLBACK_CONFIG['library_dirs'] += ['/usr/local/lib'] # homebrew FALLBACK_CONFIG['include_dirs'] = [d for d in FALLBACK_CONFIG['include_dirs'] if path.isdir(d)] FALLBACK_CONFIG['library_dirs'] = [d for d in FALLBACK_CONFIG['library_dirs'] if path.isdir(d)] FALLBACK_CONFIG['extra_compile_args'] = ['-DH5_BUILT_AS_DYNAMIC_LIB'] def pkgconfig(*packages, **kw): config = kw.setdefault('config', {}) optional_args = kw.setdefault('optional', '') flag_map = {'include_dirs': ['--cflags-only-I', 2], 'library_dirs': ['--libs-only-L', 2], 'libraries': ['--libs-only-l', 2], 'extra_compile_args': ['--cflags-only-other', 0], 'extra_link_args': ['--libs-only-other', 0], } for package in packages: try: subprocess.check_output(["pkg-config", package]) except (subprocess.CalledProcessError, OSError): print("Can't find %s with pkg-config fallback to " "static config" % package) for distutils_key in flag_map: config.setdefault(distutils_key, []).extend( FALLBACK_CONFIG[distutils_key]) config['libraries'].append(package) else: for distutils_key, (pkg_option, n) in flag_map.items(): items = subprocess.check_output( ['pkg-config', optional_args, pkg_option, package] ).decode('utf8').split() opt = config.setdefault(distutils_key, []) opt.extend([i[n:] for i in items]) return config ext_bshuf = Extension( "bitshuffle.ext", sources=["bitshuffle/ext.pyx", "src/bitshuffle.c", "src/bitshuffle_core.c", "src/iochain.c", "lz4/lz4.c"], include_dirs=["src/", "lz4/"], depends=["src/bitshuffle.h", "src/bitshuffle_core.h", "src/iochain.h", "lz4/lz4.h"], libraries=[], define_macros=MACROS, ) h5filter = Extension( "bitshuffle.h5", sources=["bitshuffle/h5.pyx", "src/bshuf_h5filter.c", "src/bitshuffle.c", "src/bitshuffle_core.c", "src/iochain.c", "lz4/lz4.c"], depends=["src/bitshuffle.h", "src/bitshuffle_core.h", "src/iochain.h", "src/bshuf_h5filter.h", "lz4/lz4.h"], define_macros=MACROS, **pkgconfig("hdf5", config=dict( include_dirs=["src/", "lz4/"])) ) filter_plugin = Extension( "bitshuffle.plugin.libh5bshuf", sources=["src/bshuf_h5plugin.c", "src/bshuf_h5filter.c", "src/bitshuffle.c", "src/bitshuffle_core.c", "src/iochain.c", "lz4/lz4.c"], depends=["src/bitshuffle.h", "src/bitshuffle_core.h", "src/iochain.h", 'src/bshuf_h5filter.h', "lz4/lz4.h"], define_macros=MACROS, **pkgconfig("hdf5", config=dict( include_dirs=["src/", "lz4/"])) ) lzf_plugin = Extension( "bitshuffle.plugin.libh5LZF", sources=["src/lzf_h5plugin.c", "lzf/lzf_filter.c", "lzf/lzf/lzf_c.c", "lzf/lzf/lzf_d.c"], depends=["lzf/lzf_filter.h", "lzf/lzf/lzf.h", "lzf/lzf/lzfP.h"], **pkgconfig("hdf5", config=dict( include_dirs=["lzf/", "lzf/lzf/"])) ) EXTENSIONS = [ext_bshuf, h5filter] # Check for plugin hdf5 plugin support (hdf5 >= 1.8.11) HDF5_PLUGIN_SUPPORT = False CPATHS = os.environ['CPATH'].split(':') if 'CPATH' in os.environ else [] for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS: if os.path.exists(os.path.join(p, "H5PLextern.h")): HDF5_PLUGIN_SUPPORT = True if HDF5_PLUGIN_SUPPORT: EXTENSIONS.extend([filter_plugin, lzf_plugin]) class develop(develop_): def run(self): # Dummy directory for copying build plugins. if not path.isdir('bitshuffle/plugin'): os.mkdir('bitshuffle/plugin') develop_.run(self) # Custom installation to include installing dynamic filters. class install(install_): user_options = install_.user_options + [ ('h5plugin', None, 'Install HDF5 filter plugins for use outside of python.'), ('h5plugin-dir=', None, 'Where to install filter plugins. Default %s.' % H5PLUGINS_DEFAULT), ] def initialize_options(self): install_.initialize_options(self) self.h5plugin = False self.h5plugin_dir = H5PLUGINS_DEFAULT def finalize_options(self): install_.finalize_options(self) if self.h5plugin not in ('0', '1', True, False): raise ValueError("Invalid h5plugin argument. Mut be '0' or '1'.") self.h5plugin = int(self.h5plugin) self.h5plugin_dir = path.abspath(self.h5plugin_dir) def run(self): install_.run(self) if self.h5plugin: if not HDF5_PLUGIN_SUPPORT: print("HDF5 < 1.8.11, not installing filter plugins.") return plugin_build = path.join(self.build_lib, "bitshuffle", "plugin") try: os.makedirs(self.h5plugin_dir) except OSError as e: if e.args[0] == 17: # Directory already exists, this is fine. pass else: raise plugin_libs = glob.glob(path.join(plugin_build, "*")) for plugin_lib in plugin_libs: plugin_name = path.split(plugin_lib)[1] shutil.copy2(plugin_lib, path.join(self.h5plugin_dir, plugin_name)) print("Installed HDF5 filter plugins to %s" % self.h5plugin_dir) # Command line or site.cfg specification of OpenMP. class build_ext(build_ext_): user_options = build_ext_.user_options + [ ('omp=', None, "Whether to compile with OpenMP threading. Default" " on current system is %s." % str(OMP_DEFAULT)) ] boolean_options = build_ext_.boolean_options + ['omp'] def initialize_options(self): build_ext_.initialize_options(self) self.omp = OMP_DEFAULT def finalize_options(self): # For some reason this gets run twice. Careful to print messages and # add arguments only one time. build_ext_.finalize_options(self) if self.omp not in ('0', '1', True, False): raise ValueError("Invalid omp argument. Mut be '0' or '1'.") self.omp = int(self.omp) import numpy as np ext_bshuf.include_dirs.append(np.get_include()) # Required only by old version of setuptools < 18.0 from Cython.Build import cythonize self.extensions = cythonize(self.extensions) for ext in self.extensions: ext._needs_stub = False def build_extensions(self): c = self.compiler.compiler_type if self.omp not in ('0', '1', True, False): raise ValueError("Invalid omp argument. Mut be '0' or '1'.") self.omp = int(self.omp) if self.omp: if not hasattr(self, "_printed_omp_message"): self._printed_omp_message = True print("\n#################################") print("# Compiling with OpenMP support #") print("#################################\n") # More portable to pass -fopenmp to linker. # self.libraries += ['gomp'] if self.compiler.compiler_type == 'msvc': openmpflag = '/openmp' compileflags = COMPILE_FLAGS_MSVC else: openmpflag = '-fopenmp' compileflags = COMPILE_FLAGS for e in self.extensions: e.extra_compile_args = list(set(e.extra_compile_args).union(compileflags)) if openmpflag not in e.extra_compile_args: e.extra_compile_args += [openmpflag] if openmpflag not in e.extra_link_args: e.extra_link_args += [openmpflag] build_ext_.build_extensions(self) # Don't install numpy/cython/hdf5 if not needed for cmd in ["sdist", "clean", "--help", "--help-commands", "--version"]: if cmd in sys.argv: setup_requires = [] break else: setup_requires = ["Cython>=0.19", "numpy>=1.6.1"] with open('requirements.txt') as f: requires = f.read().splitlines() requires = [r.split()[0] for r in requires] with open('README.rst') as r: long_description = r.read() # TODO hdf5 support should be an "extra". Figure out how to set this up. setup( name='bitshuffle', version=VERSION, packages=['bitshuffle', 'bitshuffle.tests'], scripts=[], ext_modules=EXTENSIONS, cmdclass={'build_ext': build_ext, 'install': install, 'develop': develop}, setup_requires=setup_requires, install_requires=requires, # extras_require={'H5': ["h5py"]}, package_data={'': ['data/*']}, # metadata for upload to PyPI author="Kiyoshi Wesley Masui", author_email="kiyo@physics.ubc.ca", description="Bitshuffle filter for improving typed data compression.", long_description=long_description, license="MIT", url="https://github.com/kiyo-masui/bitshuffle", download_url=("https://github.com/kiyo-masui/bitshuffle/tarball/%s" % VERSION), keywords=['compression', 'hdf5', 'numpy'], ) bitshuffle-0.3.5/src/000077500000000000000000000000001337005776700144605ustar00rootroot00000000000000bitshuffle-0.3.5/src/bitshuffle.c000066400000000000000000000111151337005776700167560ustar00rootroot00000000000000/* * Bitshuffle - Filter for improving compression of typed binary data. * * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #include "bitshuffle.h" #include "bitshuffle_core.h" #include "bitshuffle_internals.h" #include "lz4.h" #include #include // Constants. // Use fast decompression instead of safe decompression for LZ4. #define BSHUF_LZ4_DECOMPRESS_FAST // Macros. #define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \ free(buf); return count - 1000; } /* Bitshuffle and compress a single block. */ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \ const size_t size, const size_t elem_size) { int64_t nbytes, count; void *tmp_buf_bshuf; void *tmp_buf_lz4; size_t this_iter; const void *in; void *out; tmp_buf_bshuf = malloc(size * elem_size); if (tmp_buf_bshuf == NULL) return -1; tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size)); if (tmp_buf_lz4 == NULL){ free(tmp_buf_bshuf); return -1; } in = ioc_get_in(C_ptr, &this_iter); ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size)); count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size); if (count < 0) { free(tmp_buf_lz4); free(tmp_buf_bshuf); return count; } nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size); free(tmp_buf_bshuf); CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4); out = ioc_get_out(C_ptr, &this_iter); ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4)); bshuf_write_uint32_BE(out, nbytes); memcpy((char *) out + 4, tmp_buf_lz4, nbytes); free(tmp_buf_lz4); return nbytes + 4; } /* Decompress and bitunshuffle a single block. */ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr, const size_t size, const size_t elem_size) { int64_t nbytes, count; void *out, *tmp_buf; const void *in; size_t this_iter; int32_t nbytes_from_header; in = ioc_get_in(C_ptr, &this_iter); nbytes_from_header = bshuf_read_uint32_BE(in); ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + nbytes_from_header + 4)); out = ioc_get_out(C_ptr, &this_iter); ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + size * elem_size)); tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; #ifdef BSHUF_LZ4_DECOMPRESS_FAST nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size); CHECK_ERR_FREE_LZ(nbytes, tmp_buf); if (nbytes != nbytes_from_header) { free(tmp_buf); return -91; } #else nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header, size * elem_size); CHECK_ERR_FREE_LZ(nbytes, tmp_buf); if (nbytes != size * elem_size) { free(tmp_buf); return -91; } nbytes = nbytes_from_header; #endif count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); nbytes += 4; free(tmp_buf); return nbytes; } /* ---- Public functions ---- * * See header file for description and usage. * */ size_t bshuf_compress_lz4_bound(const size_t size, const size_t elem_size, size_t block_size) { size_t bound, leftover; if (block_size == 0) { block_size = bshuf_default_block_size(elem_size); } if (block_size % BSHUF_BLOCKED_MULT) return -81; // Note that each block gets a 4 byte header. // Size of full blocks. bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size); // Size of partial blocks, if any. leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4; // Size of uncompressed data not fitting into any blocks. bound += (size % BSHUF_BLOCKED_MULT) * elem_size; return bound; } int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size) { return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size, elem_size, block_size); } int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size) { return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size, elem_size, block_size); } bitshuffle-0.3.5/src/bitshuffle.h000066400000000000000000000072161337005776700167720ustar00rootroot00000000000000/* * Bitshuffle - Filter for improving compression of typed binary data. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * * * Header File * * Worker routines return an int64_t which is the number of bytes processed * if positive or an error code if negative. * * Error codes: * -1 : Failed to allocate memory. * -11 : Missing SSE. * -12 : Missing AVX. * -80 : Input size not a multiple of 8. * -81 : block_size not multiple of 8. * -91 : Decompression error, wrong number of bytes processed. * -1YYY : Error internal to compression routine with error code -YYY. */ #ifndef BITSHUFFLE_H #define BITSHUFFLE_H #include #include "bitshuffle_core.h" #ifdef __cplusplus extern "C" { #endif /* ---- bshuf_compress_lz4_bound ---- * * Bound on size of data compressed with *bshuf_compress_lz4*. * * Parameters * ---------- * size : number of elements in input * elem_size : element size of typed data * block_size : Process in blocks of this many elements. Pass 0 to * select automatically (recommended). * * Returns * ------- * Bound on compressed data size. * */ size_t bshuf_compress_lz4_bound(const size_t size, const size_t elem_size, size_t block_size); /* ---- bshuf_compress_lz4 ---- * * Bitshuffled and compress the data using LZ4. * * Transpose within elements, in blocks of data of *block_size* elements then * compress the blocks using LZ4. In the output buffer, each block is prefixed * by a 4 byte integer giving the compressed size of that block. * * Output buffer must be large enough to hold the compressed data. This could * be in principle substantially larger than the input buffer. Use the routine * *bshuf_compress_lz4_bound* to get an upper limit. * * Parameters * ---------- * in : input buffer, must be of size * elem_size bytes * out : output buffer, must be large enough to hold data. * size : number of elements in input * elem_size : element size of typed data * block_size : Process in blocks of this many elements. Pass 0 to * select automatically (recommended). * * Returns * ------- * number of bytes used in output buffer, negative error-code if failed. * */ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size); /* ---- bshuf_decompress_lz4 ---- * * Undo compression and bitshuffling. * * Decompress data then un-bitshuffle it in blocks of *block_size* elements. * * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* * must patch the parameters used to compress the data. * * NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function * LZ4_decompress_fast from LZ4, which does not protect against maliciously * formed datasets. By modifying the compressed data, this function could be * coerced into leaving the boundaries of the input buffer. * * Parameters * ---------- * in : input buffer * out : output buffer, must be of size * elem_size bytes * size : number of elements in input * elem_size : element size of typed data * block_size : Process in blocks of this many elements. Pass 0 to * select automatically (recommended). * * Returns * ------- * number of bytes consumed in *input* buffer, negative error-code if failed. * */ int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size); #ifdef __cplusplus } // extern "C" #endif #endif // BITSHUFFLE_H bitshuffle-0.3.5/src/bitshuffle_core.c000066400000000000000000001657561337005776700200130ustar00rootroot00000000000000/* * Bitshuffle - Filter for improving compression of typed binary data. * * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #include "bitshuffle_core.h" #include "bitshuffle_internals.h" #include #include #if defined(__AVX2__) && defined (__SSE2__) #define USEAVX2 #endif #if defined(__SSE2__) #define USESSE2 #endif #if defined(__ARM_NEON__) || (__ARM_NEON) #define USEARMNEON #endif // Conditional includes for SSE2 and AVX2. #ifdef USEAVX2 #include #elif defined USESSE2 #include #elif defined USEARMNEON #include #endif #if defined(_OPENMP) && defined(_MSC_VER) typedef int64_t omp_size_t; #else typedef size_t omp_size_t; #endif // Macros. #define CHECK_MULT_EIGHT(n) if (n % 8) return -80; #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) /* ---- Functions indicating compile time instruction set. ---- */ int bshuf_using_NEON(void) { #ifdef USEARMNEON return 1; #else return 0; #endif } int bshuf_using_SSE2(void) { #ifdef USESSE2 return 1; #else return 0; #endif } int bshuf_using_AVX2(void) { #ifdef USEAVX2 return 1; #else return 0; #endif } /* ---- Worker code not requiring special instruction sets. ---- * * The following code does not use any x86 specific vectorized instructions * and should compile on any machine * */ /* Transpose 8x8 bit array packed into a single quadword *x*. * *t* is workspace. */ #define TRANS_BIT_8X8(x, t) { \ t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL; \ x = x ^ t ^ (t << 7); \ t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL; \ x = x ^ t ^ (t << 14); \ t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL; \ x = x ^ t ^ (t << 28); \ } /* Transpose 8x8 bit array along the diagonal from upper right to lower left */ #define TRANS_BIT_8X8_BE(x, t) { \ t = (x ^ (x >> 9)) & 0x0055005500550055LL; \ x = x ^ t ^ (t << 9); \ t = (x ^ (x >> 18)) & 0x0000333300003333LL; \ x = x ^ t ^ (t << 18); \ t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL; \ x = x ^ t ^ (t << 36); \ } /* Transpose of an array of arbitrarily typed elements. */ #define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) { \ size_t ii, jj, kk; \ const type_t* in_type = (const type_t*) in; \ type_t* out_type = (type_t*) out; \ for(ii = 0; ii + 7 < lda; ii += 8) { \ for(jj = 0; jj < ldb; jj++) { \ for(kk = 0; kk < 8; kk++) { \ out_type[jj*lda + ii + kk] = \ in_type[ii*ldb + kk * ldb + jj]; \ } \ } \ } \ for(ii = lda - lda % 8; ii < lda; ii ++) { \ for(jj = 0; jj < ldb; jj++) { \ out_type[jj*lda + ii] = in_type[ii*ldb + jj]; \ } \ } \ } /* Memory copy with bshuf call signature. For testing and profiling. */ int64_t bshuf_copy(const void* in, void* out, const size_t size, const size_t elem_size) { const char* in_b = (const char*) in; char* out_b = (char*) out; memcpy(out_b, in_b, size * elem_size); return size * elem_size; } /* Transpose bytes within elements, starting partway through input. */ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size, const size_t elem_size, const size_t start) { size_t ii, jj, kk; const char* in_b = (const char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(start); if (size > start) { // ii loop separated into 2 loops so the compiler can unroll // the inner one. for (ii = start; ii + 7 < size; ii += 8) { for (jj = 0; jj < elem_size; jj++) { for (kk = 0; kk < 8; kk++) { out_b[jj * size + ii + kk] = in_b[ii * elem_size + kk * elem_size + jj]; } } } for (ii = size - size % 8; ii < size; ii ++) { for (jj = 0; jj < elem_size; jj++) { out_b[jj * size + ii] = in_b[ii * elem_size + jj]; } } } return size * elem_size; } /* Transpose bytes within elements. */ int64_t bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size, const size_t elem_size) { return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0); } /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size, const size_t elem_size, const size_t start_byte) { const uint64_t* in_b = (const uint64_t*) in; uint8_t* out_b = (uint8_t*) out; uint64_t x, t; size_t ii, kk; size_t nbyte = elem_size * size; size_t nbyte_bitrow = nbyte / 8; uint64_t e=1; const int little_endian = *(uint8_t *) &e == 1; const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow; const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow; CHECK_MULT_EIGHT(nbyte); CHECK_MULT_EIGHT(start_byte); for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) { x = in_b[ii]; if (little_endian) { TRANS_BIT_8X8(x, t); } else { TRANS_BIT_8X8_BE(x, t); } for (kk = 0; kk < 8; kk ++) { out_b[bit_row_offset + kk * bit_row_skip + ii] = x; x = x >> 8; } } return size * elem_size; } /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size, const size_t elem_size) { return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0); } /* General transpose of an array, optimized for large element sizes. */ int64_t bshuf_trans_elem(const void* in, void* out, const size_t lda, const size_t ldb, const size_t elem_size) { size_t ii, jj; const char* in_b = (const char*) in; char* out_b = (char*) out; for(ii = 0; ii < lda; ii++) { for(jj = 0; jj < ldb; jj++) { memcpy(&out_b[(jj*lda + ii) * elem_size], &in_b[(ii*ldb + jj) * elem_size], elem_size); } } return lda * ldb * elem_size; } /* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */ int64_t bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size, const size_t elem_size) { size_t nbyte_bitrow = size / 8; CHECK_MULT_EIGHT(size); return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow); } /* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; void *tmp_buf; CHECK_MULT_EIGHT(size); tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } /* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, jj, kk, nbyte_row; const char *in_b; char *out_b; in_b = (const char*) in; out_b = (char*) out; nbyte_row = size / 8; CHECK_MULT_EIGHT(size); for (jj = 0; jj < elem_size; jj++) { for (ii = 0; ii < nbyte_row; ii++) { for (kk = 0; kk < 8; kk++) { out_b[ii * 8 * elem_size + jj * 8 + kk] = \ in_b[(jj * 8 + kk) * nbyte_row + ii]; } } } return size * elem_size; } /* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_scal(const void* in, void* out, \ const size_t size, const size_t elem_size) { const char *in_b; char *out_b; uint64_t x, t; size_t ii, jj, kk; size_t nbyte, out_index; uint64_t e=1; const int little_endian = *(uint8_t *) &e == 1; const size_t elem_skip = little_endian ? elem_size : -elem_size; const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size; CHECK_MULT_EIGHT(size); in_b = (const char*) in; out_b = (char*) out; nbyte = elem_size * size; for (jj = 0; jj < 8 * elem_size; jj += 8) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { x = *((uint64_t*) &in_b[ii + jj]); if (little_endian) { TRANS_BIT_8X8(x, t); } else { TRANS_BIT_8X8_BE(x, t); } for (kk = 0; kk < 8; kk++) { out_index = ii + jj / 8 + elem_offset + kk * elem_skip; *((uint8_t*) &out_b[out_index]) = x; x = x >> 8; } } } return size * elem_size; } /* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; void *tmp_buf; CHECK_MULT_EIGHT(size); tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } /* ---- Worker code that uses Arm NEON ---- * * The following code makes use of the Arm NEON instruction set. * NEON technology is the implementation of the ARM Advanced Single * Instruction Multiple Data (SIMD) extension. * The NEON unit is the component of the processor that executes SIMD instructions. * It is also called the NEON Media Processing Engine (MPE). * */ #ifdef USEARMNEON /* Transpose bytes within elements for 16 bit elements. */ int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) { size_t ii; const char *in_b = (const char*) in; char *out_b = (char*) out; int8x16_t a0, b0, a1, b1; for (ii=0; ii + 15 < size; ii += 16) { a0 = vld1q_s8(in_b + 2*ii + 0*16); b0 = vld1q_s8(in_b + 2*ii + 1*16); a1 = vzip1q_s8(a0, b0); b1 = vzip2q_s8(a0, b0); a0 = vzip1q_s8(a1, b1); b0 = vzip2q_s8(a1, b1); a1 = vzip1q_s8(a0, b0); b1 = vzip2q_s8(a0, b0); a0 = vzip1q_s8(a1, b1); b0 = vzip2q_s8(a1, b1); vst1q_s8(out_b + 0*size + ii, a0); vst1q_s8(out_b + 1*size + ii, b0); } return bshuf_trans_byte_elem_remainder(in, out, size, 2, size - size % 16); } /* Transpose bytes within elements for 32 bit elements. */ int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) { size_t ii; const char *in_b; char *out_b; in_b = (const char*) in; out_b = (char*) out; int8x16_t a0, b0, c0, d0, a1, b1, c1, d1; int64x2_t a2, b2, c2, d2; for (ii=0; ii + 15 < size; ii += 16) { a0 = vld1q_s8(in_b + 4*ii + 0*16); b0 = vld1q_s8(in_b + 4*ii + 1*16); c0 = vld1q_s8(in_b + 4*ii + 2*16); d0 = vld1q_s8(in_b + 4*ii + 3*16); a1 = vzip1q_s8(a0, b0); b1 = vzip2q_s8(a0, b0); c1 = vzip1q_s8(c0, d0); d1 = vzip2q_s8(c0, d0); a0 = vzip1q_s8(a1, b1); b0 = vzip2q_s8(a1, b1); c0 = vzip1q_s8(c1, d1); d0 = vzip2q_s8(c1, d1); a1 = vzip1q_s8(a0, b0); b1 = vzip2q_s8(a0, b0); c1 = vzip1q_s8(c0, d0); d1 = vzip2q_s8(c0, d0); a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1)); b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1)); c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1)); d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1)); vst1q_s64((int64_t *) (out_b + 0*size + ii), a2); vst1q_s64((int64_t *) (out_b + 1*size + ii), b2); vst1q_s64((int64_t *) (out_b + 2*size + ii), c2); vst1q_s64((int64_t *) (out_b + 3*size + ii), d2); } return bshuf_trans_byte_elem_remainder(in, out, size, 4, size - size % 16); } /* Transpose bytes within elements for 64 bit elements. */ int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) { size_t ii; const char* in_b = (const char*) in; char* out_b = (char*) out; int8x16_t a0, b0, c0, d0, e0, f0, g0, h0; int8x16_t a1, b1, c1, d1, e1, f1, g1, h1; for (ii=0; ii + 15 < size; ii += 16) { a0 = vld1q_s8(in_b + 8*ii + 0*16); b0 = vld1q_s8(in_b + 8*ii + 1*16); c0 = vld1q_s8(in_b + 8*ii + 2*16); d0 = vld1q_s8(in_b + 8*ii + 3*16); e0 = vld1q_s8(in_b + 8*ii + 4*16); f0 = vld1q_s8(in_b + 8*ii + 5*16); g0 = vld1q_s8(in_b + 8*ii + 6*16); h0 = vld1q_s8(in_b + 8*ii + 7*16); a1 = vzip1q_s8 (a0, b0); b1 = vzip2q_s8 (a0, b0); c1 = vzip1q_s8 (c0, d0); d1 = vzip2q_s8 (c0, d0); e1 = vzip1q_s8 (e0, f0); f1 = vzip2q_s8 (e0, f0); g1 = vzip1q_s8 (g0, h0); h1 = vzip2q_s8 (g0, h0); a0 = vzip1q_s8 (a1, b1); b0 = vzip2q_s8 (a1, b1); c0 = vzip1q_s8 (c1, d1); d0 = vzip2q_s8 (c1, d1); e0 = vzip1q_s8 (e1, f1); f0 = vzip2q_s8 (e1, f1); g0 = vzip1q_s8 (g1, h1); h0 = vzip2q_s8 (g1, h1); a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0)); b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0)); c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0)); d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0)); e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0)); f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0)); g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0)); h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0)); a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1)); b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1)); c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1)); d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1)); e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1)); f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1)); g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1)); h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1)); vst1q_s8(out_b + 0*size + ii, a0); vst1q_s8(out_b + 1*size + ii, b0); vst1q_s8(out_b + 2*size + ii, c0); vst1q_s8(out_b + 3*size + ii, d0); vst1q_s8(out_b + 4*size + ii, e0); vst1q_s8(out_b + 5*size + ii, f0); vst1q_s8(out_b + 6*size + ii, g0); vst1q_s8(out_b + 7*size + ii, h0); } return bshuf_trans_byte_elem_remainder(in, out, size, 8, size - size % 16); } /* Transpose bytes within elements using best NEON algorithm available. */ int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; // Trivial cases: power of 2 bytes. switch (elem_size) { case 1: count = bshuf_copy(in, out, size, elem_size); return count; case 2: count = bshuf_trans_byte_elem_NEON_16(in, out, size); return count; case 4: count = bshuf_trans_byte_elem_NEON_32(in, out, size); return count; case 8: count = bshuf_trans_byte_elem_NEON_64(in, out, size); return count; } // Worst case: odd number of bytes. Turns out that this is faster for // (odd * 2) byte elements as well (hence % 4). if (elem_size % 4) { count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); return count; } // Multiple of power of 2: transpose hierarchically. { size_t nchunk_elem; void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; if ((elem_size % 8) == 0) { nchunk_elem = elem_size / 8; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); } else if ((elem_size % 4) == 0) { nchunk_elem = elem_size / 4; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); } else { // Not used since scalar algorithm is faster. nchunk_elem = elem_size / 2; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); } free(tmp_buf); return count; } } /* Creates a mask made up of the most significant * bit of each byte of 'input' */ int32_t move_byte_mask_neon(uint8x16_t input) { return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3) | (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7) | (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11) | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15) ); } /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; const char* in_b = (const char*) in; char* out_b = (char*) out; uint16_t* out_ui16; int64_t count; size_t nbyte = elem_size * size; CHECK_MULT_EIGHT(nbyte); int16x8_t xmm; int32_t bt; for (ii = 0; ii + 15 < nbyte; ii += 16) { xmm = vld1q_s16((int16_t *) (in_b + ii)); for (kk = 0; kk < 8; kk++) { bt = move_byte_mask_neon((uint8x16_t) xmm); xmm = vshlq_n_s16(xmm, 1); out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_ui16 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 16); return count; } /* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } /* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, jj; const char* in_b = (const char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; int8x16_t a0, b0, c0, d0, e0, f0, g0, h0; int8x16_t a1, b1, c1, d1, e1, f1, g1, h1; int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; for (ii = 0; ii + 7 < nrows; ii += 8) { for (jj = 0; jj + 15 < nbyte_row; jj += 16) { a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj); b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj); c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj); d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj); e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj); f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj); g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj); h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj); a1 = vzip1q_s8(a0, b0); b1 = vzip1q_s8(c0, d0); c1 = vzip1q_s8(e0, f0); d1 = vzip1q_s8(g0, h0); e1 = vzip2q_s8(a0, b0); f1 = vzip2q_s8(c0, d0); g1 = vzip2q_s8(e0, f0); h1 = vzip2q_s8(g0, h0); a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1)); b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1)); c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1)); d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1)); e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1)); f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1)); g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1)); h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1)); a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0)); b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0)); c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0)); d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0)); e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0)); f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0)); g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0)); h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0)); as = (int64x1_t *) &a1; bs = (int64x1_t *) &b1; cs = (int64x1_t *) &c1; ds = (int64x1_t *) &d1; es = (int64x1_t *) &e1; fs = (int64x1_t *) &f1; gs = (int64x1_t *) &g1; hs = (int64x1_t *) &h1; vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as); vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1)); vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs); vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1)); vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs); vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1)); vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds); vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1)); vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es); vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1)); vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs); vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1)); vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs); vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1)); vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs); vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1)); } for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; } } return size * elem_size; } /* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. const char* in_b = (const char*) in; uint16_t* out_ui16 = (uint16_t*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; int16x8_t xmm; int32_t bt; if (elem_size % 2) { bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); } else { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { xmm = vld1q_s16((int16_t *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = move_byte_mask_neon((uint8x16_t) xmm); xmm = vshlq_n_s16(xmm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); out_ui16[ind / 2] = bt; } } } } return size * elem_size; } /* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } #else // #ifdef USEARMNEON int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) { return -13; } int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) { return -13; } int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) { return -13; } int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size, const size_t elem_size) { return -13; } #endif /* ---- Worker code that uses SSE2 ---- * * The following code makes use of the SSE2 instruction set and specialized * 16 byte registers. The SSE2 instructions are present on modern x86 * processors. The first Intel processor microarchitecture supporting SSE2 was * Pentium 4 (2000). * */ #ifdef USESSE2 /* Transpose bytes within elements for 16 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { size_t ii; const char *in_b = (const char*) in; char *out_b = (char*) out; __m128i a0, b0, a1, b1; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); } return bshuf_trans_byte_elem_remainder(in, out, size, 2, size - size % 16); } /* Transpose bytes within elements for 32 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { size_t ii; const char *in_b; char *out_b; in_b = (const char*) in; out_b = (char*) out; __m128i a0, b0, c0, d0, a1, b1, c1, d1; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi64(a1, c1); b0 = _mm_unpackhi_epi64(a1, c1); c0 = _mm_unpacklo_epi64(b1, d1); d0 = _mm_unpackhi_epi64(b1, d1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); } return bshuf_trans_byte_elem_remainder(in, out, size, 4, size - size % 16); } /* Transpose bytes within elements for 64 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { size_t ii; const char* in_b = (const char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); e1 = _mm_unpacklo_epi8(e0, f0); f1 = _mm_unpackhi_epi8(e0, f0); g1 = _mm_unpacklo_epi8(g0, h0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); e0 = _mm_unpacklo_epi8(e1, f1); f0 = _mm_unpackhi_epi8(e1, f1); g0 = _mm_unpacklo_epi8(g1, h1); h0 = _mm_unpackhi_epi8(g1, h1); a1 = _mm_unpacklo_epi32(a0, c0); b1 = _mm_unpackhi_epi32(a0, c0); c1 = _mm_unpacklo_epi32(b0, d0); d1 = _mm_unpackhi_epi32(b0, d0); e1 = _mm_unpacklo_epi32(e0, g0); f1 = _mm_unpackhi_epi32(e0, g0); g1 = _mm_unpacklo_epi32(f0, h0); h1 = _mm_unpackhi_epi32(f0, h0); a0 = _mm_unpacklo_epi64(a1, e1); b0 = _mm_unpackhi_epi64(a1, e1); c0 = _mm_unpacklo_epi64(b1, f1); d0 = _mm_unpackhi_epi64(b1, f1); e0 = _mm_unpacklo_epi64(c1, g1); f0 = _mm_unpackhi_epi64(c1, g1); g0 = _mm_unpacklo_epi64(d1, h1); h0 = _mm_unpackhi_epi64(d1, h1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); } return bshuf_trans_byte_elem_remainder(in, out, size, 8, size - size % 16); } /* Transpose bytes within elements using best SSE algorithm available. */ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; // Trivial cases: power of 2 bytes. switch (elem_size) { case 1: count = bshuf_copy(in, out, size, elem_size); return count; case 2: count = bshuf_trans_byte_elem_SSE_16(in, out, size); return count; case 4: count = bshuf_trans_byte_elem_SSE_32(in, out, size); return count; case 8: count = bshuf_trans_byte_elem_SSE_64(in, out, size); return count; } // Worst case: odd number of bytes. Turns out that this is faster for // (odd * 2) byte elements as well (hence % 4). if (elem_size % 4) { count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); return count; } // Multiple of power of 2: transpose hierarchically. { size_t nchunk_elem; void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; if ((elem_size % 8) == 0) { nchunk_elem = elem_size / 8; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); } else if ((elem_size % 4) == 0) { nchunk_elem = elem_size / 4; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); } else { // Not used since scalar algorithm is faster. nchunk_elem = elem_size / 2; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); } free(tmp_buf); return count; } } /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; const char* in_b = (const char*) in; char* out_b = (char*) out; uint16_t* out_ui16; int64_t count; size_t nbyte = elem_size * size; CHECK_MULT_EIGHT(nbyte); __m128i xmm; int32_t bt; for (ii = 0; ii + 15 < nbyte; ii += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_ui16 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 16); return count; } /* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } /* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, jj; const char* in_b = (const char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; for (ii = 0; ii + 7 < nrows; ii += 8) { for (jj = 0; jj + 15 < nbyte_row; jj += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpacklo_epi8(c0, d0); c1 = _mm_unpacklo_epi8(e0, f0); d1 = _mm_unpacklo_epi8(g0, h0); e1 = _mm_unpackhi_epi8(a0, b0); f1 = _mm_unpackhi_epi8(c0, d0); g1 = _mm_unpackhi_epi8(e0, f0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi16(a1, b1); b0 = _mm_unpacklo_epi16(c1, d1); c0 = _mm_unpackhi_epi16(a1, b1); d0 = _mm_unpackhi_epi16(c1, d1); e0 = _mm_unpacklo_epi16(e1, f1); f0 = _mm_unpacklo_epi16(g1, h1); g0 = _mm_unpackhi_epi16(e1, f1); h0 = _mm_unpackhi_epi16(g1, h1); a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpackhi_epi32(a0, b0); c1 = _mm_unpacklo_epi32(c0, d0); d1 = _mm_unpackhi_epi32(c0, d0); e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpackhi_epi32(e0, f0); g1 = _mm_unpacklo_epi32(g0, h0); h1 = _mm_unpackhi_epi32(g0, h0); // We don't have a storeh instruction for integers, so interpret // as a float. Have a storel (_mm_storel_epi64). as = (__m128 *) &a1; bs = (__m128 *) &b1; cs = (__m128 *) &c1; ds = (__m128 *) &d1; es = (__m128 *) &e1; fs = (__m128 *) &f1; gs = (__m128 *) &g1; hs = (__m128 *) &h1; _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); } for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; } } return size * elem_size; } /* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. const char* in_b = (const char*) in; uint16_t* out_ui16 = (uint16_t*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; if (elem_size % 2) { bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); } else { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); out_ui16[ind / 2] = bt; } } } } return size * elem_size; } /* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } #else // #ifdef USESSE2 int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { return -11; } int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { return -11; } int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { return -11; } int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, const size_t elem_size) { return -11; } #endif // #ifdef USESSE2 /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */ /* ---- Worker code that uses AVX2 ---- * * The following code makes use of the AVX2 instruction set and specialized * 32 byte registers. The AVX2 instructions are present on newer x86 * processors. The first Intel processor microarchitecture supporting AVX2 was * Haswell (2013). * */ #ifdef USEAVX2 /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; const char* in_b = (const char*) in; char* out_b = (char*) out; int32_t* out_i32; size_t nbyte = elem_size * size; int64_t count; __m256i ymm; int32_t bt; for (ii = 0; ii + 31 < nbyte; ii += 32) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_i32 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 32); return count; } /* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } /* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; const char* in_b = (const char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; } /* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. const char* in_b = (const char*) in; char* out_b = (char*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; __m256i ymm; int32_t bt; if (elem_size % 4) { return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); } else { for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); * (int32_t *) &out_b[ind] = bt; } } } } return size * elem_size; } /* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size); free(tmp_buf); return count; } #else // #ifdef USEAVX2 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { return -12; } int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { return -12; } int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { return -12; } int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { return -12; } int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { return -12; } #endif // #ifdef USEAVX2 /* ---- Drivers selecting best instruction set at compile time. ---- */ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; #ifdef USEAVX2 count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size); #elif defined(USESSE2) count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size); #elif defined(USEARMNEON) count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size); #else count = bshuf_trans_bit_elem_scal(in, out, size, elem_size); #endif return count; } int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; #ifdef USEAVX2 count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size); #elif defined(USESSE2) count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size); #elif defined(USEARMNEON) count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size); #else count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size); #endif return count; } /* ---- Wrappers for implementing blocking ---- */ /* Wrap a function for processing a single block to process an entire buffer in * parallel. */ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \ const size_t size, const size_t elem_size, size_t block_size) { omp_size_t ii = 0; int64_t err = 0; int64_t count, cum_count=0; size_t last_block_size; size_t leftover_bytes; size_t this_iter; char *last_in; char *last_out; ioc_chain C; ioc_init(&C, in, out); if (block_size == 0) { block_size = bshuf_default_block_size(elem_size); } if (block_size % BSHUF_BLOCKED_MULT) return -81; #if defined(_OPENMP) #pragma omp parallel for schedule(dynamic, 1) \ private(count) reduction(+ : cum_count) #endif for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) { count = fun(&C, block_size, elem_size); if (count < 0) err = count; cum_count += count; } last_block_size = size % block_size; last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT; if (last_block_size) { count = fun(&C, last_block_size, elem_size); if (count < 0) err = count; cum_count += count; } if (err < 0) return err; leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size; //this_iter; last_in = (char *) ioc_get_in(&C, &this_iter); ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes)); last_out = (char *) ioc_get_out(&C, &this_iter); ioc_set_next_out(&C, &this_iter, (void *) (last_out + leftover_bytes)); memcpy(last_out, last_in, leftover_bytes); ioc_destroy(&C); return cum_count + leftover_bytes; } /* Bitshuffle a single block. */ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \ const size_t size, const size_t elem_size) { size_t this_iter; const void *in; void *out; int64_t count; in = ioc_get_in(C_ptr, &this_iter); ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size)); out = ioc_get_out(C_ptr, &this_iter); ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + size * elem_size)); count = bshuf_trans_bit_elem(in, out, size, elem_size); return count; } /* Bitunshuffle a single block. */ int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \ const size_t size, const size_t elem_size) { size_t this_iter; const void *in; void *out; int64_t count; in = ioc_get_in(C_ptr, &this_iter); ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size)); out = ioc_get_out(C_ptr, &this_iter); ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + size * elem_size)); count = bshuf_untrans_bit_elem(in, out, size, elem_size); return count; } /* Write a 64 bit unsigned integer to a buffer in big endian order. */ void bshuf_write_uint64_BE(void* buf, uint64_t num) { int ii; uint8_t* b = (uint8_t*) buf; uint64_t pow28 = 1 << 8; for (ii = 7; ii >= 0; ii--) { b[ii] = num % pow28; num = num / pow28; } } /* Read a 64 bit unsigned integer from a buffer big endian order. */ uint64_t bshuf_read_uint64_BE(void* buf) { int ii; uint8_t* b = (uint8_t*) buf; uint64_t num = 0, pow28 = 1 << 8, cp = 1; for (ii = 7; ii >= 0; ii--) { num += b[ii] * cp; cp *= pow28; } return num; } /* Write a 32 bit unsigned integer to a buffer in big endian order. */ void bshuf_write_uint32_BE(void* buf, uint32_t num) { int ii; uint8_t* b = (uint8_t*) buf; uint32_t pow28 = 1 << 8; for (ii = 3; ii >= 0; ii--) { b[ii] = num % pow28; num = num / pow28; } } /* Read a 32 bit unsigned integer from a buffer big endian order. */ uint32_t bshuf_read_uint32_BE(const void* buf) { int ii; uint8_t* b = (uint8_t*) buf; uint32_t num = 0, pow28 = 1 << 8, cp = 1; for (ii = 3; ii >= 0; ii--) { num += b[ii] * cp; cp *= pow28; } return num; } /* ---- Public functions ---- * * See header file for description and usage. * */ size_t bshuf_default_block_size(const size_t elem_size) { // This function needs to be absolutely stable between versions. // Otherwise encoded data will not be decodable. size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size; // Ensure it is a required multiple. block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK); } int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size) { return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size, elem_size, block_size); } int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size) { return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size, elem_size, block_size); } #undef TRANS_BIT_8X8 #undef TRANS_ELEM_TYPE #undef MAX #undef CHECK_MULT_EIGHT #undef CHECK_ERR_FREE #undef USESSE2 #undef USEAVX2 bitshuffle-0.3.5/src/bitshuffle_core.h000066400000000000000000000100071337005776700177720ustar00rootroot00000000000000/* * Bitshuffle - Filter for improving compression of typed binary data. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * * * Header File * * Worker routines return an int64_t which is the number of bytes processed * if positive or an error code if negative. * * Error codes: * -1 : Failed to allocate memory. * -11 : Missing SSE. * -12 : Missing AVX. * -13 : Missing Arm Neon. * -80 : Input size not a multiple of 8. * -81 : block_size not multiple of 8. * -91 : Decompression error, wrong number of bytes processed. * -1YYY : Error internal to compression routine with error code -YYY. */ #ifndef BITSHUFFLE_CORE_H #define BITSHUFFLE_CORE_H // We assume GNU g++ defining `__cplusplus` has stdint.h #if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus) #include #else typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; typedef signed int int32_t; typedef unsigned long long uint64_t; typedef long long int64_t; #endif #include // These are usually set in the setup.py. #ifndef BSHUF_VERSION_MAJOR #define BSHUF_VERSION_MAJOR 0 #define BSHUF_VERSION_MINOR 3 #define BSHUF_VERSION_POINT 5 #endif #ifdef __cplusplus extern "C" { #endif /* --- bshuf_using_SSE2 ---- * * Whether routines where compiled with the SSE2 instruction set. * * Returns * ------- * 1 if using SSE2, 0 otherwise. * */ int bshuf_using_SSE2(void); /* ---- bshuf_using_AVX2 ---- * * Whether routines where compiled with the AVX2 instruction set. * * Returns * ------- * 1 if using AVX2, 0 otherwise. * */ int bshuf_using_AVX2(void); /* ---- bshuf_default_block_size ---- * * The default block size as function of element size. * * This is the block size used by the blocked routines (any routine * taking a *block_size* argument) when the block_size is not provided * (zero is passed). * * The results of this routine are guaranteed to be stable such that * shuffled/compressed data can always be decompressed. * * Parameters * ---------- * elem_size : element size of data to be shuffled/compressed. * */ size_t bshuf_default_block_size(const size_t elem_size); /* ---- bshuf_bitshuffle ---- * * Bitshuffle the data. * * Transpose the bits within elements, in blocks of *block_size* * elements. * * Parameters * ---------- * in : input buffer, must be of size * elem_size bytes * out : output buffer, must be of size * elem_size bytes * size : number of elements in input * elem_size : element size of typed data * block_size : Do transpose in blocks of this many elements. Pass 0 to * select automatically (recommended). * * Returns * ------- * number of bytes processed, negative error-code if failed. * */ int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size); /* ---- bshuf_bitunshuffle ---- * * Unshuffle bitshuffled data. * * Untranspose the bits within elements, in blocks of *block_size* * elements. * * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* * must match the parameters used to shuffle the data. * * Parameters * ---------- * in : input buffer, must be of size * elem_size bytes * out : output buffer, must be of size * elem_size bytes * size : number of elements in input * elem_size : element size of typed data * block_size : Do transpose in blocks of this many elements. Pass 0 to * select automatically (recommended). * * Returns * ------- * number of bytes processed, negative error-code if failed. * */ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size); #ifdef __cplusplus } // extern "C" #endif #endif // BITSHUFFLE_CORE_H bitshuffle-0.3.5/src/bitshuffle_internals.h000066400000000000000000000042251337005776700210460ustar00rootroot00000000000000/* * Bitshuffle - Filter for improving compression of typed binary data. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. */ #ifndef BITSHUFFLE_INTERNALS_H #define BITSHUFFLE_INTERNALS_H // We assume GNU g++ defining `__cplusplus` has stdint.h #if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus) #include #else typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; typedef signed int int32_t; typedef unsigned long long uint64_t; typedef long long int64_t; #endif #include #include "iochain.h" // Constants. #ifndef BSHUF_MIN_RECOMMEND_BLOCK #define BSHUF_MIN_RECOMMEND_BLOCK 128 #define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this. #define BSHUF_TARGET_BLOCK_SIZE_B 8192 #endif // Macros. #define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; } #ifdef __cplusplus extern "C" { #endif /* ---- Utility functions for internal use only ---- */ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size); /* Read a 32 bit unsigned integer from a buffer big endian order. */ uint32_t bshuf_read_uint32_BE(const void* buf); /* Write a 32 bit unsigned integer to a buffer in big endian order. */ void bshuf_write_uint32_BE(void* buf, uint32_t num); int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size); /* Function definition for worker functions that process a single block. */ typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr, const size_t size, const size_t elem_size); /* Wrap a function for processing a single block to process an entire buffer in * parallel. */ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, const size_t size, const size_t elem_size, size_t block_size); #ifdef __cplusplus } // extern "C" #endif #endif // BITSHUFFLE_INTERNALS_H bitshuffle-0.3.5/src/bshuf_h5filter.c000066400000000000000000000150501337005776700175360ustar00rootroot00000000000000/* * Bitshuffle HDF5 filter * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #include "bitshuffle.h" #include "bshuf_h5filter.h" #define PUSH_ERR(func, minor, str) \ H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str) // Prototypes from bitshuffle.c void bshuf_write_uint64_BE(void* buf, uint64_t num); uint64_t bshuf_read_uint64_BE(void* buf); void bshuf_write_uint32_BE(void* buf, uint32_t num); uint32_t bshuf_read_uint32_BE(const void* buf); // Only called on compresion, not on reverse. herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){ herr_t r; size_t ii; unsigned int elem_size; unsigned int flags; size_t nelements = 8; size_t nelem_max = 11; unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0}; unsigned tmp_values[] = {0,0,0,0,0,0,0,0}; char msg[80]; r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements, tmp_values, 0, NULL, NULL); if(r<0) return -1; // First 3 slots reserved. Move any passed options to higher addresses. for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) { values[ii + 3] = tmp_values[ii]; } nelements = 3 + nelements; values[0] = BSHUF_VERSION_MAJOR; values[1] = BSHUF_VERSION_MINOR; elem_size = H5Tget_size(type); if(elem_size <= 0) { PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, "Invalid element size."); return -1; } values[2] = elem_size; // Validate user supplied arguments. if (nelements > 3) { if (values[3] % 8 || values[3] < 0) { sprintf(msg, "Error in bitshuffle. Invalid block size: %d.", values[3]); PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg); return -1; } } if (nelements > 4) { switch (values[4]) { case 0: break; case BSHUF_H5_COMPRESS_LZ4: break; default: PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, "Invalid bitshuffle compression."); } } r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values); if(r<0) return -1; return 1; } size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts, const unsigned int cd_values[], size_t nbytes, size_t *buf_size, void **buf) { size_t size, elem_size; int err; char msg[80]; size_t block_size = 0; size_t buf_size_out, nbytes_uncomp, nbytes_out; char* in_buf = *buf; void *out_buf; if (cd_nelmts < 3) { PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, "Not enough parameters."); return 0; } elem_size = cd_values[2]; // User specified block size. if (cd_nelmts > 3) block_size = cd_values[3]; if (block_size == 0) block_size = bshuf_default_block_size(elem_size); // Compression in addition to bitshiffle. if (cd_nelmts > 4 && cd_values[4] == BSHUF_H5_COMPRESS_LZ4) { if (flags & H5Z_FLAG_REVERSE) { // First eight bytes is the number of bytes in the output buffer, // little endian. nbytes_uncomp = bshuf_read_uint64_BE(in_buf); // Override the block size with the one read from the header. block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size; // Skip over the header. in_buf += 12; buf_size_out = nbytes_uncomp; } else { nbytes_uncomp = nbytes; buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size, elem_size, block_size) + 12; } } else { nbytes_uncomp = nbytes; buf_size_out = nbytes; } // TODO, remove this restriction by memcopying the extra. if (nbytes_uncomp % elem_size) { PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, "Non integer number of elements."); return 0; } size = nbytes_uncomp / elem_size; out_buf = malloc(buf_size_out); if (out_buf == NULL) { PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, "Could not allocate output buffer."); return 0; } if (cd_nelmts > 4 && cd_values[4] == BSHUF_H5_COMPRESS_LZ4) { if (flags & H5Z_FLAG_REVERSE) { // Bit unshuffle/decompress. err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size); nbytes_out = nbytes_uncomp; } else { // Bit shuffle/compress. // Write the header, described in // http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf. // Techincally we should be using signed integers instead of // unsigned ones, however for valid inputs (positive numbers) these // have the same representation. bshuf_write_uint64_BE(out_buf, nbytes_uncomp); bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size); err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size, elem_size, block_size); nbytes_out = err + 12; } } else { if (flags & H5Z_FLAG_REVERSE) { // Bit unshuffle. err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size, block_size); } else { // Bit shuffle. err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size, block_size); } nbytes_out = nbytes; } //printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n", //nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size); if (err < 0) { sprintf(msg, "Error in bitshuffle with error code %d.", err); PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg); free(out_buf); return 0; } else { free(*buf); *buf = out_buf; *buf_size = buf_size_out; return nbytes_out; } } H5Z_class_t bshuf_H5Filter[1] = {{ H5Z_CLASS_T_VERS, (H5Z_filter_t)(BSHUF_H5FILTER), 1, 1, "bitshuffle; see https://github.com/kiyo-masui/bitshuffle", NULL, (H5Z_set_local_func_t)(bshuf_h5_set_local), (H5Z_func_t)(bshuf_h5_filter) }}; int bshuf_register_h5filter(void){ int retval; retval = H5Zregister(bshuf_H5Filter); if(retval<0){ PUSH_ERR("bshuf_register_h5filter", H5E_CANTREGISTER, "Can't register bitshuffle filter"); } return retval; } bitshuffle-0.3.5/src/bshuf_h5filter.h000066400000000000000000000027321337005776700175460ustar00rootroot00000000000000/* * Bitshuffle HDF5 filter * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * * * Header File * * Filter Options * -------------- * block_size (option slot 0) : interger (optional) * What block size to use (in elements not bytes). Default is 0, * for which bitshuffle will pick a block size with a target of 8kb. * Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4 * Whether to apply LZ4 compression to the data after bitshuffling. * This is much faster than applying compression as a second filter * because it is done when the small block of data is already in the * L1 cache. * * For LZ4 compression, the compressed format of the data is the same as * for the normal LZ4 filter described in * http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf. * */ #ifndef BSHUF_H5FILTER_H #define BSHUF_H5FILTER_H #define H5Z_class_t_vers 2 #include "hdf5.h" #define BSHUF_H5FILTER 32008 #define BSHUF_H5_COMPRESS_LZ4 2 extern H5Z_class_t bshuf_H5Filter[1]; /* ---- bshuf_register_h5filter ---- * * Register the bitshuffle HDF5 filter within the HDF5 library. * * Call this before using the bitshuffle HDF5 filter from C unless * using dynamically loaded filters. * */ int bshuf_register_h5filter(void); #endif // BSHUF_H5FILTER_H bitshuffle-0.3.5/src/bshuf_h5plugin.c000066400000000000000000000007461337005776700175550ustar00rootroot00000000000000/* * Dynamically loaded filter plugin for HDF5 Bitshuffle filter. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #include "bshuf_h5filter.h" #include "H5PLextern.h" H5PL_type_t H5PLget_plugin_type(void) {return H5PL_TYPE_FILTER;} const void* H5PLget_plugin_info(void) {return bshuf_H5Filter;} bitshuffle-0.3.5/src/iochain.c000066400000000000000000000046641337005776700162500ustar00rootroot00000000000000/* * IOchain - Distribute a chain of dependant IO events amoung threads. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #include #include "iochain.h" void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0) { #ifdef _OPENMP omp_init_lock(&C->next_lock); for (size_t ii = 0; ii < IOC_SIZE; ii ++) { omp_init_lock(&(C->in_pl[ii].lock)); omp_init_lock(&(C->out_pl[ii].lock)); } #endif C->next = 0; C->in_pl[0].ptr = in_ptr_0; C->out_pl[0].ptr = out_ptr_0; } void ioc_destroy(ioc_chain *C) { #ifdef _OPENMP omp_destroy_lock(&C->next_lock); for (size_t ii = 0; ii < IOC_SIZE; ii ++) { omp_destroy_lock(&(C->in_pl[ii].lock)); omp_destroy_lock(&(C->out_pl[ii].lock)); } #endif } const void * ioc_get_in(ioc_chain *C, size_t *this_iter) { #ifdef _OPENMP omp_set_lock(&C->next_lock); #pragma omp flush #endif *this_iter = C->next; C->next ++; #ifdef _OPENMP omp_set_lock(&(C->in_pl[*this_iter % IOC_SIZE].lock)); omp_set_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); omp_set_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); omp_unset_lock(&C->next_lock); #endif return C->in_pl[*this_iter % IOC_SIZE].ptr; } void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr) { C->in_pl[(*this_iter + 1) % IOC_SIZE].ptr = in_ptr; #ifdef _OPENMP omp_unset_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); #endif } void * ioc_get_out(ioc_chain *C, size_t *this_iter) { #ifdef _OPENMP omp_set_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); #pragma omp flush #endif void *out_ptr = C->out_pl[*this_iter % IOC_SIZE].ptr; #ifdef _OPENMP omp_unset_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); #endif return out_ptr; } void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) { C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr; #ifdef _OPENMP omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); // *in_pl[this_iter]* lock released at the end of the iteration to avoid being // overtaken by previous threads and having *out_pl[this_iter]* corrupted. // Especially worried about thread 0, iteration 0. omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock)); #endif } bitshuffle-0.3.5/src/iochain.h000066400000000000000000000050711337005776700162460ustar00rootroot00000000000000/* * IOchain - Distribute a chain of dependant IO events amoung threads. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * * * Header File * * Similar in concept to a queue. Each task includes reading an input * and writing output, but the location of the input/output (the pointers) * depend on the previous item in the chain. * * This is designed for parallelizing blocked compression/decompression IO, * where the destination of a compressed block depends on the compressed size * of all previous blocks. * * Implemented with OpenMP locks. * * * Usage * ----- * - Call `ioc_init` in serial block. * - Each thread should create a local variable *size_t this_iter* and * pass its address to all function calls. Its value will be set * inside the functions and is used to identify the thread. * - Each thread must call each of the `ioc_get*` and `ioc_set*` methods * exactly once per iteration, starting with `ioc_get_in` and ending * with `ioc_set_next_out`. * - The order (`ioc_get_in`, `ioc_set_next_in`, *work*, `ioc_get_out`, * `ioc_set_next_out`, *work*) is most efficient. * - Have each thread call `ioc_end_pop`. * - `ioc_get_in` is blocked until the previous entry's * `ioc_set_next_in` is called. * - `ioc_get_out` is blocked until the previous entry's * `ioc_set_next_out` is called. * - There are no blocks on the very first iteration. * - Call `ioc_destroy` in serial block. * - Safe for num_threads >= IOC_SIZE (but less efficient). * */ #ifndef IOCHAIN_H #define IOCHAIN_H #include #ifdef _OPENMP #include #endif #define IOC_SIZE 33 typedef struct ioc_ptr_and_lock { #ifdef _OPENMP omp_lock_t lock; #endif void *ptr; } ptr_and_lock; typedef struct ioc_const_ptr_and_lock { #ifdef _OPENMP omp_lock_t lock; #endif const void *ptr; } const_ptr_and_lock; typedef struct ioc_chain { #ifdef _OPENMP omp_lock_t next_lock; #endif size_t next; const_ptr_and_lock in_pl[IOC_SIZE]; ptr_and_lock out_pl[IOC_SIZE]; } ioc_chain; void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0); void ioc_destroy(ioc_chain *C); const void * ioc_get_in(ioc_chain *C, size_t *this_iter); void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr); void * ioc_get_out(ioc_chain *C, size_t *this_iter); void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr); #endif // IOCHAIN_H bitshuffle-0.3.5/src/lzf_h5plugin.c000066400000000000000000000016641337005776700172410ustar00rootroot00000000000000/* * Dynamically loaded filter plugin for HDF5 LZF filter. * * This file is part of Bitshuffle * Author: Kiyoshi Masui * Website: http://www.github.com/kiyo-masui/bitshuffle * Created: 2014 * * See LICENSE file for details about copyright and rights to use. * */ #define H5Z_class_t_vers 2 #include "lzf_filter.h" #include "H5PLextern.h" #include size_t lzf_filter(unsigned flags, size_t cd_nelmts, const unsigned cd_values[], size_t nbytes, size_t *buf_size, void **buf); herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space); H5Z_class_t lzf_H5Filter[1] = {{ H5Z_CLASS_T_VERS, (H5Z_filter_t)(H5PY_FILTER_LZF), 1, 1, "lzf", NULL, (H5Z_set_local_func_t)(lzf_set_local), (H5Z_func_t)(lzf_filter) }}; H5PL_type_t H5PLget_plugin_type(void) {return H5PL_TYPE_FILTER;} const void* H5PLget_plugin_info(void) {return lzf_H5Filter;}