pax_global_header00006660000000000000000000000064143036117720014516gustar00rootroot0000000000000052 comment=1072e563919de0f37b44c89c91fdc21879e7b937 hickle-5.0.2/000077500000000000000000000000001430361177200127615ustar00rootroot00000000000000hickle-5.0.2/.github/000077500000000000000000000000001430361177200143215ustar00rootroot00000000000000hickle-5.0.2/.github/workflows/000077500000000000000000000000001430361177200163565ustar00rootroot00000000000000hickle-5.0.2/.github/workflows/test.yml000066400000000000000000000044071430361177200200650ustar00rootroot00000000000000name: Python package on: push: pull_request: types: [ opened, synchronize, reopened, edited ] jobs: build: if: github.repository_owner == 'telegraphic' name: ${{ matrix.os}} ${{ matrix.architecture }}, Python ${{ matrix.python_version }} runs-on: ${{ matrix.os }} strategy: matrix: architecture: [x86, x64] os: [ubuntu-latest, macos-latest, windows-latest] python-version: [3.6, 3.7, 3.8] #, 3.9] still disabled as astropy has problems exclude: - os: ubuntu-latest architecture: x86 - os: macos-latest architecture: x86 # disabled for now as python 3.9 not yet in matrix #- python-version: [3.9] # exclude python >= 3.9 cause no h5py win32 wheels # - os: windows-latest # architecture: x86 fail-fast: false steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} architecture: ${{ matrix.architecture }} - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel virtualenv pip install -r requirements_test.txt pip install tox tox-gh-actions - name: Test deployment run: | check-manifest python setup.py sdist bdist_wheel twine check dist/* - name: Test package run: | tox env: TOX_H5PY_REQIREMENTS: ${{ matrix.architecture == 'x86' && '32' || ''}} PLATFORM: ${{ matrix.platform }} - name: Upload coverage if: ${{ success() && github.repository == 'telegraphic/hickle' }} uses: codecov/codecov-action@v2 with: token: ${{ secrets.CODECOV_TOKEN }} files: coverage.xml fail_ci_if_error: true verbose: true - name: Deploy package if: ${{ success() && matrix.os == 'ubuntu-latest' && github.event_name == 'push' && github.ref == 'refs/heads/master' }} env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload --skip-existing dist/* hickle-5.0.2/.gitignore000066400000000000000000000000521430361177200147460ustar00rootroot00000000000000*.pyc .pypirc build/ dist/ .DS_Store .ideahickle-5.0.2/.nojekyll000066400000000000000000000000001430361177200145770ustar00rootroot00000000000000hickle-5.0.2/.pylintrc000066400000000000000000000405431430361177200146340ustar00rootroot00000000000000[MASTER] # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code extension-pkg-whitelist= # Add files or directories to the blacklist. They should be base names, not # paths. ignore=CVS # Add files or directories matching the regex patterns to the blacklist. The # regex matches against base names, not paths. ignore-patterns= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). #init-hook= # Use multiple processes to speed up Pylint. jobs=1 # List of plugins (as comma separated values of python modules names) to load, # usually to register additional checkers. load-plugins= # Pickle collected data for later comparisons. persistent=yes # Specify a configuration file. #rcfile= # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages suggestion-mode=yes # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no [MESSAGES CONTROL] # Only show warnings with the listed confidence levels. Leave empty to show # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED confidence= # Disable the message, report, category or checker with the given id(s). You # can either give multiple identifiers separated by comma (,) or put this # option multiple times (only on the command line, not in the configuration # file where it should appear only once).You can also use "--disable=all" to # disable everything first and then reenable specific checks. For example, if # you want to run only the similarities checker, you can use "--disable=all # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" disable=print-statement, parameter-unpacking, unpacking-in-except, old-raise-syntax, backtick, long-suffix, old-ne-operator, old-octal-literal, import-star-module-level, non-ascii-bytes-literal, invalid-unicode-literal, raw-checker-failed, bad-inline-option, locally-disabled, locally-enabled, file-ignored, suppressed-message, useless-suppression, deprecated-pragma, apply-builtin, basestring-builtin, buffer-builtin, cmp-builtin, coerce-builtin, execfile-builtin, file-builtin, long-builtin, raw_input-builtin, reduce-builtin, standarderror-builtin, unicode-builtin, xrange-builtin, coerce-method, delslice-method, getslice-method, setslice-method, no-absolute-import, old-division, dict-iter-method, dict-view-method, next-method-called, metaclass-assignment, indexing-exception, raising-string, reload-builtin, oct-method, hex-method, nonzero-method, cmp-method, input-builtin, round-builtin, intern-builtin, unichr-builtin, map-builtin-not-iterating, zip-builtin-not-iterating, range-builtin-not-iterating, filter-builtin-not-iterating, using-cmp-argument, eq-without-hash, div-method, idiv-method, rdiv-method, exception-message-attribute, invalid-str-codec, sys-max-int, bad-python3-import, deprecated-string-function, deprecated-str-translate-call, deprecated-itertools-function, deprecated-types-field, next-method-defined, dict-items-not-iterating, dict-keys-not-iterating, dict-values-not-iterating, deprecated-operator-function, deprecated-urllib-function, xreadlines-attribute, deprecated-sys-function, exception-escape, comprehension-escape, superfluous-parens, bad-whitespace, trailing-whitespace, invalid-name # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option # multiple time (only on the command line, not in the configuration file where # it should appear only once). See also the "--disable" option for examples. enable=c-extension-no-member [REPORTS] # Python expression which should return a note less than 10 (10 is the highest # note). You have access to the variables errors warning, statement which # respectively contain the number of errors / warnings messages and the total # number of statements analyzed. This is used by the global evaluation report # (RP0004). evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details #msg-template= # Set the output format. Available formats are text, parseable, colorized, json # and msvs (visual studio).You can also give a reporter class, eg # mypackage.mymodule.MyReporterClass. output-format=text # Tells whether to display a full report or only the messages reports=no # Activate the evaluation score. score=yes [REFACTORING] # Maximum number of nested blocks for function / method body max-nested-blocks=5 # Complete name of functions that never returns. When checking for # inconsistent-return-statements if a never returning function is called then # it will be considered as an explicit return statement and no message will be # printed. never-returning-functions=optparse.Values,sys.exit [LOGGING] # Logging modules to check that the string format arguments are in logging # function parameter format logging-modules=logging [SPELLING] # Limits count of emitted suggestions for spelling mistakes max-spelling-suggestions=4 # Spelling dictionary name. Available dictionaries: none. To make it working # install python-enchant package. spelling-dict= # List of comma separated words that should not be checked. spelling-ignore-words= # A path to a file that contains private dictionary; one word per line. spelling-private-dict-file= # Tells whether to store unknown words to indicated private dictionary in # --spelling-private-dict-file option instead of raising a message. spelling-store-unknown-words=no [MISCELLANEOUS] # List of note tags to take in consideration, separated by a comma. notes=FIXME, XXX, TODO [SIMILARITIES] # Ignore comments when computing similarities. ignore-comments=yes # Ignore docstrings when computing similarities. ignore-docstrings=yes # Ignore imports when computing similarities. ignore-imports=no # Minimum lines number of a similarity. min-similarity-lines=4 [TYPECHECK] # List of decorators that produce context managers, such as # contextlib.contextmanager. Add to this list to register other decorators that # produce valid context managers. contextmanager-decorators=contextlib.contextmanager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. generated-members= # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). ignore-mixin-members=yes # This flag controls whether pylint should warn about no-member and similar # checks whenever an opaque object is returned when inferring. The inference # can return multiple potential results while evaluating a Python object, but # some branches might not be evaluated, which results in partial inference. In # that case, it might be useful to still emit no-member and other checks for # the rest of the inferred objects. ignore-on-opaque-inference=yes # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of # qualified names. ignored-classes=optparse.Values,thread._local,_thread._local # List of module names for which member attributes should not be checked # (useful for modules/projects where namespaces are manipulated during runtime # and thus existing member attributes cannot be deduced by static analysis. It # supports qualified module names, as well as Unix pattern matching. ignored-modules= # Show a hint with possible names when a member name was not found. The aspect # of finding the hint is based on edit distance. missing-member-hint=yes # The minimum edit distance a name should have in order to be considered a # similar match for a missing member name. missing-member-hint-distance=1 # The total number of similar names that should be taken in consideration when # showing a hint for a missing member. missing-member-max-choices=1 [VARIABLES] # List of additional names supposed to be defined in builtins. Remember that # you should avoid to define new builtins when possible. additional-builtins= # Tells whether unused global variables should be treated as a violation. allow-global-unused-variables=yes # List of strings which can identify a callback function by name. A callback # name must start or end with one of those strings. callbacks=cb_, _cb # A regular expression matching the name of dummy variables (i.e. expectedly # not used). dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ # Argument names that match this expression will be ignored. Default to name # with leading underscore ignored-argument-names=_.*|^ignored_|^unused_ # Tells whether we should check for unused import in __init__ files. init-import=no # List of qualified module names which can have objects that can redefine # builtins. redefining-builtins-modules=six.moves,past.builtins,future.builtins,io,builtins [FORMAT] # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. expected-line-ending-format= # Regexp for a line that is allowed to be longer than the limit. ignore-long-lines=^\s*(# )??$ # Number of spaces of indent required inside a hanging or continued line. indent-after-paren=4 # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 # tab). indent-string=' ' # Maximum number of characters on a single line. max-line-length=100 # Maximum number of lines in a module max-module-lines=1000 # List of optional constructs for which whitespace checking is disabled. `dict- # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. # `trailing-comma` allows a space between comma and closing bracket: (a, ). # `empty-line` allows space-only lines. no-space-check=trailing-comma, dict-separator # Allow the body of a class to be on the same line as the declaration if body # contains single statement. single-line-class-stmt=no # Allow the body of an if to be on the same line as the test if there is no # else. single-line-if-stmt=no [BASIC] # Naming style matching correct argument names argument-naming-style=snake_case # Regular expression matching correct argument names. Overrides argument- # naming-style #argument-rgx= # Naming style matching correct attribute names attr-naming-style=snake_case # Regular expression matching correct attribute names. Overrides attr-naming- # style #attr-rgx= # Bad variable names which should always be refused, separated by a comma bad-names=foo, bar, baz, toto, tutu, tata # Naming style matching correct class attribute names class-attribute-naming-style=any # Regular expression matching correct class attribute names. Overrides class- # attribute-naming-style #class-attribute-rgx= # Naming style matching correct class names class-naming-style=PascalCase # Regular expression matching correct class names. Overrides class-naming-style #class-rgx= # Naming style matching correct constant names const-naming-style=UPPER_CASE # Regular expression matching correct constant names. Overrides const-naming- # style #const-rgx= # Minimum line length for functions/classes that require docstrings, shorter # ones are exempt. docstring-min-length=-1 # Naming style matching correct function names function-naming-style=snake_case # Regular expression matching correct function names. Overrides function- # naming-style #function-rgx= # Good variable names which should always be accepted, separated by a comma good-names=i, j, k, ex, Run, _ # Include a hint for the correct naming format with invalid-name include-naming-hint=no # Naming style matching correct inline iteration names inlinevar-naming-style=any # Regular expression matching correct inline iteration names. Overrides # inlinevar-naming-style #inlinevar-rgx= # Naming style matching correct method names method-naming-style=snake_case # Regular expression matching correct method names. Overrides method-naming- # style #method-rgx= # Naming style matching correct module names module-naming-style=snake_case # Regular expression matching correct module names. Overrides module-naming- # style #module-rgx= # Colon-delimited sets of names that determine each other's naming style when # the name regexes allow several styles. name-group= # Regular expression which should only match function or class names that do # not require a docstring. no-docstring-rgx=^_ # List of decorators that produce properties, such as abc.abstractproperty. Add # to this list to register other decorators that produce valid properties. property-classes=abc.abstractproperty # Naming style matching correct variable names variable-naming-style=snake_case # Regular expression matching correct variable names. Overrides variable- # naming-style #variable-rgx= [DESIGN] # Maximum number of arguments for function / method max-args=5 # Maximum number of attributes for a class (see R0902). max-attributes=7 # Maximum number of boolean expressions in a if statement max-bool-expr=5 # Maximum number of branch for function / method body max-branches=12 # Maximum number of locals for function / method body max-locals=15 # Maximum number of parents for a class (see R0901). max-parents=7 # Maximum number of public methods for a class (see R0904). max-public-methods=20 # Maximum number of return / yield for function / method body max-returns=6 # Maximum number of statements in function / method body max-statements=50 # Minimum number of public methods for a class (see R0903). min-public-methods=2 [CLASSES] # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods=__init__, __new__, setUp # List of member names, which should be excluded from the protected access # warning. exclude-protected=_asdict, _fields, _replace, _source, _make # List of valid names for the first argument in a class method. valid-classmethod-first-arg=cls # List of valid names for the first argument in a metaclass class method. valid-metaclass-classmethod-first-arg=mcs [IMPORTS] # Allow wildcard imports from modules that define __all__. allow-wildcard-with-all=no # Analyse import fallback blocks. This can be used to support both Python 2 and # 3 compatible code, which means that the block might have code that exists # only in one or another interpreter, leading to false positives when analysed. analyse-fallback-blocks=no # Deprecated modules which should not be used, separated by a comma deprecated-modules=regsub, TERMIOS, Bastion, rexec # Create a graph of external dependencies in the given file (report RP0402 must # not be disabled) ext-import-graph= # Create a graph of every (i.e. internal and external) dependencies in the # given file (report RP0402 must not be disabled) import-graph= # Create a graph of internal dependencies in the given file (report RP0402 must # not be disabled) int-import-graph= # Force import order to recognize a module as part of the standard # compatibility libraries. known-standard-library= # Force import order to recognize a module as part of a third party library. known-third-party=enchant [EXCEPTIONS] # Exceptions that will emit a warning when being caught. Defaults to # "Exception" overgeneral-exceptions=Exception hickle-5.0.2/CODE_OF_CONDUCT.md000066400000000000000000000062321430361177200155630ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at dan+github@thetelegraphic.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] [homepage]: http://contributor-covenant.org [version]: http://contributor-covenant.org/version/1/4/ hickle-5.0.2/CONTRIBUTING.md000066400000000000000000000040031430361177200152070ustar00rootroot00000000000000## Contributing to hickle Thanks for thinking about contributing to hickle, improvements and bugfixes are most welcome. The following is a brief set of guidelines (not rules) for contributing: * **Be nice.** Please follow the [code of conduct](https://github.com/telegraphic/hickle/blob/master/CODE_OF_CONDUCT.md). * **Squashing bugs.** If you find a bug, please [open an issue](https://github.com/telegraphic/hickle/issues), with some simple steps on how to reproduce it. Try not to make duplicate requests. * **Feature requests.** Feel free to make feature requests, also by [opening an issue](https://github.com/telegraphic/hickle/issues). Be clear about what it is and why it would be awesome. * **Pull requests.** If you add a cool feature you think would be useful broadly, please issue a pull request with some notes on what it does. * **Git comments.** Try and make these clear, even if concise. * **Major changes.** As quite a few people use this package, we have tried to maintain backwards compatibility as much as possible. As such, please open a discussion before you start your quest, to make sure the changes can be merged without upset. * **Unit tests.** If you add new functionality, please write a unit test (if you're familiar with how to). This should be places in the `./tests` directory, and will run with py.test. * **Travis-CI.** When you issue a pull request, Travis-CI will automatically run the unit tests. You can test yourself by running `cd tests; coverage run --source=hickle -m py.test`. * **Style.** Try and keep your code Py2.7 and Py3 compatible, and roughly follow [PEP8](https://www.python.org/dev/peps/pep-0008/) with [google style docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). * **Beginners welcome.** If you're not yet comfortable with some of the fancier things mentioned above, do your best! Just package up your idea/thought/code and [open an issue](https://github.com/telegraphic/hickle/issues) with some clear details. That's about it. Happy contributing! hickle-5.0.2/LICENSE000066400000000000000000000021251430361177200137660ustar00rootroot00000000000000Copyright (c) 2014 Danny Price and contributors http://github.com/telegraphic/hickle Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hickle-5.0.2/MANIFEST.in000066400000000000000000000013451430361177200145220ustar00rootroot00000000000000include LICENSE include *.md include MANIFEST.in include requirements*.txt recursive-include hickle tests * include conftest.py include tox.ini exclude docs recursive-exclude docs * exclude *.yml exclude .* exclude .nojekyll exclude .pylintrc exclude paper* recursive-exclude * __pycache__ recursive-exclude * *.py[co] *.bk *.swp recursive-exclude * old .old recursive-exclude * *.tox recursive-exclude * *.h5 recursive-exclude * *.bak recursive-exclude ci * recursive-exclude * coverage.xml exclude *_mv.txt exclude *_tox.txt recursive-exclude * .bashrcrun exclude hickle/tests/classes recursive-exclude hickle/tests/classes * exclude hickle/tests/dev_check recursive-exclude hickle/tests/dev_check * exclude hickle/tests/test_stateful.py hickle-5.0.2/README.md000066400000000000000000000500341430361177200142420ustar00rootroot00000000000000[![PyPI - Latest Release](https://img.shields.io/pypi/v/hickle.svg?logo=pypi&logoColor=white&label=PyPI)](https://pypi.python.org/pypi/hickle) [![PyPI - Python Versions](https://img.shields.io/pypi/pyversions/hickle.svg?logo=python&logoColor=white&label=Python)](https://pypi.python.org/pypi/hickle) [![CodeCov - Coverage Status](https://img.shields.io/codecov/c/github/telegraphic/hickle/master.svg?logo=codecov&logoColor=white&label=Coverage)](https://codecov.io/gh/telegraphic/hickle/branches/master) [![JOSS Status](http://joss.theoj.org/papers/0c6638f84a1a574913ed7c6dd1051847/status.svg)](http://joss.theoj.org/papers/0c6638f84a1a574913ed7c6dd1051847) Hickle ====== Hickle is an [HDF5](https://www.hdfgroup.org/solutions/hdf5/) based clone of `pickle`, with a twist: instead of serializing to a pickle file, Hickle dumps to an HDF5 file (Hierarchical Data Format). It is designed to be a "drop-in" replacement for pickle (for common data objects), but is really an amalgam of `h5py` and `pickle` with extended functionality. That is: `hickle` is a neat little way of dumping python variables to HDF5 files that can be read in most programming languages, not just Python. Hickle is fast, and allows for transparent compression of your data (LZF / GZIP). Why use Hickle? --------------- While `hickle` is designed to be a drop-in replacement for `pickle` (or something like `json`), it works very differently. Instead of serializing / json-izing, it instead stores the data using the excellent [h5py](https://www.h5py.org/) module. The main reasons to use hickle are: 1. It's faster than pickle and cPickle. 2. It stores data in HDF5. 3. You can easily compress your data. The main reasons not to use hickle are: 1. You don't want to store your data in HDF5. While hickle can serialize arbitrary python objects, this functionality is provided only for convenience, and you're probably better off just using the pickle module. 2. You want to convert your data in human-readable JSON/YAML, in which case, you should do that instead. So, if you want your data in HDF5, or if your pickling is taking too long, give hickle a try. Hickle is particularly good at storing large numpy arrays, thanks to `h5py` running under the hood. Documentation ------------- Documentation for hickle can be found at [telegraphic.github.io/hickle/](http://telegraphic.github.io/hickle/). Usage example ------------- Hickle is nice and easy to use, and should look very familiar to those of you who have pickled before. In short, `hickle` provides two methods: a [hickle.load](http://telegraphic.github.io/hickle/toc.html#hickle.load) method, for loading hickle files, and a [hickle.dump](http://telegraphic.github.io/hickle/toc.html#hickle.dump) method, for dumping data into HDF5. Here's a complete example: ```python import os import hickle as hkl import numpy as np # Create a numpy array of data array_obj = np.ones(32768, dtype='float32') # Dump to file hkl.dump(array_obj, 'test.hkl', mode='w') # Dump data, with compression hkl.dump(array_obj, 'test_gzip.hkl', mode='w', compression='gzip') # Compare filesizes print('uncompressed: %i bytes' % os.path.getsize('test.hkl')) print('compressed: %i bytes' % os.path.getsize('test_gzip.hkl')) # Load data array_hkl = hkl.load('test_gzip.hkl') # Check the two are the same file assert array_hkl.dtype == array_obj.dtype assert np.all((array_hkl, array_obj)) ``` ### HDF5 compression options A major benefit of `hickle` over `pickle` is that it allows fancy HDF5 features to be applied, by passing on keyword arguments on to `h5py`. So, you can do things like: ```python hkl.dump(array_obj, 'test_lzf.hkl', mode='w', compression='lzf', scaleoffset=0, chunks=(100, 100), shuffle=True, fletcher32=True) ``` A detailed explanation of these keywords is given at http://docs.h5py.org/en/latest/high/dataset.html, but we give a quick rundown below. In HDF5, datasets are stored as B-trees, a tree data structure that has speed benefits over contiguous blocks of data. In the B-tree, data are split into [chunks](http://docs.h5py.org/en/latest/high/dataset.html#chunked-storage), which is leveraged to allow [dataset resizing](http://docs.h5py.org/en/latest/high/dataset.html#resizable-datasets) and compression via [filter pipelines](http://docs.h5py.org/en/latest/high/dataset.html#filter-pipeline). Filters such as `shuffle` and `scaleoffset` move your data around to improve compression ratios, and `fletcher32` computes a checksum. These file-level options are abstracted away from the data model. Dumping custom objects ---------------------- Hickle provides several options to store objects of custom python classes. Objects of classes derived from built in classes, numpy, scipy, pandas and astropy objects will be stored using the corresponding loader provided by hickle. Any other classes per default will be stored as binary pickle string. Starting with version 4.x hickle offers the possibility to define dedicated loader functions for custom classes and starting with hickle 5.x these can be collected in module, package and application specific loader modules. ``` class MyClass(): def __init__(self): self.name = 'MyClass' self.value = 42 ``` To create a loader for `MyClass` the `create_MyClass_dataset` and either the `load_MyClass` or the `MyClassContainer` class have to be defined. ``` import hdf5 form hickle.helpters import no_compression def create_MyClass_dataset(py_obj, h_group, name, **kwargs): """ py_obj ..... the instance of MyClass to be dumped h_group .... the h5py.Group py_obj should be dumped into name ....... the name of the h5py.Dataset or h5py.Group representing py_obj **kwargs ... the compression keyword arguments passed to hickle.dump # if content of MyClass can be represented as single matrix, vector or scalar # values than created a dataset of appropriate size. and either set its shape and # dtype parameters # to the appropriate size and tyoe . or directly pass the data # using the data parameter ds = h_group.create_dataset(name,data = py_obj.value,**kwargs) ## NOTE: if your class represents a scalar using empty tuple for shape ## than kwargs have to be filtered by no_compression # ds = h_group.create_dataset(name,data = py_obj.value,shape=(),**no_compression(kwargs)) # set additional attributes providing additional specialisation of content ds.attrs['name'] = py_obj.name # when done return the new dataset object and an empty tuple or list return ds,() def load_Myclass(h_node, base_type, py_obj_type): """ h_node ........ the h5py.Dataset object containing the data of MyClass object to restore base_type ..... byte string naming the loader to be used for restoring MyClass object py_obj_type ... MyClass class or MyClass subclass object """ # py_obj_type should point to MyClass or any of its subclasses new_instance = py_obj_type() new_instance.name = h_node.attrs['name'] new_instance.value = h_node[()] return new_instance ``` For dumping content of complex objects consisting of multiple sub-items which have to be stored as individual h5py.Dataset or h5py.Group objects than define `create_MyClass_dataset` using `create_group` method instead of `create_dataset` and define the corresponding `MyClassContainer` class. ``` import h5py from hickle.helpers import PyContainer def create_MyClass_dataset(py_obj, h_group, name, **kwargs): """ py_obj ..... the instance of MyClass to be dumped h_group .... the h5py.Group py_obj should be dumped into name ....... the name of the h5py.Dataset or h5py.Group representing py_obj **kwargs ... the compression keyword arguments passed to hickle.dump ds = h_group.create_group(name) # set additional attributes providing additional specialisation of content ds.attrs['name'] = py_obj.name # when done return the new dataset object and a tuple, list or generator function # providing for all subitems a tuple or list describing containgin # name ..... the name to be used storing the subitem within the h5py.Group object # item ..... the subitem object to be stored # attrs .... dictionary included in attrs of created h5py.Group or h5py.Dataset # kwargs ... the kwargs as passed to create_MyClass_dataset function return ds,(('name',py_obj.name,{},kwargs),('value',py_obj.value,{'the answer':True},kwargs)) class MyClassContainer(PyContainer): """ Valid container classes must be derived from hickle.helpers.PyContainer class """ def __init__(self,h5_attrs,base_type,object_type): """ h5_attrs ...... the attrs dictionary attached to the group representing MyClass base_type ..... byte string naming the loader to be used for restoring MyClass object py_obj_type ... MyClass class or MyClass subclass object """ # the optional protected _content parameter of the PyContainer __init__ # method can be used to change the data structure used to store # the subitems passed to the append method of the PyContainer class # per default it is set to [] super().__init__(h5_attrs,base_type,object_type,_content = dict()) def filter(self,h_parent): # optional overload """ generator member functoin which can be overloaded to reorganize subitems of h_parent h5py.Group before being restored by hickle. Its default implementation simply yields from h_parent.items(). """ yield from super().filter(h_parent) def append(self,name,item,h5_attrs): # optional overload """ in case _content parameter was explicitly set or subitems should be sored in specific order or have to be preprocessed before the next item is appended than this can be done before storing in self._content. name ....... the name identifying subitem item within the parent hdf5.Group item ....... the object representing the subitem h5_attrs ... attrs dictionary attached to h5py.Dataset, h5py.Group representing item """ self._content[name] = item def convert(self): """ called by hickle when all sub items have been appended to MyClass PyContainer this method must be implemented by MyClass PyContainer. """ # py_obj_type should point to MyClass or any of its subclasses new_instance = py_obj_type() new_instance.__dict__.update(self._content) return new_instance ``` In a last step the loader for MyClass has to be registered with hickle. This is done by calling `hickle.lookup.LoaderManager.register_class` method ``` from hickle.lookup import LoaderManager # to register loader for object mapped to h5py.Dataset use LoaderManager.register_class( MyClass, # MyClass type object this loader handles b'MyClass', # byte string representing the name of the loader create_MyClass_Dataset, # the create dataset function defined in first example above load_MyClass, # the load dataset function defined in first example above None, # usually None True, # Set to False to force explicit storage of MyClass instances in any case 'custom' # Loader is only used when custom loaders are enabled on calling hickle.dump ) # to register loader for object mapped to h5py.Group use LoaderManager.register_class( MyClass, # MyClass type object this loader handles b'MyClass', # byte string representing the name of the loader create_MyClass_Dataset, # the create dataset function defined in first example above None, # usually None MyClassContainer # the PyContainer to be used to restore content of MyClass True, # Set to False to force explicit storage of MyClass instances in any case None # if set to None loader is enabled unconditionally ) # NOTE: in case content of MyClass instances may be mapped to h5py.Dataset or h5py.Group dependent upon # their actual complexity than both types of loaders can be merged into one single # using one common create_MyClass_dataset functoin and defining load_MyClass function and # MyClassContainer class ``` For complex python modules, packages and applications defining several classes to be dumped and handled by hickle calling `hickle.lookup.LoaderManager.register_class` explicitly very quickly becomes tedious and confusing. Therefore hickle offers from hickle 5.x on the possibility to collect all loaders for classes and objects defined by your module, package or application within dedicated loader modules and install them along with your module, package and application. For packages and application packages the `load_MyPackage.py` loader module has to be stored within `hickle_loaders` directory of the package directory (the first which contains a __init__.py file) and should be structured as follows. ``` from hickle.helpers import PyContainer ## define below all create_MyClass_dataset load_MyClass functions and MyClassContainer classes ## of the loaders serving your module, package, application package or application .... ## the class_register table and the exclude_register table are required ## by hickle to properly load and apply your loaders ## each row in the class register table will corresponds to the parameters ## of LoaderManager.register_class and has to be specified in the same order ## as above class_register = [ [ MyClass, # MyClass type object this loader handles b'MyClass', # byte string representing the name of the loader create_MyClass_Dataset, # the create dataset function defined in first example above load_MyClass, # the load dataset function defined in first example above None, # usually None True, # Set to False to force explicit storage of MyClass instances in any case 'custom' # Loader is only used when custom loaders are enabled on calling hickle.dump ], [ MyClass, # MyClass type object this loader handles b'MyClass', # byte string representing the name of the loader create_MyClass_Dataset, # the create dataset function defined in first example above None, # usually None MyClassContainer # the PyContainer to be used to restore content of MyClass True, # Set to False to force explicit storage of MyClass instances in any case None # if set to None loader is enabled unconditionally ] ] # used by hickle 4.x legacy loaders and other special loaders # usually an empty list exclude_register = [] ``` For single file modules and application scripts the `load_MyModule.py` or `load_MyApp.py` files have to be stored within the `hickle_loaders` directory located within the same directory as `MyModule.py` or `My_App.py`. For further examples of more complex loaders and on how to store bytearrays and strings such that they can be compressed when stored see default loader modules in `hickle/loaders/` directory. ### Note: storing complex objects in HDF5 file The HDF5 file format is designed to store several big matrices, images and vectors efficiently and attach some metadata and to provide a convenient way access the data through a tree structure. It is not designed like python pickle format for efficiently mapping the in memory object structure to a file. Therefore mindlessly storing plenty of tiny objects and scalar values without combining them into a single datataset will cause the HDF5 and thus the file created by hickle explode. File sizes of several 10 GB are likely and possible when a pickle file would just need some 100 MB. This can be prevented by `create_MyClass_dataset` method combining sub-items into bigger numpy arrays or other data structures which can be mapped to `h5py.Datasets` and `load_MyClass` function and /or `MyClassContainer.convert` method restoring actual structure of the sub-items on load. Recent changes -------------- * December 2021: Release of version 5, support for h5py >= 3.0 and numpy >= 1.21 * June 2020: Major refactor to version 4, and removal of support for Python 2. * December 2018: Accepted to Journal of Open-Source Software (JOSS). * June 2018: Major refactor and support for Python 3. * Aug 2016: Added support for scipy sparse matrices `bsr_matrix`, `csr_matrix` and `csc_matrix`. Performance comparison ---------------------- Hickle runs a lot faster than pickle with its default settings, and a little faster than pickle with `protocol=2` set: ```Python In [1]: import numpy as np In [2]: x = np.random.random((2000, 2000)) In [3]: import pickle In [4]: f = open('foo.pkl', 'w') In [5]: %time pickle.dump(x, f) # slow by default CPU times: user 2 s, sys: 274 ms, total: 2.27 s Wall time: 2.74 s In [6]: f = open('foo.pkl', 'w') In [7]: %time pickle.dump(x, f, protocol=2) # actually very fast CPU times: user 18.8 ms, sys: 36 ms, total: 54.8 ms Wall time: 55.6 ms In [8]: import hickle In [9]: f = open('foo.hkl', 'w') In [10]: %time hickle.dump(x, f) # a bit faster dumping to file CPU times: user 764 us, sys: 35.6 ms, total: 36.4 ms Wall time: 36.2 ms ``` So if you do continue to use pickle, add the `protocol=2` keyword (thanks @mrocklin for pointing this out). For storing python dictionaries of lists, hickle beats the python json encoder, but is slower than uJson. For a dictionary with 64 entries, each containing a 4096 length list of random numbers, the times are: json took 2633.263 ms uJson took 138.482 ms hickle took 232.181 ms It should be noted that these comparisons are of course not fair: storing in HDF5 will not help you convert something into JSON, nor will it help you serialize a string. But for quick storage of the contents of a python variable, it's a pretty good option. Installation guidelines ----------------------- ### Easy method Install with `pip` by running `pip install hickle` from the command line. #### Install on Windows 32 bit Prebuilt Python wheels packages are available on PyPi until H5PY version 2.10 and Python 3.8. Any newer versions have to be built and installed Manually. 1) Install h5py 2.10 with `pip` by running `pip install "h5py==2.10"` from the commandline 2) Install with `pip` by running `pip install hickle` form the command line ### Manual install 1. You should have Python 3.5 and above installed 2. Install hdf5 (Official page: http://www.hdfgroup.org/ftp/HDF5/current/src/unpacked/release_docs/INSTALL) (Binary Downloads: https://portal.hdfgroup.org/display/support/Downloads) __Note:__ On Windows 32 bit install prebuilt binary package for libhdf5 [1.10.4](https://portal.hdfgroup.org/display/support/HDF5+1.10.4), which is the latest version supporting 32 bit on Windows 3. Install h5py (Official page: http://docs.h5py.org/en/latest/build.html) 4. Download `hickle`: via terminal: git clone https://github.com/telegraphic/hickle.git via manual download: Go to https://github.com/telegraphic/hickle and on right hand side you will find `Download ZIP` file 5. cd to your downloaded `hickle` directory 6. Then run the following command in the `hickle` directory: `python setup.py install` ### Optional requirements: * dill: needed when files generated by hickle 3 and/or hickle 4 are to be loaded with hickle >= 5, for developent and testing * astropy: needed for development and testing * pandas: needed for development and testing ### Testing Once installed from source, run `python setup.py test` to check it's all working. Bugs & contributing -------------------- Contributions and bugfixes are very welcome. Please check out our [contribution guidelines](https://github.com/telegraphic/hickle/blob/master/CONTRIBUTING.md) for more details on how to contribute to development. Referencing hickle ------------------ If you use `hickle` in academic research, we would be grateful if you could reference [our paper](http://joss.theoj.org/papers/0c6638f84a1a574913ed7c6dd1051847) in the [Journal of Open-Source Software (JOSS)](http://joss.theoj.org/about). ``` Price et al., (2018). Hickle: A HDF5-based python pickle replacement. Journal of Open Source Software, 3(32), 1115, https://doi.org/10.21105/joss.01115 ``` hickle-5.0.2/_config.yml000066400000000000000000000000321430361177200151030ustar00rootroot00000000000000theme: jekyll-theme-caymanhickle-5.0.2/conftest.py000066400000000000000000000521401430361177200151620ustar00rootroot00000000000000import pytest import sys import types import functools as ft import threading import os import os.path import importlib import collections import ctypes import re # list of function names which shall not be # traced when compression keyword hardening # test run is executed non_core_loader_functions = { 'type_legacy_mro', 'load_pickled_data', 'recover_custom_dataset', #'register_compact_expand', '_moc_numpy_array_object_lambda', 'fix_lambda_obj_type', 'LoaderManager.load_loader', 'RecoverGroupContainer.convert', 'NoContainer.convert', '_DictItemContainer.convert', 'ExpandReferenceContainer.convert', 'RecoverGroupContainer.filter', 'ExpandReferenceContainer.filter', 'ReferenceManager.resolve_type', 'RecoverGroupContainer._append' } def pytest_addoption(parser): """ adds enable_compression keywort to pytest commandline options for enabling h5py compression keyword hardening testing of dump functions of hikcle.loaders and hickle core loaders """ parser.addoption( "--enable-compression", action='store', nargs='?', const=6, type=int, choices=range(0,10), help="run all tests with bzip compression enabled. Optionally specify compression level 0-9 (default 6)", dest="enable_compression" ) def _get_trace_function(trace_function): """ try to get hold of FunctionType object of passed in Method, Function or callable """ while not isinstance(trace_function,(types.FunctionType,types.LambdaType,types.BuiltinFunctionType)): if isinstance(trace_function,(types.MethodType,types.BuiltinMethodType)): trace_function = getattr(trace_function,'__func__') continue if isinstance(trace_function,ft.partial): trace_function = trace_function.func continue return ( getattr(trace_function,'__call__',trace_function) if callable(trace_function) and not isinstance(trace_function,type) else None ) return trace_function # keyword arguments to yield from compression_kwargs fixture below # may in future become a list of dictionaries to be yieled for # running same test with different sets of compression keywords # (implizit parametrization of tests) _compression_args = dict( compression='gzip', compression_opts=6 ) _test_compression = None def pytest_configure(config): """ make no_compression mark available from pytest.mark. if not yet activated enable profiling of dump methods and functions and set compression_level selected on commandline if explicitly specified. """ global _test_compression config.addinivalue_line( "markers","no_compression: do not enforce h5py compression hardening testing" ) if _test_compression is not None: return compression_level = config.getoption("enable_compression",default=-1) if compression_level is None or compression_level < 0: return _compression_args['compression_opts'] = compression_level _test_compression = True # local handle of no_compression mark no_compression = pytest.mark.no_compression @pytest.fixture#(scope='session') def compression_kwargs(request): """ fixture providing the compressoin related keyword arguments to be passed to any test not marked with no_compression mark and expecting compression_kwargs as one of its parameters """ global _test_compression yield ( _compression_args if _test_compression else {} ) # list of distinct copyies of LoaderManager.register_class function # keys are either "::LoaderManager.register_class" or # copy of code object executed when LoaderManager.register_class method # is called _trace_register_class = {} # list of dump_functions to be traced with respect to being # passed the compression related keywords provided through compression_kwargs # fixture above. In case a call to any of these does not include at least these # keywords an AssertionError Exception is raised. _trace_functions = collections.OrderedDict() # profiling function to be called after execution of _trace_loader_funcs # below _trace_profile_call = None # index of dump_function argument in argument list of LoaderManager.register_class # method. _trace_function_argument_default = -1 def _chain_profile_call(frame,event,arg): global _trace_profile_call if _trace_profile_call: next_call = _trace_profile_call(frame,event,arg) if next_call: _trace_profile_call = next_call # argument names which correspond to argument being passed dump_function # object _trace_function_arg_names = {'dump_function'} # the pytest session tracing of proper handling of compression related # keywords is activated for _traced_session = None _loader_file_pattern = re.compile(r'^load_\w+\.py$') def pytest_sessionstart(session): """ pytest hook called at start of session. - collects all functions exported by hickle.lookup module (for now) and records inserts "::" strings into _trace_functions list for any not listed in above non_core_loader_functions - collects all dump_functions listed in class_register tables of all hickle.loaders.load_*.py modules. """ global _test_compression,_traced_session,_trace_register_class,_trace_functions,_trace_profile_call if _test_compression is None: pytest_configure(session.config) if not _test_compression: return None # extract all loader function from hickle.lookup lookup_module = sys.modules.get('hickle.lookup',None) if not isinstance(lookup_module,types.ModuleType): lookup_module_spec = importlib.util.find_spec("hickle.lookup") lookup_module = importlib.util.module_from_spec(lookup_module_spec) lookup_module_spec.loader.exec_module(lookup_module) register_class = lookup_module.LoaderManager.register_class register_class_code = register_class.__func__.__code__ trace_function_argument = register_class_code.co_argcount + register_class_code.co_kwonlyargcount for argid,trace_function in ( (count,varname) for count,varname in enumerate(register_class_code.co_varnames[:(register_class_code.co_argcount + register_class_code.co_kwonlyargcount)]) if varname in _trace_function_arg_names ): trace_function_argument = argid break if trace_function_argument < 0: return None _trace_function_argument_default = trace_function_argument qualname = getattr(register_class,'__qualname__',register_class.__name__) code_name = qualname if qualname.rsplit('.',1) == register_class_code.co_name else register_class_code.co_name _trace_register_class.update({"{}::{}".format(register_class_code.co_filename,code_name):trace_function_argument}) for loader_func_name,loader_func in ( (func_name,func) for name, item in lookup_module.__dict__.items() if isinstance(item,(types.FunctionType,type)) for func_name,func in ( ((name,item),) if isinstance(item,types.FunctionType) else ( ( meth_name,meth) for meth_name,meth in item.__dict__.items() if isinstance(meth,types.FunctionType) ) ) if func_name[:2] != '__' and func_name[-2:] != '__' ): loader_func = _get_trace_function(loader_func) if loader_func is not None and loader_func.__module__ == lookup_module.__name__: code = loader_func.__code__ qualname = getattr(loader_func,'__qualname__',loader_func.__name__) if qualname not in non_core_loader_functions: code_name = qualname if qualname.rsplit('.',1) == code.co_name else code.co_name _trace_functions["{}::{}".format(code.co_filename,code_name)] = (loader_func.__module__,qualname) # extract all dump functions from any known loader module hickle_loaders_path = os.path.join(os.path.dirname(lookup_module.__file__),'loaders') for loader in os.scandir(hickle_loaders_path): if not loader.is_file() or _loader_file_pattern.match(loader.name) is None: continue loader_module_name = "hickle.loaders.{}".format(loader.name.rsplit('.',1)[0]) loader_module = sys.modules.get(loader_module_name,None) if loader_module is None: loader_module_spec = importlib.util.find_spec("hickle.loaders.{}".format(loader.name.rsplit('.',1)[0])) if loader_module_spec is None: continue loader_module = importlib.util.module_from_spec(loader_module_spec) try: loader_module_spec.loader.exec_module(loader_module) except ModuleNotFoundError: continue except ImportError: if sys.version_info[0] > 3 or sys.version_info[1] > 5: raise continue class_register_table = getattr(loader_module,'class_register',()) # trace function has cls/self for dump_function in ( entry[trace_function_argument-1] for entry in class_register_table ): dump_function = _get_trace_function(dump_function) if dump_function is not None: code = dump_function.__code__ qualname = getattr(dump_function,'__qualname__',dump_function.__name__) code_name = qualname if qualname.rsplit('.',1) == code.co_name else code.co_name _trace_functions["{}::{}".format(code.co_filename,code_name)] = (dump_function.__module__,qualname) # activate compression related profiling _trace_profile_call = sys.getprofile() _traced_session = session sys.setprofile(_trace_loader_funcs) return None # List of test functions which are marked by no_compression mark _never_trace_compression = set() def traceback_from_frame(frame,stopafter): """ helper function used in Python >= 3.7 to beautify traceback of AssertionError exceptoin thrown by _trace_loader_funcs """ tb = types.TracebackType(None,frame,frame.f_lasti,frame.f_lineno) while frame.f_back is not stopafter.f_back: frame = frame.f_back tb = types.TracebackType(tb,frame,frame.f_lasti,frame.f_lineno) return tb def pytest_collection_finish(session): """ collect all test functions for which compression related keyword monitoring shall be disabled. """ if not sys.getprofile() == _trace_loader_funcs: return listed = set() listemodules = set() for item in session.items: func = item.getparent(pytest.Function) if func not in listed: listed.add(func) for marker in func.iter_markers(no_compression.name): never_trace_code = func.function.__code__ qualname = getattr(func.function,'__qualname__',func.function.__name__) code_name = qualname if qualname.rsplit('.',1) == never_trace_code.co_name else never_trace_code.co_name _never_trace_compression.add("{}::{}".format(never_trace_code.co_filename,code_name)) break def _trace_loader_funcs(frame,event,arg,nochain=False): """ does the actuatual profiling with respect to proper passing compression keywords to dump_functions """ global _chain_profile_call, _trace_functions,_never_trace_compression,_trace_register_class,_trace_function_argument_default try: if event not in {'call','c_call'}: return _trace_loader_funcs # check if LoaderManager.register_class has been called # if get position of dump_function argument and extract # code object for dump_function to be registered if not None code_block = frame.f_code trace_function_argument = _trace_register_class.get(code_block,None) if trace_function_argument is not None: trace_function = frame.f_locals.get(code_block.co_varnames[trace_function_argument],None) load_function = frame.f_locals.get(code_block.co_varnames[trace_function_argument+1],None) if load_function is not None: load_function = _get_trace_function(load_function) _trace_functions.pop("{}::{}".format(load_function.__code__.co_filename,load_function.__code__.co_name),None) if trace_function is None: return _trace_loader_funcs trace_function = _get_trace_function(trace_function) if trace_function is None: return _trace_loader_funcs trace_function_code = getattr(trace_function,'__code__',None) if trace_function_code is not None: # store code object corresponding to dump_function in _trace_functions list # if not yet present there. qualname = getattr(trace_function,'__qualname__',trace_function.__name__) code_name = qualname if qualname.rsplit('.',1) == trace_function_code.co_name else trace_function_code.co_name trace_function_code_name = "{}::{}".format(trace_function_code.co_filename,code_name) if ( trace_function_code_name not in _trace_register_class and ( trace_function_code_name not in _trace_functions or trace_function_code not in _trace_functions ) ): trace_function_spec = (trace_function.__module__,qualname) _trace_functions[trace_function_code] = trace_function_spec _trace_functions[trace_function_code_name] = trace_function_spec return _trace_loader_funcs # estimate qualname from local variable stored in frame.f_local corresponding # to frame.f_code.co_varnames[0] if any. object_self_name = frame.f_code.co_varnames[:1] if object_self_name: self = frame.f_locals.get(object_self_name[0],None) module = getattr(self,'__module__','') if isinstance(module,str) and module.split('.',1)[0] == 'hickle' and isinstance(getattr(self,'__name__',None),str): method = getattr(self,frame.f_code.co_name,None) if method is not None and getattr(method,'__code__',None) == frame.f_code: code_name = "{}::{}.{}".format( frame.f_code.co_filename, getattr(self,'__qualname__',self.__name__), frame.f_code.co_name ) else: code_name = "{}::{}".format(frame.f_code.co_filename,frame.f_code.co_name) else: code_name = "{}::{}".format(frame.f_code.co_filename,frame.f_code.co_name) else: code_name = "{}::{}".format(frame.f_code.co_filename,frame.f_code.co_name) # check if frame could encode a clall to a new incarnation of LoaderManager.register_class # method. Add its code object to the list of known incarnations and rerun above code if code_block.co_name == 'register_class': trace_function_argument = _trace_register_class.get(code_name,None) if trace_function_argument is not None: _trace_register_class[code_block] = trace_function_argument return _trace_loader_funcs(frame,event,arg,True) if ( code_block.co_filename.rsplit('/',2) == ['hickle','lookup.py'] and code_block.co_varnames > trace_function_argument and code_block.co_varnames[_trace_function_argument_default] in _trace_function_arg_names ): _trace_register_class[code_name] = _trace_function_argument_default _trace_register_class[code_block] = _trace_function_argument_default return _trace_loader_funcs(frame,event,arg,True) # frame encodes a call to any other function or method. # If the function or method is listed in _trace_functions list check # if it received the appropriate set of compresson related keywords function_object_spec = _trace_functions.get(frame.f_code,None) if function_object_spec is None: function_object_spec = _trace_functions.get(code_name,None) if function_object_spec is None: return _trace_loader_funcs _trace_functions[frame.f_code] = function_object_spec baseargs = ( (arg,frame.f_locals[arg]) for arg in frame.f_code.co_varnames[:(frame.f_code.co_argcount + frame.f_code.co_kwonlyargcount)] ) kwargs = frame.f_locals.get('kwargs',None) if kwargs is not None: fullargs = ( (name,arg) for arglist in (kwargs.items(),baseargs) for name,arg in arglist ) else: fullargs = baseargs seen_compression_args = set() for arg,value in fullargs: if arg in seen_compression_args: continue if _compression_args.get(arg,None) is not None: seen_compression_args.add(arg) if len(seen_compression_args) == len(_compression_args): return _trace_loader_funcs # keywords not passed or filtered prematurely. # walk the stack until reaching executed test function. # if test function is not marked with no_compression raise # AssertionError stating that dump_function did not # receive expected compression keywords defined above # For Python <= 3.6 collect all functions called between current # frame and frame of executed test function. For Python > 3.6 use # above traceback_from_frame function to build traceack showing appropriate # callstack and context excluding this function to ensure AssertionError # exception appears thrown on behlaf of function triggering call encoded by # passed frame function_object_spec = _trace_functions[frame.f_code] if _traced_session is not None: test_list = { "{}::{}".format( item.function.__code__.co_filename, getattr(item.function,'__qualname__', item.function.__name__) ):item for item in _traced_session.items } collect_call_tree = [] next_frame = frame while next_frame is not None: object_self_name = frame.f_code.co_varnames[:1] if object_self_name: self = frame.f_locals.get(object_self_name[0]) module = getattr(self,'__module__','') if ( isinstance(module,str) and module.split('.',1)[0] == 'hickle' and isinstance(getattr(self,'__name__',None),str) ): method = getattr(self,frame.f_code.co_name,None) if method is not None and getattr(method,'__code__',None) == frame.f_code: frame_name = "{}::{}".format( next_frame.f_code.co_filename, getattr(method,'__qualname__',method.__name__) ) else: frame_name = "{}::{}".format(next_frame.f_code.co_filename,next_frame.f_code.co_name) else: frame_name = "{}::{}".format(next_frame.f_code.co_filename,next_frame.f_code.co_name) else: frame_name = "{}::{}".format(next_frame.f_code.co_filename,next_frame.f_code.co_name) if frame_name in _never_trace_compression: return _trace_loader_funcs in_test = test_list.get(frame_name,None) collect_call_tree.append((next_frame.f_code.co_filename,frame_name,next_frame.f_lineno)) if in_test is not None: try: tb = traceback_from_frame(frame,next_frame) except TypeError: pass else: raise AssertionError( "'{}': compression_kwargs lost in call".format("::".join(function_object_spec)) ).with_traceback(tb) raise AssertionError( "'{}': compression_kwargs lost in call:\n\t{}\n".format( "::".join(function_object_spec), "\n\t".join("{}::{} ({})".format(*call) for call in collect_call_tree[:0:-1]) ) ) next_frame = next_frame.f_back except AssertionError as ae: # check that first entry in traceback does not refer to this function if ae.__traceback__.tb_frame.f_code == _trace_loader_funcs.__code__: ae.__traceback__ = ae.__traceback__.tb_next raise #except Exception as e: # import traceback;traceback.print_exc() # import pdb;pdb.set_trace() finally: if not nochain: _chain_profile_call(frame,event,arg) def pytest_sessionfinish(session): sys.setprofile(_trace_profile_call) hickle-5.0.2/docs/000077500000000000000000000000001430361177200137115ustar00rootroot00000000000000hickle-5.0.2/docs/Makefile000066400000000000000000000011101430361177200153420ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)hickle-5.0.2/docs/make_docs.sh000066400000000000000000000000541430361177200161710ustar00rootroot00000000000000cp ../README.md ./source/index.md make html hickle-5.0.2/docs/source/000077500000000000000000000000001430361177200152115ustar00rootroot00000000000000hickle-5.0.2/docs/source/_static/000077500000000000000000000000001430361177200166375ustar00rootroot00000000000000hickle-5.0.2/docs/source/_static/empty.txt000066400000000000000000000000001430361177200205240ustar00rootroot00000000000000hickle-5.0.2/docs/source/_templates/000077500000000000000000000000001430361177200173465ustar00rootroot00000000000000hickle-5.0.2/docs/source/_templates/empty.txt000066400000000000000000000000001430361177200212330ustar00rootroot00000000000000hickle-5.0.2/docs/source/conf.py000066400000000000000000000131331430361177200165110ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) import recommonmark from recommonmark.transform import AutoStructify # -- Project information ----------------------------------------------------- project = u'hickle' copyright = u'2018-2020, Danny Price, Ellert van der Velden and contributors' author = u'Danny Price' # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags release = u'' # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_parsers = { '.md': 'recommonmark.parser.CommonMarkParser', } source_suffix = ['.rst', '.md'] #source_suffix = '.rst' # The master toctree document. master_doc = '../../README' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" html_theme_path = ["_themes", ] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'hickledoc' # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'hickle.tex', u'hickle Documentation', u'Danny Price', 'manual'), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'hickle', u'hickle Documentation', [author], 1) ] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'hickle', u'hickle Documentation', author, 'hickle', 'One line description of project.', 'Miscellaneous'), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] # -- Extension configuration ------------------------------------------------- def setup(app): app.add_config_value('recommonmark_config', { 'url_resolver': lambda url: github_doc_root + url, 'auto_toc_tree_section': 'Contents', }, True) app.add_transform(AutoStructify) hickle-5.0.2/docs/source/toc.rst000066400000000000000000000020141430361177200165250ustar00rootroot00000000000000.. hickle documentation master file, created by sphinx-quickstart on Fri Dec 14 15:39:45 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to hickle's documentation! ================================== Hickle is an HDF5-based clone of `pickle`, with a twist: instead of serializing to a pickle file, Hickle dumps to an HDF5 file (Hierarchical Data Format). It is designed to be a "drop-in" replacement for pickle (for common data objects), but is really an amalgam of `h5py` and `dill`/`pickle` with extended functionality. That is: `hickle` is a neat little way of dumping python variables to HDF5 files that can be read in most programming languages, not just Python. Hickle is fast, and allows for transparent compression of your data (LZF / GZIP). .. toctree:: :maxdepth: 2 :caption: Contents: .. automodule:: hickle :members: load, dump Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` hickle-5.0.2/hickle/000077500000000000000000000000001430361177200142205ustar00rootroot00000000000000hickle-5.0.2/hickle/__init__.py000066400000000000000000000005201430361177200163260ustar00rootroot00000000000000# hickle imports from .__version__ import __version__ from . import hickle from .hickle import * from .fileio import ClosedFileError, FileError # All declaration __all__ = ['hickle', 'ClosedFileError', 'FileError'] __all__.extend(hickle.__all__) # Author declaration __author__ = "Danny Price, Ellert van der Velden and contributors" hickle-5.0.2/hickle/__version__.py000066400000000000000000000002761430361177200170600ustar00rootroot00000000000000# -*- coding: utf-8 -*- """ Hickle Version ============== Stores the different versions of the *hickle* package. """ # %% VERSIONS # Default/Latest/Current version __version__ = '5.0.2' hickle-5.0.2/hickle/fileio.py000066400000000000000000000175501430361177200160510ustar00rootroot00000000000000# encoding: utf-8 """ #fileio.py contains functions, classes and constants related to file management. These functions may also be used by loader modules even though currently no related use-case is known and storing dicts as independent files as requested by @gpetty in issue #133 is better handled on hdf5 or h5py level and not on hickle level. """ # %% IMPORTS # Built-in imports import re import operator import typing import types import collections import numbers import h5py as h5 import os.path as os_path from pathlib import Path class FileError(Exception): """ An exception raised if the file is fishy """ class ClosedFileError(Exception): """ An exception raised if the file is fishy """ def not_io_base_like(f,*args): """ creates function which can be used in replacement for IOBase.isreadable, IOBase.isseekable and IOBase.iswriteable methods in case f would not provide any of them. Parameters ---------- f (file or file like): file or file like object to which hickle shall dump data to. *args (tuple): list of one or more tuples containing the commands to be checked in replacement tests for IOBase.isreadable, IOBase.isseekable or IOBase.iswriteable and the arguments required to perform the tests Note: **kwargs not supported Returns ------- function to be called in replacement of any of not implemented IOBase.isreadable, IOBase.isseekable or IOBase.isreadable Example ------- if not getattr(f, 'isreadable', not_io_base_like(f, 'read', 0))(): raise ValueError("Not a readable file or file like object") """ def must_test(): if not args: return False for cmd,*call_args in args: cmd = getattr(f,cmd,None) if not cmd: return False try: cmd(*call_args) except: return False return True return must_test def file_opener(f, path, mode='r',filename = None): """ A file opener helper function with some error handling. This can open files through a file object, an h5py file, or just the filename. Parameters ---------- f (file, file-like, h5py.File, str, (file,str),{'file':file,'name':str} ): File to open for dumping or loading purposes. str: the path of the HDF5-file that must be used. `~h5py.Group`: the group (or file) in an open HDF5-file that must be used. file, file-like: file or like object which provides `read`, `seek`, `tell` and write methods tuple: two element tuple with the first being the file or file like object to dump to and the second the filename to be used instead of 'filename' parameter dict: dictionary with 'file' and 'name' items path (str): Path within HDF5-file or group to dump to/load from. mode (str): optional string indicating how the file shall be opened. For details see Python `open`. Note: The 'b' flag is optional as all files are a and have to be opened in binary mode. filename (str): optional The name of the file. Ignored when f is `str` or `h5py.File` object. Returns ------- tuple containing (file, path, closeflag) file (h5py.File): The h5py.File object the data is to be dumped to or loaded from path (str): Absolute path within HDF5-file or group to dump to/load from. closeflag: True .... file was opened by file_opener and must be closed by caller. False ... file shall not be closed by caller unless opened by caller Raises ------ CloseFileError: If passed h5py.File, h5py.Group or h5py.Dataset object is not accessible. This in most cases indicates that underlying HDF5.File, file or file-like object has already been closed. FileError If passed file or file-like object is not opened for reading or in addition for writing in case mode corresponds to any of 'w', 'w+', 'x', 'x+' or a. ValueError: If anything else than str, bytes or None specified for filename """ # Make sure that the given path always starts with '/' if not path.startswith('/'): path = "/%s" % path # Were we handed a file object or just a file name string? if isinstance(f, (str, Path)): return h5.File(f, mode.replace('b','')),path,True if isinstance(f, h5.Group): if not f: raise ClosedFileError( "HDF5 file {}has been closed or h5py.Group or h5py.Dataset are not accessible. " "Please pass either a filename string, a pathlib.Path, a file or file like object, " "an opened h5py.File or h5py.Group or h5py.Dataset there outof.".format( "'{}' ".format(filename) if isinstance(filename,(str,bytes)) and filename else '' ) ) base_path = f.name if not isinstance(f,h5.File): f = f.file if f.mode == 'r' and ( mode[0] != 'r' or '+' in mode[1:] ): raise FileError( "HDF5 file '{}' not opened for writing".format(f.filename)) # Since this file was already open, do not close the file afterward return f,''.join((base_path,path.rstrip('/'))),False # get the name of the file if not isinstance(filename,(str,bytes)): if filename is not None: raise ValueError("'filename' must be of type 'str' or 'bytes'") if isinstance(f,(tuple,list)) and len(f) > 1: f,filename = f[:2] elif isinstance(f,dict): f,filename = f['file'],f['name'] else: filename = getattr(f,'filename',None) if filename is None: filename = getattr(f,'name',None) if filename is None: filename = repr(f) if getattr(f,'closed',False): raise ClosedFileError( "HDF5 file {}has been closed or h5py.Group or h5py.Dataset are not accessible. " "Please pass either a filename string, a pathlib.Path, a file or file like object, " "an opened h5py.File or h5py.Group or h5py.Dataset there out of.".format( "'{}' ".format(filename) if isinstance(filename,(str,bytes)) and filename else '' ) ) # file and file-like object must be at least read and seekable. This means # they have as specified by IOBase provide read, seek and tell methods if ( getattr(f,'readable',not_io_base_like(f,('read',0)))() and getattr(f,'seekable',not_io_base_like(f,('seek',0),('tell',)))() ): # if file is to be opened for cration, writing or appending check if file, file-like # object is writable or at least provides write method for writing binary data if mode[0] in 'xwa' or ( '+' in mode[1:] and mode[0] == 'r' ): if not getattr(f,'writeable',not_io_base_like(f,('write',b'')))(): raise FileError( "file '{}' not writable. Please pass either a filename string, " "a pathlib.Path, a file or file like object, " "an opened h5py.File or h5py.Group or h5py.Dataset there out of.".format(filename) ) elif mode[0] != 'r': raise ValueError( "invalid file mode must be one out of 'w','w+','x','x+','r','r+','a'. " "at max including a 'b' which will be ignored" ) return h5.File( f, mode.replace('b','') if mode[0] == 'r' else mode[0], driver='fileobj', fileobj = f ), path, True raise FileError( "'file_obj' must be a valid path string, pahtlib.Path, h5py.File, h5py.Group, " "h5py.Dataset, file or file like object'" ) hickle-5.0.2/hickle/helpers.py000066400000000000000000000177031430361177200162440ustar00rootroot00000000000000# encoding: utf-8 """ #helpers.py Contains functions, classes and constants to be used by all components of hickle including the loader modules """ # %% IMPORTS # Built-in imports import collections import h5py as h5 import functools as ft # Package imports # %% EXCEPTION DEFINITIONS nobody_is_my_name = () class NotHicklable(Exception): """ object can not be mapped to proper hickle HDF5 file structure and thus shall be converted to pickle string before storing. """ class ToDoError(Exception): # pragma: no cover """ An exception raised for non-implemented functionality""" def __str__(self): return "Error: this functionality hasn't been implemented yet." # %% CLASS DEFINITIONS class PyContainer(): """ Abstract base class for all PyContainer classes acting as proxy between h5py.Group and python object represented by the content of the h5py.Group. Any container type object as well as complex objects are represented in a tree like structure in the HDF5. PyContainer type objects ensure to properly map these structure when converting it into the corresponding python object structure. Parameters ---------- h5_attrs (h5py.AttributeManager): attributes defined on h5py.Group object represented by this PyContainer base_type (bytes): the basic type used for representation in the HDF5 file object_type: type of Python object to be restored. May be used by PyContainer.convert to convert loaded Python object into final one. Attributes ---------- base_type (bytes): the basic type used for representation on the HDF5 file object_type: type of Python object to be restored. Dependent upon container may be used by PyContainer.convert to convert loaded Python object into final one. """ __slots__ = ("base_type", "object_type", "_h5_attrs", "_content","__dict__" ) def __init__(self, h5_attrs, base_type, object_type, _content = None): """ Parameters (protected): ----------------------- _content (default: list): container to be used to collect the Python objects representing the sub items or the state of the final Python object. Shall only be set by derived PyContainer classes and not be set when default list container shall be used. """ # the base type used to select this PyContainer self.base_type = base_type # class of python object represented by this PyContainer self.object_type = object_type # the h5_attrs structure of the h5_group to load the object_type from # can be used by the append and convert methods to obtain more # information about the container like object to be restored self._h5_attrs = h5_attrs # intermediate list, tuple, dict, etc. used to collect and store the sub items # when calling the append method self._content = _content if _content is not None else [] def filter(self, h_parent): """ PyContainer type child classes may overload this generator function to filter and preprocess the content of h_parent h5py.Group content to ensure it can be properly processed by recursive calls to hickle._load function. Per default yields from h_parent.items(). For examples see: hickle.lookup.ExpandReferenceContainer.filter hickle.loaders.load_scipy.SparseMatrixContainer.filter """ yield from h_parent.items() def append(self, name, item, h5_attrs): """ adds the passed item to the content of this container. Parameters ---------- name (string): the name of the h5py.Dataset or h5py.Group sub item was loaded from item: the Python object of the sub item h5_attrs: attributes defined on h5py.Group or h5py.Dataset sub item was loaded from. """ self._content.append(item) def convert(self): """ creates the final object and populates it with the items stored in the _content attribute. Note: Must be implemented by the derived PyContainer child classes Returns ------- py_obj: The final Python object loaded from file """ raise NotImplementedError("convert method must be implemented") class H5NodeFilterProxy(): """ Proxy class which allows to temporarily modify the content of h5_node.attrs attribute. Original attributes of underlying h5_node are left unchanged. Parameters ---------- h5_node: node for which attributes shall be replaced by a temporary value """ __slots__ = ('_h5_node','attrs','__dict__') def __init__(self,h5_node): # the h5py.Group or h5py.Dataset the attributes should temporarily # be modified. self._h5_node = h5_node # the temporarily modified attributes structure super().__setattr__( 'attrs', collections.ChainMap({}, h5_node.attrs)) def __getattribute__(self, name): # for attrs and wrapped _h5_node return local copy. Any other request # redirect to wrapped _h5_node if name in {"attrs", "_h5_node"}: return super(H5NodeFilterProxy,self).__getattribute__(name) _h5_node = super(H5NodeFilterProxy,self).__getattribute__('_h5_node') return getattr(_h5_node, name) def __setattr__(self, name, value): # if wrapped _h5_node and attrs shall be set store value on local attributes # otherwise pass on to wrapped _h5_node if name in {'_h5_node'}: super().__setattr__(name, value) return if name in {'attrs'}: # pragma: no cover raise AttributeError('attribute is read-only') _h5_node = super().__getattribute__('_h5_node') setattr(_h5_node, name, value) def __getitem__(self, *args, **kwargs): _h5_node = super().__getattribute__('_h5_node') return _h5_node.__getitem__(*args, **kwargs) # TODO as needed add more function like __getitem__ to fully proxy h5_node # or consider using metaclass __getattribute__ for handling special methods class no_compression(dict): """ named dict comprehension which temporarily removes any compression or data filter related argument from the passed iterable. """ # list of keyword parameters to filter __filter_keys__ = { "compression", "shuffle", "compression_opts", "chunks", "fletcher32", "scaleoffset" } def __init__(self, mapping): super().__init__(( (key,value) for key,value in ( mapping.items() if isinstance(mapping,dict) else mapping ) if key not in no_compression.__filter_keys__ )) # %% FUNCTION DEFINITIONS def not_dumpable( py_obj, h_group, name, **kwargs): # pragma: no cover """ create_dataset method attached to loader of dummy py_object which is used to mimic PyContainer class for groups in legacy hickle 4.x file. Raises ------ RuntimeError: in any case as this function shall never be called """ raise RuntimeError("types defined by loaders not dump able") def convert_str_attr(attrs,name,*,encoding='utf8'): return attrs[name].decode(encoding) def convert_str_list_attr(attrs,name,*,encoding='utf8'): return [ value.decode(encoding) for value in attrs[name]] if h5.version.version_tuple[0] >= 3: # pragma: no cover load_str_list_attr_ascii = load_str_list_attr = h5.AttributeManager.get load_str_attr_ascii = load_str_list_attr = h5.AttributeManager.get else: # pragma: no cover load_str_list_attr_ascii = ft.partial(convert_str_list_attr,encoding='ascii') load_str_list_attr = convert_str_list_attr load_str_attr_ascii = ft.partial(convert_str_attr,encoding='ascii') load_str_attr = convert_str_attr hickle-5.0.2/hickle/hickle.py000066400000000000000000000430461430361177200160400ustar00rootroot00000000000000# encoding: utf-8 """ # hickle.py Created by Danny Price 2016-02-03. Hickle is an HDF5 based clone of Pickle. Instead of serializing to a pickle file, Hickle dumps to an HDF5 file. It is designed to be as similar to pickle in usage as possible, providing a load() and dump() function. ## Notes Hickle has two main advantages over Pickle: 1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler reads the entire pickle thing and loads it into memory. In comparison, HDF5 files are designed for large datasets. Things are only loaded when accessed. 2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows on Linux and vice versa is likely to fail with errors like "Insecure string pickle". HDF5 files will load fine, as long as both machines have h5py installed. 3) pickle.dumps() and pickle.loads() functions can be mimicked by passing a BytesIO type to hickle.dump() or hickle.load() function and setting the filename parameter to a non empty string. hicklestring = BytesIO() hickle.dump(my_data,hicklestring,mode='w',filename='') loaded_data = hickle.load(hicklestring,mode='r',filename='') """ # %% IMPORTS # Built-in imports import io import sys import warnings import types import functools as ft # Package imports import pickle import h5py as h5 import numpy as np #whished it would not be necessary but sometimes garbage collector #may kick in while trying to close file. Causing ValueError in close #to prevent check if collectoin is necessary after flushing file import gc # hickle imports from hickle import __version__ from .helpers import ( PyContainer, NotHicklable, nobody_is_my_name, ToDoError ) from .fileio import ClosedFileError, FileError, file_opener from .lookup import ( #hkl_types_dict, hkl_container_dict, load_loader, load_legacy_loader , create_pickled_dataset, load_nothing, fix_lambda_obj_type,ReferenceManager, LoaderManager, RecoverGroupContainer, recover_custom_dataset ) # All declaration __all__ = ['dump', 'load'] # %% FUNCTION DEFINITIONS ########### # DUMPERS # ########### def _dump(py_obj, h_group, name, memo, loader,attrs={} , **kwargs): """ Dump a python object to a group within an HDF5 file. This function is called recursively by the main dump() function. Parameters ---------- py_obj (object): python object to dump. h_group (h5.File.group): group to dump data into. name (str): name of resulting hdf5 group or dataset memo (ReferenceManager): the ReferenceManager object responsible for handling all object and type memoisation related issues attrs (dict): additional attributes to be stored along with the resulting hdf5 group or hdf5 dataset kwargs (dict): keyword arguments to be passed to create_dataset function """ py_obj_id = id(py_obj) py_obj_ref = memo.get(py_obj_id,None) if py_obj_ref is not None: # py_object already dumped to hdf5 file store a reference to it instead # instead of dumping it again. # # Note: reference dataset share their base_type and py_obj_type with the # referenced h5py.Group or h5py.Dataset. On load their h5py.ref_dtype type # dtype is used to distinguish them from datasets hosting pickled data. h_link = h_group.create_dataset(name,data = py_obj_ref[0].ref,dtype = h5.ref_dtype) h_link.attrs.update(attrs) return # Check if loader has already been loaded for the provided py_obj and # retrieve the most appropriate method for creating the corresponding # representation within HDF5 file py_obj_type, (create_dataset, base_type,memoise) = loader.load_loader(py_obj.__class__) try: h_node,h_subitems = create_dataset(py_obj, h_group, name, **kwargs) except NotHicklable: h_node,h_subitems = create_pickled_dataset(py_obj, h_group, name, reason = str(NotHicklable), **kwargs) else: # store base_type and type unless py_obj had to be pickled by create_pickled_dataset memo.store_type(h_node,py_obj_type,base_type,**kwargs) # add additional attributes and prevent modification of 'type' attribute h_node.attrs.update((name,attr) for name,attr in attrs.items() if name != 'type' ) # if py_object shall be memoised to properly represent multiple references # to it in HDF5 file store it along with created h_node in the memo dictionary. # remembering the py_object along with the h_node ensures that py_object_id # which represents the memory address of py_obj refers to py_obj until the # whole structure is stored within hickle file. if memoise: memo[py_obj_id] = (h_node,py_obj) # loop through list of all sub items and recursively dump them # to HDF5 file for h_subname,py_subobj,h_subattrs,sub_kwargs in h_subitems: _dump(py_subobj,h_node,h_subname,memo,loader,h_subattrs,**sub_kwargs) def dump(py_obj, file_obj, mode='w', path='/',*,filename = None,options = {},**kwargs): """ Write a hickled representation of `py_obj` to the provided `file_obj`. Parameters ---------- py_obj (object): Python object to hickle to HDF5. file_obj (file, file-like, h5py.File, str, (file,str),{'file':file,'name':str} ): File to open for dumping or loading purposes. str: the path of the HDF5-file that must be used. ~h5py.Group: the group (or file) in an open HDF5-file that must be used. file, file-like: file or like object which provides `read`, `seek`, `tell` and write methods tuple: two element tuple with the first being the file or file like object to dump to and the second the filename to be used instead of 'filename' parameter dict: dictionary with 'file' and 'name' items mode (str): optional string indicating how the file shall be opened. For details see Python `open`. Note: The 'b' flag is optional as all files are and have to be opened in binary mode. path (str): optional Path within HDF5-file or group to dump to/load from. filename (str): optional The name of the file. Ignored when f is `str` or `h5py.File` object. options (dict): optional Each entry in this dict modifies how hickle dumps data to file. For example { custom = True } would enforce use of custom loaders on all classes registered with this kind of loader. { custom = False } would disable custom loaders for dumped data even if globally turned on. More options may follow. kwargs : keyword arguments Additional keyword arguments that must be provided to the :meth:`~h5py.Group.create_dataset` method. For example compression=True Raises ------ CloseFileError: If passed h5py.File, h5py.Group or h5py.Dataset object is not accessible. This in most cases indicates that underlying HDF5 was closed or if file or file or file-like object has already been closed. FileError If passed file or file-like object is not opened for reading or in addition for writing in case mode corresponds to any of 'w', 'w+', 'x', 'x+' or a. ValueError: If anything else than str, bytes or None specified for filename or for mode is anything else specified than 'w','w+','x','x+','r','r+','a' or contains any optional open flag other than 'b' """ # Open the file h5f, path, close_flag = file_opener(file_obj, path, mode,filename) try: # Log which version of python was used to generate the hickle file pv = sys.version_info py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2]) h_root_group = h5f.get(path,None) if h_root_group is None: h_root_group = h5f.create_group(path) elif h_root_group.items(): raise ValueError("Unable to create group (name already exists)") h_root_group.attrs["HICKLE_VERSION"] = __version__ h_root_group.attrs["HICKLE_PYTHON_VERSION"] = py_ver with LoaderManager.create_manager(h_root_group,False,options) as loader: with ReferenceManager.create_manager(h_root_group) as memo: _dump(py_obj, h_root_group,'data', memo ,loader,**kwargs) finally: # Flush the the h5py.File and close it if it was opened by hickle. h5f.flush() # disable python garbage collector while closing to prevent Unrecognized # typecode ValueError caused by h5py objects collected to earlay most # persistently observed with h5py 2.10 in python 3.7 on 32 bit windows gc.disable() if close_flag: h5f.close() gc.enable() ########### # LOADERS # ########### class RootContainer(PyContainer): """ PyContainer representing the whole HDF5 file """ __slots__ = () def convert(self): return self._content[0] def load(file_obj, path='/', safe=True, filename = None): """ Load the Python object stored in `file_obj` at `path` and return it. Parameters ---------- file_obj (file, file-like, h5py.File, str, (file,str),{'file':file,'name':str} ): File to open for dumping or loading purposes. str: the path of the HDF5-file that must be used. ~h5py.Group: the group (or file) in an open HDF5-file that must be used. file, file-like: file or like object which provides `read`, `seek`, `tell` and write methods tuple: two element tuple with the first being the file or file like object to dump to and the second the filename to be used instead of 'filename' parameter dict: dictionary with 'file' and 'name' items path (str): optional Path within HDF5-file or group to dump to/load from. safe (bool): optional Disable automatic depickling of arbitrary python objects. DO NOT set this to False unless the file is from a trusted source. (See https://docs.python.org/3/library/pickle.html for an explanation) Note: ignored when loading hickle 4.x and newer files filename (str): optional The name of the file. Ignored when f is `str` or `h5py.File` object. Returns ------- py_obj : object The unhickled Python object. Raises ------ CloseFileError: If passed h5py.File, h5py.Group or h5py.Dataset object is not accessible. This in most cases indicates that underlying HDF5 was closed or if file or file or file-like object has already been closed. FileError If passed file or file-like object is not opened for reading ValueError: If anything else than str, bytes or None specified for filename """ # Try to read the provided file_obj as a hickle file h5f, path, close_flag = file_opener(file_obj, path, 'r', filename) try: h_root_group = h5f.get(path,None) # only used by v4 if not isinstance(h_root_group,h5.Group): raise FileError("file '{}': path '{}' not existing".format(h5f.filename,path)) # Define attributes h_root_group must have v3_attrs = ['CLASS', 'VERSION', 'PYTHON_VERSION'] v4_attrs = ['HICKLE_VERSION', 'HICKLE_PYTHON_VERSION'] # Check if the proper attributes for v3 loading are available if all(map(h5f.attrs.get, v3_attrs)): # Check if group attribute 'CLASS' has value 'hickle if(h5f.attrs['CLASS'] not in ( b'hickle','hickle')): # pragma: no cover # If not, raise error raise AttributeError("HDF5-file attribute 'CLASS' does not " "have value 'hickle'!") # Obtain version with which the file was made try: major_version = int(h5f.attrs['VERSION'][0]) # If this cannot be done, then this is not a v3 file except Exception: # pragma: no cover raise Exception("This file does not appear to be a hickle v3 " "file.") # Else, if the major version is not 3, it is not a v3 file either else: if(major_version != 3): # pragma: no cover raise Exception("This file does not appear to be a hickle " "v3 file.") # Load file from hickle import legacy_v3 warnings.warn("Input argument 'file_obj' appears to be a file made" " with hickle v3. Using legacy load...") return(legacy_v3.load(file_obj, path, safe)) # Else, check if the proper attributes for v4 loading are available if all(map(h_root_group.attrs.get, v4_attrs)): # Load file py_container = RootContainer(h_root_group.attrs,b'document_root',RootContainer) pickle_loads = pickle.loads hickle_version = h_root_group.attrs["HICKLE_VERSION"].split('.') if int(hickle_version[0]) == 4: # hickle 4.x file activate if legacy load fixes for 4.x # eg. pickle of versions < 3.8 do not prevent dumping of lambda functions # even though stated otherwise in documentation. Activate workarounds # just in case issues arise. Especially as corresponding lambdas in # load_numpy are not needed anymore and thus have been removed. with LoaderManager.create_manager(h_root_group,True) as loader: with ReferenceManager.create_manager(h_root_group,fix_lambda_obj_type) as memo: _load(py_container, 'data',h_root_group['data'],memo,loader) #load_loader = load_legacy_loader) return py_container.convert() # 4.1.x file and newer with LoaderManager.create_manager( h_root_group,False) as loader: with ReferenceManager.create_manager(h_root_group,pickle_loads) as memo: _load(py_container, 'data',h_root_group['data'],memo,loader) #load_loader = load_loader) return py_container.convert() # Else, raise error raise FileError("HDF5-file does not have the proper attributes!") # If this fails, raise error and provide user with caught error message except Exception as error: raise ValueError("Provided argument 'file_obj' does not appear to be a valid hickle file! (%s)" % (error),error) from error finally: # Flush the h5py.File and close it lif it was opened by hickle. h5f.flush() # disable python garbage collector while closing to prevent Unrecognized # typecode ValueError caused by h5py objects collected to earlay most # persistently observed with h5py 2.10 in python 3.7 on 32 bit windows gc.disable() if close_flag: h5f.close() gc.enable() def _load(py_container, h_name, h_node,memo,loader): #load_loader = load_loader): """ Load a hickle file Recursive function to load hdf5 data into a PyContainer() Parameters ---------- py_container (PyContainer): Python container to load data into h_name (str): the name of the resulting h5py.Group or h5py.Dataset h_node (h5py.Group, h5py.Dataset): h5py.Group or h5py.Dataset to restore data from. memo (ReferenceManager): the ReferenceManager object responsible for handling all object and type memoisation related issues loader (LoaderManager): the LoaderManager object managing the loaders required to properly restore the content of h_node and append it to py_container. """ # if h_node has already been loaded cause a reference to it was encountered # earlier directly append it to its parent container and return node_ref = memo.get(h_node.id,h_node) if node_ref is not h_node: py_container.append(h_name,node_ref,h_node.attrs) return # load the type information of node. py_obj_type,base_type,is_container = memo.resolve_type(h_node) py_obj_type,(_,_,memoise) = loader.load_loader(py_obj_type,base_type=base_type) if is_container: # Either a h5py.Group representing the structure of complex objects or # a h5py.Dataset representing a h5py.Reference to the node of an object # referred to from multiple places within the object structure on dump # is to be restored. # If no appropriate PyContainer is available use RecoverGroupContainer # instead to at least recover its contained data py_container_class = loader.hkl_container_dict.get(base_type,RecoverGroupContainer) py_subcontainer = py_container_class(h_node.attrs,base_type,py_obj_type) for h_key,h_subnode in py_subcontainer.filter(h_node): _load(py_subcontainer, h_key, h_subnode, memo ,loader) # finalize sub item sub_data = py_subcontainer.convert() py_container.append(h_name,sub_data,h_node.attrs) else: # must be a dataset load it and append to parent container. # In case no appropriate loader could be found use recover_custom_dataset # instead to at least recover the contained data load_fn = loader.hkl_types_dict.get(base_type, recover_custom_dataset) sub_data = load_fn(h_node,base_type,py_obj_type) py_container.append(h_name,sub_data,h_node.attrs) # store loaded object for properly restoring additional references to it if memoise: memo[h_node.id] = sub_data hickle-5.0.2/hickle/legacy_v3/000077500000000000000000000000001430361177200160745ustar00rootroot00000000000000hickle-5.0.2/hickle/legacy_v3/__init__.py000066400000000000000000000001041430361177200202000ustar00rootroot00000000000000from .hickle import dump, load from .__version__ import __version__ hickle-5.0.2/hickle/legacy_v3/__version__.py000066400000000000000000000002761430361177200207340ustar00rootroot00000000000000# -*- coding: utf-8 -*- """ Hickle Version ============== Stores the different versions of the *Hickle* package. """ # %% VERSIONS # Default/Latest/Current version __version__ = '3.4.8' hickle-5.0.2/hickle/legacy_v3/helpers.py000066400000000000000000000056751430361177200201250ustar00rootroot00000000000000import re import six def get_type_and_data(h_node): """ Helper function to return the py_type and data block for a HDF node """ py_type = h_node.attrs["type"][0] data = h_node[()] # if h_node.shape == (): # data = h_node.value # else: # data = h_node[:] return py_type, data def get_type(h_node): """ Helper function to return the py_type for a HDF node """ py_type = h_node.attrs["type"][0] return py_type def sort_keys(key_list): """ Take a list of strings and sort it by integer value within string Args: key_list (list): List of keys Returns: key_list_sorted (list): List of keys, sorted by integer """ # Py3 h5py returns an irritating KeysView object # Py3 also complains about bytes and strings, convert all keys to bytes if six.PY3: key_list2 = [] for key in key_list: if isinstance(key, str): key = bytes(key, 'ascii') key_list2.append(key) key_list = key_list2 # Check which keys contain a number numbered_keys = [re.search(br'\d+', key) for key in key_list] # Sort the keys on number if they have it, or normally if not if(len(key_list) and not numbered_keys.count(None)): to_int = lambda x: int(re.search(br'\d+', x).group(0)) return(sorted(key_list, key=to_int)) else: return(sorted(key_list)) def check_is_iterable(py_obj): """ Check whether a python object is iterable. Note: this treats unicode and string as NON ITERABLE Args: py_obj: python object to test Returns: iter_ok (bool): True if item is iterable, False is item is not """ if six.PY2: string_types = (str, unicode) else: string_types = (str, bytes, bytearray) if isinstance(py_obj, string_types): return False try: iter(py_obj) return True except TypeError: return False def check_is_hashable(py_obj): """ Check if a python object is hashable Note: this function is currently not used, but is useful for future development. Args: py_obj: python object to test """ try: py_obj.__hash__() return True except TypeError: return False def check_iterable_item_type(iter_obj): """ Check if all items within an iterable are the same type. Args: iter_obj: iterable object Returns: iter_type: type of item contained within the iterable. If the iterable has many types, a boolean False is returned instead. References: http://stackoverflow.com/questions/13252333/python-check-if-all-elements-of-a-list-are-the-same-type """ iseq = iter(iter_obj) try: first_type = type(next(iseq)) except StopIteration: return False except Exception as ex: return False else: return first_type if all((type(x) is first_type) for x in iseq) else False hickle-5.0.2/hickle/legacy_v3/hickle.py000066400000000000000000000510061430361177200177070ustar00rootroot00000000000000# encoding: utf-8 """ # hickle.py Created by Danny Price 2016-02-03. Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in usage as possible, providing a load() and dump() function. ## Notes Hickle has two main advantages over Pickle: 1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler reads the entire pickle thing and loads it into memory. In comparison, HDF5 files are designed for large datasets. Things are only loaded when accessed. 2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows on Linux and vice versa is likely to fail with errors like "Insecure string pickle". HDF5 files will load fine, as long as both machines have h5py installed. """ from __future__ import absolute_import, division, print_function import sys import os from pkg_resources import get_distribution, DistributionNotFound from ast import literal_eval import numpy as np import h5py as h5 from .__version__ import __version__ from .helpers import get_type, sort_keys, check_is_iterable, check_iterable_item_type from .lookup import (types_dict, hkl_types_dict, types_not_to_sort, container_types_dict, container_key_types_dict, check_is_ndarray_like) try: from exceptions import Exception from types import NoneType except ImportError: pass # above imports will fail in python3 from six import PY2, PY3, string_types, integer_types import io # Make several aliases for Python2/Python3 compatibility if PY3: file = io.TextIOWrapper # Import dill as pickle import dill as pickle try: from pathlib import Path string_like_types = string_types + (Path,) except ImportError: # Python 2 does not have pathlib string_like_types = string_types import warnings ################## # Error handling # ################## class FileError(Exception): """ An exception raised if the file is fishy """ def __init__(self): return def __str__(self): return ("Cannot open file. Please pass either a filename " "string, a file object, or a h5py.File") class ClosedFileError(Exception): """ An exception raised if the file is fishy """ def __init__(self): return def __str__(self): return ("HDF5 file has been closed. Please pass either " "a filename string, a file object, or an open h5py.File") class NoMatchError(Exception): """ An exception raised if the object type is not understood (or supported)""" def __init__(self): return def __str__(self): return ("Error: this type of python object cannot be converted into a " "hickle.") class ToDoError(Exception): """ An exception raised for non-implemented functionality""" def __init__(self): return def __str__(self): return "Error: this functionality hasn't been implemented yet." class SerializedWarning(UserWarning): """ An object type was not understood The data will be serialized using pickle. """ pass ###################### # H5PY file wrappers # ###################### class H5GroupWrapper(h5.Group): """ Group wrapper that provides a track_times kwarg. track_times is a boolean flag that can be set to False, so that two files created at different times will have identical MD5 hashes. """ def create_dataset(self, *args, **kwargs): kwargs['track_times'] = getattr(self, 'track_times', True) return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) def create_group(self, *args, **kwargs): group = super(H5GroupWrapper, self).create_group(*args, **kwargs) group.__class__ = H5GroupWrapper group.track_times = getattr(self, 'track_times', True) return group class H5FileWrapper(h5.File): """ Wrapper for h5py File that provides a track_times kwarg. track_times is a boolean flag that can be set to False, so that two files created at different times will have identical MD5 hashes. """ def create_dataset(self, *args, **kwargs): kwargs['track_times'] = getattr(self, 'track_times', True) return super(H5FileWrapper, self).create_dataset(*args, **kwargs) def create_group(self, *args, **kwargs): group = super(H5FileWrapper, self).create_group(*args, **kwargs) group.__class__ = H5GroupWrapper group.track_times = getattr(self, 'track_times', True) return group def file_opener(f, mode='r', track_times=True): """ A file opener helper function with some error handling. This can open files through a file object, a h5py file, or just the filename. Args: f (file, h5py.File, or string): File-identifier, e.g. filename or file object. mode (str): File open mode. Only required if opening by filename string. track_times (bool): Track time in HDF5; turn off if you want hickling at different times to produce identical files (e.g. for MD5 hash check). """ # Assume that we will have to close the file after dump or load close_flag = True # Were we handed a file object or just a file name string? if isinstance(f, (file, io.TextIOWrapper, io.BufferedWriter)): filename, mode = f.name, f.mode f.close() mode = mode.replace('b', '') h5f = h5.File(filename, mode) elif isinstance(f, string_like_types): filename = f h5f = h5.File(filename, mode) elif isinstance(f, (H5FileWrapper, h5._hl.files.File)): try: filename = f.filename except ValueError: raise ClosedFileError h5f = f # Since this file was already open, do not close the file afterward close_flag = False else: print(f.__class__) raise FileError h5f.__class__ = H5FileWrapper h5f.track_times = track_times return(h5f, close_flag) ########### # DUMPERS # ########### def _dump(py_obj, h_group, call_id=0, **kwargs): """ Dump a python object to a group within a HDF5 file. This function is called recursively by the main dump() function. Args: py_obj: python object to dump. h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ # Get list of dumpable dtypes dumpable_dtypes = [] for lst in [[bool, complex, bytes, float], string_types, integer_types]: dumpable_dtypes.extend(lst) # Firstly, check if item is a numpy array. If so, just dump it. if check_is_ndarray_like(py_obj): create_hkl_dataset(py_obj, h_group, call_id, **kwargs) # Next, check if item is a dict elif isinstance(py_obj, dict): create_hkl_dataset(py_obj, h_group, call_id, **kwargs) # If not, check if item is iterable elif check_is_iterable(py_obj): item_type = check_iterable_item_type(py_obj) # item_type == False implies multiple types. Create a dataset if item_type is False: h_subgroup = create_hkl_group(py_obj, h_group, call_id) for ii, py_subobj in enumerate(py_obj): _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) # otherwise, subitems have same type. Check if subtype is an iterable # (e.g. list of lists), or not (e.g. list of ints, which should be treated # as a single dataset). else: if item_type in dumpable_dtypes: create_hkl_dataset(py_obj, h_group, call_id, **kwargs) else: h_subgroup = create_hkl_group(py_obj, h_group, call_id) for ii, py_subobj in enumerate(py_obj): _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) # item is not iterable, so create a dataset for it else: create_hkl_dataset(py_obj, h_group, call_id, **kwargs) def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs): """ Write a pickled representation of obj to the open file object file. Args: obj (object): python object o store in a Hickle file: file object, filename string, or h5py.File object file in which to store the object. A h5py.File or a filename is also acceptable. mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append). Ignored if file is a file object. compression (str): optional argument. Applies compression to dataset. Options: None, gzip, lzf (+ szip, if installed) track_times (bool): optional argument. If set to False, repeated hickling will produce identical files. path (str): path within hdf5 file to save data to. Defaults to root / """ # Make sure that file is not closed unless modified # This is to avoid trying to close a file that was never opened close_flag = False try: # Open the file h5f, close_flag = file_opener(file_obj, mode, track_times) h5f.attrs["CLASS"] = b'hickle' h5f.attrs["VERSION"] = __version__ h5f.attrs["type"] = [b'hickle'] # Log which version of python was used to generate the hickle file pv = sys.version_info py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2]) h5f.attrs["PYTHON_VERSION"] = py_ver h_root_group = h5f.get(path) if h_root_group is None: h_root_group = h5f.create_group(path) h_root_group.attrs["type"] = [b'hickle'] _dump(py_obj, h_root_group, **kwargs) except NoMatchError: fname = h5f.filename h5f.close() try: os.remove(fname) except OSError: warnings.warn("Dump failed. Could not remove %s" % fname) finally: raise NoMatchError finally: # Close the file if requested. # Closing a file twice will not cause any problems if close_flag: h5f.close() def create_dataset_lookup(py_obj): """ What type of object are we trying to pickle? This is a python dictionary based equivalent of a case statement. It returns the correct helper function for a given data type. Args: py_obj: python object to look-up what function to use to dump to disk Returns: match: function that should be used to dump data to a new dataset """ t = type(py_obj) types_lookup = {dict: create_dict_dataset} types_lookup.update(types_dict) match = types_lookup.get(t, no_match) return match def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs): """ Create a dataset within the hickle HDF5 file Args: py_obj: python object to dump. h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ #lookup dataset creator type based on python object type create_dataset = create_dataset_lookup(py_obj) # do the creation create_dataset(py_obj, h_group, call_id, **kwargs) def create_hkl_group(py_obj, h_group, call_id=0): """ Create a new group within the hickle file Args: h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ h_subgroup = h_group.create_group('data_%i' % call_id) h_subgroup.attrs['type'] = [str(type(py_obj)).encode('ascii', 'ignore')] return h_subgroup def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs): """ Creates a data group for each key in dictionary Notes: This is a very important function which uses the recursive _dump method to build up hierarchical data models stored in the HDF5 file. As this is critical to functioning, it is kept in the main hickle.py file instead of in the loaders/ directory. Args: py_obj: python object to dump; should be dictionary h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ h_dictgroup = h_group.create_group('data_%i' % call_id) h_dictgroup.attrs['type'] = [str(type(py_obj)).encode('ascii', 'ignore')] for key, py_subobj in py_obj.items(): if isinstance(key, string_types): h_subgroup = h_dictgroup.create_group("%r" % (key)) else: h_subgroup = h_dictgroup.create_group(str(key)) h_subgroup.attrs["type"] = [b'dict_item'] h_subgroup.attrs["key_type"] = [str(type(key)).encode('ascii', 'ignore')] _dump(py_subobj, h_subgroup, call_id=0, **kwargs) def no_match(py_obj, h_group, call_id=0, **kwargs): """ If no match is made, raise an exception Args: py_obj: python object to dump; default if item is not matched. h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ pickled_obj = pickle.dumps(py_obj) d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj]) d.attrs["type"] = [b'pickle'] warnings.warn("%s type not understood, data have been serialized" % type(py_obj), SerializedWarning) ############# ## LOADERS ## ############# class PyContainer(list): """ A group-like object into which to load datasets. In order to build up a tree-like structure, we need to be able to load datasets into a container with an append() method. Python tuples and sets do not allow this. This class provides a list-like object that be converted into a list, tuple, set or dict. """ def __init__(self): super(PyContainer, self).__init__() self.container_type = None self.name = None self.key_type = None def convert(self): """ Convert from PyContainer to python core data type. Returns: self, either as a list, tuple, set or dict (or other type specified in lookup.py) """ if self.container_type in container_types_dict.keys(): convert_fn = container_types_dict[self.container_type] return convert_fn(self) if self.container_type == str(dict).encode('ascii', 'ignore'): keys = [] for item in self: key = item.name.split('/')[-1] key_type = item.key_type[0] if key_type in container_key_types_dict.keys(): to_type_fn = container_key_types_dict[key_type] key = to_type_fn(key) keys.append(key) items = [item[0] for item in self] return dict(zip(keys, items)) else: return self def no_match_load(key): """ If no match is made when loading, need to raise an exception """ raise RuntimeError("Cannot load %s data type" % key) #pass def load_dataset_lookup(key): """ What type of object are we trying to unpickle? This is a python dictionary based equivalent of a case statement. It returns the type a given 'type' keyword in the hickle file. Args: py_obj: python object to look-up what function to use to dump to disk Returns: match: function that should be used to dump data to a new dataset """ match = hkl_types_dict.get(key, no_match_load) return match def load(fileobj, path='/', safe=True): """ Load a hickle file and reconstruct a python object Args: fileobj: file object, h5py.File, or filename string safe (bool): Disable automatic depickling of arbitrary python objects. DO NOT set this to False unless the file is from a trusted source. (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) path (str): path within hdf5 file to save data to. Defaults to root / """ # Make sure that the file is not closed unless modified # This is to avoid trying to close a file that was never opened close_flag = False try: h5f, close_flag = file_opener(fileobj) h_root_group = h5f.get(path) try: assert 'CLASS' in h5f.attrs.keys() assert 'VERSION' in h5f.attrs.keys() VER = h5f.attrs['VERSION'] try: VER_MAJOR = int(VER) except ValueError: VER_MAJOR = int(VER[0]) if VER_MAJOR == 1: if PY2: warnings.warn("Hickle file versioned as V1, attempting legacy loading...") from . import hickle_legacy return hickle_legacy.load(fileobj, safe) else: raise RuntimeError("Cannot open file. This file was likely" " created with Python 2 and an old hickle version.") elif VER_MAJOR == 2: if PY2: warnings.warn("Hickle file appears to be old version (v2), attempting " "legacy loading...") from . import hickle_legacy2 return hickle_legacy2.load(fileobj, path=path, safe=safe) else: raise RuntimeError("Cannot open file. This file was likely" " created with Python 2 and an old hickle version.") # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3) # For backward compatibility we really need to catch this. # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0) elif VER_MAJOR == 3 and VER == VER_MAJOR: if PY2: warnings.warn("Hickle file appears to be old version (v2.1.0), attempting " "legacy loading...") from . import hickle_legacy2 return hickle_legacy2.load(fileobj, path=path, safe=safe) else: raise RuntimeError("Cannot open file. This file was likely" " created with Python 2 and an old hickle version.") elif VER_MAJOR >= 3: py_container = PyContainer() py_container.container_type = 'hickle' py_container = _load(py_container, h_root_group) return py_container[0][0] except AssertionError: if PY2: warnings.warn("Hickle file is not versioned, attempting legacy loading...") from . import hickle_legacy return hickle_legacy.load(fileobj, safe) else: raise RuntimeError("Cannot open file. This file was likely" " created with Python 2 and an old hickle version.") finally: # Close the file if requested. # Closing a file twice will not cause any problems if close_flag: h5f.close() def load_dataset(h_node): """ Load a dataset, converting into its correct python type Args: h_node (h5py dataset): h5py dataset object to read Returns: data: reconstructed python object from loaded data """ py_type = get_type(h_node) try: load_fn = load_dataset_lookup(py_type) return load_fn(h_node) except: raise #raise RuntimeError("Hickle type %s not understood." % py_type) def _load(py_container, h_group): """ Load a hickle file Recursive funnction to load hdf5 data into a PyContainer() Args: py_container (PyContainer): Python container to load data into h_group (h5 group or dataset): h5py object, group or dataset, to spider and load all datasets. """ group_dtype = h5._hl.group.Group dataset_dtype = h5._hl.dataset.Dataset #either a file, group, or dataset if isinstance(h_group, (H5FileWrapper, group_dtype)): py_subcontainer = PyContainer() try: py_subcontainer.container_type = bytes(h_group.attrs['type'][0]) except KeyError: raise #py_subcontainer.container_type = '' py_subcontainer.name = h_group.name if py_subcontainer.container_type == b'dict_item': py_subcontainer.key_type = h_group.attrs['key_type'] if py_subcontainer.container_type not in types_not_to_sort: h_keys = sort_keys(h_group.keys()) else: h_keys = h_group.keys() for h_name in h_keys: h_node = h_group[h_name] py_subcontainer = _load(py_subcontainer, h_node) sub_data = py_subcontainer.convert() py_container.append(sub_data) else: # must be a dataset subdata = load_dataset(h_group) py_container.append(subdata) return py_container hickle-5.0.2/hickle/legacy_v3/loaders/000077500000000000000000000000001430361177200175255ustar00rootroot00000000000000hickle-5.0.2/hickle/legacy_v3/loaders/__init__.py000066400000000000000000000000461430361177200216360ustar00rootroot00000000000000from __future__ import absolute_importhickle-5.0.2/hickle/legacy_v3/loaders/load_astropy.py000066400000000000000000000203411430361177200225770ustar00rootroot00000000000000import numpy as np from astropy.units import Quantity from astropy.coordinates import Angle, SkyCoord from astropy.constants import Constant, EMConstant from astropy.table import Table from astropy.time import Time from ..helpers import get_type_and_data import six def create_astropy_quantity(py_obj, h_group, call_id=0, **kwargs): """ dumps an astropy quantity Args: py_obj: python object to dump; should be a python type (int, float, bool etc) h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ # kwarg compression etc does not work on scalars d = h_group.create_dataset('data_%i' % call_id, data=py_obj.value, dtype='float64') #, **kwargs) d.attrs["type"] = [b'astropy_quantity'] if six.PY3: unit = bytes(str(py_obj.unit), 'ascii') else: unit = str(py_obj.unit) d.attrs['unit'] = [unit] def create_astropy_angle(py_obj, h_group, call_id=0, **kwargs): """ dumps an astropy quantity Args: py_obj: python object to dump; should be a python type (int, float, bool etc) h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ # kwarg compression etc does not work on scalars d = h_group.create_dataset('data_%i' % call_id, data=py_obj.value, dtype='float64') #, **kwargs) d.attrs["type"] = [b'astropy_angle'] if six.PY3: unit = str(py_obj.unit).encode('ascii') else: unit = str(py_obj.unit) d.attrs['unit'] = [unit] def create_astropy_skycoord(py_obj, h_group, call_id=0, **kwargs): """ dumps an astropy quantity Args: py_obj: python object to dump; should be a python type (int, float, bool etc) h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ # kwarg compression etc does not work on scalars lat = py_obj.data.lat.value lon = py_obj.data.lon.value dd = np.stack((lon, lat), axis=-1) d = h_group.create_dataset('data_%i' % call_id, data=dd, dtype='float64') #, **kwargs) d.attrs["type"] = [b'astropy_skycoord'] if six.PY3: lon_unit = str(py_obj.data.lon.unit).encode('ascii') lat_unit = str(py_obj.data.lat.unit).encode('ascii') else: lon_unit = str(py_obj.data.lon.unit) lat_unit = str(py_obj.data.lat.unit) d.attrs['lon_unit'] = [lon_unit] d.attrs['lat_unit'] = [lat_unit] def create_astropy_time(py_obj, h_group, call_id=0, **kwargs): """ dumps an astropy Time object Args: py_obj: python object to dump; should be a python type (int, float, bool etc) h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ # kwarg compression etc does not work on scalars data = py_obj.value dtype = str(py_obj.value.dtype) # Need to catch string times if '": # Yuck. Convert numpy._bytes -> str -> bytes return [bytes(str(item, 'utf8'), 'utf8') for item in data] if py3_str_type == b"": return [str(item, 'utf8') for item in data] else: return list(data) def load_tuple_dataset(h_node): data = load_list_dataset(h_node) return tuple(data) def load_set_dataset(h_node): data = load_list_dataset(h_node) return set(data) def load_bytes_dataset(h_node): py_type, data = get_type_and_data(h_node) return bytes(data[0]) def load_string_dataset(h_node): py_type, data = get_type_and_data(h_node) return str(data[0]) def load_unicode_dataset(h_node): py_type, data = get_type_and_data(h_node) return unicode(data[0]) def load_none_dataset(h_node): return None def load_pickled_data(h_node): py_type, data = get_type_and_data(h_node) import dill as pickle return pickle.loads(data[0]) def load_python_dtype_dataset(h_node): py_type, data = get_type_and_data(h_node) subtype = h_node.attrs["python_subdtype"] type_dict = { b"": int, b"": float, b"": bool, b"": complex, "": int, "": float, "": bool, "": complex } tcast = type_dict.get(subtype) return tcast(data) types_dict = { list: create_listlike_dataset, tuple: create_listlike_dataset, set: create_listlike_dataset, bytes: create_stringlike_dataset, str: create_stringlike_dataset, #bytearray: create_stringlike_dataset, int: create_python_dtype_dataset, float: create_python_dtype_dataset, bool: create_python_dtype_dataset, complex: create_python_dtype_dataset, type(None): create_none_dataset, } hkl_types_dict = { b"" : load_list_dataset, b"" : load_tuple_dataset, b"" : load_set_dataset, b"bytes" : load_bytes_dataset, b"python_dtype" : load_python_dtype_dataset, b"string" : load_string_dataset, b"pickle" : load_pickled_data, b"none" : load_none_dataset, } hickle-5.0.2/hickle/legacy_v3/loaders/load_scipy.py000066400000000000000000000073561430361177200222400ustar00rootroot00000000000000import six import scipy from scipy import sparse from ..helpers import get_type_and_data def check_is_scipy_sparse_array(py_obj): """ Check if a python object is a scipy sparse array Args: py_obj: python object to check whether it is a sparse array Returns is_numpy (bool): Returns True if it is a sparse array, else False if it isn't """ t_csr = type(scipy.sparse.csr_matrix([0])) t_csc = type(scipy.sparse.csc_matrix([0])) t_bsr = type(scipy.sparse.bsr_matrix([0])) is_sparse = type(py_obj) in (t_csr, t_csc, t_bsr) return is_sparse def create_sparse_dataset(py_obj, h_group, call_id=0, **kwargs): """ dumps an sparse array to h5py file Args: py_obj: python object to dump; should be a numpy array or np.ma.array (masked) h_group (h5.File.group): group to dump data into. call_id (int): index to identify object's relative location in the iterable. """ h_sparsegroup = h_group.create_group('data_%i' % call_id) data = h_sparsegroup.create_dataset('data', data=py_obj.data, **kwargs) indices = h_sparsegroup.create_dataset('indices', data=py_obj.indices, **kwargs) indptr = h_sparsegroup.create_dataset('indptr', data=py_obj.indptr, **kwargs) shape = h_sparsegroup.create_dataset('shape', data=py_obj.shape, **kwargs) if isinstance(py_obj, type(sparse.csr_matrix([0]))): type_str = 'csr' elif isinstance(py_obj, type(sparse.csc_matrix([0]))): type_str = 'csc' elif isinstance(py_obj, type(sparse.bsr_matrix([0]))): type_str = 'bsr' if six.PY2: h_sparsegroup.attrs["type"] = [b'%s_matrix' % type_str] data.attrs["type"] = [b"%s_matrix_data" % type_str] indices.attrs["type"] = [b"%s_matrix_indices" % type_str] indptr.attrs["type"] = [b"%s_matrix_indptr" % type_str] shape.attrs["type"] = [b"%s_matrix_shape" % type_str] else: h_sparsegroup.attrs["type"] = [bytes(str('%s_matrix' % type_str), 'ascii')] data.attrs["type"] = [bytes(str("%s_matrix_data" % type_str), 'ascii')] indices.attrs["type"] = [bytes(str("%s_matrix_indices" % type_str), 'ascii')] indptr.attrs["type"] = [bytes(str("%s_matrix_indptr" % type_str), 'ascii')] shape.attrs["type"] = [bytes(str("%s_matrix_shape" % type_str), 'ascii')] def load_sparse_matrix_data(h_node): py_type, data = get_type_and_data(h_node) h_root = h_node.parent indices = h_root.get('indices')[:] indptr = h_root.get('indptr')[:] shape = h_root.get('shape')[:] if py_type == b'csc_matrix_data': smat = sparse.csc_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) elif py_type == b'csr_matrix_data': smat = sparse.csr_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) elif py_type == b'bsr_matrix_data': smat = sparse.bsr_matrix((data, indices, indptr), dtype=data.dtype, shape=shape) return smat class_register = [ [scipy.sparse.csr_matrix, b'csr_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], [scipy.sparse.csc_matrix, b'csc_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], [scipy.sparse.bsr_matrix, b'bsr_matrix_data', create_sparse_dataset, load_sparse_matrix_data, False, check_is_scipy_sparse_array], ] exclude_register = [] # Need to ignore things like csc_matrix_indices which are loaded automatically for mat_type in ('csr', 'csc', 'bsr'): for attrib in ('indices', 'indptr', 'shape'): hkl_key = "%s_matrix_%s" % (mat_type, attrib) if not six.PY2: hkl_key = hkl_key.encode('ascii') exclude_register.append(hkl_key) hickle-5.0.2/hickle/legacy_v3/lookup.py000066400000000000000000000164321430361177200177650ustar00rootroot00000000000000""" #lookup.py This file contains all the mappings between hickle/HDF5 metadata and python types. There are four dictionaries and one set that are populated here: 1) types_dict types_dict: mapping between python types and dataset creation functions, e.g. types_dict = { list: create_listlike_dataset, int: create_python_dtype_dataset, np.ndarray: create_np_array_dataset } 2) hkl_types_dict hkl_types_dict: mapping between hickle metadata and dataset loading functions, e.g. hkl_types_dict = { "" : load_list_dataset, "" : load_tuple_dataset } 3) container_types_dict container_types_dict: mapping required to convert the PyContainer object in hickle.py back into the required native type. PyContainer is required as some iterable types are immutable (do not have an append() function). Here is an example: container_types_dict = { "": list, "": tuple } 4) container_key_types_dict container_key_types_dict: mapping specifically for converting hickled dict data back into a dictionary with the same key type. While python dictionary keys can be any hashable object, in HDF5 a unicode/string is required for a dataset name. Example: container_key_types_dict = { "": str, "": unicode } 5) types_not_to_sort type_not_to_sort is a list of hickle type attributes that may be hierarchical, but don't require sorting by integer index. ## Extending hickle to add support for other classes and types The process to add new load/dump capabilities is as follows: 1) Create a file called load_[newstuff].py in loaders/ 2) In the load_[newstuff].py file, define your create_dataset and load_dataset functions, along with all required mapping dictionaries. 3) Add an import call here, and populate the lookup dictionaries with update() calls: # Add loaders for [newstuff] try: from .loaders.load_[newstuff[ import types_dict as ns_types_dict from .loaders.load_[newstuff[ import hkl_types_dict as ns_hkl_types_dict types_dict.update(ns_types_dict) hkl_types_dict.update(ns_hkl_types_dict) ... (Add container_types_dict etc if required) except ImportError: raise """ import six from ast import literal_eval def return_first(x): """ Return first element of a list """ return x[0] def load_nothing(h_hode): pass types_dict = {} hkl_types_dict = {} types_not_to_sort = [b'dict', b'csr_matrix', b'csc_matrix', b'bsr_matrix'] container_types_dict = { b"": list, b"": tuple, b"": set, b"": list, b"": tuple, b"": set, b"csr_matrix": return_first, b"csc_matrix": return_first, b"bsr_matrix": return_first } # Technically, any hashable object can be used, for now sticking with built-in types container_key_types_dict = { b"": literal_eval, b"": float, b"": bool, b"": int, b"": complex, b"": literal_eval, b"": literal_eval, b"": float, b"": bool, b"": int, b"": complex, b"": literal_eval } if six.PY2: container_key_types_dict[b""] = literal_eval container_key_types_dict[b""] = long # Add loaders for built-in python types if six.PY2: from .loaders.load_python import types_dict as py_types_dict from .loaders.load_python import hkl_types_dict as py_hkl_types_dict else: from .loaders.load_python3 import types_dict as py_types_dict from .loaders.load_python3 import hkl_types_dict as py_hkl_types_dict types_dict.update(py_types_dict) hkl_types_dict.update(py_hkl_types_dict) # Add loaders for numpy types from .loaders.load_numpy import types_dict as np_types_dict from .loaders.load_numpy import hkl_types_dict as np_hkl_types_dict from .loaders.load_numpy import check_is_numpy_array types_dict.update(np_types_dict) hkl_types_dict.update(np_hkl_types_dict) ####################### ## ND-ARRAY checking ## ####################### ndarray_like_check_fns = [ check_is_numpy_array ] def check_is_ndarray_like(py_obj): is_ndarray_like = False for ii, check_fn in enumerate(ndarray_like_check_fns): is_ndarray_like = check_fn(py_obj) if is_ndarray_like: break return is_ndarray_like ####################### ## loading optional ## ####################### def register_class(myclass_type, hkl_str, dump_function, load_function, to_sort=True, ndarray_check_fn=None): """ Register a new hickle class. Args: myclass_type type(class): type of class dump_function (function def): function to write data to HDF5 load_function (function def): function to load data from HDF5 is_iterable (bool): Is the item iterable? hkl_str (str): String to write to HDF5 file to describe class to_sort (bool): If the item is iterable, does it require sorting? ndarray_check_fn (function def): function to use to check if """ types_dict.update({myclass_type: dump_function}) hkl_types_dict.update({hkl_str: load_function}) if to_sort == False: types_not_to_sort.append(hkl_str) if ndarray_check_fn is not None: ndarray_like_check_fns.append(ndarray_check_fn) def register_class_list(class_list): """ Register multiple classes in a list Args: class_list (list): A list, where each item is an argument to the register_class() function. Notes: This just runs the code: for item in mylist: register_class(*item) """ for class_item in class_list: register_class(*class_item) def register_class_exclude(hkl_str_to_ignore): """ Tell loading function to ignore any HDF5 dataset with attribute 'type=XYZ' Args: hkl_str_to_ignore (str): attribute type=string to ignore and exclude from loading. """ hkl_types_dict[hkl_str_to_ignore] = load_nothing def register_exclude_list(exclude_list): """ Ignore HDF5 datasets with attribute type='XYZ' from loading ArgsL exclude_list (list): List of strings, which correspond to hdf5/hickle type= attributes not to load. """ for hkl_str in exclude_list: register_class_exclude(hkl_str) ######################## ## Scipy sparse array ## ######################## try: from .loaders.load_scipy import class_register, exclude_register register_class_list(class_register) register_exclude_list(exclude_register) except ImportError: pass except NameError: pass #################### ## Astropy stuff ## #################### try: from .loaders.load_astropy import class_register register_class_list(class_register) except ImportError: pass ################## ## Pandas stuff ## ################## try: from .loaders.load_pandas import class_register register_class_list(class_register) except ImportError: pass hickle-5.0.2/hickle/loaders/000077500000000000000000000000001430361177200156515ustar00rootroot00000000000000hickle-5.0.2/hickle/loaders/__init__.py000066400000000000000000000020121430361177200177550ustar00rootroot00000000000000# names all optional loaders defined by any load_*.py file # will be extended by any optional loader managed by hickle # core engine. Names of optional_loaders must be all lower case. # Corresponding option attributes in hickle file will be all # upper case. optional_loaders = { # option loader for defining custom loader methods and # PyContainer classes. By marking them as custom option # they are only activate if specified by a call to # hickle.dump. If not specified than custom objects and # classes will simply be stored as pickle string. # The data may in this case not be recoverable if # underlying classes are not available or not compatible # any more due to disruptive changes. When dumped using # custom loader hickle at least can try to restore data # as numpy.array or python dict like structure with metadata # attached as is for further inspection. 'custom', } # prefix for optional_loaders attribute names which are all # uppercase attribute_prefix = "OPTION_" hickle-5.0.2/hickle/loaders/load_astropy.py000066400000000000000000000240071430361177200207260ustar00rootroot00000000000000# %% IMPORTS # Package imports from astropy.coordinates import Angle, SkyCoord from astropy.constants import Constant from astropy.table import Table from astropy.time import Time from astropy.units import Quantity import numpy as np # hickle imports from hickle.helpers import no_compression,load_str_list_attr_ascii,load_str_attr_ascii # %% FUNCTION DEFINITIONS def create_astropy_quantity(py_obj, h_group, name, **kwargs): """ dumps an astropy quantity Parameters ---------- py_obj: python object to dump; should be a python type h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset representing astropy quantity and empty subitems """ d = h_group.create_dataset(name, data=py_obj.value, dtype='float64', **no_compression(kwargs)) d.attrs['unit'] = py_obj.unit.to_string().encode('ascii') return d,() def create_astropy_angle(py_obj, h_group, name, **kwargs): """ dumps an astropy angle Parameters ---------- py_obj: python object to dump; should be a python type h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset representing astropy angle and empty subitems """ d = h_group.create_dataset(name, data=py_obj.value, dtype='float64', **no_compression(kwargs)) d.attrs['unit'] = py_obj.unit.to_string().encode('ascii') return d,() def create_astropy_skycoord(py_obj, h_group, name, **kwargs): """ dumps an astropy SkyCoord object Parameters ---------- py_obj: python object to dump; should be a python type h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset representing astorpy SkyCoord and empty subitems """ lon = py_obj.data.lon.value lat = py_obj.data.lat.value dd = np.stack((lon, lat), axis=-1) d = h_group.create_dataset(name, data=dd, dtype='float64', **kwargs) lon_unit = py_obj.data.lon.unit.to_string().encode('ascii') lat_unit = py_obj.data.lat.unit.to_string().encode('ascii') d.attrs['lon_unit'] = lon_unit d.attrs['lat_unit'] = lat_unit return d,() def create_astropy_time(py_obj, h_group, name, **kwargs): """ dumps an astropy Time object Parameters ---------- py_obj: python object to dump; should be a python type h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset representing astropy time and empty subitems """ # Need to catch string times, e.g. 1999-01-01T00:00:00.123 # Must be encoded into bytes. if 'str' in py_obj.value.dtype.name: bytes_dtype_str = py_obj.value.dtype.str.replace('= 3.x 'float': float, 'bool': bool, 'int': int, 'complex': complex, 'NoneType': lambda x : None } # %% FUNCTION DEFINITIONS def create_scalar_dataset(py_obj, h_group, name, **kwargs): """ dumps a python dtype object to h5py file Parameters ---------- py_obj (object): python object to dump; should be a scalar (int, float, bool, str, etc) h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing corresponding h5py.Dataset and empty subitems list """ # If py_obj is an integer and cannot be stored in 64-bits, convert to str # the sign is not counted by bit_length thus any integer which has more than # 63 bits has to be converted into string if isinstance(py_obj, int) and (py_obj.bit_length() > 63):# and ( py_obj < -2**63 or py_obj >= 2**63 ) : return h_group.create_dataset(name,data = bytearray(str(py_obj), 'ascii'),**kwargs),() return h_group.create_dataset(name, data=py_obj, **no_compression(kwargs)),() def create_none_dataset(py_obj, h_group, name, **kwargs): """ Dump None type to file Parameters ---------- py_obj (NoneType): python object to dump; must be None object h_group (h5.File.Group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing corresponding empty h5py.Dataset and empty subitems list """ return h_group.create_dataset(name, shape = None,dtype = 'V1',**no_compression(kwargs)),() def check_iterable_item_type(first_item,iter_obj): """ checks if for all items of an iterable sequence (list, tuple, etc.) a least common dtype exists to which all items can be safely be casted. Parameters ---------- first_item: the first item of the iterable sequence used to initialize the dtype iter_obj: the remaining items of the iterable sequence Returns ------- the least common dtype or none if not all items can be casted """ if ( operator.length_hint(first_item) > 1 or ( operator.length_hint(first_item) == 1 and not isinstance(first_item,(str,bytes)) ) or np.ndim(first_item) != 0 ): return None dtype = np.dtype(first_item.__class__) if dtype.name == 'object' or 'str' in dtype.name or ( 'bytes' in dtype.name and len(first_item) > 1): return None for item in iter_obj: if np.ndim(item) != 0: return None common_dtype = np.result_type(np.dtype(item.__class__),dtype) if ( common_dtype.name == 'object' or 'str' in common_dtype.name or ( 'bytes' in common_dtype.name and len(item) > 1 ) ): return None if dtype != common_dtype: dtype = common_dtype return dtype def create_listlike_dataset(py_obj, h_group, name,list_len = -1,item_dtype = None, **kwargs): """ Dumper for list, set, tuple Parameters ---------- py_obj (list, set, tuple, ...): python object to dump; should be list-like h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Group or h5py.Dataset representing list-like object and a list of subitems to be stored within this group. In case of h5py.Dataset returned this list is always empty """ if isinstance(py_obj,(str,bytes)): # strings and bytes are stored as array of bytes with strings encoded # using utf8 encoding string_data = bytearray(py_obj,"utf8") if isinstance(py_obj,str) else memoryview(py_obj) string_data = np.array(string_data,copy=False) string_data.dtype = 'S1' dataset = h_group.create_dataset( name, data = string_data,shape = (1,string_data.size), **kwargs) dataset.attrs["str_type"] = py_obj.__class__.__name__.encode("ascii") return dataset,() if len(py_obj) < 1: # list-like object is empty just store empty dataset return h_group.create_dataset(name,shape=None,dtype='int',**no_compression(kwargs)),() if list_len < 0: # neither length nor dtype of items is known compute them now item_dtype = check_iterable_item_type(py_obj[0],py_obj[1:]) list_len = len(py_obj) if item_dtype or list_len < 1: # create a dataset and map all items to least common dtype shape = (list_len,) if list_len > 0 else None dataset = h_group.create_dataset(name,shape = shape,dtype = item_dtype,**kwargs) for index,item in enumerate(py_obj,0): dataset[index] = item_dtype.type(item) return dataset,() # create group and provide generator yielding all subitems to be stored within item_name = "data{:d}" def provide_listlike_items(): for index,item in enumerate(py_obj,0): yield item_name.format(index),item,{"item_index":index},kwargs h_subgroup = h_group.create_group(name) h_subgroup.attrs["num_items"] = list_len return h_subgroup,provide_listlike_items() def create_setlike_dataset(py_obj,h_group,name,**kwargs): """ Creates a dataset or group for set-like objects. Parameters ---------- py_obj (set, ...): python object to dump; should be set-like h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Group or h5py.Dataset representing set-like object and a list of subitems to be stored within this group. In case of h5py.Dataset this list is always empty """ # set objects do not support indexing thus determination of item dtype has to # be handled specially. Call create_listlike_dataset for proper creation # of corresponding dataset if not py_obj: # dump empty set return h_group.create_dataset( name, data = list(py_obj), shape = None, dtype = int, **no_compression(kwargs) ),() set_iter = iter(py_obj) first_item = next(set_iter) item_dtype = check_iterable_item_type(first_item,set_iter) return create_listlike_dataset( py_obj, h_group, name, list_len = len(py_obj), item_dtype = item_dtype, **kwargs ) _byte_slashes = re.compile(b'[\\/]') _str_slashes = re.compile(r'[\\/]') def create_dictlike_dataset(py_obj, h_group, name, **kwargs): """ Creates a data group for each key in dictionary Parameters ---------- py_obj (dict): python object to dump; should be dictionary h_group (h5.File.group): group to dump data into. name (str): h5 node name kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Group or h5py.Dataset representing dict-like object and a list of subitems to be stored within this group. In case of h5py.Dataset this list is always empty """ h_dictgroup = h_group.create_group(name) key_value_pair_name = "data{:d}" def package_dict_items(): """ generator yielding appropriate parameters for dumping each dict key value pair """ for idx, (key, py_subobj) in enumerate(py_obj.items()): # Obtain the raw string representation of this key key_base_type = key.__class__.__name__.encode("utf8") if isinstance(key,str): if not _str_slashes.search(key): yield r'"{}"'.format(key),py_subobj,{'key_idx':idx,'key_base_type':key_base_type},kwargs continue elif isinstance(key,bytes): if not _byte_slashes.search(key): try: h_key = key.decode("utf8") except UnicodeError: # pragma no cover pass else: yield r'b"{}"'.format(h_key),py_subobj,{'key_idx':idx,'key_base_type':key_base_type},kwargs continue elif key_base_type in dict_key_types_dict: h_key = "{!r}".format(key) if not _str_slashes.search(h_key): yield h_key,py_subobj,{'key_idx':idx,'key_base_type':key_base_type},kwargs continue sub_node_name = key_value_pair_name.format(idx) yield sub_node_name,(key,py_subobj),{'key_idx':idx,'key_base_type':b'key_value'},kwargs return h_dictgroup,package_dict_items() def load_scalar_dataset(h_node, base_type, py_obj_type): """ loads scalar dataset Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (type): final type of restored scalar Returns ------- resulting python object of type py_obj_type """ data = h_node[()] if h_node.size < 2 else memoryview(h_node[()]) return py_obj_type(data) if data.__class__ is not py_obj_type else data def load_none_dataset(h_node,base_type,py_obj_type): """ returns None value as represented by underlying empty dataset Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (NoneType): final type of restored scalar Returns ------- None """ return None def load_list_dataset(h_node,base_type,py_obj_type): """ loads any kind of list like dataset Parameters ---- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (list, tuple, set, ...): final type of restored object Returns ------- resulting python object of type py_obj_type """ if h_node.shape is None: # empty list tuple or set just return new instance of py_obj_type return py_obj_type() if isinstance(py_obj_type,tuple) else py_obj_type(()) str_type = h_node.attrs.get('str_type', None) content = h_node[()] if str_type in (b'str','str'): # decode bytes representing python string before final conversion if h_node.dtype.itemsize > 1 and 'bytes' in h_node.dtype.name: # string dataset 4.0.x style convert it back to python string content = np.array(content, copy=False, dtype=str).tolist() else: # decode bytes representing python string before final conversion content = bytes(content).decode("utf8") return py_obj_type(content) if content.__class__ is not py_obj_type else content def load_hickle_4_x_string(h_node,base_type,py_obj_type): """ loads dataset representing python string stored by hickle 4.x Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (str): final type of restored string Returns ------- resulting python str or bytes object as specified by py_obj_type """ if not 'object' in h_node.dtype.name or h_node.attrs.get('str_type',None) is not None: return load_list_dataset(h_node,base_type,py_obj_type) content = h_node[()] if py_obj_type is str: return content if isinstance(content,str) else content.decode('utf8') return py_obj_type(content) if content.__class__ is not py_obj_type else content class ListLikeContainer(PyContainer): """ PyContainer for all list like objects exempt set """ __slots__ = () # regular expression used to extract index value from name of group or dataset # representing subitem appended to the final list extract_index = re.compile(r'\d+$') # as None can and may be a valid list entry define an alternative marker for # missing items and indices def __init__(self,h5_attrs,base_type,object_type): # if number of items is defined upon group resize content to # at least match this amount of subitems num_items = h5_attrs.get('num_items',0) super(ListLikeContainer,self).__init__(h5_attrs,base_type,object_type,_content = [nobody_is_my_name] * num_items) def append(self,name,item,h5_attrs): # load item index from attributes if known else extract it from name index = h5_attrs.get("item_index",None) if index is None: index_match = self.extract_index.search(name) if index_match is None: if item is nobody_is_my_name: # dummy data injected likely by load_nothing, ignore it return raise KeyError("List like item name '{}' not understood".format(name)) index = int(index_match.group(0)) # if index exceeds capacity of extend list appropriately if len(self._content) <= index: self._content.extend([nobody_is_my_name] * ( index - len(self._content) + 1 )) if self._content[index] is not nobody_is_my_name: raise IndexError("Index {} already set".format(index)) self._content[index] = item def convert(self): return self._content if self.object_type is self._content.__class__ else self.object_type(self._content) class SetLikeContainer(PyContainer): """ PyContainer for all set like objects. """ __slots__ = () def __init__(self,h5_attrs, base_type, object_type): super(SetLikeContainer,self).__init__(h5_attrs,base_type,object_type,_content=set()) def append(self,name,item,h5_attrs): self._content.add(item) def convert(self): return self._content if self._content.__class__ is self.object_type else self.object_type(self._content) class DictLikeContainer(PyContainer): """ PyContainer for all dict like objects """ __slots__ = () _swap_key_slashes = re.compile(r"\\") def append(self,name,item,h5_attrs): key_base_type = h5_attrs.get('key_base_type',b'') if key_base_type in ( b'str','str'): item = ( name[1:-1] if name[0] == '"' else self._swap_key_slashes.sub(r'/',name)[1:-1], item ) elif key_base_type in (b'bytes','bytes'): item = ( name[2:-1].encode("utf8") if name[:2] == 'b"' else self._swap_key_slashes.sub(r'/',name)[1:-1], item ) elif not key_base_type in (b'key_value','key_value'): load_key = dict_key_types_dict.get(key_base_type,None) if load_key is None: if key_base_type not in {b'tuple','tuple'}: raise ValueError("key type '{}' not understood".format(key_base_type.decode("utf8"))) import ast load_key = ast.literal_eval item = ( load_key(self._swap_key_slashes.sub(r'/',name)), item ) key_index = h5_attrs.get('key_idx',None) if key_index is None: if item[1] is nobody_is_my_name: # dummy data injected most likely by load_nothing ignore it return raise KeyError("invalid dict item key_index missing") if len(self._content) <= key_index: self._content.extend([nobody_is_my_name] * ( key_index - len(self._content) + 1)) if self._content[key_index] is not nobody_is_my_name: raise IndexError("Key index {} already set".format(key_index)) self._content[key_index] = item def convert(self): return self.object_type(self._content) # %% REGISTERS class_register = [ [list, b"list", create_listlike_dataset, load_list_dataset,ListLikeContainer], [tuple, b"tuple", create_listlike_dataset, load_list_dataset,ListLikeContainer], [dict, b"dict",create_dictlike_dataset,None,DictLikeContainer], [set, b"set", create_setlike_dataset, load_list_dataset,SetLikeContainer], [bytes, b"bytes", create_listlike_dataset, load_list_dataset], [str, b"str", create_listlike_dataset, load_list_dataset], [str, b"str", None, load_hickle_4_x_string,None,True,'hickle-4.x'], [int, b"int", create_scalar_dataset, load_scalar_dataset, None, False], [float, b"float", create_scalar_dataset, load_scalar_dataset, None, False], [complex, b"complex", create_scalar_dataset, load_scalar_dataset, None, False], [bool, b"bool", create_scalar_dataset, load_scalar_dataset, None, False], [None.__class__, b"None", create_none_dataset, load_none_dataset, None, False] ] exclude_register = [] hickle-5.0.2/hickle/loaders/load_numpy.py000066400000000000000000000251251430361177200203770ustar00rootroot00000000000000# encoding: utf-8 """ # load_numpy.py Utilities and dump / load handlers for handling numpy and scipy arrays """ # %% IMPORTS # Package imports import numpy as np import types # hickle imports from hickle.loaders.load_builtins import create_listlike_dataset,ListLikeContainer from hickle.helpers import PyContainer,no_compression # %% FUNCTION DEFINITIONS def create_np_scalar_dataset(py_obj, h_group, name, **kwargs): """ dumps an numpy.dtype object to h5py file Parameters ---------- py_obj (numpy.scalar): python object to dump; should be a numpy scalar, e.g. numpy.float16(1) h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset and empty list of subitems """ d = h_group.create_dataset(name, data=py_obj, **no_compression(kwargs)) d.attrs["np_dtype"] = py_obj.dtype.str.encode("ascii") return d,() def create_np_dtype(py_obj, h_group, name, **kwargs): """ dumps an numpy dtype object to h5py file Parameters ---------- py_obj (numpy.dtype): python object to dump; should be a numpy dtype, e.g. numpy.float16 h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Dataset and empty list of subitems """ d = h_group.create_dataset(name, data=bytearray(py_obj.str,"ascii"), **kwargs) return d,() def create_np_array_dataset(py_obj, h_group, name, **kwargs): """ dumps an ndarray object to h5py file Parameters ---------- py_obj (numpy.ndarray): python object to dump; should be a numpy.ndarray or numpy.ma.array (masked) h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset or group kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Datset and empty list of subitems or h5py.Group and iterable of subitems """ # Obtain dtype of py_obj dtype = py_obj.dtype # Check if py_obj contains strings if "str" in dtype.name: if py_obj.ndim < 1: # convert string to utf8 encoded bytearray string_data = bytearray(py_obj.item(),"utf8") if 'bytes' not in dtype.name else memoryview(py_obj.item()) string_data = np.array(string_data,copy = False) string_data.dtype = 'S1' h_node = h_group.create_dataset(name,data = string_data,shape=(1,string_data.size),**kwargs) sub_items = () else: # store content as list of strings h_node,sub_items = create_listlike_dataset(py_obj.tolist(), h_group, name, **kwargs) elif dtype.name == 'object': # If so, convert py_obj to list py_obj = py_obj.tolist() # Check if py_obj is a list if isinstance(py_obj, list): # If so, dump py_obj into the current group h_node,sub_items = create_listlike_dataset(py_obj, h_group, name, **kwargs) else: # If not, create a new group and dump py_obj into that h_node = h_group.create_group(name) sub_items = ("data",py_obj,{},kwargs), else: h_node = h_group.create_dataset( name, data=py_obj, **( no_compression(kwargs) if "bytes" in dtype.name else kwargs ) ) sub_items = () h_node.attrs['np_dtype'] = dtype.str.encode('ascii') return h_node,sub_items def create_np_masked_array_dataset(py_obj, h_group, name, **kwargs): """ dumps an numpy.ma.core.MaskedArray object to h5py file Parameters ---------- py_obj (numpy.ma.array): python object to dump; should be a numpy.ndarray or numpy.ma.array (masked) h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset or group kwargs (dict): keyword arguments to be passed to create_dataset function Returns ------- tuple containing h5py.Group and subitems list representing masked array contents: """ # Obtain dtype of py_obj h_node = h_group.create_group(name) h_node.attrs['np_dtype'] = py_obj.dtype.str.encode('ascii') return h_node,(("data",py_obj.data,{},kwargs),('mask',py_obj.mask,{},kwargs)) def load_np_dtype_dataset(h_node,base_type,py_obj_type): """ restores dtype from dataset Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (numpy.dtype): final type of restored dtype Returns ------- resulting numpy.dtype """ return np.dtype(bytes(h_node[()])) def load_np_scalar_dataset(h_node,base_type,py_obj_type): """ restores scalar value from dataset Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (numpy.dtype): final type of restored dtype Returns ------- resulting numpy.scalar """ dtype = np.dtype(h_node.attrs["np_dtype"]) return dtype.type(h_node[()]) def load_ndarray_dataset(h_node,base_type,py_obj_type): """ restores ndarray like object from dataset Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (numpy.ndarray, numpy.ma.array, ...): final type of restored array Returns ------- resulting numpy.ndarray, numpy.ma.array """ dtype = np.dtype(h_node.attrs['np_dtype']) if "str" in dtype.name: string_data = h_node[()] if h_node.dtype.itemsize <= 1 or 'bytes' not in h_node.dtype.name: # in hickle 4.0.X numpy.ndarrays containing multiple strings are # not converted to list of string but saved as ar consequently # itemsize of dtype is > 1 string_data = bytes(string_data).decode("utf8") return np.array(string_data,copy=False,dtype=dtype) if issubclass(py_obj_type,np.matrix): return py_obj_type(data=h_node[()],dtype=dtype) # TODO how to restore other ndarray derived object_types # simply using classname for casting does not work, in # case they use the same interface like numpy.ndarray return np.array(h_node[()], dtype=dtype) def load_ndarray_masked_dataset(h_node,base_type,py_obj_type): """ restores masked array from data and mask datasets as stored by hickle version 4.0.0 Parameters ---------- h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type py_obj_type (numpy.ndarray, numpy.ma.array, ...): final type of restored array Returns ------- resulting numpy.ndarray, numpy.ma.array """ masked_array = NDMaskedArrayContainer(h_node.attrs,base_type,py_obj_type) masked_array.append('data',h_node[()],h_node.attrs), mask_path = "{}_mask".format(h_node.name) h_root = h_node.parent h_node_mask = h_root.get(mask_path,None) if h_node_mask is None: raise ValueError("mask not found") masked_array.append('mask',h_node_mask,h_node_mask.attrs) return masked_array.convert() class NDArrayLikeContainer(ListLikeContainer): """ PyContainer used to restore complex ndarray from h5py.Group node """ __slots__ = () def append(self,name,item,h5_attrs): # if group contains only one item which either has been # dumped using create_pickled_dataset or its name reads # data than assume single non list-type object otherwise # pass item on to append method of ListLikeContainer if h5_attrs.get("base_type",'') == b'pickle' or name == "data": self._content = item else: super(NDArrayLikeContainer,self).append(name,item,h5_attrs) def convert(self): data = np.array(self._content,dtype = self._h5_attrs['np_dtype']) return data if data.__class__ is self.object_type or isinstance(self.object_type,types.LambdaType) else self.object_type(data) class NDMaskedArrayContainer(PyContainer): """ PyContainer used to restore masked array stored as dedicated h5py.Group """ __slots__ = () def __init__(self,h5_attrs,base_type,object_type): super(NDMaskedArrayContainer,self).__init__(h5_attrs,base_type,object_type,_content = {}) def append(self,name,item,h5_attrs): self._content[name] = item def convert(self): dtype = self._h5_attrs['np_dtype'] data = np.ma.array(self._content['data'], mask=self._content['mask'], dtype=dtype) return data if data.__class__ is self.object_type or isinstance(self.object_type,types.LambdaType) else self.object_type(data) ##################### # Lookup dictionary # ##################### # %% REGISTERS class_register = [ [np.dtype, b"np_dtype", create_np_dtype, load_np_dtype_dataset], [np.number, b"np_scalar", create_np_scalar_dataset, load_np_scalar_dataset,None,False], # for all scalars which are not derived from numpy.number which itself is numpy.generic subclass # to properly catch and handle they will be caught by the following [np.generic, b"np_scalar", create_np_scalar_dataset, load_np_scalar_dataset,None,False], [np.ndarray, b"ndarray", create_np_array_dataset, load_ndarray_dataset,NDArrayLikeContainer], [np.ma.core.MaskedArray, b"ndarray_masked", create_np_masked_array_dataset, None,NDMaskedArrayContainer], # NOTE: The following is load only # just needed to link old ndarray_masked_data base_type to load_ndarray_masked_dataset # loader module selection will be triggered by numpy.ma.core.MaskedArray object_type anyway # but base_type is used to select proper load_function [np.ma.core.MaskedArray, b"ndarray_masked_data",None , load_ndarray_masked_dataset,None,False,'hickle-4.x'], # NOTE: numpy.matrix is obsolete and just an alias for numpy.ndarray therefore # to keep things simple numpy.matrix will be handled by same functions as # numpy.ndarray. As long as just required cause numpy.ma.core.MaskedArray # uses it for data [np.matrix, b"np_matrix", create_np_array_dataset, load_ndarray_dataset] ] exclude_register = [ (b"ndarray_masked_mask",'hickle-4.x') ] hickle-5.0.2/hickle/loaders/load_pandas.py000066400000000000000000000002041430361177200204640ustar00rootroot00000000000000import pandas as pd print("pandas",pd.__version__) # TODO: populate with classes to load class_register = [] exclude_register = [] hickle-5.0.2/hickle/loaders/load_scipy.py000066400000000000000000000101401430361177200203450ustar00rootroot00000000000000# %% IMPORTS # Package imports import pickle import numpy as np import scipy import copy from scipy import sparse # hickle imports from hickle.helpers import PyContainer,H5NodeFilterProxy from hickle.loaders.load_numpy import load_ndarray_dataset,create_np_array_dataset # %% FUNCTION DEFINITIONS def return_first(x): """ Dummy function used as place holder type in loading legacy hickle 4.x files """ raise TypeError("'return_first' not callable and deprecated. Create and use PyContainer instead.") def create_sparse_dataset(py_obj, h_group, name, **kwargs): """ dumps an sparse array to h5py file Parameters ---------- py_obj (scipy.sparse.csr_matrix,scipy.sparse.csc_matrix, scipy.sparse.bsr_matrix): python object to dump h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset kwargs (dict): keyword arguments to be passed to create_dataset function Returns: Group and list of subitems to dump into """ h_sparsegroup = h_group.create_group(name) return h_sparsegroup,( ('data',py_obj.data,{},kwargs), ('indices',py_obj.indices,{},kwargs), ('indptr',py_obj.indptr,{},kwargs), ('shape',py_obj.shape,{},kwargs) ) ndarray_type_string = pickle.dumps(np.ndarray) tuple_type_string = pickle.dumps(tuple) class SparseMatrixContainer(PyContainer): """ PyContainer used to restore sparse Matrix """ # instance attribute shadowing class method of same name # points per default to shadowed method __slots__ = ('filter',) _index_name_map = { 'data':0, 'indices':1, 'indptr':2, 'shape':3 } def __init__(self,h5_attrs, base_type, object_type): super(SparseMatrixContainer,self).__init__(h5_attrs,base_type,object_type,_content = [None]*4) # in case object type is return_first (hickle 4.x file) than switch filter # to redirect loading of sub items to numpy ndarray type. Otherwise set to # PyContainer.filter method if object_type is return_first: self.filter = self._redirect_to_ndarray else: self.filter = super(SparseMatrixContainer,self).filter def _redirect_to_ndarray(self,h_parent): """ iterates through items and extracts effective object and basetype of sparse matrix from data subitem and remaps all subitems to ndarray type exempt shape which is remapped to tuple """ for name,item in h_parent.items(): item = H5NodeFilterProxy(item) if name == 'data': self.object_type = pickle.loads(item.attrs['type']) self.base_type = item.attrs['base_type'] np_dtype = item.attrs.get('np_dtype',None) if np_dtype is None: item.attrs['np_dtype'] = item.dtype.str.encode('ascii') elif name not in self._index_name_map.keys(): continue # ignore name if name == "shape": item.attrs['type'] = np.array(tuple_type_string) item.attrs['base_type'] = b'tuple' else: item.attrs['type'] = np.array(ndarray_type_string) item.attrs['base_type'] = b'ndarray' np_dtype = item.attrs.get('np_dtype',None) if np_dtype is None: item.attrs['np_dtype'] = item.dtype.str.encode('ascii') yield name,item def append(self,name,item,h5_attrs): index = self._index_name_map.get(name,None) self._content[index] = item def convert(self): return self.object_type(tuple(self._content[:3]),dtype=self._content[0].dtype,shape=self._content[3]) # %% REGISTERS class_register = [ [scipy.sparse.csr_matrix, b'csr_matrix', create_sparse_dataset, None, SparseMatrixContainer], [scipy.sparse.csc_matrix, b'csc_matrix', create_sparse_dataset, None, SparseMatrixContainer], [scipy.sparse.bsr_matrix, b'bsr_matrix', create_sparse_dataset, None, SparseMatrixContainer] ] exclude_register = [] hickle-5.0.2/hickle/lookup.py000066400000000000000000002031251430361177200161060ustar00rootroot00000000000000# encoding: utf-8 """ #lookup.py This file manages all the mappings between hickle/HDF5 metadata and python types. There are three dictionaries that are populated by the LoaderManager associated with each file created or loaded: 1) types_dict Mapping between python types and dataset and group creation functions, e.g. types_dict = { list: (create_listlike_dataset, 'list'), int: (create_python_dtype_dataset, 'int'), np.ndarray: (create_np_array_dataset, 'ndarray'), } 2) hkl_types_dict Mapping between hickle metadata and dataset loading functions, e.g. hkl_types_dict = { 'list': load_list_dataset, 'tuple': load_tuple_dataset } 3) hkl_container_dict Mapping between hickle metadata and group container classes, e.g. hkl_contianer_dict = { 'list': ListLikeContainer, 'tuple': TupleLikeContainer, 'dict': DictLikeContainer } ## Extending hickle to add support for other classes and types The process to add new load/dump capabilities is as follows: 1) Create a file called load_[newmodule/newpackage].py 2) In the load_[newmodule/newpackage].py file, define your create_dataset, load_dataset functions and PyContainer objects, along with the 'class_register' and 'exclude_register' tables. See the other loaders in /dist_packages/hickle/loaders directory for examples. The columns in register_class table correspond to argument list of hickle.lookup.LoaderManager.register_class method. 3) store the load_[newmodule/newpackage].py file in one of the following locations /dist_packages/hickle/loaders/ the loaders supported by hickle globally /dist_packages/hickle_loaders/ loaders installed during installation of additional single file python modules through pip or legacy os installer /dist_packages/[newpackage]/hickle_loaders/ loaders provided by [newpackage] installed through pip or legacy os installer [MyPackage]/hickle_loaders/ loaders specific for objects and classes defined by the modules of [MyPackage] package basedir([MyModule|MyApplication])/hickle_loaders loaders specific for object and classes defined by [MyModule|MyApplication].py file not part of a python package Loader for a single object or class can also be created by calling hickle.lookup.register_class prior to calling hickle.dump and hickle.load 1) create a `create_[MyClass]_dataset` function, a `load_[MyClass]_fcn` function and/or `[MyClass]Container` class for [MyClass] class to be dumped and loaded by hickle. Examples can be found in the loader modules in /dist_packages/hickle/loaders directory 2) call `hickle.lookup.LoaderManager.register_class` method as follows from your code ``` from hickle.lookup import LoaderManager LoaderManager.register_class( [MyClass],'', create_[MyClass]_dataset, load_[MyClass]_fcnt, # or None if [MyClass] is mapped to h5py.Group only [MyClass]Container , # or None if [MyClass] is mapped to h5py.Dataset only True, # if False [MyClass] object will be stored explicitly on any occurrence 'custom', # set to None to enforce unconditional use of loader ) hickle.dump('my_[MyClass]_object,'myfile.hkl','w',options={'custom':True}) new_[MyClass]_object = hickle.load('myfile.hkl') ``` """ # %% IMPORTS # Built-in imports import sys import warnings import types import re import weakref import os.path from importlib.util import find_spec, module_from_spec,spec_from_file_location,spec_from_loader from importlib import invalidate_caches # Package imports import collections import pickle import numpy as np import h5py as h5 # hickle imports from .helpers import PyContainer,not_dumpable,nobody_is_my_name,no_compression,NotHicklable from .loaders import optional_loaders, attribute_prefix if sys.version_info[:2] <= (3,5): # pragma: no cover # define ModuleNotFoundError on __builtins__ to ensure below code is working setattr(sys.modules['builtins'],'ModuleNotFoundError',getattr(sys.modules['builtins'],'ModuleNotFoundError',ImportError)) # %% GLOBALS # %% FUNCTION DEFINITIONS def load_nothing(h_node, base_type , py_obj_type): # pragma: no cover """ loads nothing """ return nobody_is_my_name def dump_nothing(py_obj, h_group, name, **kwargs): # pragma: no cover """ dumps nothing """ return nobody_is_my_name # %% CLASS DEFINITIONS class _DictItem(): # pragma: no cover """ dummy py_obj for dict_item loader """ class NodeReference(): # pragma: no cover """ dummy py_obj_type returned by ReferenceManager.get_type when encountering dataset of h5py.ref_dtype which expose no explicit 'type' attribute. """ class ReferenceError(Exception): # pragma: no cover """ exception thrown by ReferenceManager """ class LookupError(Exception): # pragma: no cover """ exception thrown if type lookup fails """ class SerializedWarning(UserWarning): # pragma: no cover """ An object type was not understood The data will be serialized using pickle. """ class PackageImportDropped(UserWarning): # pragma: no cover """ Package or module defining type/class was removed from sys.modules """ class MockedLambdaWarning(UserWarning): # pragma: no cover """ In Python >= 3.8 lambda function fails pickle.loads faking restoring lambda to keep legacy hickle 4.x files loading properly """ class DataRecoveredWarning(UserWarning): # pragma: no cover """ Raised when hickle does not find an appropriate loader for a specific type and thus has to fall back to '!recover!' loader recover_custom_dataset function or RecoverGroupContainer PyContainer object """ class AttemptRecoverCustom(): """ Dummy type indicating that the data of specific py_obj_type listed in the hickle_types_table could not be restored. Most likely pickle.loads encountered an ImportError/ModuleNotFoundError or AttributeError indicating that the package and/or module defining the py_obj_type is not installed or does not anymore provide the definition of py_object_type. In this case hickle tries to at least recover the data and corresponding meta data stored in h5py.Group or h5py.Dataset attributes and the base_type string indicating the loader used to dump the data. Only in case this attempt fails an exception is thrown. """ class RecoveredGroup(dict,AttemptRecoverCustom): """ dict type object representing the content and hickle meta data of a h5py.group """ __slots__ = ('attrs',) def __init__(self,*args,attrs={},**kwargs): super().__init__(*args,**kwargs) self.attrs={name:value for name,value in attrs.items() if name not in {'type'}} class RecoveredDataset(np.ndarray,AttemptRecoverCustom): """ numpy.ndarray type object representing the content and hickle meta data of a h5py.dataset """ __slots__ = ('attrs',) def __new__(cls,input_array,dtype=None,attrs={}): array_copy = np.array(input_array,dtype=dtype) obj = super().__new__( cls, shape = array_copy.shape, dtype = array_copy.dtype, buffer=array_copy, offset=0, strides=array_copy.strides, order = 'C' if array_copy.flags.c_contiguous else 'F' ) obj.attrs = {name:value for name,value in attrs.items() if name not in {'type'}} return obj def __array_finalize__(self,obj): if obj is not None: self.attrs = getattr(obj,'attrs',{}) class ManagerMeta(type): """ Metaclass for all manager classes derived from the BaseManager class. Ensures that the __managers__ class attribute of each immediate child class of BaseManager is initialized to a dictionary and ensures that it can not be overwritten by any grandchild class and down. The __managers__ attribute declared by any grandchild shadowing the __managers__ attribute of any of its ancestors is dropped without any further notice. """ def __new__(cls, name, bases, namespace, **kwords): for _ in ( True for base in bases if isinstance(getattr(base,'__managers__',None),dict) ): namespace.pop('__managers__',None) break else: if not isinstance(namespace.get('__managers__',None),dict): namespace['__managers__'] = dict() if bases and not object in bases else None return super().__new__(cls,name,bases,namespace,**kwords) class BaseManager(metaclass = ManagerMeta): """ Base class providing basic management of simultaneously open managers. Must be subclassed. """ __slots__ = ('__weakref__',) __managers__ = None @classmethod def _drop_manager(cls, fileid): """ finalizer callback to properly remove a object from the .__managers__ structure when corresponding file is closed or object is garbage collected Parameters ---------- cls (BaseManager): the child class for which to drop the hdf5 file instance referenced to by the specified file id fileid (h5py.FileId): Id identifying the hdf5 file the ReferenceManager was created for """ try: manager = cls.__managers__.get(fileid,None) if manager is None: return cls.__managers__.pop(fileid,None) except: # pragma: no cover # only triggered in case of race pass @classmethod def create_manager(cls, h_node, create_entry): """ Check whether an instance already exists for the file the h_node belongs to and call create_entry if no entry yet exists. Parameters ---------- cls: the manager class to create a new instance for h_node (h5py.File, h5py.Group, h5py.Dataset): the h5py node or its h_root_group or file to create a new ReferenceManager object for. create_entry (callable): function or method which returns a new table entry. A table entry is a tuple or list with contains as its first item the newly created object. It may include further items specific to the actual BaseManager subclass. Raises ------ LookupError: if manager has already been created for h_node or its h_root_group """ manager = cls.__managers__.get(h_node.file.id,None) if manager is not None: raise LookupError( "'{}' type manager already created for file '{}'".format( cls.__name__,h_node.file.filename ) ) table = cls.__managers__[h_node.file.id] = create_entry() weakref.finalize(table[0],cls._drop_manager,h_node.file.id) return table[0] @classmethod def get_manager(cls, h_node): """ return manager responsible for the file containing h_node Parameters ---------- h_node (h5py.File, h5py.Group, h5py.Dataset): the h5py node to obtain the responsible manager for. Raises ------ LookupError: if no manager has been created yet for h_node or its h_root_group """ try: return cls.__managers__[h_node.file.id][0] except KeyError: raise ReferenceError("no managers exist for file '{}'".format(h_node.file.filename)) def __init__(self): if type.mro(self.__class__)[0] is BaseManager: raise TypeError("'BaseManager' class must be subclassed") def __enter__(self): raise NotImplementedError("'{}' type object must implement Python ContextManager protocol") def __exit__(self, exc_type, exc_value, exc_traceback, h_node=None): # remove this ReferenceManager object from the table of active ReferenceManager objects # and cleanly unlink from any h5py object instance and id references managed. Finalize # 'hickle_types_table' overlay if it was created by __init__ for hickle 4.0.X file self.__class__._drop_manager(h_node.file.id) class ReferenceManager(BaseManager, dict): """ Manages all object and type references created for basic and type special memoisation. To create a ReferenceManager call ReferenceManager.create_manager function. The value returned can be and shall be used within a with statement to ensure it is garbage collected before file is closed. For example: with ReferenceManager.create_manager(h_root_group) as memo: _dump(data,h_root_group,'data',memo,loader,**kwargs) with ReferenceManager.create_manager(h_root_group) as memo: _load(py_container,'data',h_root_group['data'],memo,loader) with ReferenceManager.create_manager(h_root_group,fix_lambda_obj_type) as memo: _load(py_container,'data',h_root_group['data'],memo,loader) NOTE: for creating appropriate loader object see LoaderManager """ __slots__ = ( '_py_obj_type_table', # hickle_types_table h5py.Group storing type information '_py_obj_type_link', # dictionary linking py_obj_type and representation in hickle_types_table '_base_type_link', # dictionary linking base_type string and representation in hickle_types_table '_overlay', # in memory hdf5 dummy file hosting dummy hickle_types_table for hickle 4.x files 'pickle_loads' # reference to pickle.loads method ) @staticmethod def get_root(h_node): """ returns the h_root_group the passed h_node belongs to. """ # try to resolve the 'type' attribute of the h_node entry_ref = h_node.attrs.get('type',None) if isinstance(entry_ref,h5.Reference): # return the grandparent of the referenced py_obj_type dataset as it # also the h_root_group of h_node try: entry = h_node.file.get(entry_ref,None) except ValueError: # pragma: no cover entry = None if entry is not None: return entry.parent.parent if h_node.parent == h_node.file: # h_node is either the h_root_group it self or the file node representing # the open hickle file. return h_node if isinstance(h_node,h5.Group) else h_node.file # either h_node has not yet a 'type' assigned or contains pickle string # which has implicit b'pickle' type. try to resolve h_root_group from its # parent 'type' entry if any entry_ref = h_node.parent.attrs.get('type',None) if not isinstance(entry_ref,h5.Reference): if entry_ref is None: # parent has neither a 'type' assigned return h_node if isinstance(h_node,h5.Group) else h_node.file # 'type' seems to be a byte string or string fallback to h_node.file return h_node.file try: entry = h_node.file.get(entry_ref,None) except ValueError: # pragma: no cover entry = None if entry is None: # 'type' reference seems to be stale return h_node if isinstance(h_node,h5.Group) else h_node.file # return the grand parent of the referenced py_obj_type dataset as it # is also the h_root_group of h_node return entry.parent.parent @staticmethod def _drop_overlay(h5file): """ closes in memory overlay file providing dummy 'hickle_types_table' structure for hdf5 files which were created by hickle 4.x """ h5file.close() @classmethod def create_manager(cls,h_node, pickle_loads = pickle.loads): """ creates a new ReferenceManager object for the h_root_group the h_node belongs to. Parameters ---------- h_node (h5py.Group, h5py.Dataset): the h5py node or its h_root_group to create a new ReferenceManager object for. pickle_loads (FunctionType,MethodType): method to be used to expand py_obj_type pickle strings. defaults to pickle.loads. Must be set to fix_lambda_obj_type for hickle file created by hickle 4.x. Raises ------ LookupError: if ReferenceManager has already been created for h_node or its h_root_group """ def create_manager(): return ( ReferenceManager(h_node,pickle_loads = pickle_loads), ReferenceManager.get_root(h_node) ) return super().create_manager(h_node,create_manager) def __init__(self, h_root_group, *args,pickle_loads = pickle.loads, **kwargs): """ constructs ReferenceManager object Parameters ---------- h_root_group (h5py.Group): see ReferenceManager.create_manager args (tuple,list): passed to dict.__init__ pickle_loads (FunctionType,MethodType): see ReferenceManager.create_manager kwargs (dict): passed to dict.__init__ Raises ------ ReferenceError: In case an error occurs while loading 'hickle_types_table' from an existing file opened for reading and writing """ super().__init__(*args,**kwargs) self._py_obj_type_link = dict() self._base_type_link = dict() self._overlay = None self.pickle_loads = pickle_loads # get the 'hickle_types_table' member of h_root_group or create it anew # in case none found. In case hdf5 file is opened for reading only # create an in memory hdf5 file (managed by hdf5 'core' driver) providing # an empty dummy hickle_types_table. This is necessary to ensue that # ReferenceManager.resolve_type works properly on hickle 4.x files which # store type information directly in h5py.Group and h5py.Datasets attrs # structure. self._py_obj_type_table = h_root_group.get('hickle_types_table',None) if self._py_obj_type_table is None: if h_root_group.file.mode == 'r+': self._py_obj_type_table = h_root_group.create_group("hickle_types_table",track_order = True) else: h5_overlay = h5.File( '{}.hover'.format(h_root_group.file.filename.rsplit('.',1)[0]), mode='w', driver='core',backing_store=False ) self._py_obj_type_table = h5_overlay.create_group("hickle_types_table",track_order = True) self._overlay = weakref.finalize(self,ReferenceManager._drop_overlay,h5_overlay) return # verify that '_py_obj_type_table' is a valid h5py.Group object if not isinstance(self._py_obj_type_table,h5.Group): raise ReferenceError("'hickle_types_table' invalid: Must be HDF5 Group entry") # if h_root_group.file was opened for writing restore '_py_obj_type_link' and # '_base_type_link' table entries from '_py_obj_type_table' to ensure when # h5py.Group and h5py.Dataset are added anew to h_root_group tree structure # their 'type' attribute is set to the correct py_obj_type reference by the # ReferenceManager.store_type method. Each of '_py_obj_type_link' and # '_base_type_link' tables can be used to properly restore the 'py_obj_type' # and 'base_type' when loading the file as well as assigning to the 'type' # attribute the appropriate 'py_obj_type' dataset reference from the # '_py_obj_type_table' when dumping data to the file. if h_root_group.file.mode != 'r+': return for _, entry in self._py_obj_type_table.items(): if entry.shape is None and entry.dtype == 'S1': base_type = entry.name.rsplit('/',1)[-1].encode('ascii') self._base_type_link[base_type] = entry self._base_type_link[entry.id] = base_type continue base_type_ref = entry.attrs.get('base_type',None) if not isinstance(base_type_ref,h5.Reference): raise ReferenceError( "inconsistent 'hickle_types_table' entries for py_obj_type '{}': " "no base_type".format(py_obj_type) ) try: base_type_entry = entry.file.get(base_type_ref,None) except ValueError: # pragma: no cover base_type_entry = None if base_type_entry is None: raise ReferenceError( "inconsistent 'hickle_types_table' entries for py_obj_type '{}': " "stale base_type".format(py_obj_type) ) base_type = self._base_type_link.get(base_type_entry.id,None) if base_type is None: base_type = base_type_entry.name.rsplit('/',1)[-1].encode('ascii') try: py_obj_type = pickle.loads(entry[()]) except (ImportError,AttributeError): py_obj_type = AttemptRecoverCustom entry_link = py_obj_type,'!recover!',base_type else: entry_link = py_obj_type,base_type self._py_obj_type_link[id(py_obj_type)] = entry self._py_obj_type_link[entry.id] = entry_link def store_type(self, h_node, py_obj_type, base_type = None, attr_name = 'type', **kwargs): """ assigns a 'py_obj_type' entry reference to the attribute specified by attr_name of h_node and creates if not present the appropriate 'hickle_types_table' entries for py_obj_type and base_type. Note ---- Storing and restoring the content of nodes containing pickle byte strings is fully managed by pickle.dumps and pickle.loads functions including selection of appropriate py_obj_type. Therefore no explicit entry for object and b'pickle' py_obj_type and base_type pairs indicating pickled content of pickled dataset are created. Parameters ---------- h_node (h5py.Group, h5py.Dataset): node the 'type' attribute a 'hickle_types_table' entry corresponding to the provided py_obj_type, base_type entry pair shall be assigned to. py_obj_type (any type or class): the type or class of the object instance represented by h_node base_type (bytes): the base-type bytes string of the loader used to create the h_node and restore an object instance form on load. If None no 'hickle_types_table' will be created for py_obj_type if not already present and a LookupError exception is raised instead. attr_name (str): the name of the attribute the type reference shall be stored to. Defaults to 'type' kwargs (dict): keyword arguments to be passed to h5py.Group.create_dataset function when creating the entries for py_obj_type and base_type anew Raises ------ ValueError: if base_type is not a valid bytes string LookupError: if base_type is None and no 'hickle_types_table' entry exists for py_obj_type yet """ # return immediately if py_obj_type is object as h_node contains pickled byte # string of the actual object dumped if py_obj_type is object: return # if no entry within the 'hickle_types_table' exists yet # for py_obj_type create the corresponding pickle string dataset # and store appropriate entries in the '_py_obj_type_link' table for # further use by ReferenceManager.store_type and ReferenceManager.resolve_type # methods py_obj_type_id = id(py_obj_type) entry = self._py_obj_type_link.get(py_obj_type_id,None) if entry is None: if base_type is None: raise LookupError( "no entry found for py_obj_type '{}'".format(py_obj_type.__name__) ) if not isinstance(base_type,(str,bytes)) or not base_type: raise ValueError("base_type must be non empty bytes string") type_entry = memoryview(pickle.dumps(py_obj_type)) type_entry = np.array(type_entry,copy = False) type_entry.dtype = 'S1' entry = self._py_obj_type_table.create_dataset( str(len(self._py_obj_type_table)), data=type_entry, shape=(1,type_entry.size), **kwargs ) # assign a reference to base_type entry within 'hickle_types_table' to # the 'base_type' attribute of the newly created py_obj_type entry. # if 'hickle_types_table' does not yet contain empty dataset entry for # base_type create it and store appropriate entries in the '_base_type_link' table # for further use by ReferenceManager.store_type and ReferenceManager,resolve_type # methods base_entry = self._base_type_link.get(base_type,None) if base_entry is None: base_entry = self._base_type_link[base_type] = self._py_obj_type_table.create_dataset( base_type.decode('ascii'), shape=None,dtype = 'S1', **no_compression(kwargs) ) self._base_type_link[base_entry.id] = base_type entry.attrs['base_type'] = base_entry.ref self._py_obj_type_link[py_obj_type_id] = entry self._py_obj_type_link[entry.id] = (py_obj_type,base_type) h_node.attrs[attr_name] = entry.ref def resolve_type(self,h_node,attr_name = 'type',base_type_type = 1): """ resolves the py_obj_type and base_type pair referenced to by the 'type' attribute and if present the 'base_type' attribute. Note: If the 'base_type' attribute is present it is assumed that the dataset was created by hickle 4.x version. Consequently it is assumed that the 'type' attribute contains a pickle bytes string to load the py_obj_type from instead of a reference to a 'hickle_types_table' entry representing the py_obj_type base_type pair of h_node. Note: If 'type' attribute is not present h_node represents either a h5py.Reference to the actual node of the object to be restored or contains a pickle bytes string. In either case the corresponding implicit py_obj_type base_type pair (NodeReference, b'!node-reference!') or (object,b'pickle') respective is assumed and returned. Note: If restoring 'py_object_type' from pickle string stored in type attribute or 'hickle_types_table' fails than implicit py_obj_type base_type pair (AttemptRecoverCustom,'!recover!') is returned instead of the actual 'py_obj_type' base_type pair is returned. The latter can be retrieved by setting 'base_type_type' to 2 in this case Parameters ---------- h_node (h5py.Group,h5py.Dataset): the node to resolve py_obj_type and base_type for using reference stored in attribute specified by attr_name attr_name (str): the name of the attribute the type reference shall be restored from. Defaults to and must be 'type' in case not a h5py.Reference. base_type_type (int): 1 (default) base_type used to select loader -1 original base_type corresponding to not understood py_obj_type of recovered h5py.Group or h5py.Dataset Returns ------- tuple containing (py_obj_type,base_type,is_container) py_obj_type: the python type of the restored object base_type: the base_type string indicating the loader to be used for properly restoring the py_obj_type instance or the base_type string is_container: boolean flag indicating whether h_node represents a h5py.Group or h5py.Reference both of which have to be handled by corresponding PyContainer type loaders or a h5py.Dataset for which the appropriate load_fn is to be called. """ # load the type attribute indicated by attr_name. If not present check if h_node # is h5py.Reference dataset or a dataset containing a pickle bytes # string. In either case assume (NodeReference,b'!node-reference!') or (object,b'pickle') # respective as (py_obj_type, base_type) pair and set is_container flag to True for # h5py.Reference and False otherwise. # # NOTE: hickle 4.x legacy file does not store 'type' attribute for h5py.Group nodes with # b'dict_item' base_type. As h5py.Groups do not have a dtype attribute the check # whether h_node.dtype equals h5py.ref_dtype will raise AttributeError. # If h_node represents a b'dict_item' than self.pickle_loads will point to # fix_lambda_obj_type below which will properly handle None value of type_ref # in any other case file is not a hickle 4.x legacy file and thus has to be # considered broken type_ref = h_node.attrs.get(attr_name,None) if not isinstance(type_ref,h5.Reference): if type_ref is None: try: metadata = h_node.dtype.metadata except (AttributeError,): pass else: if metadata is not None and issubclass(metadata.get('ref',object),h5.Reference): return NodeReference,b'!node-reference!',True return object,b'pickle',False # check if 'type' attribute of h_node contains a reference to a 'hickle_types_table' # entry. If not use pickle to restore py_object_type from the 'type' attribute value # directly if possible try: # set is_container_flag to True if h_node is h5py.Group type object and false # otherwise return self.pickle_loads(type_ref), h_node.attrs.get('base_type', b'pickle'), isinstance(h_node, h5.Group) except (ModuleNotFoundError,AttributeError): # module missing or py_object_type not provided by module return AttemptRecoverCustom,( h_node.attrs.get('base_type',b'pickle') if base_type_type == 2 else b'!recover!' ),isinstance(h_node,h5.Group) except (TypeError, pickle.UnpicklingError, EOFError): raise ReferenceError( "node '{}': '{}' attribute ('{}')invalid: not a pickle byte string".format( h_node.name,attr_name,type_ref ) ) try: entry = self._py_obj_type_table[type_ref] except (ValueError, KeyError): raise ReferenceError( "node '{}': '{}' attribute invalid: stale reference".format( h_node.name,attr_name ) ) # load (py_obj_type,base_type) pair from _py_obj_type_link for 'hickle_types_table' entry # referenced by 'type' entry. Create appropriate _py_obj_type_link and _base_type_link # entries if if not present for (py_obj_type,base_type) pair for further use by # ReferenceManager.store_type and ReferenceManager.resolve_type methods. type_info = self._py_obj_type_link.get(entry.id, None) if type_info is None: base_type_ref = entry.attrs.get('base_type', None) if base_type_ref is None: base_type = b'pickle' else: try: base_type_entry = self._py_obj_type_table[base_type_ref] except ( ValueError,KeyError ): # TODO should be recovered here instead? raise ReferenceError( "stale base_type reference encountered for '{}' type table entry".format( entry.name ) ) base_type = self._base_type_link.get(base_type_entry.id,None) if base_type is None: # get the relative table entry name form full path name of entry node base_type = base_type_entry.name.rsplit('/',1)[-1].encode('ASCII') self._base_type_link[base_type] = base_type_entry self._base_type_link[base_type_entry.id] = base_type try: py_obj_type = self.pickle_loads(entry[()]) except (ModuleNotFoundError,AttributeError): py_obj_type = AttemptRecoverCustom entry_link = (py_obj_type,b'!recover!',base_type) else: entry_link = (py_obj_type,base_type) self._py_obj_type_link[id(py_obj_type)] = entry type_info = self._py_obj_type_link[entry.id] = entry_link # return (py_obj_type,base_type). set is_container flag to true if # h_node is h5py.Group object and false otherwise return (type_info[0],type_info[base_type_type],isinstance(h_node,h5.Group)) def __enter__(self): if not isinstance(self._py_obj_type_table, h5.Group) or not self._py_obj_type_table: raise RuntimeError( "Stale ReferenceManager, call ReferenceManager.create_manager to create a new one" ) return self def __exit__(self, exc_type, exc_value, exc_traceback): if not isinstance(self._py_obj_type_table, h5.Group) or not self._py_obj_type_table: return # remove this ReferenceManager object from the table of active ReferenceManager objects # and cleanly unlink from any h5py object instance and id references managed. Finalize # 'hickle_types_table' overlay if it was created by __init__ for hickle 4.x file super().__exit__(exc_type, exc_value, exc_traceback, self._py_obj_type_table) self._py_obj_type_table = None self._py_obj_type_link = None self._base_type_link = None self.pickle_loads = None if self._overlay is not None: self._overlay() self._overlay = None ##################### # loading optional # ##################### _managed_by_hickle = {'hickle', ''} _custom_loader_enabled_builtins = {'__main__':('','')} class LoaderManager(BaseManager): """ Handles the file specific lookup of loader to be used to dump or load a python object of a specific type To create a LoaderManager call LoaderManager.create_manager function. The value returned can be and shall be used within a with statement for example as follows: with LoaderManager.create_manager(h_root_group) as loader: _dump(data,h_root_group,'data',memo,loader,**kwargs) with LoaderManager.create_manager(h_root_group,False,{'custom':true}) as loader: _load(py_container,'data',h_root_group['data'],memo,loader) with LoaderManager.create_manager(h_root_group,True) as memo: _load(py_container,'data',h_root_group['data'],memo,loader) NOTE: for creating appropriate memo object see ReferenceManager """ # Define dict of all acceptable types dependent upon loader option __py_types__ = { None: {}, 'hickle-4.x': {}, **{ option:{} for option in optional_loaders } } # Define dict of all acceptable load function dependent upon loader option __hkl_functions__ = { None: {}, 'hickle-4.x': {}, **{ option:{} for option in optional_loaders } } # Define dict of all acceptable hickle container types dependent upon loader option __hkl_container__ = { None: {}, 'hickle-4.x': {}, **{ option:{} for option in optional_loaders } } # Empty list (hashtable) of loaded loader names __loaded_loaders__ = set() @classmethod def register_class( cls, myclass_type, hkl_str, dump_function=None, load_function=None, container_class=None, memoise = True, option=None ): """ Register a new class to be recognized and dumped or restored by hickle. Parameters ---------- myclass_type (type.class): the class to register dump_fcn, load_fcn, PyContainer for hkl_str (str): String to write to HDF5 file identifying class and loader suitable for restoring the py_object described by the data stored in hickle file. NOTE: dict_item, pickle, !node-refrence!, !recover! and any other string enclosed within a pair of !! can not be modified if once registered. Strings quoted by !! must be added as global loader with option = None. dump_function (callable): callable to write data to HDF5 load_function (callable): function to load data from HDF5 container_class (PyContainer): PyContainer type proxy class to load data from HDF5 memoise (bool): True: references to the object instances of class shall be remembered during dump and load for properly resolving multiple references to the same object instance. False: every occurrence of an instance of the object has to be dumped and restored on load disregarding instances already present. option (str, None): String identifying set of loaders which shall only be used when specific feature or category is requested on top of global loaders. If None than loader is globally to be used if there is no other loader registered for myclass_type. NOTE: only strings listed in 'optional_loaders' exported by hickle.loaders.__init__ and 'hickle-4.x' are accepted. Raises ------ TypeError: loader for myclass_type may only be registered by hickle core modules not loaded from hickle/loaders/ directory, /hickle_loaders/, /hickle_loaders/ or <__main__path>/hickle_loaders/ directory by explicitly calling LoaderManager.register_class method. ValueError: if optional loader modules tries to shadow 'dict_item', 'pickle' and any loader marked as essential to proper function of hickle.dump and hickle.load by ! prefix and postfix ('!node-reference!', '!recover!'). LookupError: if optional loader denoted by option is unknown. Any new option must be listed in 'optional_loaders' exported by 'hickle.loaders.__init__.py' file to be recognized as valid option """ if ( myclass_type is object or isinstance( myclass_type, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType) ) or issubclass(myclass_type,(type,_DictItem)) ): # object, all functions, methods, class objects and the special _DictItem class # type objects are to be handled by hickle core only. dump_module = getattr(dump_function, '__module__', '').split('.', 2) load_module = getattr(load_function, '__module__', '').split('.', 2) container_module = getattr(container_class, '__module__', '').split('.', 2) if {dump_module[0], load_module[0], container_module[0]} - _managed_by_hickle: raise TypeError( "loader for '{}' type managed by hickle only".format( myclass_type.__name__ ) ) if "loaders" in {*dump_module[1:2], *load_module[1:2], *container_module[1:2]}: raise TypeError( "loader for '{}' type managed by hickle core only".format( myclass_type.__name__ ) ) if ( ( cls.__hkl_functions__[None].get(hkl_str) or cls.__hkl_container__[None].get(hkl_str) ) and ( hkl_str[:1] == hkl_str[-1:] == b'!' or hkl_str in disallow_in_option ) ): raise ValueError( "'{}' base_type may not be shadowed by loader".format(hkl_str) ) # add loader try: if dump_function is not None: cls.__py_types__[option][myclass_type] = ( dump_function, hkl_str,memoise) if load_function is not None: cls.__hkl_functions__[option][hkl_str] = load_function cls.__hkl_functions__[option][hkl_str.decode('ascii')] = load_function if container_class is not None: cls.__hkl_container__[option][hkl_str] = container_class cls.__hkl_container__[option][hkl_str.decode('ascii')] = container_class except KeyError: raise LookupError("Invalid option '{}' encountered".format(option)) @classmethod def register_class_exclude(cls, hkl_str_to_ignore, option = None): """ Tell loading function to ignore any HDF5 dataset with attribute 'type=XYZ' Parameters ---------- hkl_str_to_ignore (str): attribute type=string to ignore and exclude from loading. option (str, None): String identifying set of optional loaders from which class shall be excluded Raises ------ ValueError: class is managed by hickle core machinery and thus may not be ignored LookupError: option loader shall belong to is unknown. Any new option must be listed in 'optional_loaders' exported by 'hickle.loaders.__init__.py' file to be recognized as valid option """ if hkl_str_to_ignore[0] == hkl_str_to_ignore[-1] == b'!' or hkl_str_to_ignore in disallowed_to_ignore: raise ValueError( "excluding '{}' base_type managed by hickle core not possible".format( hkl_str_to_ignore ) ) try: cls.__hkl_functions__[option][hkl_str_to_ignore] = load_nothing cls.__hkl_container__[option][hkl_str_to_ignore] = NoContainer cls.__hkl_functions__[option][hkl_str_to_ignore.decode('ascii')] = load_nothing cls.__hkl_container__[option][hkl_str_to_ignore.decode('ascii')] = NoContainer except KeyError: raise LookupError("'{}' option unknown".format(option)) __slots__ = ( 'types_dict', 'hkl_types_dict', 'hkl_container_dict', '_mro', '_file') _option_formatter = '{}{{}}'.format(attribute_prefix) _option_parser = re.compile(r'^{}(.*)$'.format(attribute_prefix),re.I) def __init__(self, h_root_group, legacy = False, options = None): """ constructs LoaderManager object Parameters ---------- h_root_group (h5py.Group): see LoaderManager.create_manager legacy (bool): If true the file h_node belongs to is in legacy hickle 4.x format. Ensure lambda py_obj_type strings are loaded properly and 'hickle-4.x' type loaders are included within types_dict, 'hkl_types_dict' and 'hkl_container_dict' options (dict): optional loaders to be loaded. Each key names one loader and its value indicates whether to be used (True) or excluded (False) Raises ------ LookupError: option loader unknown """ # initialize lookup dictionaries with set of common loaders self.types_dict = collections.ChainMap(self.__class__.__py_types__[None]) self.hkl_types_dict = collections.ChainMap(self.__class__.__hkl_functions__[None]) self.hkl_container_dict = collections.ChainMap(self.__class__.__hkl_container__[None]) # Select source of optional loader flags. If option is None try to read options # from h_root_group.attrs structure. Otherwise use content of options dict store # each entry to be used within h_root_group.attrs structure or update entry there if options is None: option_items = ( match.group(1).lower() for match,on in ( ( LoaderManager._option_parser.match(name), value ) for name, value in h_root_group.attrs.items() ) if match and on ) else: def set_option_items(): for option_key,on in options.items(): if not on: continue h_root_group.attrs[LoaderManager._option_formatter.format(option_key.upper())] = on yield option_key option_items = set_option_items() # try to include loader set indicated by option_name try: for option_name in option_items: self.types_dict.maps.insert(0,self.__class__.__py_types__[option_name]) self.hkl_types_dict.maps.insert(0,self.__class__.__hkl_functions__[option_name]) self.hkl_container_dict.maps.insert(0,self.__class__.__hkl_container__[option_name]) except KeyError: raise LookupError("Option '{}' invalid".format(option_name)) # add loaders required to properly load legacy files created by hickle 4.x and # ensure that non class types are properly reported by load_loader if legacy: self._mro = type_legacy_mro self.types_dict.maps.insert(0,self.__class__.__py_types__['hickle-4.x']) self.hkl_types_dict.maps.insert(0,self.__class__.__hkl_functions__['hickle-4.x']) self.hkl_container_dict.maps.insert(0,self.__class__.__hkl_container__['hickle-4.x']) else: self._mro = type.mro self._file = h_root_group.file def load_loader(self, py_obj_type,*,base_type=None): """ Checks if given `py_obj` requires an additional loader to be handled properly and loads it if so. Parameters ---------- py_obj: the Python object to find an appropriate loader for Returns ------- tuple containing (py_obj, (create_dataset, base_type, memoise)) py_obj: the Python object the loader was requested for (create_dataset,base_type,memoise): tuple providing create_dataset function, name of base_type used to represent py_obj and the boolean memoise flag indicating whether loaded object shall be remembered for restoring further references to it or must be loaded every time encountered. Raises ------ RuntimeError: in case py object is defined by hickle core machinery. """ types_dict = self.types_dict loaded_loaders = self.__class__.__loaded_loaders__ # loop over the entire mro_list of py_obj_type for mro_item in self._mro(py_obj_type): # Check if mro_item is already listed in types_dict and return if found loader_item = types_dict.get(mro_item,None) if loader_item is not None: return py_obj_type,loader_item # Obtain the package name of mro_item package_list = mro_item.__module__.split('.',2) package_file = None if package_list[0] == 'hickle': if package_list[1] != 'loaders': if base_type is not None and ( base_type in self.hkl_types_dict or base_type in self.hkl_container_dict): return py_obj_type,(not_dumpable,base_type,True) print(mro_item,package_list) raise RuntimeError( "objects defined by hickle core must be registered" " before first dump or load" ) if ( len(package_list) < 3 or not package_list[2].startswith("load_") or '.' in package_list[2][5:] ): warnings.warn( "ignoring '{!r}' dummy type not defined by loader module".format(py_obj_type), RuntimeWarning ) continue # dummy objects are not dumpable ensure that future lookups return that result loader_item = types_dict.get(mro_item,None) if loader_item is None: loader_item = types_dict[mro_item] = ( not_dumpable, b'NotHicklable',False ) # ensure module of mro_item is loaded as loader as it will contain # loader which knows how to handle group or dataset with dummy as # py_obj_type loader_name = mro_item.__module__ if loader_name in loaded_loaders: # loader already loaded as triggered by dummy abort search and return # what found so far as fallback to further bases does not make sense return py_obj_type,loader_item else: loader_name,package_file = _custom_loader_enabled_builtins.get(package_list[0],(None,'')) if loader_name is None: # construct the name of the associated loader loader_name = 'hickle.loaders.load_{:s}'.format(package_list[0]) elif not loader_name: # try to resolve module name for __main__ script and other generic modules package_module = sys.modules.get(package_list[0],None) if package_module is None: warnings.warn( "package/module '{}' defining '{}' type dropped".format( package_list[0],mro_item.__name__ ), PackageImportDropped ) continue package_file = getattr(package_module,'__file__',None) if package_file is None: package_loader = getattr(package_module,'__loader__',None) if package_loader is None: # pragma: no cover # just to secure against "very smart" tinkering # with python import machinery, no serious testable use-case known and expected continue package_spec = spec_from_loader(package_list[0],package_loader) if not getattr(package_spec,'has_location',False): continue package_file = package_spec.origin if not os.path.isabs(package_file): # pragma: no cover # not sure if this case wouldn't just be result of "very smart" tinkering # with python import machinery, no serious testable use-case known yet package_spec = find_spec(os.path.basename(package_file.rsplit('.')[0])) if not getattr(package_spec,'has_location',False): # pargma: no cover # not sure if this case wouldn't just be result of "very smart" tinkering # with python import machinery, no serious testable use-case known yet continue package_file = package_spec.origin package_list[0],allow_custom_loader = os.path.basename(package_file).rsplit('.')[0],package_list[0] loader_name = 'hickle.loaders.load_{:s}'.format(package_list[0]) _custom_loader_enabled_builtins[allow_custom_loader] = loader_name, package_file # Check if this module is already loaded if loader_name in loaded_loaders: # loader is loaded but does not define loader for mro_item # check next base class continue # check if loader module has already been loaded. If use that instead # of importing it anew loader = sys.modules.get(loader_name,None) if loader is None: # Try to load a loader with this name loader_spec = find_spec(loader_name) if loader_spec is None: assert isinstance(package_file,str), "package_file name for _custom_loader_enabled_builtins must be string" if not package_file: package_spec = getattr(sys.modules.get(package_list[0],None),'__spec__',None) if package_spec is None: package_spec = find_spec(package_list[0]) if not getattr(package_spec,'has_location',False): # can't resolve package or base module hosting mro_item continue package_file = package_spec.origin package_path = os.path.dirname(package_file) package_loader_path = os.path.join( package_path, "hickle_loaders", "load_{:s}.py".format(package_list[0]) ) try: fid = open(package_loader_path,'rb') except FileNotFoundError: try: package_loader_path += 'c' fid = open(package_loader_path,'rb') except FileNotFoundError: # no file for loader module found continue else: fid.close() else: fid.close() loader_spec = spec_from_file_location(loader_name,package_loader_path) # import the the loader module described by module_spec # any import errors and exceptions result at this stage from # errors inside module and not cause loader module does not # exist loader = module_from_spec(loader_spec) loader_spec.loader.exec_module(loader) sys.modules[loader_name] = loader # load all loaders defined by loader module for next_loader in loader.class_register: self.register_class(*next_loader) for drop_loader in ( loader if isinstance(loader,(list,tuple)) else (loader,None) for loader in loader.exclude_register) : self.register_class_exclude(*drop_loader) loaded_loaders.add(loader_name) # check if loader module defines a loader for base_class mro_item loader_item = types_dict.get(mro_item,None) if loader_item is None: # the new loader does not define loader for mro_item # check next base class continue # return loader for base_class mro_item return py_obj_type,loader_item # no appropriate loader found. Lower py_object_type to object and # return fallback to pickle return object,(create_pickled_dataset,b'pickle',True) @classmethod def create_manager(cls, h_node, legacy = False, options = None): """ creates an new LoaderManager object for the h_root_group the h_node belongs to. Parameters ---------- h_node (h5py.Group, h5py.Dataset): the h5py node or its h_root_group to create a new LoaderManager object for. legacy (bool): if true file h_node belongs to is in legacy hickle 4.x format ensure lambda py_obj_type strings are loaded properly and 'hickle-4.x' type loaders are included within types_dict, 'hkl_types_dict' and 'hkl_container_dict' options (dict): optional loaders to be loaded. Each key names one loader and its value indicates whether to be used (True) or excluded (False) Raises ------ LookupError: if ReferenceManager has already been created for h_node or its h_root_group """ def create_manager(): return (LoaderManager(h_node,legacy,options),) return super().create_manager(h_node,create_manager) def __enter__(self): if not isinstance(self._file,h5.File) or not self._file: raise RuntimeError( "Stale LoaderManager, call LoaderManager.create_manager to create a new one" ) return self def __exit__(self, exc_type, exc_value, exc_traceback): if not isinstance(self._file,h5.File) or not self._file: return super().__exit__(exc_type, exc_value, exc_traceback, self._file) self._file = None self._mro = None self.types_dict = None self.hkl_types_dict = None self.hkl_container_dict = None def type_legacy_mro(cls): """ drop in replacement of type.mro for loading legacy hickle 4.x files which were created without generalized PyContainer objects available. Consequently some h5py.Datasets and h5py.Group objects expose function objects as their py_obj_type type.mro expects classes only. Parameters ---------- cls (type): the py_obj_type/class of the object to load or dump Returns ------- mro list for cls as returned by type.mro or in case cls is a function or method a single element tuple is returned """ if isinstance( cls, (types.FunctionType,types.BuiltinFunctionType,types.MethodType,types.BuiltinMethodType) ): return (cls,) return type.mro(cls) # %% BUILTIN LOADERS (not maskable) # list of below hkl_types which may not be ignored # NOTE: types which are enclosed in !! pair are disallowed in any case disallowed_to_ignore = {b'dict_item', b'pickle' } # list of below hkl_types which may not be redefined by optional loader # NOTE: types which are enclosed in !! pair are disallowed in any case disallow_in_option = {b'pickle'} class NoContainer(PyContainer): # pragma: no cover """ load nothing container """ def convert(self): pass class _DictItemContainer(PyContainer): """ PyContainer reducing hickle version 4.x dict_item type h5py.Group to its content for inclusion within dict h5py.Group """ def convert(self): return self._content[0] LoaderManager.register_class( _DictItem, b'dict_item', dump_nothing, load_nothing, _DictItemContainer, False, 'hickle-4.x' ) class ExpandReferenceContainer(PyContainer): """ PyContainer for properly restoring additional references to an object instance shared multiple times within the dumped object structure """ def filter(self,h_parent): """ resolves the h5py.Reference link and yields the the node it refers to as sub item of h_parent so that it can be properly loaded by recursively calling hickle._load method independent whether it can be directly loaded from the memo dictionary or has to be restored from file. """ try: referred_node = h_parent.file.get(h_parent[()],None) except ( ValueError, KeyError ): # pragma no cover referred_node = None if referred_node is None: raise ReferenceError("node '{}' stale node reference".format(h_parent.name)) yield referred_node.name.rsplit('/',1)[-1], referred_node def convert(self): """ returns the object the reference was pointing to """ return self._content[0] # objects created by resolving h5py.Reference datasets are already stored inside # memo dictionary so no need to memoise them. LoaderManager.register_class( NodeReference, b'!node-reference!', dump_nothing, load_nothing, ExpandReferenceContainer, False ) def create_pickled_dataset(py_obj, h_group, name, reason = None, **kwargs): """ Create pickle string as object can not be mapped to any other h5py structure. Parameters ---------- py_obj: python object to dump; default if item is not matched. h_group (h5.File.group): group to dump data into. name (str): the name of the resulting dataset reason (str,None): reason why py_object has to be pickled eg. string provided by NotHicklable exception Warnings ------- SerializedWarning: issued before pickle string is created """ # for what ever reason py_obj could not be successfully reduced # ask pickle for help and report to user. reason_str = " (Reason: %s)" % (reason) if reason is not None else "" warnings.warn( "{!r} type not understood, data is serialized:{:s}".format( py_obj.__class__.__name__, reason_str ), SerializedWarning ) # store object as pickle string pickled_obj = pickle.dumps(py_obj) d = h_group.create_dataset(name, data = memoryview(pickled_obj), **kwargs) return d,() def load_pickled_data(h_node, base_type, py_obj_type): """ loade pickle string and return resulting py_obj """ try: return pickle.loads(h_node[()]) except (ImportError,AttributeError): return RecoveredDataset(h_node[()],dtype = h_node.dtype,attrs = dict(h_node.attrs)) # no dump method is registered for object as this is the default for # any unknown object and for classes, functions and methods LoaderManager.register_class(object,b'pickle',None,load_pickled_data) def recover_custom_dataset(h_node,base_type,py_obj_type): """ drop in load_fcn for any base_type no appropriate loader could be found """ manager = ReferenceManager.get_manager(h_node) _,base_type,_ = manager.resolve_type(h_node,base_type_type = -1) warnings.warn( "loader '{}' missing for '{}' type object. Data recovered ({})".format( base_type, py_obj_type.__name__ if not isinstance(py_obj_type, AttemptRecoverCustom) else None, h_node.name.rsplit('/')[-1] ), DataRecoveredWarning ) attrs = dict(h_node.attrs) attrs['base_type'] = base_type return RecoveredDataset(h_node[()],dtype=h_node.dtype,attrs=attrs) class RecoverGroupContainer(PyContainer): """ drop in PyContainer for any base_type not appropriate loader could be found """ def __init__(self,h5_attrs, base_type, object_type): super().__init__(h5_attrs, base_type, object_type,_content = {}) def filter(self,h_parent): """ switch base_type to the one loader is missing for """ warnings.warn( "loader '{}' missing for '{}' type object. Data recovered ({})".format( self.base_type, self.object_type.__name__ if not isinstance(self.object_type,AttemptRecoverCustom) else None, h_parent.name.rsplit('/')[-1] ), DataRecoveredWarning ) manager = ReferenceManager.get_manager(h_parent) _,self.base_type,_ = manager.resolve_type(h_parent,base_type_type = -1) yield from h_parent.items() def append(self,name,item,h5_attrs): if isinstance(item,AttemptRecoverCustom): self._content[name] = item else: self._content[name] = (item,{ key:value for key,value in h5_attrs.items() if key not in {'type'}}) def convert(self): attrs = {key:value for key,value in self._h5_attrs.items() if key not in {'type'}} attrs['base_type'] = self.base_type return RecoveredGroup(self._content,attrs=attrs) LoaderManager.register_class(AttemptRecoverCustom,b'!recover!',None,recover_custom_dataset,RecoverGroupContainer,True) def _moc_numpy_array_object_lambda(x): """ drop in replacement for lambda object types which seem not any more be accepted by pickle for Python 3.8 and onward. see fix_lambda_obj_type function below Parameters ---------- x (list): itemlist from which to return first element Returns ------- first element of provided list """ return x[0] LoaderManager.register_class( _moc_numpy_array_object_lambda, b'!moc_lambda!', dump_nothing, load_nothing, None, True, 'hickle-4.x' ) def fix_lambda_obj_type(bytes_object, *, fix_imports=True, encoding="ASCII", errors="strict"): """ drop in replacement for pickle.loads method when loading files created by hickle 4.x It captures any TypeError thrown by pickle.loads when encountering a pickle string representing a lambda function used as py_obj_type for a h5py.Dataset or h5py.Group. While in Python <3.8 pickle loads creates the lambda Python >= 3.8 throws an error when encountering such a pickle string. This is captured and _moc_numpy_array_object_lambda returned instead. Further some h5py.Group and h5py.Datasets do not provide any py_obj_type for them object is returned assuming that proper loader has been identified by other objects already """ if bytes_object is None: return object try: return pickle.loads(bytes_object, fix_imports=fix_imports, encoding=encoding, errors=errors) except TypeError: warnings.warn( "presenting '{!r}' instead of stored lambda 'type'".format( _moc_numpy_array_object_lambda ), MockedLambdaWarning ) return _moc_numpy_array_object_lambda hickle-5.0.2/hickle/tests/000077500000000000000000000000001430361177200153625ustar00rootroot00000000000000hickle-5.0.2/hickle/tests/__init__.py000066400000000000000000000000001430361177200174610ustar00rootroot00000000000000hickle-5.0.2/hickle/tests/generate_legacy_4_0_0.py000066400000000000000000000134601430361177200217370ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # generate_legacy_4_0_0.py Creates datastructure to be dumped to the hickle_4_0_0.hkl file. When run as script under hickle 4.0.0 or hickle 4.0.1 it will result in a valid legacy 4.0.0 file which can be used to tests that later version are still capable loading hickle 4.0.0 format files. When imported by any of the tests the method generate_py_object returns the same datastructure stored to the prior generated file. """ import sys sys.path.insert(0,"../..") import hickle import numpy as np import scipy import scipy.sparse import astropy import collections import os.path def generate_py_object(): """ create a data structure covering all or at least the most obvious, prominent and most likely breaking differences between hickle 4.0.0/4.0.1 version and Versions > 4.1.0 Returns: list object containing all the relevant data objects and the filename of the file the data has been stored to or shall be stored to. """ scriptdir = os.path.split(__file__)[0] some_string = "this is some string to be dumped by hickle 4.0.0" some_bytes = b"this is the same in bytes instead of utf8" some_char_list = list(some_string) some_bytes_list = list(some_bytes) some_numbers = tuple(range(50)) some_floats = tuple( float(f) for f in range(50)) mixed = list( f for f in ( some_numbers[i//2] if i & 1 else some_floats[i//2] for i in range(100) ) ) wordlist = ["hello","world","i","like","you"] byteslist = [ s.encode("ascii") for s in wordlist] mixus = [some_string,some_numbers,12,11] numpy_array = np.array([ [ 0.8918443906408066, 0.5408942506873636, 0.43463333793335346, 0.21382281373491407, 0.14580527098359963, 0.6869306139451369, 0.22954988509310692, 0.2833880251470392, 0.8811201329390297, 0.4144190218983931, 0.06595369247674943 ], [ 0.8724300029833221, 0.7173303189807705, 0.5721666862018427, 0.8535567654595188, 0.5806566016388102, 0.9921250367638187, 0.07104048226766191, 0.47131100732975095, 0.8006065068241431, 0.2804909335297441, 0.1968823602346148 ], [ 0.0515177648326276, 0.1852582437284651, 0.22016412062225577, 0.6393104121476216, 0.7751103631149562, 0.12810902186723572, 0.09634877693000932, 0.2388423061420949, 0.5730001119950099, 0.1197268172277629, 0.11539619086292308 ], [ 0.031751102230864414, 0.21672180477587166, 0.4366501648161476, 0.9549518596659471, 0.42398684476912474, 0.04490851499559967, 0.7394234049135264, 0.7378312792413693, 0.9808812550712923, 0.2488404519024885, 0.5158454824458993 ], [ 0.07550969197984403, 0.08485317435746553, 0.15760274251917195, 0.18029979414515496, 0.9501707036126847, 0.1723868250469468, 0.7951538687631865, 0.2546219217084682, 0.9116518509985955, 0.6930255788272572, 0.9082828280630456 ], [ 0.6712307672376565, 0.367223385378443, 0.9522931417348294, 0.714592360187415, 0.18334824241062575, 0.9322238504996762, 0.3594776411821822, 0.6302097368268973, 0.6281766915388312, 0.7114942437206809, 0.6977764481953693 ], [ 0.9541502922560433, 0.47788295940203784, 0.6511716236981558, 0.4079446664375711, 0.2747969334307605, 0.3571662787734283, 0.10235638316970186, 0.8567343897483571, 0.6623468654315807, 0.21377047332104315, 0.860146852430476 ] ]) mask = np.array([ [0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0], [0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1] ]) numpy_array_masked = np.ma.array(numpy_array, dtype='float32', mask=mask) plenty_dict = { "string":1, b'bytes':2, 12:3, 0.55:4, complex(1,4):5, (1,):6, tuple(mixus):7, ():8, '9':9, None:10, 'a/b':11 } odrdered_dict = collections.OrderedDict(((3, [3, 0.1]), (7, [5, 0.1]), (5, [3, 0.1]))) row = np.array([0, 0, 1, 2, 2, 2]) col = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]) csr_matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(3, 3)) csc_matrix = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3)) indptr = np.array([0, 2, 3, 6]) indices = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]).repeat(4).reshape(6, 2, 2) bsr_matrix = scipy.sparse.bsr_matrix((data, indices, indptr), shape=(6, 6)) numpy_string = np.array(some_string) numpy_bytes = np.array(some_bytes) numpy_wordlist = np.array(wordlist) numpy_dict = np.array({}) return [ some_string , some_bytes , some_char_list , some_bytes_list , some_numbers , some_floats , mixed , wordlist , byteslist , mixus , numpy_array , mask , numpy_array_masked , plenty_dict , odrdered_dict , csr_matrix , csc_matrix , bsr_matrix , numpy_string , numpy_bytes , numpy_wordlist , numpy_dict ],os.path.join(scriptdir,"legacy_hkls","hickle_4.0.0.hkl") if __name__ == '__main__': # create the file by dumping using hickle but only if # the available hickle version is >= 4.0.0 and < 4.1.0 hickle_version = hickle.__version__.split('.') if hickle_version[0] != 4 or hickle_version[1] > 0: raise RuntimeError("Shall be run using < 4.1 only") scriptdir = os.path.split(__file__)[0] now_dumping,testfile = generate_py_object() hickle.dump(now_dumping,testfile) hickle-5.0.2/hickle/tests/hickle_loaders/000077500000000000000000000000001430361177200203325ustar00rootroot00000000000000hickle-5.0.2/hickle/tests/hickle_loaders/__init__.py000066400000000000000000000000001430361177200224310ustar00rootroot00000000000000hickle-5.0.2/hickle/tests/hickle_loaders/load_builtins.py000066400000000000000000000004371430361177200235400ustar00rootroot00000000000000def create_package_test(myclass_type,h_group,name,**kwargs): return h_group,() def load_package_test(h_node,base_type,py_obj_type): return {12:12} class_register = [ ( dict,b'dict',create_package_test,load_package_test ) ] exclude_register = [b'please_kindly_ignore_me'] hickle-5.0.2/hickle/tests/legacy_hkls/000077500000000000000000000000001430361177200176475ustar00rootroot00000000000000hickle-5.0.2/hickle/tests/legacy_hkls/generate_test_hickle.py000066400000000000000000000010601430361177200243660ustar00rootroot00000000000000""" # Generate_test_hickle.py Helper to generate test hickle files for a given hickle version Bash command to generate things: > VER=2.1.0; pip uninstall hickle -y; pip install hickle==$VER; python generate_test_hickle.py $VER """ import hickle as hkl import numpy as np import sys ver_str = sys.argv[1].replace('.', '_') fn_out = 'hickle_%s.hkl' % ver_str dd = { 'dog_breed': b'Chihuahua', 'age': 10, 'height': 1.1, 'nums': [1, 2, 3], 'narr': np.array([1, 2, 3]), } print("Dumping %s..." % fn_out) hkl.dump(dd, fn_out, path='test') hickle-5.0.2/hickle/tests/legacy_hkls/hickle_3_4_9.hkl000066400000000000000000000423701430361177200225110ustar00rootroot00000000000000HDF  D` PTREEHEAPXtestH @CLASS @VERSION 8typehickle HPYTHON_VERSION0xTREEHEAPXdata_0H 8typehickle(!GCOLhickle3.4.93.7.4 hSNODpTREE@ 0HEAPX@'dog_breed''age''height''nums''narr'SNODHhHh @typeTREE`+HEAPXdata_0HSNOD,,. 23858?@?`A089`9; @type dict_item H key_type  `# _ 8typebytesHChihuahua ?SNODP"h/TREE@1HEAPX/data_0H,. @type dict_item H key_type   i#_ @type python_dtype2h0SNOD00 Hpython_subdtype 5TREE7HEAPXX5data_0H385 @type dict_item H key_type   ?@4 4m#_ @type python_dtype8h(SNODx6 Hpython_subdtype;TREE=HEAPX;data_0H`9; @type dict_item H key_type  u# _ @type8SNOD<ATREECHEAPXAdata_0H@?`A @type dict_item H key_type  # _ 8typendarray@SNODBhickle-5.0.2/hickle/tests/legacy_hkls/hickle_4.0.0.hkl000066400000000000000000002655001430361177200223350ustar00rootroot00000000000000HDF  @k` TREEHEAPXdataH HHICKLE_VERSION PHICKLE_PYTHON_VERSIONTREE`Rb PHEAP`pQ8'pSNOD8GCOL4.0.13.6.9list0this is some string to be dumped by hickle 4.0.0str)this is the same in bytes instead of utf8bytesstr list list tuple tuple listfloatintfloatintfloatintfloatintfloatintfloatintfloatintfloatintfloatint float!int"float#int$float%int&float'int(float)int*float+int,float-int.float/int0float1int2float3int4float5int6float7int8float9int:float;int<float=int>float?int@floatAintBfloatCintDfloatEintFfloatGintHfloatIintJfloatKintLfloatMintNfloatOintPfloatQintRfloatSintTfloatUintVfloatWintXfloatYintZfloat[int\float]int^float_int`floataintbfloatcintdfloateintffloatginthfloatiintjfloatkintlfloatmintnfloatointpfloatqintrstrslisttbytesulistvlistw0this is some string to be dumped by hickle 4.0.0xstrytuplezint{int|float64}ndarray~int64ndarrayndarray_masked_maskfloat32ndarray_masked_data dict_itemstrint dict_itembytesint dict_itemintint dict_itemfloatint dict_itemcomplexint dict_itemtupleintpickle dict_itemintlistintfloat dict_itemintlistintfloat dict_item8 Ptype-cdill._dill _load_type qXlistqqRq. H base_type  Y"_ H base_type0x SNODX`8 Ptype,cdill._dill _load_type qXstrqqRq.8 Y"_ H base_type Ptype.cdill._dill _load_type qXbytesqqRq. 00 0Y"_ H str_type H base_type Ptype-cdill._dill _load_type qXlistqqRq.@))@P HY"_ H base_type Ptype-cdill._dill _load_type qXlistqqRq.@-0)this is some string to be dumped by hickle 4.0.0this is the same in bytes instead of utf8  !"#$%&'()*+,-./01?@@@@@@ @"@$@&@(@*@,@.@0@1@2@3@4@5@6@7@8@9@:@;@<@=@>@?@@@@@A@A@B@B@C@C@D@D@E@E@F@F@G@G@H@H@?@@@@@@ @"@ $@ &@ (@ *@ ,@.@0@1@2@3@4@5@6@7@8@9@:@;@<@=@>@?@@@ @@!A@"A@#B@$B@%C@&C@'D@(D@)E@*E@+F@,F@-G@.G@/H@0H@1helloworldilikeyou @22@!Y"_ H base_type Ptype.cdill._dill _load_type qXtupleqqRq.H22 ?@4 4(#Y"_ H base_type Ptype.cdill._dill _load_type qXtupleqqRq.TREE(/`D8KSY ;Pi0oHyh00bP0@@Pثpp HEAP(ػ p*, Ptype-cdill._dill _load_type qXlistqqRq. H base_type   ?@4 4$Y"_ H base_typep0x SNOD.0X>`? Ptype.cdill._dill _load_type qXfloatqqRq.0@$Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4$Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@$Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4$Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@$Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4$Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@$Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4$Y"_ H base_type<x Ptype,cdill._dill _load_type qXintqqRq. SNOD _h`axd Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@ %Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4(%Y"_ H base_typeEx SNODhApPBxC`F Ptype.cdill._dill _load_type qXfloatqqRq.0@0%Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 48%Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.0@@%Y"_ H base_type Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4H%Y"_ H base_type Lx SNODGH(JL Ptype.cdill._dill _load_type qXfloatqqRq. @P%Y"_ H base_type!:x( Ptype,cdill._dill _load_type qXintqqRq. Ptype.cdill._dill _load_type qXfloatqqRq. H base_typeu8 ?@4 4X%Y"_ H base_type" Ptype.cdill._dill _load_type qXfloatqqRq.0@`%Y"_ H base_type# Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4h%Y"_ H base_type$HTx SNOD(2hOPQ Ptype.cdill._dill _load_type qXfloatqqRq.0@p%Y"_ H base_type% Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4x%Y"_ H base_type& Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type' Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4%Y"_ H base_type(Zx SNODTVHWXX[\] p3 Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type) Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type* Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type+ Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type, Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type- Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4%Y"_ H base_type.dx SNOD0x@ Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type/ Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type0 Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type1 Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4%Y"_ H base_type2jx SNODe g(@h0k Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type3 Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4%Y"_ H base_type4 Ptype.cdill._dill _load_type qXfloatqqRq.0@%Y"_ H base_type5 Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4%Y"_ H base_type60qx SNOD8Pl@m(4Hn Ptype.cdill._dill _load_type qXfloatqqRq. @&Y"_ H base_type7Nx( Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4h'Y"_ H base_typed Ptype.cdill._dill _load_type qXfloatqqRq. Ptype.cdill._dill _load_type qXfloatqqRq. Ptype.cdill._dill _load_type qXfloatqqRq.8 ?@4 4&Y"_ H base_type8 Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_type9 Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4&Y"_ H base_type:Nx SNODPqXxu`vhx0@ &Y"_ H base_type; Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4(&Y"_ H base_type< Ptype.cdill._dill _load_type qXfloatqqRq.0@0&Y"_ H base_type= Ptype,cdill._dill _load_type qXintqqRq.  ?@4 48&Y"_ H base_type>xx SNODpXzx{| ~05 Ptype.cdill._dill _load_type qXfloatqqRq.0@@&Y"_ H base_type? Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4H&Y"_ H base_type@ Ptype.cdill._dill _load_type qXfloatqqRq.0@P&Y"_ H base_typeA Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4X&Y"_ H base_typeB Ptype.cdill._dill _load_type qXfloatqqRq.0@`&Y"_ H base_typeC Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4h&Y"_ H base_typeDx SNODP؍ Ptype.cdill._dill _load_type qXfloatqqRq.0@p&Y"_ H base_typeE Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4x&Y"_ H base_typeF Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeG Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4&Y"_ H base_typeH Ptype.cdill._dill _load_type qXfloatqqRq. @&Y"_ H base_typeIxx(SNOD8878x Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4&Y"_ H base_typeJ Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeK Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4&Y"_ H base_typeL Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeM Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4&Y"_ H base_typeNXx SNODЙ Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeO Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4&Y"_ H base_typeP Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeQ Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4&Y"_ H base_typeRx SNOD X(0h@8 Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeS Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4&Y"_ H base_typeT Ptype.cdill._dill _load_type qXfloatqqRq.0@&Y"_ H base_typeU Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4&Y"_ H base_typeVx SNOD8@H0P Ptype.cdill._dill _load_type qXfloatqqRq.0@'Y"_ H base_typeW Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4'Y"_ H base_typeX Ptype.cdill._dill _load_type qXfloatqqRq.0@'Y"_ H base_typeY Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4'Y"_ H base_typeZ x SNODX@`hȪp Ptype.cdill._dill _load_type qXfloatqqRq.0@ 'Y"_ H base_type[ Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4('Y"_ H base_type\ Ptype.cdill._dill _load_type qXfloatqqRq.0@0'Y"_ H base_type] Ptype,cdill._dill _load_type qXintqqRq.  ?@4 48'Y"_ H base_type^x SNODxخ H9` Ptype.cdill._dill _load_type qXfloatqqRq.0@@'Y"_ H base_type_ Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4H'Y"_ H base_type` Ptype.cdill._dill _load_type qXfloatqqRq.0@P'Y"_ H base_typea Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4X'Y"_ H base_typebPx SNOD0p Ptype.cdill._dill _load_type qXfloatqqRq. @`'Y"_ H base_typecrx(data_0data_1data_2data_3data_4data_5data_6data_7data_8data_9data_10data_11data_12data_13data_14data_15data_16data_17data_18data_19data_20data_21data_22data_23data_24data_25data_26data_27data_28data_29data_30data_31data_32data_33data_34data_35data_36data_37data_38data_39data_40data_41data_42data_43data_44data_45data_46data_47data_48data_49data_50data_51data_52data_53data_54data_55data_56data_57data_58data_59data_60data_61data_62data_63data_64data_65data_66data_67data_68data_69data_70data_71data_72data_73data_74data_75data_76data_77data_78data_79data_80data_81data_82data_83data_84data_85data_86data_87data_88data_89data_90data_91data_92data_93data_94data_95data_96data_97data_98data_99X0@p'Y"_ H base_typee Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4x'Y"_ H base_typefxtx SNODȺ0sX0@'Y"_ H base_typeg Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4'Y"_ H base_typeh Ptype.cdill._dill _load_type qXfloatqqRq.0@' Y"_ H base_typei Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4' Y"_ H base_typejtx SNODP@=0x0@' Y"_ H base_typek Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4' Y"_ H base_typel Ptype.cdill._dill _load_type qXfloatqqRq.0@' Y"_ H base_typem Ptype,cdill._dill _load_type qXintqqRq.  ?@4 4' Y"_ H base_typen0x SNODP 0 Ptype.cdill._dill _load_type qXfloatqqRq.0@' Y"_ H base_typeo Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4' Y"_ H base_typep Ptype.cdill._dill _load_type qXfloatqqRq.0@' Y"_ H base_typeq Ptype,cdill._dill _load_type qXintqqRq. ' Y"_ H str_typer H base_types Ptype-cdill._dill _load_type qXlistqqRq.  Y"_ H str_typetNhxhelloworldilikeyou0w  !"#$%&'()*+,-./01 Nj]??-~uO?V4 Y?ċ^?̿:C?0U?Ba?ji#??Z"2?5y\ׅ??.p3?=V`?`gɊ?V.?vR;u?Mҋ??]e?=P?zb?TV?rj?њ?pc3A?C?Pjk?à3?z "? o??E[?>WP?Lac?k?b΁?hiT?ؼ?pS,?K?H4dg?@?>q?dHK?@,?dXC-?ђA&?Y+z?`?~w/y?>?Lcww?MP?S!?rZ)*?H?R?{aF/T?m1f?|h?ϼ{e?S;?E?7?.@&4?|*=^j?F` 1?D\?GR?Od? x ?C>^Z>M>/? k><>a?.>=W_?7?y?Z??}?}=O>L?>I>KS=V=>ra>٩#?mF?/>R=t>#?U3=T=r =R]>>wt?>7=J=?s?Ӥ=ǭ=b!>8>c>s?,0>4K?]>bi? j1?9h?+?>|s?6?;>9n?t >mU!?0 ?}$6?z2?2Ct?>/&? >/>޶>9=R[?)?Z>2\????NP Ptype-chickle.loaders.load_scipy return_first q.YhSNOD((0P)8*,@pHPH; Ptype-cdill._dill _load_type qXlistqqRq.TREE HEAPX(data_0data_1data_2data_30 Ptype-cdill._dill _load_type qXlistqqRq. H base_typev 1 Y"_ H base_typexPx SNOD X Ptype,cdill._dill _load_type qXstrqqRq.@22@A Y"_ H base_typey Ptype.cdill._dill _load_type qXtupleqqRq.0@' Y"_ H base_typez Ptype,cdill._dill _load_type qXintqqRq.0@ Y"_ H base_type{ Ptype,cdill._dill _load_type qXintqqRq. (   ?@4 4h Y"_ H np_dtype| H base_type} 8typecnumpy ndarray q. (  @Ah Y"_ H np_dtype~ H base_type 8typecnumpy ndarray q. (    4 Y"_ H np_dtypeX (  (FALSETRUEM Y"_ 8typecnumpy ndarray q.h}q(XstringqKCbytesqKK KG?ᙙKcdill._dill _load_type qXcomplexqqRqG?G@qRqKKq K(X0this is some string to be dumped by hickle 4.0.0q (KKKKKKKKKK K K K K KKKKKKKKKKKKKKKKKKK K!K"K#K$K%K&K'K(K)K*K+K,K-K.K/K0K1tq K K tq K)KX9q K NK Xa/bqK u.this is some string to be dumped by hickle 4.0.0this is the same in bytes instead of utf8helloworldilikeyou H base_type H base_type @type cnumpy.ma.core MaskedArray q.TREEHEAPX P3758SNODhp)X0TREEHEAPX0dataHSNOD8X- 0 H base_type  Hkey_base_type X key_type,cdill._dill _load_type qXintqqRq. 0 key_idx@TREE HEAPXdata_0data_1@SNOD,@`@` Ptype-cdill._dill _load_type qXlistqqRq. H base_type @ Y"_ H base_typeP x(SNOD  Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4 Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.TREEHEAPXPdataH 0 H base_type  Hkey_base_type X key_type,cdill._dill _load_type qXintqqRq. 0 key_idx@ TREEHEAPXdata_0data_1@SNOD@`@` Ptype-cdill._dill _load_type qXlistqqRq. H base_type @ Y"_ H base_typePx(SNOD Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4 Y"_ H base_type Ptype.cdill._dill _load_type qXfloatqqRq.pTREE!HEAPXxdataH8X H base_type  Hkey_base_type* X key_type,cdill._dill _load_type qXintqqRq. 0 key_idx@H"TREE0$HEAPX data_0data_1@SNOD@h h  Ptype-cdill._dill _load_type qXlistqqRq. H base_type* @ Y"_ H base_type*x%x(SNOD #% Ptype,cdill._dill _load_type qXintqqRq.8 ?@4 4 Y"_ H base_type* Ptype.cdill._dill _load_type qXfloatqqRq.0 H base_type* @typeccollections OrderedDict q.hAHEAPX(P(dataindicesindptrshape0 @0 Y"_ H base_type * xCpD=D Y"_ H base_typeHGCOLintlistintfloatdict csr_matrixcsr_matrix_indicescsr_matrix_indptr csr_matrix_shape csr_matrix csc_matrix csc_matrix_indices csc_matrix_indptrcsc_matrix_shape csc_matrix bsr_matrixbsr_matrix_indicesbsr_matrix_indptrbsr_matrix_shape bsr_matrixH? X@   Y"_ @typec__builtin__ NoneType .@Bh    Y"_ @typec__builtin__ NoneType .Bh  @ Y"_ @typec__builtin__ NoneType .Ch :0( Ptype-chickle.loaders.load_scipy return_first q. H base_type * H base_type* H base_type* H base_type*  Htype"cscipy.sparse.csr csr_matrix q.0LTREEG HEAPX(PFdataindicesindptrshape0 @0 Y"_ H base_type *@NpSNODFIJ K   Y"_ @typec__builtin__ NoneType .Mh   1 Y"_ @typec__builtin__ NoneType .pMh  @A Y"_ @typec__builtin__ NoneType .Mh D0F Ptype-chickle.loaders.load_scipy return_first q. H base_type *  H base_type*  H base_type*  H base_type* Htype"cscipy.sparse.csc csc_matrix q.TREE(U HEAPX(Qdataindicesindptrshape0data_0data_1data_2data_3data_4data_5data_6data_7data_8data_9data_10data_11data_12data_12_maskdata_13data_14data_15data_16data_17data_18data_19data_20data_21SNOD(:0(CD0FNNP[ 8@Q Y"_ H base_type *@[pSNODTpVW X   Y"_ @typec__builtin__ NoneType .Zh   ) Y"_ @typec__builtin__ NoneType .pZh  @9 Y"_ @typec__builtin__ NoneType .Zh H base_type * H base_type* H base_type* H base_type* Htype"cscipy.sparse.bsr bsr_matrix q. `0I0 Y"_ H np_dtype* H base_type* 8typecnumpy ndarray q. `)y) Y"_ H np_dtype* H base_type* 8typecnumpy ndarray q. p Y"_ H np_dtype* H base_type* 8typecnumpy ndarray q.i@TREEfHEAPXxbdataHSNOD ]^`8`Xb  hTREEHEAPXfPSNODd@d`f@d`f H base_type* Ptype-cdill._dill _load_type qXdictqqRq.8`Xb htypeAcdill._dill _create_function q(cdill._dill _load_type qXCodeTypeqqRq(KKKKKSCtj|dSqNKqXnpqXarrayqq Xxq q XT/home/nother/daten/projekte/github-contributions/hickle/hickle/loaders/load_numpy.pyq Xq KiCq))tqRqchickle.loaders.load_numpy __dict__ h NN}qNtqRq. H np_dtype* H base_type*hickle-5.0.2/hickle/tests/test_01_hickle_helpers.py000066400000000000000000000260151430361177200222600ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_hickle_helpers.py Unit tests for hickle module -- helper functions. """ import pytest # %% IMPORTS # Package imports import numpy as np import pickle import operator import numpy as np import h5py # hickle imports from hickle.helpers import ( PyContainer,H5NodeFilterProxy,no_compression,convert_str_attr,convert_str_list_attr ) from hickle.fileio import FileError,ClosedFileError,file_opener,not_io_base_like from py.path import local # Set current working directory to the temporary directory local.get_temproot().chdir() # %% DATA DEFINITIONS dummy_data = (1,2,3) # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ # create file and create a dataset the attributes of which will later on be # modified import h5py as h5 dummy_file = h5.File('hickle_helpers_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_dataset("somedata",data=dummy_data,dtype='i') test_data.attrs['type'] = np.array(pickle.dumps(tuple)) test_data.attrs['base_type'] = b'tuple' test_data.attrs['someattr'] = 12 test_data.attrs['someother'] = 11 # write out the file reopen it read only dummy_file.flush() dummy_file.close() dummy_file = h5.File(filename,'r') # provide the file and close afterwards yield dummy_file dummy_file.close() @pytest.fixture def test_file_name(request): yield "{}.hkl".format(request.function.__name__) # %% FUNCTION DEFINITIONS def test_no_compression(): """ test no_compression filter for temporarily hiding compression related kwargs from h5py.create_dataset method """ # simulate kwargs without compression related kwargs = {'hello':1,'word':2} assert dict(no_compression(kwargs)) == kwargs # simulate kwargs including all relevant keyword arguments kwargs2 = dict(kwargs) kwargs2.update({ "compression":True, "shuffle":True, "compression_opts":8, "chunks":512, "fletcher32":True, "scaleoffset":20 }) assert dict(no_compression(kwargs2)) == kwargs def test_py_container(h5_data): """ test abstract PyContainer base class defining container interface and providing default implementations for append and filter """ # test default implementation of append container = PyContainer({},b'list',list) container.append('data0',1,{}) container.append('data1','b',{}) # ensure that default implementation of convert enforces overload by # derived PyContainer classes by raising NotImplementedError with pytest.raises(NotImplementedError): my_list = container.convert() # test default implementation of PyContainer.filter method which # simply shall yield from passed in iterator assert [ item for item in dummy_data ] == list(dummy_data) assert dict(container.filter(h5_data)) == {'somedata':h5_data['somedata']} def test_H5NodeFilterProxy(h5_data): """ tests H5NodeFilterProxy class. This class allows to temporarily rewrite attributes of h5py.Group and h5py.Dataset nodes before being loaded by hickle._load method. """ # load data and try to directly modify 'type' and 'base_type' Attributes # which will fail cause hdf5 file is opened for read only h5_node = h5_data['somedata'] with pytest.raises(OSError): try: h5_node.attrs['type'] = pickle.dumps(list) except RuntimeError as re: raise OSError(re).with_traceback(re.__traceback__) with pytest.raises(OSError): try: h5_node.attrs['base_type'] = b'list' except RuntimeError as re: raise OSError(re).with_traceback(re.__traceback__) # verify that 'type' expands to tuple before running # the remaining tests object_type = pickle.loads(h5_node.attrs['type']) assert object_type is tuple assert object_type(h5_node[()].tolist()) == dummy_data # Wrap node by H5NodeFilterProxy and rerun the above tests # again. This time modifying Attributes shall be possible. h5_node = H5NodeFilterProxy(h5_node) h5_node.attrs['type'] = pickle.dumps(list) h5_node.attrs['base_type'] = b'list' object_type = pickle.loads(h5_node.attrs['type']) assert object_type is list # test proper pass through of item and attribute access # to wrapped h5py.Group or h5py.Dataset object respective assert object_type(h5_node[()].tolist()) == list(dummy_data) assert h5_node.shape == np.array(dummy_data).shape with pytest.raises(AttributeError,match = r"can't\s+set\s+attribute"): h5_node.dtype = np.float32 def test_not_io_base_like(test_file_name): """ test not_io_base_like function for creating replacement methods for IOBase.isreadable, IOBase.isseekable and IOBase.writeable """ with open(test_file_name,'w') as f: assert not not_io_base_like(f)() assert not not_io_base_like(f,'strange_read',0)() assert not not_io_base_like(f,'seek',0,'strange_tell')() with open(test_file_name,'r') as f: assert not_io_base_like(f,('seek',0),('tell',))() assert not not_io_base_like(f,('seek',0),('tell',()))() assert not_io_base_like(f,('read',0))() assert not not_io_base_like(f,('tell',()))() assert not_io_base_like(f,('tell',))() def test_file_opener(h5_data,test_file_name): """ test file opener function """ # check that file like object is properly initialized for writing filename = test_file_name.replace(".hkl","_{}.{}") with open(filename.format("w","hdf5"),"w") as f: with pytest.raises(FileError): h5_file,path,close_flag = file_opener(f,"root","w",filename="filename") with open(filename.format("w","hdf5"),"w+b") as f: h5_file,path,close_flag = file_opener(f,"root","w+") assert isinstance(h5_file,h5py.File) and path == "/root" and h5_file.mode == 'r+' h5_file.close() # check that file like object is properly initialized for reading with open(filename.format("w","hdf5"),"rb") as f: h5_file,path,close_flag = file_opener(f,"root","r") assert isinstance(h5_file,h5py.File) and path == "/root" and h5_file.mode == 'r' assert close_flag h5_file.close() # check that only str are accepted as filenames with pytest.raises(ValueError): h5_file,path,close_flag = file_opener(f,"root","r",filename=12) # check that tuple specifying file object and filename string is accepted h5_file,path,close_flag = file_opener((f,"not me"),"root","r") assert isinstance(h5_file,h5py.File) and path == "/root" and h5_file.mode == 'r' assert close_flag h5_file.close() # check that dict specifying file object and filename is accepted h5_file,path,close_flag = file_opener({"file":f,"name":"not me"},"root","r") assert isinstance(h5_file,h5py.File) and path == "/root" and h5_file.mode == 'r' assert close_flag h5_file.close() # check that file is rejected if mode used to open and mode passed to file # opener do not match with pytest.raises(FileError): h5_file,path,close_flag = file_opener({"file":f,"name":"not me"},"root","r+") with pytest.raises(FileError): h5_file,path,close_flag = file_opener({"file":f,"name":"not me"},"root","w") with pytest.raises(ValueError): h5_file,path,close_flag = file_opener({"file":f,"name":"not me"},"root","+") # check that only binary files opened for reading and writing are accepted with # mode w with open(filename.format("w","hdf5"),"w") as f: with pytest.raises(FileError): h5_file,path,close_flag = file_opener({"file":f,"name":"not me"},"root","w") # check that closed file objects are rejected with pytest.raises(ClosedFileError): h5_file,path,close_flag = file_opener(f,"root","r") # check that h5py.File object is properly initialised for writing with pytest.raises(FileError): h5_file,path,close_flag = file_opener(h5_data,"","w") with h5py.File(filename.format("w","whkl"),"w") as hdf5_file: h5_file,path,close_flag = file_opener(hdf5_file,"","w") assert isinstance(h5_file,h5py.File) and path == "/" assert h5_file.mode == 'r+' and not close_flag hdf5_group = hdf5_file.create_group("some_group") with pytest.raises(ClosedFileError): h5_file,path,close_flag = file_opener(hdf5_file,"","w") with h5py.File(filename.format("w","whkl"),"r") as hdf5_file: h5_file,path,close_flag = file_opener(hdf5_file["some_group"],'',"r") assert isinstance(h5_file,h5py.File) and path == "/some_group" assert h5_file.mode == 'r' and not close_flag # check that a new file is created for provided filename and properly initialized h5_file,path,close_flag = file_opener(filename.format("w",".hkl"),"root_group","w") assert isinstance(h5_file,h5py.File) and path == "/root_group" assert h5_file.mode == 'r+' and close_flag h5_file.close() # check that any other object not being a file like object, a h5py.File object or # a filename string triggers an FileError exception with pytest.raises(FileError): h5_file,path,close_flag = file_opener(object(),"root_group","w") def test_str_attr_converter(): """ test attribute decoder helper functions used to mimic h5py >= 3.x behaviour when h5py 2.10 is installed """ ascii_str_val = 'some ascii encoded string attr' utf8_str_val = 'some utf8 encoded string attr' some_attrs = dict( some_attr_ascii = ascii_str_val.encode('ascii'), some_attr_utf8 = utf8_str_val.encode('utf8'), some_attr_list_ascii = [ strval.encode('ascii') for strval in ascii_str_val.split(' ') ], some_attr_list_utf8 = [ strval.encode('utf8') for strval in utf8_str_val.split(' ') ] ) assert convert_str_attr(some_attrs,'some_attr_ascii',encoding='ascii') == ascii_str_val assert convert_str_attr(some_attrs,'some_attr_utf8') == utf8_str_val assert " ".join(convert_str_list_attr(some_attrs,'some_attr_list_ascii',encoding='ascii')) == ascii_str_val assert " ".join(convert_str_list_attr(some_attrs,'some_attr_list_utf8')) == utf8_str_val # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest test_no_compression() for data in h5_data(FixtureRequest(test_py_container)): test_py_container(data) for data in h5_data(FixtureRequest(test_py_container)): test_H5NodeFilterProxy(data) for filename in ( ( test_file_name(request), ) for request in (FixtureRequest(test_not_io_base_like),) ): test_not_io_base_like(filename) for h5_root,filename in ( ( h5_data(request),test_file_name(request) ) for request in (FixtureRequest(test_file_opener),) ): test_file_opener(h5_root,filename) test_str_attr_converter() hickle-5.0.2/hickle/tests/test_02_hickle_lookup.py000066400000000000000000001763151430361177200221410ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_hickle_lookup.py Unit tests for hickle module -- lookup functions. """ # %% IMPORTS import pytest import sys import shutil import types import weakref import compileall import os # Package imports import re import collections import numpy as np import h5py import pickle from importlib.util import find_spec,spec_from_loader,spec_from_file_location from importlib import reload from copy import copy import os.path from py.path import local # hickle imports from hickle.helpers import PyContainer,not_dumpable from hickle.loaders import optional_loaders, attribute_prefix import hickle.lookup as lookup # Set current working directory to the temporary directory local.get_temproot().chdir() # %% DATA DEFINITIONS dummy_data = (1,2,3) # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer, H5NodeFilterProxy and ReferenceManager. Uses name of executed test as part of filename """ dummy_file = h5py.File('hickle_lookup_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data if dummy_file: dummy_file.close() @pytest.fixture() def loader_table(): """ create a class_register and a exclude_register table for testing register_class and register_class_exclude functions 0: dataset only loader 1: PyContainer only loader 2: not dumped loader 3: external loader module trying to overwrite hickle core loader 4: hickle loader module trying to overload hickle core loader 3: loader defined by hickle core """ # clear loaded_loaders, types_dict, hkl_types_dict and hkl_contianer_dict # to ensure no loader preset by hickle core or hickle loader module # intervenes with test global lookup lookup.LoaderManager.__loaded_loaders__.clear() tuple( True for opt in lookup.LoaderManager.__py_types__.values() if opt.clear() ) tuple( True for opt in lookup.LoaderManager.__hkl_functions__.values() if opt.clear() ) tuple( True for opt in lookup.LoaderManager.__hkl_container__.values() if opt.clear() ) # simulate loader definitions found within loader modules def create_test_dataset(myclass_type,h_group,name,**kwargs): return h_group,() def load_test_dataset(h_node,base_type,py_obj_type): return 12 class TestContainer(PyContainer): def convert(self): return self._content[0] class NotHicklePackage(TestContainer): """ checks if container_class provided by module outside hickle package tries to define alternative loader for IteratorProxy class handled by hickle core directly """ __module__ = "nothickle.loaders.load_builtins" class HickleLoadersModule(TestContainer): """ checks if container_class provided by hickle.loaders module tries to define alternative loader for IteratorProxy class handled by hickle core directly """ __module__ = "hickle.loaders.load_builtins" class IsHickleCore(TestContainer): """ Simulates loader registered by hickle.hickle module """ __module__ = "hickle.hickle" # provide the table yield [ (int,b'int',create_test_dataset,load_test_dataset,None,False), (list,b'list',create_test_dataset,None,TestContainer,True), (tuple,b'tuple',None,load_test_dataset,TestContainer), (lookup._DictItem,b'dict_item',None,None,NotHicklePackage), (lookup._DictItem,b'pickle',None,None,HickleLoadersModule), (lookup._DictItem,b'dict_item',lookup.LoaderManager.register_class,None,IsHickleCore) ] # cleanup and reload hickle.lookup module to reset it to its initial state # in case hickle.hickle has already been preloaded by pytest also reload it # to ensure no side effects occur during later tests lookup.LoaderManager.__loaded_loaders__.clear() tuple( True for opt in lookup.LoaderManager.__py_types__.values() if opt.clear() ) tuple( True for opt in lookup.LoaderManager.__hkl_functions__.values() if opt.clear() ) tuple( True for opt in lookup.LoaderManager.__hkl_container__.values() if opt.clear() ) reload(lookup) lookup = sys.modules[lookup.__name__] hickle_hickle = sys.modules.get("hickle.hickle",None) if hickle_hickle is not None: reload(hickle_hickle) # %% CLASS DEFINITIONS class ToBeInLoadersOrNotToBe(): """ Dummy class used to check that only loaders for Python objects are accepted by load_loader which are either declared outside hickle or are pre registered by hickle core through directly calling register_class or are declared by a load_.py module within the pickle.loaders package Also it is used in simulating reduced object tuple with all trailing None items removed """ __slots__ = () def __reduce_ex__(self,proto = pickle.DEFAULT_PROTOCOL): reduced = super(ToBeInLoadersOrNotToBe,self).__reduce_ex__(proto) for index,item in enumerate(reduced[:1:-1],0): if item is not None: return reduced[:(-index if index > 0 else None)] return reduced def __reduce__(self): reduced = super(ToBeInLoadersOrNotToBe,self).__reduce__() for index,item in enumerate(reduced[:1:-1],0): if item is not None: return reduced[:(-index if index > 0 else None)] return reduced def __eq__(self,other): return other.__class__ is self.__class__ def __ne__(self,other): return self != other class ClassToDump(): """ Primary class used to test create_pickled_dataset function """ def __init__(self,hallo,welt,with_default=1): self._data = hallo,welt,with_default def __eq__(self,other): return other.__class__ is self.__class__ and self._data == other._data def __ne__(self,other): return self != other class ClassToDumpCompact(ClassToDump): """ Class which may be handled by 'compact_expand' loader """ def __compact__(self): return self._data def __expand__(self,compact): self._data = compact class ClassToDumpCompactOff(ClassToDump): """ Class which enforces that any instance is pickled independent whether 'compact_expand' loader was selected for hickle.dump call or not """ def __compact__(self): return None class ClassToDumpCompactStrange(ClassToDump): """ Class which does not properly implement '__compact__' and '__expand__' methods recommended by compact expand protocol """ def __compact__(self): return self._data class ClassToDumpCompactStrange2(ClassToDump): """ Another class which does not properly implement '__compact__' and '__expand__' methods recommended by compact expand protocol """ def __compact__(self): return 42 class ClassToDumpCompactDataset(ClassToDump): """ Class which is to be represented by a h5py.Dataset instead of a h5py.Group in its compacted form """ def __compact__(self): return "{}|{}|{}".format(*self._data) def __expand__(self,compact): self._data = compact.split("|") self._data[2] = int(self._data[2]) self._data = (*self._data,) class SimpleClass(): """ simple class used to check that instance __dict__ is properly dumped and restored by create_pickled_dataset and PickledContainer """ def __init__(self): self.someattr = "I'm some attr" self.someother = 12 def __eq__(self,other): return other.__class__ is self.__class__ and self.__dict__ == other.__dict__ def __ne__(self,other): return self != other class NoExtendList(list): """ special list class used to test whether append is properly used when list like object is dumped and restored through create_pickled_dataset and PickledContainer """ def __getattribute__(self,name): if name == "extend": raise AttributeError("no extend") return super(NoExtendList,self).__getattribute__(name) # %% FUNCTION DEFINITIONS def function_to_dump(hallo,welt,with_default=1): """ non class function to be dumped and restored through create_pickled_dataset and load_pickled_data """ return hallo,welt,with_default def load_anyhing(h_node, base_type , py_obj_type): # pragma: no cover """ loads nothing """ return None def test_AttemptRecoverCustom_classes(h5_data): recovered_group = lookup.RecoveredGroup({'hello':1},attrs={'world':2,'type':42}) assert recovered_group == {'hello':1} and recovered_group.attrs == {'world':2} array_to_recover = np.random.random_sample([4,2]) dataset_to_recover = h5_data.create_dataset('to_recover',data=array_to_recover) dataset_to_recover.attrs['world'] = 2 dataset_to_recover.attrs['type'] = 42 recovered_dataset = lookup.RecoveredDataset(dataset_to_recover[()],dtype=dataset_to_recover.dtype,attrs=dataset_to_recover.attrs) assert np.allclose(recovered_dataset,array_to_recover) assert recovered_dataset.dtype == array_to_recover.dtype assert recovered_dataset.attrs == {'world':2} #recovered = lookup.recover_custom_dataset(dataset_to_recover,'unknown',dataset_to_recover.attrs['type']) #assert recovered.dtype == array_to_recover.dtype and recovered == array_to_recover #assert recovered.attrs == {'world':2} def test_LoaderManager_register_class(loader_table): """ tests the register_class method """ # try to register dataset only loader specified by loader_table # and retrieve its contents from types_dict and hkl_types_dict loader_spec = loader_table[0] lookup.LoaderManager.register_class(*loader_spec) assert lookup.LoaderManager.__py_types__[None][loader_spec[0]] == (*loader_spec[2:0:-1],loader_spec[5]) assert lookup.LoaderManager.__hkl_functions__[None][loader_spec[1]] == loader_spec[3] with pytest.raises(KeyError): lookup.LoaderManager.__hkl_container__[None][loader_spec[1]] is None # try to register PyContainer only loader specified by loader_table # and retrieve its contents from types_dict and hkl_container_dict loader_spec = loader_table[1] lookup.LoaderManager.register_class(*loader_spec) assert lookup.LoaderManager.__py_types__[None][loader_spec[0]] == (*loader_spec[2:0:-1],loader_spec[5]) with pytest.raises(KeyError): lookup.LoaderManager.__hkl_functions__[None][loader_spec[1]] is None assert lookup.LoaderManager.__hkl_container__[None][loader_spec[1]] == loader_spec[4] # try to register container without dump_function specified by # loader table and try to retrieve load_function and PyContainer from # hkl_types_dict and hkl_container_dict loader_spec = loader_table[2] lookup.LoaderManager.register_class(*loader_spec) with pytest.raises(KeyError): lookup.LoaderManager.__py_types__[None][loader_spec[0]][1] == loader_spec[1] assert lookup.LoaderManager.__hkl_functions__[None][loader_spec[1]] == loader_spec[3] assert lookup.LoaderManager.__hkl_container__[None][loader_spec[1]] == loader_spec[4] # try to register loader shadowing loader preset by hickle core # defined by external loader module loader_spec = loader_table[3] with pytest.raises(TypeError,match = r"loader\s+for\s+'\w+'\s+type\s+managed\s+by\s+hickle\s+only"): lookup.LoaderManager.register_class(*loader_spec) loader_spec = loader_table[4] # try to register loader shadowing loader preset by hickle core # defined by hickle loaders module with pytest.raises(TypeError,match = r"loader\s+for\s+'\w+'\s+type\s+managed\s+by\s+hickle\s+core\s+only"): lookup.LoaderManager.register_class(*loader_spec) # simulate registering loader preset by hickle core loader_spec = loader_table[5] lookup.LoaderManager.register_class(*loader_spec) loader_spec = loader_table[0] lookup.LoaderManager.__hkl_functions__[None][b'!node-reference!'] = loader_spec[3:5] with pytest.raises(ValueError): lookup.LoaderManager.register_class(loader_spec[0],b'!node-reference!',*loader_spec[2:],'custom') lookup.LoaderManager.__hkl_functions__[None].pop(b'!node-reference!') with pytest.raises(lookup.LookupError): lookup.LoaderManager.register_class(*loader_spec,'mine') def test_LoaderManager_register_class_exclude(loader_table): """ test register class exclude function """ # try to disable loading of loader preset by hickle core base_type = loader_table[5][1] lookup.LoaderManager.register_class(*loader_table[2]) lookup.LoaderManager.register_class(*loader_table[5]) with pytest.raises(ValueError,match = r"excluding\s+'.+'\s+base_type\s+managed\s+by\s+hickle\s+core\s+not\s+possible"): lookup.LoaderManager.register_class_exclude(base_type) # disable any of the other loaders base_type = loader_table[2][1] lookup.LoaderManager.register_class_exclude(base_type) with pytest.raises(lookup.LookupError): lookup.LoaderManager.register_class_exclude(base_type,'compact') def patch_importlib_util_find_spec(name,package=None): """ function used to temporarily redirect search for loaders to hickle_loader directory in test directory for testing loading of new loaders """ return find_spec("hickle.tests." + name.replace('.','_',1),package) def patch_importlib_util_find_spec_no_load_builtins(name,package=None): """ function used to temporarily redirect search for loaders to hickle_loader directory in test directory for testing loading of new loaders """ if name in {'hickle.loaders.load_builtins'}: return None return find_spec("hickle.tests." + name.replace('.','_',1),package) def patch_importlib_util_spec_from_tests_loader(name, loader, *, origin=None, is_package=None): """ function used to temporarily redirect search for loaders to hickle_loader directory in test directory for testing loading of new loaders """ name = name.replace('.','_',1) myloader = copy(sys.modules['hickle.tests'].__loader__) myloader.name = "hickle.tests." + name myloader.path = os.path.join(os.path.dirname(myloader.path),'{}.py'.format(name)) return spec_from_loader(myloader.name,myloader,origin=origin,is_package=is_package) def patch_importlib_util_spec_from_loader(name, loader, *, origin=None, is_package=None): """ function used to temporarily redirect search for loaders to hickle_loader directory in test directory for testing loading of new loaders """ return spec_from_loader("hickle.tests." + name.replace('.','_',1),loader,origin=origin,is_package=is_package) def patch_importlib_util_spec_from_file_location(name, location, *, loader=None, submodule_search_locations=None): """ function used to temporarily redirect search for loaders to hickle_loader directory in test directory for testing loading of new loaders """ return spec_from_file_location("hickle.tests." + name.replace('.','_',1),location,loader=loader,submodule_search_locations =submodule_search_locations) def patch_importlib_util_find_no_spec(name,package=None): """ function used to simulate situation where no appropriate loader could be found for object """ return None def patch_importlib_util_no_spec_from_loader(name, loader, *, origin=None, is_package=None): """ function used to simulate situation where no appropriate loader could be found for object """ return None def patch_importlib_util_no_spec_from_file_location(name, location, *, loader=None, submodule_search_locations=None): """ function used to simulate situation where no appropriate loader could be found for object """ return None def patch_hide_collections_loader(name,package=None): if name in ('hickle.loaders.load_collections'): return None return find_spec(name,package) def test_LoaderManager(loader_table,h5_data): """ tests LoaderManager constructor """ manager = lookup.LoaderManager(h5_data,False) assert isinstance(manager.types_dict,collections.ChainMap) assert manager.types_dict.maps[0] is lookup.LoaderManager.__py_types__[None] assert isinstance(manager.hkl_types_dict,collections.ChainMap) assert manager.hkl_types_dict.maps[0] is lookup.LoaderManager.__hkl_functions__[None] assert isinstance(manager.hkl_container_dict,collections.ChainMap) assert manager.hkl_container_dict.maps[0] is lookup.LoaderManager.__hkl_container__[None] assert manager._mro is type.mro assert manager._file.id == h5_data.file.id manager = lookup.LoaderManager(h5_data,True) assert manager.types_dict.maps[0] is lookup.LoaderManager.__py_types__['hickle-4.x'] assert manager.types_dict.maps[1] is lookup.LoaderManager.__py_types__[None] assert manager.hkl_types_dict.maps[0] is lookup.LoaderManager.__hkl_functions__['hickle-4.x'] assert manager.hkl_types_dict.maps[1] is lookup.LoaderManager.__hkl_functions__[None] assert manager.hkl_container_dict.maps[0] is lookup.LoaderManager.__hkl_container__['hickle-4.x'] assert manager.hkl_container_dict.maps[1] is lookup.LoaderManager.__hkl_container__[None] assert manager._mro is lookup.type_legacy_mro assert manager._file.id == h5_data.file.id ###### amend ##### manager = lookup.LoaderManager(h5_data,False,{'custom':True}) assert manager.types_dict.maps[0] is lookup.LoaderManager.__py_types__['custom'] assert manager.types_dict.maps[1] is lookup.LoaderManager.__py_types__[None] assert manager.hkl_types_dict.maps[0] is lookup.LoaderManager.__hkl_functions__['custom'] assert manager.hkl_types_dict.maps[1] is lookup.LoaderManager.__hkl_functions__[None] assert manager.hkl_container_dict.maps[0] is lookup.LoaderManager.__hkl_container__['custom'] assert manager.hkl_container_dict.maps[1] is lookup.LoaderManager.__hkl_container__[None] assert manager._file.id == h5_data.file.id assert h5_data.attrs.get('{}CUSTOM'.format(attribute_prefix),None) manager = lookup.LoaderManager(h5_data,False,None) assert manager.types_dict.maps[0] is lookup.LoaderManager.__py_types__['custom'] assert manager.types_dict.maps[1] is lookup.LoaderManager.__py_types__[None] assert manager.hkl_types_dict.maps[0] is lookup.LoaderManager.__hkl_functions__['custom'] assert manager.hkl_types_dict.maps[1] is lookup.LoaderManager.__hkl_functions__[None] assert manager.hkl_container_dict.maps[0] is lookup.LoaderManager.__hkl_container__['custom'] assert manager.hkl_container_dict.maps[1] is lookup.LoaderManager.__hkl_container__[None] assert manager._file.id == h5_data.file.id h5_data.attrs.pop('{}CUSTOM'.format(attribute_prefix),None) manager = lookup.LoaderManager(h5_data,False,{'custom':False}) assert manager.types_dict.maps[0] is lookup.LoaderManager.__py_types__[None] assert manager.hkl_types_dict.maps[0] is lookup.LoaderManager.__hkl_functions__[None] assert manager.hkl_container_dict.maps[0] is lookup.LoaderManager.__hkl_container__[None] assert h5_data.attrs.get('{}CUSTOM'.format(attribute_prefix),h5_data) is h5_data with pytest.raises(lookup.LookupError): manager = lookup.LoaderManager(h5_data,False,{'compact':True}) def test_LoaderManager_drop_manager(h5_data): """ test static LoaderManager._drop_table method """ loader = lookup.LoaderManager(h5_data) lookup.LoaderManager.__managers__[h5_data.file.id] = (loader,) some_other_file = h5py.File('someother.hdf5','w') some_other_root = some_other_file.create_group('root') lookup.LoaderManager._drop_manager(some_other_root.file.id) lookup.LoaderManager.__managers__[some_other_file.file.id] = (lookup.LoaderManager(some_other_root),) assert lookup.LoaderManager.__managers__.get(h5_data.file.id,None) == (loader,) lookup.LoaderManager._drop_manager(h5_data.file.id) assert lookup.LoaderManager.__managers__.get(h5_data.file.id,None) is None lookup.LoaderManager._drop_manager(some_other_root.file.id) assert not lookup.LoaderManager.__managers__ some_other_file.close() def test_LoaderManager_create_manager(h5_data): """ test public static LoaderManager.create_manager function """ second_tree = h5_data.file.create_group('seondary_root') loader = lookup.LoaderManager.create_manager(h5_data) assert lookup.LoaderManager.__managers__[h5_data.file.id][0] is loader with pytest.raises(lookup.LookupError): second_table = lookup.LoaderManager.create_manager(second_tree) lookup.LoaderManager._drop_manager(h5_data.file.id) def test_LoaderManager_context(h5_data): """ test use of LoaderManager as context manager """ with lookup.LoaderManager.create_manager(h5_data) as loader: assert lookup.LoaderManager.__managers__[h5_data.file.id][0] is loader assert loader._file is None with pytest.raises(RuntimeError): with loader as loader2: pass loader.__exit__(None,None,None) def test_LoaderManager_load_loader(loader_table,h5_data,monkeypatch): """ test LoaderManager.load_loader method """ # some data to check loader for # assume loader should be load_builtins loader py_object = dict() loader_name = "hickle.loaders.load_builtins" with monkeypatch.context() as moc_import_lib: with lookup.LoaderManager.create_manager(h5_data) as loader: # hide loader from hickle.lookup.loaded_loaders and check that # fallback loader for python object is returned moc_import_lib.setattr("importlib.util.find_spec",patch_importlib_util_find_no_spec) moc_import_lib.setattr("hickle.lookup.find_spec",patch_importlib_util_find_no_spec) moc_import_lib.setattr("importlib.util.spec_from_loader",patch_importlib_util_no_spec_from_loader) moc_import_lib.setattr("hickle.lookup.spec_from_loader",patch_importlib_util_no_spec_from_loader) moc_import_lib.setattr("importlib.util.spec_from_file_location",patch_importlib_util_no_spec_from_file_location) moc_import_lib.setattr("hickle.lookup.spec_from_file_location",patch_importlib_util_no_spec_from_file_location) moc_import_lib.delitem(sys.modules,"hickle.loaders.load_builtins",raising=False) py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is object and nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) lookup._custom_loader_enabled_builtins[py_obj_type.__class__.__module__] = ('','') py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is object and nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) backup_builtins = sys.modules['builtins'] moc_import_lib.delitem(sys.modules,'builtins') with pytest.warns(lookup.PackageImportDropped):# TODO when warning is added run check for warning py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is object and nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) moc_import_lib.setitem(sys.modules,'builtins',backup_builtins) # redirect load_builtins loader to tests/hickle_loader path moc_import_lib.setattr("importlib.util.spec_from_file_location",patch_importlib_util_spec_from_file_location) moc_import_lib.setattr("hickle.lookup.spec_from_file_location",patch_importlib_util_spec_from_file_location) #py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) #assert py_obj_type is dict and nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) moc_import_lib.setattr("importlib.util.find_spec",patch_importlib_util_find_spec) moc_import_lib.setattr("hickle.lookup.find_spec",patch_importlib_util_find_spec) moc_import_lib.setattr("importlib.util.spec_from_loader",patch_importlib_util_spec_from_loader) moc_import_lib.setattr("hickle.lookup.spec_from_loader",patch_importlib_util_spec_from_loader) # try to find appropriate loader for dict object, a mock of this # loader should be provided by hickle/tests/hickle_loaders/load_builtins # module ensure that this module is the one found by load_loader function import hickle.tests.hickle_loaders.load_builtins as load_builtins moc_import_lib.setitem(sys.modules,loader_name,load_builtins) moc_import_lib.setattr("importlib.util.spec_from_loader",patch_importlib_util_spec_from_tests_loader) moc_import_lib.setattr("hickle.lookup.spec_from_loader",patch_importlib_util_spec_from_tests_loader) py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is dict and nopickleloader == (load_builtins.create_package_test,b'dict',True) # simulate loading of package or local loader from hickle_loaders directory backup_load_builtins = sys.modules.pop('hickle.loaders.load_builtins',None) backup_py_obj_type = loader.types_dict.pop(dict,None) backup_loaded_loaders = lookup.LoaderManager.__loaded_loaders__.discard('hickle.loaders.load_builtins') moc_import_lib.setattr("importlib.util.find_spec",patch_importlib_util_find_spec_no_load_builtins) moc_import_lib.setattr("hickle.lookup.find_spec",patch_importlib_util_find_spec_no_load_builtins) py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is dict assert nopickleloader == (sys.modules['hickle.loaders.load_builtins'].create_package_test,b'dict',True) ## back to start test successful fallback to legacy .pyc in case no source is available for package sys.modules.pop('hickle.loaders.load_builtins','None') loader.types_dict.pop(dict,None) lookup.LoaderManager.__loaded_loaders__.discard('hickle.loaders.load_builtins') pyc_path = load_builtins.__file__ + 'c' if not os.path.isfile(pyc_path): compileall.compile_file(load_builtins.__file__,legacy=True) assert os.path.isfile(pyc_path) base_dir,base_name = os.path.split(load_builtins.__file__) hidden_source = os.path.join(base_dir,'.{}h'.format(base_name)) os.rename(load_builtins.__file__,hidden_source) py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is dict assert nopickleloader == (sys.modules['hickle.loaders.load_builtins'].create_package_test,b'dict',True) #once again just checking that if no legacy .pyc next base is tried sys.modules.pop('hickle.loaders.load_builtins','None') loader.types_dict.pop(dict,None) lookup.LoaderManager.__loaded_loaders__.discard('hickle.loaders.load_builtins') os.remove(pyc_path) py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is object assert nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) os.rename(hidden_source,load_builtins.__file__) moc_import_lib.setattr("importlib.util.spec_from_loader",patch_importlib_util_spec_from_loader) moc_import_lib.setattr("hickle.lookup.spec_from_loader",patch_importlib_util_spec_from_loader) moc_import_lib.setattr("importlib.util.find_spec",patch_importlib_util_find_spec) moc_import_lib.setattr("hickle.lookup.find_spec",patch_importlib_util_find_spec) sys.modules['hickle.loaders.load_builtins'] = backup_load_builtins loader.types_dict[dict] = backup_py_obj_type # not added by missing legacy .pyc test re-add manually here lookup.LoaderManager.__loaded_loaders__.add('hickle.loaders.load_builtins') lookup._custom_loader_enabled_builtins.pop(py_obj_type.__class__.__module__,None) # preload dataset only loader and check that it can be resolved directly loader_spec = loader_table[0] lookup.LoaderManager.register_class(*loader_spec) assert loader.load_loader((12).__class__) == (loader_spec[0],(*loader_spec[2:0:-1],loader_spec[5])) # try to find appropriate loader for dict object, a mock of this # should have already been imported above assert loader.load_loader(py_object.__class__) == (dict,(load_builtins.create_package_test,b'dict',True)) # remove loader again and undo redirection again. dict should now be # processed by create_pickled_dataset moc_import_lib.delitem(sys.modules,loader_name) del lookup.LoaderManager.__py_types__[None][dict] py_obj_type,nopickleloader = loader.load_loader(py_object.__class__) assert py_obj_type is object and nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) # check that load_loader prevents redefinition of loaders to be predefined by hickle core with pytest.raises( RuntimeError, match = r"objects\s+defined\s+by\s+hickle\s+core\s+must\s+be" r"\s+registered\s+before\s+first\s+dump\s+or\s+load" ): py_obj_type,nopickleloader = loader.load_loader(ToBeInLoadersOrNotToBe) # check that hickle only managed load only loaders without a registered dump function # are properly recognized and reported by LoaderManager.load_loader loader.register_class( ToBeInLoadersOrNotToBe, b's.pear', None, load_anyhing,None,True) with pytest.raises( RuntimeError, match = r"objects\s+defined\s+by\s+hickle\s+core\s+must\s+be" r"\s+registered\s+before\s+first\s+dump\s+or\s+load" ): py_obj_type,nopickleloader = loader.load_loader(ToBeInLoadersOrNotToBe) py_obj_type,(create_dataset,base_type,memoise) = loader.load_loader(ToBeInLoadersOrNotToBe,base_type=b's.pear') assert py_obj_type is ToBeInLoadersOrNotToBe assert create_dataset is lookup.not_dumpable and base_type == b's.pear' and memoise == True loader.hkl_types_dict.pop(b's.pear',None) loader.hkl_container_dict.pop(b's.pear',None) loader.types_dict.pop(ToBeInLoadersOrNotToBe,None) moc_import_lib.setattr(ToBeInLoadersOrNotToBe,'__module__','hickle.loaders') # check that load_loaders issues drop warning upon loader definitions for # dummy objects defined within hickle package but outside loaders modules with pytest.warns( RuntimeWarning, match = r"ignoring\s+'.+'\s+dummy\s+type\s+not\s+defined\s+by\s+loader\s+module" ): py_obj_type,nopickleloader = loader.load_loader(ToBeInLoadersOrNotToBe) assert py_obj_type is object #ToBeInLoadersOrNotToBe assert nopickleloader == (lookup.create_pickled_dataset,b'pickle',True) # check that loader definitions for dummy objects defined by loaders work as expected # by loader module moc_import_lib.setattr(ToBeInLoadersOrNotToBe,'__module__',loader_name) py_obj_type,(create_dataset,base_type,memoise) = loader.load_loader(ToBeInLoadersOrNotToBe) assert py_obj_type is ToBeInLoadersOrNotToBe and base_type == b'NotHicklable' assert create_dataset is not_dumpable assert memoise == False # remove loader_name from list of loaded loaders and check that loader is loaded anew # and that values returned for dict object correspond to loader # provided by freshly loaded loader module lookup.LoaderManager.__loaded_loaders__.remove(loader_name) py_obj_type,(create_dataset,base_type,memoise) = loader.load_loader(py_object.__class__) load_builtins_moc = sys.modules.get(loader_name,None) assert load_builtins_moc is not None loader_spec = load_builtins_moc.class_register[0] assert py_obj_type is dict and create_dataset is loader_spec[2] assert base_type is loader_spec[1] assert memoise == True # check that package path is properly resolved if package module # for which to find loader for is not found on sys. modules or # its __spec__ attribute is set to None typically on __main__ or # builtins and other c modules lookup.LoaderManager.__loaded_loaders__.remove(loader_name) backup_module = ClassToDump.__module__ moc_import_lib.setattr(ClassToDump,'__module__',re.sub(r'^\s*hickle\.','',ClassToDump.__module__)) py_obj_type,(create_dataset,base_type,memoise) = loader.load_loader(ClassToDump) assert py_obj_type is object #ClassToDump assert create_dataset is lookup.create_pickled_dataset assert base_type == b'pickle' and memoise == True ClassToDump.__module__ = backup_module moc_import_lib.setattr("hickle.lookup.find_spec",patch_hide_collections_loader) py_obj_type,(create_dataset,base_type,memoise) = loader.load_loader(collections.OrderedDict) moc_import_lib.setattr("hickle.lookup.find_spec",patch_importlib_util_find_spec) assert py_obj_type is collections.OrderedDict assert create_dataset is sys.modules[loader_name].create_package_test assert base_type == b'dict' and memoise == True def test_type_legacy_mro(): """ tests type_legacy_mro function which is used in replacement for native type.mro function when loading 4.0.0 and 4.0.1 files it handles cases where type objects passed to load_loader are functions not classes """ # check that for class object type_legacy_mro function returns # the mro list provided by type.mro unchanged assert lookup.type_legacy_mro(SimpleClass) == type.mro(SimpleClass) # check that in case function is passed as type object a tuple with # function as single element is returned assert lookup.type_legacy_mro(function_to_dump) == (function_to_dump,) def test_create_pickled_dataset(h5_data,compression_kwargs): """ tests create_pickled_dataset, load_pickled_data function and PickledContainer """ # check if create_pickled_dataset issues SerializedWarning for objects which # either do not support copy protocol py_object = ClassToDump('hello',1) pickled_py_object = pickle.dumps(py_object) data_set_name = "greetings" with pytest.warns(lookup.SerializedWarning,match = r".*type\s+not\s+understood,\s+data\s+is\s+serialized:.*") as warner: h5_node,subitems = lookup.create_pickled_dataset(py_object, h5_data,data_set_name,**compression_kwargs) assert isinstance(h5_node,h5py.Dataset) and not subitems and iter(subitems) assert bytes(h5_node[()]) == pickled_py_object and h5_node.name.rsplit('/',1)[-1] == data_set_name assert lookup.load_pickled_data(h5_node,b'pickle',object) == py_object backup_class_to_dump = globals()['ClassToDump'] backup_class_to_dump = globals().pop('ClassToDump',None) recovered = lookup.load_pickled_data(h5_node,b'pickle',object) assert isinstance(recovered,lookup.RecoveredDataset) assert bytes(recovered) == pickled_py_object globals()['ClassToDump'] = backup_class_to_dump def test__DictItemContainer(): """ tests _DictItemContainer class which represent dict_item group used by version 4.0.0 files to represent values of dictionary key """ container = lookup._DictItemContainer({},b'dict_item',lookup._DictItem) my_bike_lock = (1,2,3,4) container.append('my_bike_lock',my_bike_lock,{}) assert container.convert() is my_bike_lock #@pytest.mark.no_compression def test__moc_numpy_array_object_lambda(): """ test the _moc_numpy_array_object_lambda function which mimics the effect of lambda function created py pickle when expanding pickle `'type'` string set for numpy arrays containing a single object not expandable into a list. Mocking is necessary from Python 3.8.X on as it seems in Python 3.8 and onward trying to pickle a lambda now causes a TypeError whilst it seems to be silently accepted in Python < 3.8 """ data = ['hello','world'] assert lookup._moc_numpy_array_object_lambda(data) == data[0] #@pytest.mark.no_compression def test_fix_lambda_obj_type(): """ test _moc_numpy_array_object_lambda function it self. When invoked it should return the first element of the passed list """ assert lookup.fix_lambda_obj_type(None) is object picklestring = pickle.dumps(SimpleClass) assert lookup.fix_lambda_obj_type(picklestring) is SimpleClass with pytest.warns(lookup.MockedLambdaWarning): assert lookup.fix_lambda_obj_type('') is lookup._moc_numpy_array_object_lambda def test_ReferenceManager_get_root(h5_data): """ tests the static ReferenceManager._get_root method """ # create an artificial 'hickle_types_table' with some entries # and link their h5py.Reference objects to the 'type' attributes # of some data such that ReferenceManager._get_root can resolve # h5_data root_group independent which node it was passed root_group = h5_data['/root_group'] data_group = root_group.create_group('data') content = data_group.create_dataset('mydata',data=12) type_table = root_group.create_group('hickle_types_table') int_pickle_string = bytearray(pickle.dumps(int)) int_np_entry = np.array(int_pickle_string,copy=False) int_np_entry.dtype = 'S1' int_entry = type_table.create_dataset(str(len(type_table)),data = int_np_entry,shape =(1,int_np_entry.size)) int_base_type = b'int' int_base_type = type_table.create_dataset(int_base_type,shape=None,dtype="S1") int_entry.attrs['base_type'] = int_base_type.ref content.attrs['type'] = int_entry.ref # try to resolve root_group from various kinds of nodes including # root_group it self. assert lookup.ReferenceManager.get_root(content).id == root_group.id assert lookup.ReferenceManager.get_root(root_group).id == root_group.id assert lookup.ReferenceManager.get_root(data_group).id == data_group.id # check fallbacks to passe in group or file in case resolution via # 'type' attribute reference fails list_group = data_group.create_group('somelist') some_list_item = list_group.create_dataset('0',data=13) assert lookup.ReferenceManager.get_root(some_list_item).id == some_list_item.file.id assert lookup.ReferenceManager.get_root(list_group).id == list_group.id # test indirect resolution through 'type' reference of parent group # which should have an already properly assigned 'type' attribute # unless reading hickle 4.0.X file or referred to 'hickle_types_table' entry # is missing. In both cases file shall be returned as fallback list_pickle_string = bytearray(pickle.dumps(list)) list_np_entry = np.array(list_pickle_string,copy = False) list_np_entry.dtype = 'S1' list_entry = type_table.create_dataset(str(len(type_table)),data = list_np_entry,shape=(1,list_np_entry.size)) list_base_type = b'list' list_base_type = type_table.create_dataset(list_base_type,shape=None,dtype="S1") list_entry.attrs['base_type'] = list_base_type.ref list_group.attrs['type'] = list_pickle_string assert lookup.ReferenceManager.get_root(some_list_item).id == root_group.file.id list_group.attrs['type'] = list_entry.ref assert lookup.ReferenceManager.get_root(some_list_item).id == root_group.id for_later_use = list_entry.ref list_entry = None del type_table[str(len(type_table)-2)] assert lookup.ReferenceManager.get_root(some_list_item).id == root_group.file.id assert lookup.ReferenceManager.get_root(some_list_item).id == root_group.file.id class not_a_surviver(): """does not survive pickle.dumps""" def test_ReferenceManager(h5_data): """ test for creation of ReferenceManager object (__init__) to be run before testing ReferenceManager.create_manager """ reference_manager = lookup.ReferenceManager(h5_data) type_table = h5_data['hickle_types_table'] assert isinstance(type_table,h5py.Group) reference_manager = lookup.ReferenceManager(h5_data) assert reference_manager._py_obj_type_table.id == type_table.id false_root = h5_data.file.create_group('false_root') false_root.create_dataset('hickle_types_table',data=12) with pytest.raises(lookup.ReferenceError): reference_manager = lookup.ReferenceManager(false_root) int_pickle_string = bytearray(pickle.dumps(int)) int_np_entry = np.array(int_pickle_string,copy=False) int_np_entry.dtype = 'S1' int_entry = type_table.create_dataset(str(len(type_table)),data = int_np_entry,shape =(1,int_np_entry.size)) int_base_type = b'int' int_base_type = type_table.create_dataset(int_base_type,shape=None,dtype="S1") int_entry.attrs['base_type'] = int_base_type.ref list_pickle_string = bytearray(pickle.dumps(list)) list_np_entry = np.array(list_pickle_string,copy = False) list_np_entry.dtype = 'S1' list_entry = type_table.create_dataset(str(len(type_table)),data = list_np_entry,shape=(1,list_np_entry.size)) list_base_type = b'list' list_base_type = type_table.create_dataset(list_base_type,shape=None,dtype="S1") list_entry.attrs['base_type'] = list_base_type.ref missing_pickle_string = bytearray(pickle.dumps(not_a_surviver)) missing_np_entry = np.array(missing_pickle_string,copy = False) missing_np_entry.dtype = 'S1' missing_entry = type_table.create_dataset(str(len(type_table)),data = missing_np_entry,shape=(1,missing_np_entry.size)) missing_base_type = b'lost' missing_base_type = type_table.create_dataset(missing_base_type,shape=None,dtype="S1") missing_entry.attrs['base_type'] = missing_base_type.ref hide_not_a_surviver = globals().pop('not_a_surviver',None) reference_manager = lookup.ReferenceManager(h5_data) globals()['not_a_surviver'] = hide_not_a_surviver assert reference_manager._py_obj_type_link[id(int)] == int_entry assert reference_manager._py_obj_type_link[int_entry.id] == (int,b'int') assert reference_manager._base_type_link[b'int'] == int_base_type assert reference_manager._base_type_link[int_base_type.id] == b'int' assert reference_manager._py_obj_type_link[id(list)] == list_entry assert reference_manager._py_obj_type_link[list_entry.id] == (list,b'list') assert reference_manager._base_type_link[b'list'] == list_base_type assert reference_manager._base_type_link[list_base_type.id] == b'list' assert reference_manager._base_type_link[b'lost'] == missing_base_type assert reference_manager._base_type_link[missing_base_type.id] == b'lost' assert reference_manager._py_obj_type_link[missing_entry.id] == (lookup.AttemptRecoverCustom,'!recover!',b'lost') backup_attr = list_entry.attrs['base_type'] list_entry.attrs.pop('base_type',None) with pytest.raises(lookup.ReferenceError): reference_manager = lookup.ReferenceManager(h5_data) list_entry.attrs['base_type']=b'list' with pytest.raises(lookup.ReferenceError): reference_manager = lookup.ReferenceManager(h5_data) stale_ref_entry = type_table.create_dataset("stale",shape=None,dtype = 'S1') list_entry.attrs['base_type']=stale_ref_entry.ref type_table.pop("stale",None) stale_ref_entry = None with pytest.raises(lookup.ReferenceError): reference_manager = lookup.ReferenceManager(h5_data) list_entry.attrs['base_type']=backup_attr old_hickle_file_root = h5_data.file.create_group('old_root') h5_data.file.flush() base_name,ext = h5_data.file.filename.rsplit('.',1) file_name = "{}_ro.{}".format(base_name,ext) data_name = h5_data.name data_file_name = h5_data.file.filename #################### NOTE ############################# # h5_data ficture is invalidated by the following line # as well as all Groups and Datasets read from it # manually reopen it and do not forget to close at # end of iths test h5_data.file.close() shutil.copy(data_file_name,file_name) read_only_handle = h5py.File(file_name,'r') h5_read_data = read_only_handle[data_name] h5_read_old = read_only_handle['old_root'] reference_manager = lookup.ReferenceManager(h5_read_old) assert isinstance(reference_manager._overlay,weakref.finalize) overlay_file = reference_manager._py_obj_type_table.file assert overlay_file.mode == 'r+' and overlay_file.driver == 'core' assert overlay_file.id != read_only_handle.id reference_manager = lookup.ReferenceManager(h5_read_data) read_only_handle.close() class SubReferenceManager(lookup.ReferenceManager): __managers__ = () assert SubReferenceManager.__managers__ is lookup.ReferenceManager.__managers__ with pytest.raises(TypeError): invalid_instance = lookup.BaseManager() class OtherManager(lookup.BaseManager): pass with pytest.raises(NotImplementedError): with OtherManager() as invalid_manager: pass def test_ReferenceManager_drop_manager(h5_data): """ test static ReferenceManager._drop_table method """ reference_manager = lookup.ReferenceManager(h5_data) lookup.ReferenceManager.__managers__[h5_data.file.id] = (reference_manager,h5_data) some_other_file = h5py.File('someother.hdf5','w') some_other_root = some_other_file.create_group('root') lookup.ReferenceManager._drop_manager(some_other_root.file.id) lookup.ReferenceManager.__managers__[some_other_file.file.id] = (lookup.ReferenceManager(some_other_root),some_other_root) assert lookup.ReferenceManager.__managers__.get(h5_data.file.id,None) == (reference_manager,h5_data) lookup.ReferenceManager._drop_manager(h5_data.file.id) assert lookup.ReferenceManager.__managers__.get(h5_data.file.id,None) is None lookup.ReferenceManager._drop_manager(some_other_root.file.id) assert not lookup.ReferenceManager.__managers__ some_other_file.close() def test_ReferenceManager_create_manager(h5_data): """ test public static ReferenceManager.create_manager function """ second_tree = h5_data.file.create_group('seondary_root') h5_data_table = lookup.ReferenceManager.create_manager(h5_data) assert lookup.ReferenceManager.__managers__[h5_data.file.id][0] is h5_data_table with pytest.raises(lookup.LookupError): second_table = lookup.ReferenceManager.create_manager(second_tree) lookup.ReferenceManager._drop_manager(h5_data.file.id) def test_ReferenceManager_context(h5_data): """ test use of ReferenceManager as context manager """ with lookup.ReferenceManager.create_manager(h5_data) as memo: assert lookup.ReferenceManager.__managers__[h5_data.file.id][0] is memo assert memo._py_obj_type_table is None with pytest.raises(RuntimeError): with memo as memo2: pass memo.__exit__(None,None,None) old_hickle_file_root = h5_data.file.create_group('old_root') h5_data.file.flush() base_name,ext = h5_data.file.filename.rsplit('.',1) file_name = "{}_ro.{}".format(base_name,ext) data_file_name = h5_data.file.filename data_name = old_hickle_file_root.name #################### NOTE ############################# # h5_data ficture is invalidated by the following line # as well as all Groups and Datasets read from it # manually reopen it and do not forget to close at # end of iths test h5_data.file.close() shutil.copy(data_file_name,file_name) read_only_handle = h5py.File(file_name,'r') h5_read_data = read_only_handle[data_name] with lookup.ReferenceManager.create_manager(h5_read_data) as memo: assert isinstance(memo._overlay,weakref.finalize) assert memo._overlay is None read_only_handle.close() def test_ReferenceManager_store_type(h5_data,compression_kwargs): """ test ReferenceManager.store_type method which sets 'type' attribute reference to appropriate py_obj_type entry within 'hickle_types_table' """ h_node = h5_data.create_group('some_list') with lookup.ReferenceManager.create_manager(h5_data) as memo: memo.store_type(h_node,object,None,**compression_kwargs) assert len(memo._py_obj_type_table) == 0 and not memo._py_obj_type_link and not memo._base_type_link with pytest.raises(lookup.LookupError): memo.store_type(h_node,list,None,**compression_kwargs) with pytest.raises(ValueError): memo.store_type(h_node,list,b'',**compression_kwargs) memo.store_type(h_node,list,b'list',**compression_kwargs) assert isinstance(h_node.attrs['type'],h5py.Reference) type_table_entry = h5_data.file[h_node.attrs['type']] assert pickle.loads(type_table_entry[()]) is list assert isinstance(type_table_entry.attrs['base_type'],h5py.Reference) assert h5_data.file[type_table_entry.attrs['base_type']].name.rsplit('/',1)[-1].encode('ascii') == b'list' @pytest.mark.no_compression def test_ReferenceManager_get_manager(h5_data): h_node = h5_data.create_group('some_list') item_data = np.array(memoryview(b'hallo welt lore grueszet dich ipsum aus der lore von ipsum gelort in ipsum'),copy=False) item_data.dtype = 'S1' h_item = h_node.create_dataset('0',data=item_data,shape=(1,item_data.size)) with lookup.ReferenceManager.create_manager(h5_data) as memo: memo.store_type(h_node,list,b'list') memo.store_type(h_item,bytes,b'bytes') assert lookup.ReferenceManager.get_manager(h_item) == memo backup_manager = lookup.ReferenceManager.__managers__.pop(h5_data.file.id,None) assert backup_manager is not None with pytest.raises(lookup.ReferenceError): manager = lookup.ReferenceManager.get_manager(h_item) lookup.ReferenceManager.__managers__[h5_data.file.id] = backup_manager with pytest.raises(lookup.ReferenceError): manager = lookup.ReferenceManager.get_manager(h_item) @pytest.mark.no_compression def test_ReferenceManager_resolve_type(h5_data): """ test ReferenceManager.reslove_type method which tries to resolve content of 'type' attribute of passed in node and return appropriate pair of py_obj_type,base_type and a boolean flag indicating whether node represents a h5py.Group or h5py.Reference both of which are to be handled by PyContainer objects. """ invalid_pickle_and_ref = h5_data.create_group('invalid_pickle_and_ref') pickled_data = h5_data.create_dataset('pickled_data',data = bytearray()) shared_ref = h5_data.create_dataset('shared_ref',data = pickled_data.ref,dtype = h5py.ref_dtype) old_style_typed = h5_data.create_dataset('old_style_typed',data = 12) old_style_typed.attrs['type'] = np.array(pickle.dumps(int)) old_style_typed.attrs['base_type'] = b'int' broken_old_style = h5_data.create_dataset('broken_old_style',data = 12) broken_old_style.attrs['type'] = 12 broken_old_style.attrs['base_type'] = b'int' new_style_typed = h5_data.create_dataset('new_style_typed',data = 12) stale_new_style = h5_data.create_dataset('stale_new_style',data = 12) new_style_typed_no_link = h5_data.create_dataset('new_style_typed_no_link',data = 12.5) has_not_recoverable_type = h5_data.create_dataset('no_recoverable_type',data = 42.56) with lookup.ReferenceManager.create_manager(h5_data) as memo: with pytest.raises(lookup.ReferenceError): memo.resolve_type(invalid_pickle_and_ref) assert memo.resolve_type(pickled_data) == (object,b'pickle',False) assert memo.resolve_type(shared_ref) == (lookup.NodeReference,b'!node-reference!',True) assert memo.resolve_type(old_style_typed) in ((int,b'int',False),(int,'int',False)) with pytest.raises(lookup.ReferenceError): info = memo.resolve_type(broken_old_style) memo.store_type(new_style_typed,int,b'int') entry_id = len(memo._py_obj_type_table) memo.store_type(stale_new_style,list,b'list') assert pickle.loads(memo._py_obj_type_table[str(entry_id)][()]) is list stale_list_base = memo._py_obj_type_table['list'].ref # remove py_obj_type entry and base_type_entry for list entry # while h5py 2 raises a value error if not active link exists for a # dataset h5py 3 returns an anonymous group when resolving a stale # reference to it if any body still holds a strong reference to its # h5py.Dataset or h5py.Group object. Therefore drop all references to # the removed entries to simulate that somebody has removed them from # a hickle file before it was passed to hickle.load for restoring its # content. memo._py_obj_type_link.pop(memo._py_obj_type_table[str(entry_id)].id,None) memo._py_obj_type_link.pop(id(list),None) memo._base_type_link.pop(memo._py_obj_type_table['list'].id,None) memo._base_type_link.pop(b'list',None) del memo._py_obj_type_table[str(entry_id)] del memo._py_obj_type_table['list'] memo._py_obj_type_table.file.flush() with pytest.raises(lookup.ReferenceError): memo.resolve_type(stale_new_style) entry_id = len(memo._py_obj_type_table) memo.store_type(new_style_typed_no_link,float,b'float') float_entry = memo._py_obj_type_table[str(entry_id)] assert pickle.loads(float_entry[()]) is float float_base = float_entry.attrs['base_type'] # remove float entry and clear all references to it see above del memo._py_obj_type_link[float_entry.id] del memo._py_obj_type_link[id(float)] del float_entry.attrs['base_type'] memo._py_obj_type_table.file.flush() assert memo.resolve_type(new_style_typed_no_link) in ((float,b'pickle',False),(float,'pickle',False)) del memo._py_obj_type_link[float_entry.id] del memo._py_obj_type_link[id(float)] # create stale reference to not existing base_type entry memo._py_obj_type_table.create_dataset('list',shape=None,dtype='S1') float_entry.attrs['base_type'] = memo._py_obj_type_table['list'].ref memo._py_obj_type_table.pop('list',None) memo._py_obj_type_table.file.flush() with pytest.raises(lookup.ReferenceError): info = memo.resolve_type(new_style_typed_no_link) memo._py_obj_type_link.pop(float_entry.id,None) memo._py_obj_type_link.pop(id(float),None) del memo._base_type_link[memo._py_obj_type_table[float_base].id] del memo._base_type_link[b'float'] float_entry.attrs['base_type'] = float_base memo._py_obj_type_table.file.flush() assert memo.resolve_type(new_style_typed_no_link) in ((float,b'float',False),(float,'float',False)) assert memo.resolve_type(new_style_typed_no_link) memo.store_type(has_not_recoverable_type,not_a_surviver,b'lost') del memo._py_obj_type_link[memo._py_obj_type_link[id(not_a_surviver)].id] del memo._py_obj_type_link[id(not_a_surviver)] hide_not_a_surviver = globals().pop('not_a_surviver',None) assert memo.resolve_type(has_not_recoverable_type) == (lookup.AttemptRecoverCustom,b'!recover!',False) assert memo.resolve_type(has_not_recoverable_type,base_type_type=2) == (lookup.AttemptRecoverCustom,b'lost',False) globals()['not_a_surviver'] = hide_not_a_surviver has_not_recoverable_type.attrs['type'] = np.array(pickle.dumps(not_a_surviver)) has_not_recoverable_type.attrs['base_type'] = b'lost' hide_not_a_surviver = globals().pop('not_a_surviver',None) assert memo.resolve_type(has_not_recoverable_type) == (lookup.AttemptRecoverCustom,b'!recover!',False) assert memo.resolve_type(has_not_recoverable_type,base_type_type=2) in ((lookup.AttemptRecoverCustom,b'lost',False),(lookup.AttemptRecoverCustom,'lost',False)) globals()['not_a_surviver'] = hide_not_a_surviver def test_ExpandReferenceContainer(h5_data): """ test ExpandReferenceContainer which resolves object link stored as h5py.Reference type dataset """ expected_data = np.random.randint(-13,13,12) referred_data = h5_data.create_dataset('referred_data',data = expected_data) referring_node = h5_data.create_dataset('referring_node',data = referred_data.ref,dtype = h5py.ref_dtype) h5_data.file.flush() sub_container = lookup.ExpandReferenceContainer(referring_node.attrs,b'!node-reference!',lookup.NodeReference) content = None for name,subitem in sub_container.filter(referring_node): assert name == 'referred_data' and subitem.id == referred_data.id content = np.array(subitem[()]) sub_container.append(name,content,subitem.attrs) assert np.all(sub_container.convert()==expected_data) referring_node = h5_data.create_dataset('stale_reference',shape=(),dtype=h5py.ref_dtype) sub_container = lookup.ExpandReferenceContainer(referring_node.attrs,b'!node-reference!',lookup.NodeReference) with pytest.raises(lookup.ReferenceError): for name,subitem in sub_container.filter(referring_node): content = np.array(subitem[()]) sub_container.append(name,content,subitem.attrs) @pytest.mark.no_compression def test_recover_custom_data(h5_data): array_to_recover = np.random.random_sample([4,2]) with lookup.ReferenceManager.create_manager(h5_data) as memo: dataset_to_recover = h5_data.create_dataset('to_recover',data=array_to_recover) dataset_to_recover.attrs['world'] = 2 memo.store_type(dataset_to_recover,ClassToDump,b'myclass') group_to_recover = h5_data.create_group('need_recover') memo.store_type(group_to_recover,ClassToDump,b'myclass') backup_class_to_dump = globals().pop('ClassToDump',None) memo._py_obj_type_link.pop(id('ClassToDump'),None) memo._base_type_link.pop(b'myclass') type_entry = memo._py_obj_type_table[dataset_to_recover.attrs['type']] memo._py_obj_type_link.pop(type_entry.id,None) py_obj_type,base_type,is_group = memo.resolve_type(dataset_to_recover) assert issubclass(py_obj_type,lookup.AttemptRecoverCustom) and base_type == b'!recover!' with pytest.warns(lookup.DataRecoveredWarning): recovered = lookup.recover_custom_dataset(dataset_to_recover,base_type,py_obj_type) assert recovered.dtype == array_to_recover.dtype and np.all(recovered == array_to_recover) assert recovered.attrs == {'base_type':b'myclass','world':2} assert not is_group type_entry = memo._py_obj_type_table[group_to_recover.attrs['type']] memo._py_obj_type_link.pop(type_entry.id,None) some_int=group_to_recover.create_dataset('some_int',data=42) some_float=group_to_recover.create_dataset('some_float',data=42.0) group_to_recover.attrs['so'] = 'long' group_to_recover.attrs['and'] = 'thanks' some_float.attrs['for'] = 'all' some_float.attrs['the'] = 'fish' py_obj_type,base_type,is_group = memo.resolve_type(group_to_recover) assert issubclass(py_obj_type,lookup.AttemptRecoverCustom) and base_type == b'!recover!' assert is_group recover_container = lookup.RecoverGroupContainer(group_to_recover.attrs,base_type,py_obj_type) with pytest.warns(lookup.DataRecoveredWarning): for name,item in recover_container.filter(group_to_recover): recover_container.append(name,item[()],item.attrs) recover_container.append('some_other',recovered,recovered.attrs) recovered_group = recover_container.convert() assert isinstance(recovered_group,dict) assert some_float[()] == recovered_group['some_float'][0] and some_float.attrs == recovered_group['some_float'][1] assert some_int[()] == recovered_group['some_int'][0] and some_int.attrs == recovered_group['some_int'][1] assert recovered_group['some_other'] is recovered assert recovered_group.attrs['base_type'] == memo.resolve_type(group_to_recover,base_type_type=2)[1] assert len(recovered_group.attrs) == 3 assert recovered_group.attrs['so'] == 'long' and recovered_group.attrs['and'] == 'thanks' globals()['ClassToDump'] = backup_class_to_dump if __name__ == "__main__": from _pytest.monkeypatch import monkeypatch from _pytest.fixtures import FixtureRequest from hickle.tests.conftest import compression_kwargs for h5_root in h5_data(FixtureRequest(test_create_pickled_dataset)): test_AttemptRecoverCustom_classes(h5_data) for table in loader_table(): test_LoaderManager_register_class(table) for table in loader_table(): test_LoaderManager_register_class_exclude(table) for table,h5_root in ( (tab,root) for tab in loader_table() for root in h5_data(FixtureRequest(test_LoaderManager)) ): test_LoaderManager(table,h5_root) for h5_root in h5_data(FixtureRequest(test_LoaderManager_drop_manager)): test_LoaderManager_drop_manager(h5_root) for h5_root in h5_data(FixtureRequest(test_LoaderManager_create_manager)): test_LoaderManager_create_manager(h5_root) for h5_root in h5_data(FixtureRequest(test_LoaderManager_context)): test_LoaderManager_context(h5_root) for table,h5_root,monkey in ( (tab,root,mpatch) for tab in loader_table() for root in h5_data(FixtureRequest(test_LoaderManager_load_loader)) for mpatch in monkeypatch() ): test_LoaderManager_load_loader(table,h5_root,monkey) test_type_legacy_mro() for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_pickled_dataset),) ): test_create_pickled_dataset(h5_root,keywords) test__DictItemContainer() test__moc_numpy_array_object_lambda() test_fix_lambda_obj_type() test_fix_lambda_obj_type() for h5_root in h5_data(FixtureRequest(test_ReferenceManager_get_root)): test_ReferenceManager_get_root(h5_root) for h5_root in h5_data(FixtureRequest(test_ReferenceManager)): test_ReferenceManager(h5_root) for h5_root in h5_data(FixtureRequest(test_ReferenceManager_drop_manager)): test_ReferenceManager_drop_manager(h5_root) for h5_root in h5_data(FixtureRequest(test_ReferenceManager_create_manager)): test_ReferenceManager_create_manager(h5_root) for h5_root in h5_data(FixtureRequest(test_ReferenceManager_context)): test_ReferenceManager_context(h5_root) for h5_root in h5_data(FixtureRequest(test_ReferenceManager_get_manager)): test_ReferenceManager_get_manager(h5_root) for h5_root,compression_kwargs in ( h5_data(FixtureRequest(test_ReferenceManager_store_type)) ): test_ReferenceManager_store_type(h5_root,compression_kwargs) for h5_root in h5_data(FixtureRequest(test_ReferenceManager_resolve_type)): test_ReferenceManager_resolve_type(h5_root) for h5_root in h5_data(FixtureRequest(test_ExpandReferenceContainer)): test_ExpandReferenceContainer(h5_root) for h5_root in h5_data(FixtureRequest(test_ExpandReferenceContainer)): test_recover_custom_data(h5_data) hickle-5.0.2/hickle/tests/test_03_load_builtins.py000066400000000000000000000531001430361177200221240ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_load_builtins Unit tests for hickle module -- builtins loader. """ import pytest import collections import itertools # %% IMPORTS # Package imports import h5py as h5 import numpy as np from py.path import local # hickle imports import hickle.loaders.load_builtins as load_builtins import hickle.helpers as helpers # Set current working directory to the temporary directory local.get_temproot().chdir() # %% TEST DATA dummy_data = (1,2,3) # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file providing parent group hosting created datasets and groups. Name of test function is included in filename """ dummy_file = h5.File('load_builtins_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() # %% FUNCTION DEFINITIONS def test_scalar_dataset(h5_data,compression_kwargs): """ tests creation and loading of datasets for scalar values """ # check that scalar value is properly handled floatvalue = 5.2 h_dataset,subitems= load_builtins.create_scalar_dataset(floatvalue,h5_data,"floatvalue",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and h_dataset[()] == floatvalue assert not [ item for item in subitems ] assert load_builtins.load_scalar_dataset(h_dataset,b'float',float) == floatvalue # check that integer value less than 64 bit is stored as int intvalue = 11 h_dataset,subitems = load_builtins.create_scalar_dataset(intvalue,h5_data,"intvalue",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and h_dataset[()] == intvalue assert not [ item for item in subitems ] assert load_builtins.load_scalar_dataset(h_dataset,b'int',int) == intvalue # check that integer larger than 64 bit is stored as ASCII byte string non_mappable_int = int(2**65) h_dataset,subitems = load_builtins.create_scalar_dataset(non_mappable_int,h5_data,"non_mappable_int",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) assert bytearray(h_dataset[()]) == str(non_mappable_int).encode('utf8') assert not [ item for item in subitems ] assert load_builtins.load_scalar_dataset(h_dataset,b'int',int) == non_mappable_int # check that integer larger than 64 bit is stored as ASCII byte string non_mappable_neg_int = -int(-2**63-1) h_dataset,subitems = load_builtins.create_scalar_dataset(non_mappable_neg_int,h5_data,"non_mappable_neg_int",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) assert bytearray(h_dataset[()]) == str(non_mappable_neg_int).encode('utf8') assert not [ item for item in subitems ] assert load_builtins.load_scalar_dataset(h_dataset,b'int',int) == non_mappable_neg_int def test_load_hickle_4_0_X_string(h5_data): string_data = "just test me as utf8 string" bytes_data = string_data.encode('utf8') if h5.version.version_tuple[0] >= 3: utf_entry = h5_data.create_dataset('utf_entry',data = string_data)#,dtype = 'U{}'.format(len(string_data))) bytes_entry = h5_data.create_dataset('bytes_entry',data = bytes_data,dtype = 'S{}'.format(len(bytes_data))) else: utf_entry = h5_data.create_dataset('utf_entry',data = string_data) bytes_entry = h5_data.create_dataset('bytes_entry',data = bytes_data) assert load_builtins.load_hickle_4_x_string(utf_entry,b'str',str) == string_data bytes_entry.attrs['str_type'] = b'str' assert load_builtins.load_hickle_4_x_string(bytes_entry,b'str',str) == string_data object_entry = h5_data.create_dataset('utf_h5py2_entry',data = string_data,dtype = np.dtype('O',metadata={'vlen':bytes})) assert load_builtins.load_hickle_4_x_string(object_entry,b'str',bytes) == bytes_data def test_non_dataset(h5_data,compression_kwargs): """ that None value is properly stored """ h_dataset,subitems = load_builtins.create_none_dataset(None,h5_data,"None_value",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and h_dataset.shape is None and h_dataset.dtype == 'V1' assert not [ item for item in subitems ] assert load_builtins.load_none_dataset(h_dataset,b'None',None.__class__) is None def test_listlike_dataset(h5_data,compression_kwargs): """ test storing and loading of list like data """ # check that empty tuple is stored properly empty_tuple = () h_dataset,subitems = load_builtins.create_listlike_dataset(empty_tuple, h5_data, "empty_tuple",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and h_dataset.size is None assert not subitems and iter(subitems) assert load_builtins.load_list_dataset(h_dataset,b'tuple',tuple) == empty_tuple # check that string data is stored properly stored as array of bytes # which supports compression stringdata = "string_data" h_dataset,subitems = load_builtins.create_listlike_dataset(stringdata, h5_data, "string_data",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not [ item for item in subitems ] assert bytearray(h_dataset[()]).decode("utf8") == stringdata assert h_dataset.attrs["str_type"] in ('str',b'str') assert load_builtins.load_list_dataset(h_dataset,b'str',str) == stringdata # check that byte string is properly stored as array of bytes which # supports compression bytesdata = b'bytes_data' h_dataset,subitems = load_builtins.create_listlike_dataset(bytesdata, h5_data, "bytes_data",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not [ item for item in subitems ] assert bytes(h_dataset[()]) == bytesdata assert h_dataset.attrs["str_type"] in ('bytes',b'bytes') assert load_builtins.load_list_dataset(h_dataset,b'bytes',bytes) == bytesdata # check that string dataset created by hickle 4.0.x is properly loaded # utilizing numpy.array method. Mimic dumped data h_dataset = h5_data.create_dataset("legacy_np_array_bytes_data",data=np.array(stringdata.encode('utf8'))) h_dataset.attrs['str_type'] = b'str' assert load_builtins.load_list_dataset(h_dataset,b'str',str) == stringdata # check that list of single type is stored as dataset of same type homogenous_list = [ 1, 2, 3, 4, 5, 6] h_dataset,subitems = load_builtins.create_listlike_dataset(homogenous_list,h5_data,"homogenous_list",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not [ item for item in subitems ] assert h_dataset[()].tolist() == homogenous_list and h_dataset.dtype == int assert load_builtins.load_list_dataset(h_dataset,b'list',list) == homogenous_list # check that list of different scalar types for which a least common type exists # is stored using a dataset mixed_dtype_list = [ 1, 2.5, 3.8, 4, 5, 6] h_dataset,subitems = load_builtins.create_listlike_dataset(mixed_dtype_list,h5_data,"mixed_dtype_list",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not [ item for item in subitems ] assert h_dataset[()].tolist() == mixed_dtype_list and h_dataset.dtype == float assert load_builtins.load_list_dataset(h_dataset,b'list',list) == mixed_dtype_list # check that list containing non scalar objects is converted into group # further check that for groups representing list the index of items is either # provided via item_index attribute or can be read from name of item not_so_homogenous_list = [ 1, 2, 3, [4],5 ,6 ] h_dataset,subitems = load_builtins.create_listlike_dataset(not_so_homogenous_list,h5_data,"not_so_homogenous_list",**compression_kwargs) assert isinstance(h_dataset,h5.Group) item_name = "data{:d}" index = -1 loaded_list = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) subitems1,subitems2 = itertools.tee(subitems,2) index_from_string = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) for index,(name,item,attrs,kwargs) in enumerate(iter(subitems1)): assert item_name.format(index) == name and item == not_so_homogenous_list[index] assert attrs == {"item_index":index} and kwargs == compression_kwargs if isinstance(item,list): item_dataset,_ = load_builtins.create_listlike_dataset(item,h_dataset,name,**compression_kwargs) else: item_dataset = h_dataset.create_dataset(name,data = item) item_dataset.attrs.update(attrs) loaded_list.append(name,item,item_dataset.attrs) index_from_string.append(name,item,{}) assert index + 1 == len(not_so_homogenous_list) assert loaded_list.convert() == not_so_homogenous_list assert index_from_string.convert() == not_so_homogenous_list # check that list groups which do not provide num_items attribute # are automatically expanded to properly cover the highest index encountered # for any of the list items. no_num_items = {key:value for key,value in h_dataset.attrs.items() if key != "num_items"} no_num_items_container = load_builtins.ListLikeContainer(no_num_items,b'list',list) for index,(name,item,attrs,kwargs) in enumerate(iter(subitems2)): assert item_name.format(index) == name and item == not_so_homogenous_list[index] assert attrs == {"item_index":index} and kwargs == compression_kwargs item_dataset = h_dataset.get(name,None) no_num_items_container.append(name,item,{}) assert index + 1 == len(not_so_homogenous_list) assert no_num_items_container.convert() == not_so_homogenous_list # check that list the first of which is not a scalar is properly mapped # to a group. Also check that ListLikeContainer.append raises exception # in case neither item_index is provided nor an index value can be parsed # from the tail of its name. Also check that ListLikeContainer.append # raises exception in case value for item_index already has been loaded object_list = [ [4, 5 ] ,6, [ 1, 2, 3 ] ] h_dataset,subitems = load_builtins.create_listlike_dataset(object_list,h5_data,"object_list",**compression_kwargs) assert isinstance(h_dataset,h5.Group) item_name = "data{:d}" wrong_item_name = item_name + "_ni" index = -1 loaded_list = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) index_from_string = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) for index,(name,item,attrs,kwargs) in enumerate(iter(subitems)): assert item_name.format(index) == name and item == object_list[index] assert attrs == {"item_index":index} and kwargs == compression_kwargs if isinstance(item,list): item_dataset,_ = load_builtins.create_listlike_dataset(item,h_dataset,name,**compression_kwargs) else: item_dataset = h_dataset.create_dataset(name,data = item) item_dataset.attrs.update(attrs) loaded_list.append(name,item,item_dataset.attrs) with pytest.raises(KeyError,match = r"List\s+like\s+item name\s+'\w+'\s+not\s+understood"): index_from_string.append(wrong_item_name.format(index),item,{}) # check that previous error is not triggered when # legacy 4.0.x loader injects the special value helpers.nobody_is_my_name which # is generated by load_nothing function. this is for example used as load method # for legacy 4.0.x np.masked.array objects where the mask is injected in parallel # in the root group of the corresponding values data set. By silently ignoring # this special value returned by load_nothing it can be assured that for example # mask datasets of numpy.masked.array objects hickup the loader. index_from_string.append(wrong_item_name.format(index),helpers.nobody_is_my_name,{}) if index < 1: continue with pytest.raises(IndexError, match = r"Index\s+\d+\s+already\s+set"): loaded_list.append(name,item,{"item_index":index-1}) assert index + 1 == len(object_list) # assert that list of strings where first string has length 1 is properly mapped # to group string_list = test_set = ['I','confess','appriciate','hickle','times'] h_dataset,subitems = load_builtins.create_listlike_dataset(string_list,h5_data,"string_list",**compression_kwargs) assert isinstance(h_dataset,h5.Group) item_name = "data{:d}" index = -1 loaded_list = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) index_from_string = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) for index,(name,item,attrs,kwargs) in enumerate(iter(subitems)): assert item_name.format(index) == name and item == string_list[index] assert attrs == {"item_index":index} and kwargs == compression_kwargs item_dataset = h_dataset.create_dataset(name,data = item) item_dataset.attrs.update(attrs) loaded_list.append(name,item,item_dataset.attrs) index_from_string.append(name,item,{}) assert index + 1 == len(string_list) assert loaded_list.convert() == string_list assert index_from_string.convert() == string_list # assert that list which contains numeric values and strings is properly mapped # to group mixed_string_list = test_set = [12,2.8,'I','confess','appriciate','hickle','times'] h_dataset,subitems = load_builtins.create_listlike_dataset(mixed_string_list,h5_data,"mixed_string_list",**compression_kwargs) assert isinstance(h_dataset,h5.Group) item_name = "data{:d}" index = -1 loaded_list = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) index_from_string = load_builtins.ListLikeContainer(h_dataset.attrs,b'list',list) for index,(name,item,attrs,kwargs) in enumerate(iter(subitems)): assert item_name.format(index) == name and item == mixed_string_list[index] assert attrs == {"item_index":index} and kwargs == compression_kwargs item_dataset = h_dataset.create_dataset(name,data = item) item_dataset.attrs.update(attrs) loaded_list.append(name,item,item_dataset.attrs) index_from_string.append(name,item,{}) assert index + 1 == len(mixed_string_list) assert loaded_list.convert() == mixed_string_list assert index_from_string.convert() == mixed_string_list def test_set_container(h5_data,compression_kwargs): """ tests storing and loading of set """ # check that set of strings is store as group test_set = {'I','confess','appriciate','hickle','times'} h_setdataset,subitems = load_builtins.create_setlike_dataset(test_set,h5_data,"test_set",**compression_kwargs) set_container = load_builtins.SetLikeContainer(h_setdataset.attrs,b'set',set) for name,item,attrs,kwargs in subitems: set_container.append(name,item,attrs) assert set_container.convert() == test_set # check that set of single bytes is stored as single dataset test_set_2 = set(b"hello world") h_setdataset,subitems = load_builtins.create_setlike_dataset(test_set_2,h5_data,"test_set_2",**compression_kwargs) assert isinstance(h_setdataset,h5.Dataset) and set(h_setdataset[()]) == test_set_2 assert not subitems and iter(subitems) assert load_builtins.load_list_dataset(h_setdataset,b'set',set) == test_set_2 # check that set containing byte strings is stored as group test_set_3 = set((item.encode("utf8") for item in test_set)) h_setdataset,subitems = load_builtins.create_setlike_dataset(test_set_3,h5_data,"test_set_3",**compression_kwargs) set_container = load_builtins.SetLikeContainer(h_setdataset.attrs,b'set',set) for name,item,attrs,kwargs in subitems: set_container.append(name,item,attrs) assert set_container.convert() == test_set_3 # check that empty set is represented by empty dataset h_setdataset,subitems = load_builtins.create_setlike_dataset(set(),h5_data,"empty_set",**compression_kwargs) assert isinstance(h_setdataset,h5.Dataset) and h_setdataset.size == 0 assert not subitems and iter(subitems) assert load_builtins.load_list_dataset(h_setdataset,b'set',set) == set() def test_dictlike_dataset(h5_data,compression_kwargs): """ test storing and loading of dict """ class KeyClass(): """class used as dict key""" allkeys_dict = { 'string_key':0, b'bytes_key':1, 12:2, 0.25:3, complex(1,2):4, None:5, (1,2,3):6, KeyClass():7, KeyClass:8 } # check that dict is stored as group # check that string and byte string keys are mapped to dataset or group name # check that scalar dict keys are converted to their string representation # check that for all other keys a key value pair is created h_datagroup,subitems = load_builtins.create_dictlike_dataset(allkeys_dict,h5_data,"allkeys_dict",**compression_kwargs) assert isinstance(h_datagroup,h5.Group) invalid_key = b'' last_entry = -1 load_dict = load_builtins.DictLikeContainer(h_datagroup.attrs,b'dict',dict) ordered_dict = collections.OrderedDict() for name,item,attrs,kwargs in subitems: value = item if attrs["key_base_type"] == b"str": key = name[1:-1] elif attrs["key_base_type"] == b"bytes": key = name[2:-1].encode("utf8") elif attrs["key_base_type"] == b'key_value': key = item[0] value = item[1] else: load_key = load_builtins.dict_key_types_dict.get(attrs["key_base_type"],None) if load_key is None: raise ValueError("key_base_type '{}' invalid".format(attrs["key_base_type"])) key = load_key(name) assert allkeys_dict.get(key,invalid_key) == value load_dict.append(name,item,attrs) last_entry = attrs.get("key_idx",None) ordered_dict[key] = value assert last_entry + 1 == len(allkeys_dict) assert load_dict.convert() == allkeys_dict # verify that DictLikeContainer.append raises error in case invalid key_base_type # is provided with pytest.raises(ValueError, match = r"key\s+type\s+'.+'\s+not\s+understood"): load_dict.append("invalid_key_type",12,{"key_idx":9,"key_base_type":b"invalid_type"}) tuple_key = ('a','b','c') # verify that DictLikeContainer.append raises error in case index of key value pair # within dict is whether provided by key_index attribute nor can be parsed from # name of corresponding dataset or group with pytest.raises(KeyError, match = r"invalid\s+dict\s+item\s+key_index\s+missing"): load_dict.append(str(tuple_key),9,{"item_index":9,"key_base_type":b"tuple"}) # check that helpers.nobody_is_my_name injected for example by load_nothing is silently # ignored in case no key could be retrieved from dataset or sub group load_dict.append( str(tuple_key), helpers.nobody_is_my_name, {"item_index":9,"key_base_type":b"tuple"} ) with pytest.raises(KeyError): assert load_dict.convert()[tuple_key] is None # check that if key_idx attribute is provided key value pair may be added load_dict.append(str(tuple_key),9,{"key_idx":9,"key_base_type":b"tuple"}) assert load_dict.convert()[tuple_key] == 9 # verify that DictLikeContainer.append raises error in case item index already # set with pytest.raises(IndexError,match = r"Key\s+index\s+\d+\s+already\s+set"): load_dict.append(str(tuple_key),9,{"key_idx":9,"key_base_type":b"tuple"}) # check that order of OrderedDict dict keys is not altered on loading data from # hickle file h_datagroup,subitems = load_builtins.create_dictlike_dataset(ordered_dict,h5_data,"ordered_dict",**compression_kwargs) assert isinstance(h_datagroup,h5.Group) last_entry = -1 load_ordered_dict = load_builtins.DictLikeContainer(h_datagroup.attrs,b'dict',collections.OrderedDict) for name,item,attrs,kwargs in subitems: value = item if attrs["key_base_type"] == b"str": key = name[1:-1] elif attrs["key_base_type"] == b"bytes": key = name[2:-1].encode("utf8") elif attrs["key_base_type"] == b'key_value': key = item[0] value = item[1] else: load_key = load_builtins.dict_key_types_dict.get(attrs["key_base_type"],None) if load_key is None: raise ValueError("key_base_type '{}' invalid".format(attrs["key_base_type"])) key = load_key(name) assert ordered_dict.get(key,invalid_key) == value load_ordered_dict.append(name,item,attrs) last_entry = attrs.get("key_idx",None) assert last_entry + 1 == len(allkeys_dict) assert load_ordered_dict.convert() == ordered_dict # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest from hickle.tests.conftest import compression_kwargs for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_scalar_dataset),) ): test_scalar_dataset(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_non_dataset),) ): test_non_dataset(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_listlike_dataset),) ): test_listlike_dataset(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_set_container),) ): test_set_container(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_dictlike_dataset),) ): test_dictlike_dataset(h5_root,keywords) hickle-5.0.2/hickle/tests/test_04_load_numpy.py000066400000000000000000000303121430361177200214440ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_load_numpy Unit tests for hickle module -- numpy loader. """ import pytest import sys # %% IMPORTS # Package imports import h5py as h5 import numpy as np import hickle.loaders.load_numpy as load_numpy from py.path import local # Set current working directory to the temporary directory local.get_temproot().chdir() # %% GLOBALS NESTED_DICT = { "level1_1": { "level2_1": [1, 2, 3], "level2_2": [4, 5, 6] }, "level1_2": { "level2_1": [1, 2, 3], "level2_2": [4, 5, 6] }, "level1_3": { "level2_1": { "level3_1": [1, 2, 3], "level3_2": [4, 5, 6] }, "level2_2": [4, 5, 6] } } # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ dummy_file = h5.File('test_load_builtins.hdf5','w') dummy_file = h5.File('load_numpy_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() # %% FUNCTION DEFINITIONS def test_create_np_scalar(h5_data,compression_kwargs): """ tests proper storage and loading of numpy scalars """ # check that scalar dataset is created for numpy scalar scalar_data = np.float64(np.pi) dtype = scalar_data.dtype h_dataset,subitems = load_numpy.create_np_scalar_dataset(scalar_data,h5_data,"scalar_data",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert h_dataset.attrs['np_dtype'] in ( dtype.str.encode('ascii'),dtype.str) assert h_dataset[()] == scalar_data assert load_numpy.load_np_scalar_dataset(h_dataset,b'np_scalar',scalar_data.__class__) == scalar_data # check that numpy.bool_ scarlar is properly stored and reloaded scalar_data = np.bool_(True) dtype = scalar_data.dtype h_dataset,subitems = load_numpy.create_np_scalar_dataset(scalar_data,h5_data,"generic_data",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert h_dataset.attrs['np_dtype'] in ( dtype.str.encode('ascii'),dtype.str) and h_dataset[()] == scalar_data assert load_numpy.load_np_scalar_dataset(h_dataset,b'np_scalar',scalar_data.__class__) == scalar_data def test_create_np_dtype(h5_data,compression_kwargs): """ test proper creation and loading of dataset representing numpy dtype """ dtype = np.dtype(np.int16) h_dataset,subitems = load_numpy.create_np_dtype(dtype, h5_data,"dtype_string",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert bytes(h_dataset[()]).decode('ascii') == dtype.str assert load_numpy.load_np_dtype_dataset(h_dataset,'np_dtype',np.dtype) == dtype def test_create_np_ndarray(h5_data,compression_kwargs): """ test proper creation and loading of numpy ndarray """ # check that numpy array representing python utf8 string is properly # stored as bytearray dataset and reloaded from np_array_data = np.array("im python string") h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_string_array",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert bytes(h_dataset[()]) == np_array_data.tolist().encode("utf8") assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) assert load_numpy.load_ndarray_dataset(h_dataset,b'ndarray',np.ndarray) == np_array_data # check that numpy array representing python bytes string is properly # stored as bytearray dataset and reloaded from np_array_data = np.array(b"im python bytes") h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_bytes_array",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert h_dataset[()] == np_array_data.tolist() assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) assert load_numpy.load_ndarray_dataset(h_dataset,b'ndarray',np.ndarray) == np_array_data # check that numpy array with dtype object representing list of various kinds # of objects is converted to list before storing and reloaded properly from this # list representation # NOTE: simplified as mixing items of varying length receives # VisibleDeprecationWarning from newer numpy versions #np_array_data = np.array([[NESTED_DICT], ('What is this?',), {1, 2, 3, 7, 1}]) np_array_data = np.array([NESTED_DICT])#, ('What is this?',), {1, 2, 3, 7, 1}]) h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_list_object_array",**compression_kwargs) ndarray_container = load_numpy.NDArrayLikeContainer(h_dataset.attrs,b'ndarray',np_array_data.__class__) assert isinstance(h_dataset,h5.Group) and iter(subitems) assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) for index,(name,item,attrs,kwargs) in enumerate(subitems): assert name == "data{:d}".format(index) and attrs.get("item_index",None) == index assert isinstance(kwargs,dict) and np_array_data[index] == item ndarray_container.append(name,item,attrs) assert np.all(ndarray_container.convert() == np_array_data) # check that numpy array containing multiple strings of length > 1 # is properly converted to list of strings and restored from its list # representation np_array_data = np.array(["1313e", "was", "maybe?", "here"]) h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_list_of_strings_array",**compression_kwargs) ndarray_container = load_numpy.NDArrayLikeContainer(h_dataset.attrs,b'ndarray',np_array_data.__class__) assert isinstance(h_dataset,h5.Group) and iter(subitems) assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) for index,(name,item,attrs,kwargs) in enumerate(subitems): assert name == "data{:d}".format(index) and attrs.get("item_index",None) == index assert isinstance(kwargs,dict) and np_array_data[index] == item ndarray_container.append(name,item,attrs) assert np.all(ndarray_container.convert() == np_array_data) # check that numpy array with object dtype which is converted to single object # by ndarray.tolist method is properly stored according to type of object and # restored from this representation accordingly np_array_data = np.array(NESTED_DICT) h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_object_array",**compression_kwargs) ndarray_container = load_numpy.NDArrayLikeContainer(h_dataset.attrs,b'ndarray',np_array_data.__class__) ndarray_pickle_container = load_numpy.NDArrayLikeContainer(h_dataset.attrs,b'ndarray',np_array_data.__class__) assert isinstance(h_dataset,h5.Group) and iter(subitems) assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) data_set = False for name,item,attrs,kwargs in subitems: if name == "data": assert not data_set and not attrs and isinstance(kwargs,dict) assert np_array_data[()] == item data_set = True ndarray_container.append(name,item,attrs) attrs = dict(attrs) attrs["base_type"] = b'pickle' ndarray_pickle_container.append(name,item,attrs) else: raise AssertionError("expected single data object") assert np.all(ndarray_container.convert() == np_array_data) assert np.all(ndarray_pickle_container.convert() == np_array_data) # check that numpy.matrix type object is properly stored and reloaded from # hickle file. # NOTE/TODO: current versions of numpy issue PendingDeprecationWarning when using # numpy.matrix. In order to indicate to pytest that this is known and can safely # be ignored the warning is captured here. Shall it be that future numpy versions # convert PendingDeprecationWarning into any kind of exception like TypeError # AttributeError, RuntimeError or alike that also capture these Exceptions not # just PendingDeprecationWarning with pytest.warns(PendingDeprecationWarning): np_array_data = np.matrix([[1, 2], [3, 4]]) h_dataset,subitems = load_numpy.create_np_array_dataset(np_array_data,h5_data,"numpy_matrix",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and iter(subitems) and not subitems assert np.all(h_dataset[()] == np_array_data) assert h_dataset.attrs["np_dtype"] in ( np_array_data.dtype.str.encode("ascii"),np_array_data.dtype.str) np_loaded_array_data = load_numpy.load_ndarray_dataset(h_dataset,b'npmatrix',np.matrix) assert np.all(np_loaded_array_data == np_array_data) assert isinstance(np_loaded_array_data,np.matrix) assert np_loaded_array_data.shape == np_array_data.shape def test_create_np_masked_array(h5_data,compression_kwargs): """ test proper creation and loading of numpy.masked arrays """ # check that simple masked array is properly stored and loaded masked_array = np.ma.array([1, 2, 3, 4], dtype='float32', mask=[0, 1, 0, 0]) h_datagroup,subitems = load_numpy.create_np_masked_array_dataset(masked_array, h5_data, "masked_array",**compression_kwargs) masked_array_container = load_numpy.NDMaskedArrayContainer(h_datagroup.attrs,b'ndarray_masked',np.ma.array) assert isinstance(h_datagroup,h5.Group) and iter(subitems) assert h_datagroup.attrs["np_dtype"] in ( masked_array.dtype.str.encode("ascii"),masked_array.dtype.str) data_set = mask_set = False for name,item,attrs,kwargs in subitems: assert isinstance(attrs,dict) and isinstance(kwargs,dict) if name == "data": assert not data_set and not attrs and np.all(masked_array.data == item) and item is not masked_array masked_array_container.append(name,item,attrs) data_set = True elif name == "mask": assert not mask_set and not attrs and np.all(masked_array.mask == item) and item is not masked_array masked_array_container.append(name,item,attrs) mask_set = True else: raise AssertionError("expected one data and one mask object") assert np.all(masked_array_container.convert() == masked_array) # check that format used by hickle version 4.0.0 to encode is properly recognized # on loading and masked array is restored accordingly h_dataset = h5_data.create_dataset("masked_array_dataset",data = masked_array.data) h_dataset.attrs["np_dtype"] = masked_array.dtype.str.encode("ascii") with pytest.raises(ValueError,match = r"mask\s+not\s+found"): loaded_masked_array = load_numpy.load_ndarray_masked_dataset(h_dataset,b'masked_array_data',np.ma.array) h_mask_dataset = h5_data.create_dataset("masked_array_dataset_mask",data = masked_array.mask) loaded_masked_array = load_numpy.load_ndarray_masked_dataset(h_dataset,b'masked_array_data',np.ma.array) assert np.all(loaded_masked_array == masked_array ) # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest from hickle.tests.conftest import compression_kwargs for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_np_scalar),) ): test_create_np_scalar(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_np_dtype),) ): test_create_np_dtype(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_np_ndarray),) ): test_create_np_ndarray(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_np_masked_array),) ): test_create_np_masked_array(h5_root,keywords) hickle-5.0.2/hickle/tests/test_05_load_scipy.py000066400000000000000000000140361430361177200214310ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_load_scipy Unit tests for hickle module -- scipy loader. """ # %% IMPORTS # Package imports import pytest import h5py as h5 import numpy as np import pickle from scipy.sparse import csr_matrix, csc_matrix, bsr_matrix from py.path import local # %% HICKLE imports import hickle.loaders.load_scipy as load_scipy # Set the current working directory to the temporary directory local.get_temproot().chdir() # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ dummy_file = h5.File('test_load_builtins.hdf5','w') dummy_file = h5.File('load_numpy_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() # %% FUNCTION DEFINITIONS def test_return_first_function_type(): with pytest.raises(TypeError): load_scipy.return_first(['anything','some other thins','nothing']) def test_create_sparse_dataset(h5_data,compression_kwargs): """ test creation and loading of sparse matrix """ # create all possible kinds of sparse matrix representations row = np.array([0, 0, 1, 2, 2, 2]) col = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]) sm1 = csr_matrix((data, (row, col)), shape=(3, 3)) sm2 = csc_matrix((data, (row, col)), shape=(3, 3)) indptr = np.array([0, 2, 3, 6]) indices = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]).repeat(4).reshape([6, 2, 2]) sm3 = bsr_matrix((data, indices, indptr), shape=(6, 6)) # check that csr type matrix is properly stored and loaded h_datagroup,subitems = load_scipy.create_sparse_dataset(sm1,h5_data,"csr_matrix",**compression_kwargs) assert isinstance(h_datagroup,h5.Group) and iter(subitems) seen_items = dict((key,False) for key in ("data",'indices','indptr','shape')) sparse_container = load_scipy.SparseMatrixContainer(h_datagroup.attrs,b'csr_matrix',csr_matrix) for name,item,attrs,kwargs in subitems: assert not seen_items[name] seen_items[name] = True sparse_container.append(name,item,attrs) reloaded = sparse_container.convert() assert np.all(reloaded.data == sm1.data) and reloaded.dtype == sm1.dtype and reloaded.shape == sm1.shape # check that csc type matrix is properly stored and loaded h_datagroup,subitems = load_scipy.create_sparse_dataset(sm2,h5_data,"csc_matrix",**compression_kwargs) assert isinstance(h_datagroup,h5.Group) and iter(subitems) seen_items = dict((key,False) for key in ("data",'indices','indptr','shape')) sparse_container = load_scipy.SparseMatrixContainer(h_datagroup.attrs,b'csc_matrix',csc_matrix) for name,item,attrs,kwargs in subitems: assert not seen_items[name] seen_items[name] = True sparse_container.append(name,item,attrs) reloaded = sparse_container.convert() assert np.all(reloaded.data == sm2.data) and reloaded.dtype == sm2.dtype and reloaded.shape == sm2.shape # check that bsr type matrix is properly stored and loaded h_datagroup,subitems = load_scipy.create_sparse_dataset(sm3,h5_data,"bsr_matrix",**compression_kwargs) assert isinstance(h_datagroup,h5.Group) and iter(subitems) seen_items = dict((key,False) for key in ("data",'indices','indptr','shape')) sparse_container = load_scipy.SparseMatrixContainer(h_datagroup.attrs,b'bsr_matrix',bsr_matrix) for name,item,attrs,kwargs in subitems: assert not seen_items[name] seen_items[name] = True sparse_container.append(name,item,attrs) reloaded = sparse_container.convert() assert np.all(reloaded.data == sm3.data) and reloaded.dtype == sm3.dtype and reloaded.shape == sm3.shape # mimic hickle version 4.0.0 format to represent crs type matrix h_datagroup,subitems = load_scipy.create_sparse_dataset(sm1,h5_data,"csr_matrix_filtered",**compression_kwargs) sparse_container = load_scipy.SparseMatrixContainer(h_datagroup.attrs,b'csr_matrix',load_scipy.return_first) for name,item,attrs,kwargs in subitems: h_dataset = h_datagroup.create_dataset(name,data=item) if name == "data": attrs["type"] = np.array(pickle.dumps(sm1.__class__)) attrs["base_type"] = b'csr_matrix' h_dataset.attrs.update(attrs) # check that dataset representing hickle 4.0.0 representation of sparse matrix # is properly recognized by SparseMatrixContainer.filter method and sub items of # sparse matrix group are properly adjusted to be safely loaded by SparseMatrixContainer for name,h_dataset in sparse_container.filter(h_datagroup): if name == "shape": sparse_container.append(name,tuple(h_dataset[()]),h_dataset.attrs) else: sparse_container.append(name,np.array(h_dataset[()]),h_dataset.attrs) reloaded = sparse_container.convert() assert np.all(reloaded.data == sm1.data) and reloaded.dtype == sm1.dtype and reloaded.shape == sm1.shape # verify that SparseMatrixContainer.filter method ignores any items which # are not recognized by SparseMatrixContainer update or convert method h_datagroup.create_dataset("ignoreme",data=12) for name,h_dataset in sparse_container.filter(h_datagroup): if name == "shape": sparse_container.append(name,tuple(h_dataset[()]),h_dataset.attrs) else: sparse_container.append(name,np.array(h_dataset[()]),h_dataset.attrs) reloaded = sparse_container.convert() assert np.all(reloaded.data == sm1.data) and reloaded.dtype == sm1.dtype and reloaded.shape == sm1.shape # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest from hickle.tests.conftest import compression_kwargs test_return_first_function_type() for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_sparse_dataset),) ): test_create_sparse_dataset(h5_root,keywords) hickle-5.0.2/hickle/tests/test_06_load_astropy.py000077500000000000000000000373611430361177200220150ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_load_astropy Unit tests for hickle module -- astropy loader. """ # %% IMPORTS # Package imports import h5py as h5 import numpy as np import pytest from astropy.units import Quantity from astropy.time import Time from astropy.coordinates import Angle, SkyCoord import astropy.constants as apc from astropy.table import Table import numpy as np from py.path import local # hickle imports import hickle.loaders.load_astropy as load_astropy # Set the current working directory to the temporary directory local.get_temproot().chdir() # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ dummy_file = h5.File('test_load_builtins.hdf5','w') dummy_file = h5.File('load_numpy_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() # %% FUNCTION DEFINITIONS def test_create_astropy_quantity(h5_data,compression_kwargs): """ test proper storage and loading of astropy quantities """ for index,uu in enumerate(['m^3', 'm^3 / s', 'kg/pc']): a = Quantity(7, unit=uu) h_dataset,subitems = load_astropy.create_astropy_quantity(a,h5_data,"quantity{}".format(index),**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) a_unit_string = a.unit.to_string() assert h_dataset.attrs['unit'] in ( a_unit_string.encode("ascii"),a_unit_string) and h_dataset[()] == a.value reloaded = load_astropy.load_astropy_quantity_dataset(h_dataset,b'astropy_quantity',Quantity) assert reloaded == a and reloaded.unit == a.unit a *= a h_dataset,subitems = load_astropy.create_astropy_quantity(a,h5_data,"quantity_sqr{}".format(index),**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) a_unit_string = a.unit.to_string() assert h_dataset.attrs['unit'] in ( a_unit_string.encode("ascii"),a_unit_string) and h_dataset[()] == a.value reloaded = load_astropy.load_astropy_quantity_dataset(h_dataset,b'astropy_quantity',Quantity) assert reloaded == a and reloaded.unit == a.unit def test_create_astropy_constant(h5_data,compression_kwargs): """ test proper storage and loading of astropy constants """ h_dataset,subitems = load_astropy.create_astropy_constant(apc.G,h5_data,"apc_G",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) apc_G_unit_string = apc.G.unit.to_string() assert h_dataset.attrs["unit"] in (apc_G_unit_string.encode('ascii'),apc_G_unit_string) assert h_dataset.attrs["abbrev"] in (apc.G.abbrev.encode('ascii'),apc.G.abbrev) assert h_dataset.attrs["name"] in ( apc.G.name.encode('ascii'),apc.G.name) assert h_dataset.attrs["reference"] in ( apc.G.reference.encode('ascii'),apc.G.reference) assert h_dataset.attrs["uncertainty"] == apc.G.uncertainty reloaded = load_astropy.load_astropy_constant_dataset(h_dataset,b'astropy_constant',apc.G.__class__) assert reloaded == apc.G and reloaded.dtype == apc.G.dtype h_dataset,subitems = load_astropy.create_astropy_constant(apc.cgs.e,h5_data,"apc_cgs_e",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs["unit"] in ( apc.cgs.e.unit.to_string().encode('ascii'),apc.cgs.e.unit) assert h_dataset.attrs["abbrev"] in ( apc.cgs.e.abbrev.encode('ascii'),apc.cgs.e.abbrev) assert h_dataset.attrs["name"] in (apc.cgs.e.name.encode('ascii'),apc.cgs.e.name) assert h_dataset.attrs["reference"] in ( apc.cgs.e.reference.encode('ascii'),apc.cgs.e.reference) assert h_dataset.attrs["uncertainty"] == apc.cgs.e.uncertainty assert h_dataset.attrs["system"] in ( apc.cgs.e.system.encode('ascii'),apc.cgs.e.system ) reloaded = load_astropy.load_astropy_constant_dataset(h_dataset,b'astropy_constant',apc.cgs.e.__class__) assert reloaded == apc.cgs.e and reloaded.dtype == apc.cgs.e.dtype def test_astropy_table(h5_data,compression_kwargs): """ test proper storage and loading of astropy table """ t = Table([[1, 2], [3, 4]], names=('a', 'b'), meta={'name': 'test_thing'}) h_dataset,subitems = load_astropy.create_astropy_table(t,h5_data,"astropy_table",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert ( np.all(h_dataset.attrs['colnames'] == [ cname.encode('ascii') for cname in t.colnames]) or np.all(h_dataset.attrs['colnames'] == [ cname for cname in t.colnames]) ) for metakey,metavalue in t.meta.items(): assert h_dataset.attrs[metakey] == metavalue assert h_dataset.dtype == t.as_array().dtype reloaded = load_astropy.load_astropy_table(h_dataset,b'astropy_table',t.__class__) assert reloaded.meta == t.meta and reloaded.dtype == t.dtype assert np.allclose(t['a'].astype('float32'),reloaded['a'].astype('float32')) assert np.allclose(t['b'].astype('float32'),reloaded['b'].astype('float32')) def test_astropy_quantity_array(h5_data,compression_kwargs): """ test proper storage and loading of array of astropy quantities """ a = Quantity([1, 2, 3], unit='m') h_dataset,subitems = load_astropy.create_astropy_quantity(a,h5_data,"quantity_array",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs['unit'] in (a.unit.to_string().encode("ascii"),a.unit.to_string()) and np.all(h_dataset[()] == a.value) reloaded = load_astropy.load_astropy_quantity_dataset(h_dataset,b'astropy_quantity',Quantity) assert np.all(reloaded == a) and reloaded.unit == a.unit def test_astropy_time_array(h5_data,compression_kwargs): """ test proper storage and loading of astropy time representations """ loop_counter = 0 for times in ([58264, 58265, 58266], [[58264, 58265, 58266], [58264, 58265, 58266]]): t1 = Time(times, format='mjd', scale='utc') h_dataset, subitems = load_astropy.create_astropy_time(t1,h5_data, f'time_{loop_counter}',**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs['format'] in( str(t1.format).encode('ascii'),str(t1.format)) assert h_dataset.attrs['scale'] in ( str(t1.scale).encode('ascii'),str(t1.scale)) assert h_dataset.attrs['np_dtype'] in( t1.value.dtype.str.encode('ascii'),t1.value.dtype.str) reloaded = load_astropy.load_astropy_time_dataset(h_dataset,b'astropy_time',t1.__class__) assert reloaded.value.shape == t1.value.shape assert reloaded.format == t1.format assert reloaded.scale == t1.scale for index in range(len(t1)): assert np.allclose(reloaded.value[index], t1.value[index]) loop_counter += 1 t_strings = ['1999-01-01T00:00:00.123456789', '2010-01-01T00:00:00'] # Check that 2D time arrays work as well (github issue #162) for times in (t_strings, [t_strings, t_strings]): t1 = Time(times, format='isot', scale='utc') h_dataset,subitems = load_astropy.create_astropy_time(t1,h5_data,f'time_{loop_counter}',**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs['format'] in (str(t1.format).encode('ascii'),str(t1.format)) assert h_dataset.attrs['scale'] in (str(t1.scale).encode('ascii'),str(t1.scale)) assert h_dataset.attrs['np_dtype'] in ( t1.value.dtype.str.encode('ascii'),t1.value.dtype.str) reloaded = load_astropy.load_astropy_time_dataset(h_dataset,b'astropy_time',t1.__class__) assert reloaded.value.shape == t1.value.shape assert reloaded.format == t1.format assert reloaded.scale == t1.scale for index in range(len(t1)): assert reloaded.value[index].tostring() == t1.value[index].tostring() del h_dataset.attrs['np_dtype'] reloaded = load_astropy.load_astropy_time_dataset(h_dataset,b'astropy_time',t1.__class__) assert reloaded.value.shape == t1.value.shape assert reloaded.format == t1.format assert reloaded.scale == t1.scale for index in range(len(t1)): assert reloaded.value[index].tostring() == t1.value[index].tostring() loop_counter += 1 def test_astropy_angle(h5_data,compression_kwargs): """ test proper storage of astropy angles """ for index,uu in enumerate(['radian', 'degree']): a = Angle(1.02, unit=uu) h_dataset,subitems = load_astropy.create_astropy_angle(a,h5_data,"angle_{}".format(uu),**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs['unit'] in( a.unit.to_string().encode('ascii'),a.unit.to_string()) assert h_dataset[()] == a.value reloaded = load_astropy.load_astropy_angle_dataset(h_dataset,b'astropy_angle',a.__class__) assert reloaded == a and reloaded.unit == a.unit def test_astropy_angle_array(h5_data,compression_kwargs): """ test proper storage and loading of arrays of astropy angles """ a = Angle([1, 2, 3], unit='degree') h_dataset,subitems = load_astropy.create_astropy_angle(a,h5_data,"angle_array",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset.attrs['unit'] in (a.unit.to_string().encode('ascii'),a.unit.to_string()) assert np.allclose(h_dataset[()] , a.value ) reloaded = load_astropy.load_astropy_angle_dataset(h_dataset,b'astropy_angle',a.__class__) assert np.all(reloaded == a) and reloaded.unit == a.unit def test_astropy_skycoord(h5_data,compression_kwargs): """ test proper storage and loading of astropy sky coordinates """ ra = Angle('1d20m', unit='degree') dec = Angle('33d0m0s', unit='degree') radec = SkyCoord(ra, dec) h_dataset,subitems = load_astropy.create_astropy_skycoord(radec,h5_data,"astropy_skycoord_1",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset[()][...,0] == radec.data.lon.value assert h_dataset[()][...,1] == radec.data.lat.value assert h_dataset.attrs['lon_unit'] in ( radec.data.lon.unit.to_string().encode('ascii'),radec.data.lon.unit.to_string()) assert h_dataset.attrs['lat_unit'] in ( radec.data.lat.unit.to_string().encode('ascii'),radec.data.lat.unit.to_string()) reloaded = load_astropy.load_astropy_skycoord_dataset(h_dataset,b'astropy_skycoord',radec.__class__) assert np.allclose(reloaded.ra.value,radec.ra.value) assert np.allclose(reloaded.dec.value,radec.dec.value) ra = Angle('1d20m', unit='hourangle') dec = Angle('33d0m0s', unit='degree') radec = SkyCoord(ra, dec) h_dataset,subitems = load_astropy.create_astropy_skycoord(radec,h5_data,"astropy_skycoord_2",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert h_dataset[()][...,0] == radec.data.lon.value assert h_dataset[()][...,1] == radec.data.lat.value assert h_dataset.attrs['lon_unit'] in (radec.data.lon.unit.to_string().encode('ascii'),radec.data.lon.unit.to_string()) assert h_dataset.attrs['lat_unit'] in ( radec.data.lat.unit.to_string().encode('ascii'),radec.data.lat.unit.to_string()) reloaded = load_astropy.load_astropy_skycoord_dataset(h_dataset,b'astropy_skycoord',radec.__class__) assert reloaded.ra.value == radec.ra.value assert reloaded.dec.value == radec.dec.value def test_astropy_skycoord_array(h5_data,compression_kwargs): """ test proper storage and loading of astropy sky coordinates """ ra = Angle(['1d20m', '0d21m'], unit='degree') dec = Angle(['33d0m0s', '-33d01m'], unit='degree') radec = SkyCoord(ra, dec) h_dataset,subitems = load_astropy.create_astropy_skycoord(radec,h5_data,"astropy_skycoord_1",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert np.allclose(h_dataset[()][...,0],radec.data.lon.value) assert np.allclose(h_dataset[()][...,1],radec.data.lat.value) assert h_dataset.attrs['lon_unit'] in ( radec.data.lon.unit.to_string().encode('ascii'),radec.data.lon.unit.to_string()) assert h_dataset.attrs['lat_unit'] in ( radec.data.lat.unit.to_string().encode('ascii'),radec.data.lat.unit.to_string()) reloaded = load_astropy.load_astropy_skycoord_dataset(h_dataset,b'astropy_skycoord',radec.__class__) assert np.allclose(reloaded.ra.value,radec.ra.value) assert np.allclose(reloaded.dec.value,radec.dec.value) ra = Angle([['1d20m', '0d21m'], ['1d20m', '0d21m']], unit='hourangle') dec = Angle([['33d0m0s', '33d01m'], ['33d0m0s', '33d01m']], unit='degree') radec = SkyCoord(ra, dec) h_dataset,subitems = load_astropy.create_astropy_skycoord(radec,h5_data,"astropy_skycoord_2",**compression_kwargs) assert isinstance(h_dataset,h5.Dataset) and not subitems and iter(subitems) assert np.allclose(h_dataset[()][...,0],radec.data.lon.value) assert np.allclose(h_dataset[()][...,1],radec.data.lat.value) assert h_dataset.attrs['lon_unit'] in ( radec.data.lon.unit.to_string().encode('ascii'),radec.data.lon.unit.to_string()) assert h_dataset.attrs['lat_unit'] in ( radec.data.lat.unit.to_string().encode('ascii'),radec.data.lat.unit.to_string()) reloaded = load_astropy.load_astropy_skycoord_dataset(h_dataset,b'astropy_skycoord',radec.__class__) assert np.allclose(reloaded.ra.value,radec.ra.value) assert np.allclose(reloaded.dec.value,radec.dec.value) assert reloaded.ra.shape == radec.ra.shape assert reloaded.dec.shape == radec.dec.shape # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest from conftest import compression_kwargs for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_astropy_quantity),) ): test_create_astropy_quantity(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_create_astropy_constant),) ): test_create_astropy_constant(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_table),) ): test_astropy_table(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_quantity_array),) ): test_astropy_quantity_array(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_time_array),) ): test_astropy_time_array(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_angle),) ): test_astropy_angle(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_angle_array),) ): test_astropy_angle_array(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_skycoord),) ): test_astropy_skycoord(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_astropy_skycoord_array),) ): test_astropy_skycoord_array(h5_root,keywords) hickle-5.0.2/hickle/tests/test_07_load_pandas.py000066400000000000000000000031231430361177200215450ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_load_pandas Unit tests for hickle module -- pandas loader. """ # TODO add tests for all loader related dump_fcn, load_fcn functions # and PyContainer classes as soon as there exists any pandas # specific loader # %% IMPORTS # Package imports import h5py as h5 import numpy as np import pytest import pandas as pd from py.path import local # hickle imports import hickle.loaders.load_pandas as load_pandas # Set the current working directory to the temporary directory local.get_temproot().chdir() # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ dummy_file = h5.File('test_load_builtins.hdf5','w') dummy_file = h5.File('load_numpy_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() # %% FUNCTION DEFINITIONS def test_nothing_yet_totest(h5_data,compression_kwargs): """ dummy test function to be removed as soon as load_pandas loader module contains dump_fcn, load_fcn and PyContainer functions and classes for pandas arrays and objects. """ # %% MAIN SCRIPT if __name__ == "__main__": from _pytest.fixtures import FixtureRequest from conftest import compression_kwargs for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_nothing_yet_totest),) ): test_nothing_yet_totest(h5_root,keywords) hickle-5.0.2/hickle/tests/test_99_hickle_core.py000066400000000000000000000343641430361177200215750ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_hickle.py Unit tests for hickle module. """ # %% IMPORTS # Built-in imports from collections import OrderedDict as odict import os import re from pprint import pprint # Package imports import pytest import pickle import h5py import numpy as np from py.path import local # hickle imports from hickle import dump, helpers, hickle, load, lookup, fileio # Set current working directory to the temporary directory local.get_temproot().chdir() # %% GLOBALS # %% HELPER DEFINITIONS # %% FIXTURES @pytest.fixture def h5_data(request): """ create dummy hdf5 test data file for testing PyContainer and H5NodeFilterProxy """ import h5py as h5 dummy_file = h5.File('hickle_core_{}.hdf5'.format(request.function.__name__),'w') filename = dummy_file.filename test_data = dummy_file.create_group("root_group") yield test_data dummy_file.close() @pytest.fixture def test_file_name(request): yield "{}.hkl".format(request.function.__name__) # %% FUNCTION DEFINITIONS def test_recursive_dump(h5_data,compression_kwargs): """ test _dump function and that it properly calls itself recursively """ # check that dump function properly creates a list dataset and # sets appropriate values for 'type' and 'base_type' attributes data = simple_list = [1,2,3,4] with lookup.ReferenceManager.create_manager(h5_data) as memo: with lookup.LoaderManager.create_manager(h5_data) as loader: hickle._dump(data, h5_data, "simple_list",memo,loader,**compression_kwargs) dumped_data = h5_data["simple_list"] assert memo.resolve_type(dumped_data) == (data.__class__,b'list',False) assert np.all(dumped_data[()] == simple_list) # check that dump function properly creates a group representing # a dictionary and its keys and values and sets appropriate values # for 'type', 'base_type' and 'key_base_type' attributes data = { '12':12, (1,2,3):'hallo' } hickle._dump(data, h5_data, "some_dict",memo,loader,**compression_kwargs) dumped_data = h5_data["some_dict"] assert memo.resolve_type(dumped_data) == (data.__class__,b'dict',True) # check that the name of the resulting dataset for the first dict item # resembles double quoted string key and 'type', 'base_type 'key_base_type' # attributes the resulting dataset are set accordingly first_item = dumped_data['"12"'] assert first_item[()] == 12 and first_item.attrs['key_base_type'] in (b'str','str') assert memo.resolve_type(first_item) == (data['12'].__class__,b'int',False) #assert first_item.attrs['base_type'] == b'int' #assert first_item.attrs['type'] == pickle.dumps(data['12'].__class__) # check that second item is converted into key value pair group, that # the name of that group reads 'data0' and that 'type', 'base_type' and # 'key_base_type' attributes are set accordingly second_item = dumped_data.get("data0",None) if second_item is None: second_item = dumped_data["data1"] assert second_item.attrs['key_base_type'] in (b'key_value','key_value') assert memo.resolve_type(second_item) == (tuple,b'tuple',True) #assert second_item.attrs['type'] == pickle.dumps(tuple) # check that content of key value pair group resembles key and value of # second dict item key = second_item['data0'] value = second_item['data1'] assert np.all(key[()] == (1,2,3)) # and key.attrs['base_type'] == b'tuple' assert memo.resolve_type(key) == (tuple,b'tuple',False) assert bytes(value[()]) == 'hallo'.encode('utf8') # and value.attrs['base_type'] == b'str' assert memo.resolve_type(value) == (str,b'str',False) # check that objects for which no loader has been registered or for which # available loader raises NotHicklable exception are handled by # create_pickled_dataset function def fail_create_dict(py_obj,h_group,name,**kwargs): raise helpers.NotHicklable("test loader shrugg") loader.types_dict.maps.insert(0,{dict:(fail_create_dict,*loader.types_dict[dict][1:])}) memo_backup = memo.pop(id(data),None) with pytest.warns(lookup.SerializedWarning): hickle._dump(data, h5_data, "pickled_dict",memo,loader,**compression_kwargs) dumped_data = h5_data["pickled_dict"] assert bytes(dumped_data[()]) == pickle.dumps(data) loader.types_dict.maps.pop(0) memo[id(data)] = memo_backup def test_recursive_load(h5_data,compression_kwargs): """ test _load function and that it properly calls itself recursively """ # check that simple scalar value is properly restored on load from # corresponding dataset data = 42 data_name = "the_answer" with lookup.ReferenceManager.create_manager(h5_data) as memo: with lookup.LoaderManager.create_manager(h5_data) as loader: hickle._dump(data, h5_data, data_name,memo,loader,**compression_kwargs) py_container = hickle.RootContainer(h5_data.attrs,b'hickle_root',hickle.RootContainer) hickle._load(py_container, data_name, h5_data[data_name],memo,loader) assert py_container.convert() == data # check that dict object is properly restored on load from corresponding group data = {'question':None,'answer':42} data_name = "not_formulated" hickle._dump(data, h5_data, data_name,memo,loader,**compression_kwargs) py_container = hickle.RootContainer(h5_data.attrs,b'hickle_root',hickle.RootContainer) hickle._load(py_container, data_name, h5_data[data_name],memo,loader) assert py_container.convert() == data # check that objects for which no loader has been registered or for which # available loader raises NotHicklable exception are properly restored on load # from corresponding copy protocol group or pickled data string def fail_create_dict(py_obj,h_group,name,**kwargs): raise helpers.NotHicklable("test loader shrugg") loader.types_dict.maps.insert(0,{dict:(fail_create_dict,*loader.types_dict[dict][1:])}) data_name = "pickled_dict" memo_backup = memo.pop(id(data),None) with pytest.warns(lookup.SerializedWarning): hickle._dump(data, h5_data, data_name,memo,loader,**compression_kwargs) hickle._load(py_container, data_name, h5_data[data_name],memo,loader) assert py_container.convert() == data loader.types_dict.maps.pop(0) memo[id(data)] = memo_backup # %% ISSUE RELATED TESTS def test_invalid_file(compression_kwargs): """ Test if trying to use a non-file object fails. """ with pytest.raises(hickle.FileError): dump('test', (),**compression_kwargs) def test_binary_file(test_file_name,compression_kwargs): """ Test if using a binary file works https://github.com/telegraphic/hickle/issues/123""" filename = test_file_name.replace(".hkl",".hdf5") with open(filename, "w") as f: with pytest.raises(hickle.FileError): hickle.dump(None, f,**compression_kwargs) with open(filename, "w+") as f: with pytest.raises(hickle.FileError): hickle.dump(None, f,**compression_kwargs) with open(filename, "wb") as f: with pytest.raises(hickle.FileError): hickle.dump(None, f,**compression_kwargs) with open(filename, "w+b") as f: hickle.dump(None, f,**compression_kwargs) def test_file_open_close(test_file_name,h5_data,compression_kwargs): """ https://github.com/telegraphic/hickle/issues/20 """ import h5py f = h5py.File(test_file_name.replace(".hkl",".hdf"), 'w') a = np.arange(5) dump(a, test_file_name,**compression_kwargs) dump(a, test_file_name,**compression_kwargs) dump(a, f, mode='w',**compression_kwargs) f.close() with pytest.raises(hickle.ClosedFileError): dump(a, f, mode='w',**compression_kwargs) h5_data.create_dataset('nothing',data=[]) with pytest.raises(ValueError,match = r"Unable\s+to\s+create\s+group\s+\(name\s+already\s+exists\)"): dump(a,h5_data.file,path="/root_group",**compression_kwargs) def test_hdf5_group(test_file_name,compression_kwargs): import h5py hdf5_filename = test_file_name.replace(".hkl",".hdf5") file = h5py.File(hdf5_filename, 'w') group = file.create_group('test_group') a = np.arange(5) dump(a, group,**compression_kwargs) file.close() a_hkl = load(hdf5_filename, path='/test_group') assert np.allclose(a_hkl, a) file = h5py.File(hdf5_filename, 'r+') group = file.create_group('test_group2') b = np.arange(8) dump(b, group, path='deeper/and_deeper',**compression_kwargs) file.close() with pytest.raises(ValueError): b_hkl = load(hdf5_filename, path='/test_group2/deeper_/and_deeper') b_hkl = load(hdf5_filename, path='/test_group2/deeper/and_deeper') assert np.allclose(b_hkl, b) file = h5py.File(hdf5_filename, 'r') b_hkl2 = load(file['test_group2'], path='deeper/and_deeper') assert np.allclose(b_hkl2, b) file.close() def test_with_open_file(test_file_name,compression_kwargs): """ Testing dumping and loading to an open file https://github.com/telegraphic/hickle/issues/92""" lst = [1] tpl = (1,) dct = {1: 1} arr = np.array([1]) with h5py.File(test_file_name, 'w') as file: dump(lst, file, path='/lst',**compression_kwargs) dump(tpl, file, path='/tpl',**compression_kwargs) dump(dct, file, path='/dct',**compression_kwargs) dump(arr, file, path='/arr',**compression_kwargs) with h5py.File(test_file_name, 'r') as file: assert load(file, '/lst') == lst assert load(file, '/tpl') == tpl assert load(file, '/dct') == dct assert load(file, '/arr') == arr def test_load(test_file_name,compression_kwargs): a = set([1, 2, 3, 4]) b = set([5, 6, 7, 8]) c = set([9, 10, 11, 12]) z = (a, b, c) z = [z, z] z = (z, z, z, z, z) print("Original:") pprint(z) dump(z, test_file_name, mode='w',**compression_kwargs) print("\nReconstructed:") z = load(test_file_name) pprint(z) def test_multi_hickle(test_file_name,compression_kwargs): """ Dumping to and loading from the same file several times https://github.com/telegraphic/hickle/issues/20""" a = {'a': 123, 'b': [1, 2, 4]} if os.path.exists(test_file_name): os.remove(test_file_name) dump(a, test_file_name, path="/test", mode="w",**compression_kwargs) dump(a, test_file_name, path="/test2", mode="r+",**compression_kwargs) dump(a, test_file_name, path="/test3", mode="r+",**compression_kwargs) dump(a, test_file_name, path="/test4", mode="r+",**compression_kwargs) load(test_file_name, path="/test") load(test_file_name, path="/test2") load(test_file_name, path="/test3") load(test_file_name, path="/test4") def test_improper_attrs(test_file_name,compression_kwargs): """ test for proper reporting missing mandatory attributes for the various supported file versions """ # check that missing attributes which disallow to identify # hickle version are reported data = "my name? Ha I'm Nobody" dump(data,test_file_name,**compression_kwargs) manipulated = h5py.File(test_file_name,"r+") root_group = manipulated.get('/') root_group.attrs["VERSION"] = root_group.attrs["HICKLE_VERSION"] del root_group.attrs["HICKLE_VERSION"] manipulated.flush() with pytest.raises( ValueError, match= r"Provided\s+argument\s+'file_obj'\s+does\s+not\s+appear" r"\s+to\s+be\s+a\s+valid\s+hickle\s+file!.*" ): load(manipulated) # %% MAIN SCRIPT if __name__ == '__main__': """ Some tests and examples """ from _pytest.fixtures import FixtureRequest from hickle.tests.conftest import compression_kwargs for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_recursive_dump),) ): test_recursive_dump(h5_root,keywords) for h5_root,keywords in ( ( h5_data(request),compression_kwargs(request) ) for request in (FixtureRequest(test_recursive_load),) ): test_recursive_load(h5_root,keywords) for keywords in compression_kwargs(FixtureRequest(test_recursive_dump)): test_invalid_file(keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_binary_file),) ): test_binary_file(filename,keywords) for h5_root,filename,keywords in ( ( h5_data(request),test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_file_open_close),) ): test_file_open_close(h5_root,filename,keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_hdf5_group),) ): test_hdf5_group(filename,keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_with_open_file),) ): test_with_open_file(filename,keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_load),) ): test_load(filename,keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_multi_hickle),) ): test_multi_hickle(filename,keywords) for filename,keywords in ( ( test_file_name(request),compression_kwargs(request) ) for request in (FixtureRequest(test_improper_attrs),) ): test_improper_attrs(filename,keywords) hickle-5.0.2/hickle/tests/test_hickle.py000066400000000000000000000640341430361177200202410ustar00rootroot00000000000000#! /usr/bin/env python # encoding: utf-8 """ # test_hickle.py Unit test for hickle package. """ # %% IMPORTS # Built-in imports from collections import OrderedDict as odict import os import re from pprint import pprint import pickle # Package imports import numpy as np from py.path import local import pytest # hickle imports from hickle import dump, hickle, load, lookup # Set current working directory to the temporary directory local.get_temproot().chdir() # %% GLOBALS NESTED_DICT = { "level1_1": { "level2_1": [1, 2, 3], "level2_2": [4, 5, 6] }, "level1_2": { "level2_1": [1, 2, 3], "level2_2": [4, 5, 6] }, "level1_3": { "level2_1": { "level3_1": [1, 2, 3], "level3_2": [4, 5, 6] }, "level2_2": [4, 5, 6] } } # %% FIXTURES @pytest.fixture def test_file_name(request): """ create test dependent filename path string """ yield "{}.hkl".format(request.function.__name__) # %% HELPER DEFINITIONS # Define a test function that must be serialized and unpacked again def func(a, b, c=0): """ just something to do """ return(a, b, c) # the following is required as package name of with_state is hickle # and load_loader refuses load any loader module for classes defined inside # hickle package exempt when defined within load_*.py loaders modules. # That has to be done by hickle sub modules directly using register_class function pickle_dumps = pickle.dumps pickle_loads = pickle.loads types_to_hide = set() def make_visible_to_dumps(obj,protocol=None,*,fix_imports=True): """ simulate loader functions defined outside hickle package """ if obj in types_to_hide: obj.__module__ = re.sub(r'^\s*(?!hickle\.)','hickle.',obj.__module__) elif obj.__class__ in types_to_hide: obj.__class__.__module__ = re.sub(r'^\s*(?!hickle\.)','hickle.',obj.__class__.__module__) return pickle_dumps(obj,protocol,fix_imports=fix_imports) def hide_from_hickle(bytes_obj,*,fix_imports=True,encoding="ASCII",errors="strict"): """ simulate loader function defined outside hickle package """ obj = pickle_loads(bytes_obj,fix_imports = fix_imports, encoding = encoding, errors = errors) if obj in types_to_hide: obj.__module__ = re.sub(r'^\s*hickle\.','',obj.__module__) elif obj.__class__ in types_to_hide: obj.__class__.__module__ = re.sub(r'^\s*hickle\.','',obj.__class__.__module__) return obj # Define a class that must always be pickled class with_state(object): """ A class that always must be handled by create_pickled_dataset """ def __init__(self): self.a = 12 self.b = { 'love': np.ones([12, 7]), 'hatred': np.zeros([4, 9])} def __getstate__(self): self.a *= 2 return({ 'a': self.a, 'b': self.b}) def __setstate__(self, state): self.a = state['a'] self.b = state['b'] def __getitem__(self, index): if(index == 0): return(self.a) if(index < 2): return(self.b['hatred']) if(index > 2): raise ValueError("index unknown") return(self.b['love']) types_to_hide.add(with_state) # %% FUNCTION DEFINITIONS def test_invalid_file(): """ Test if trying to use a non-file object fails. """ with pytest.raises(hickle.FileError): dump('test', ()) def test_state_obj(monkeypatch,test_file_name,compression_kwargs): """ Dumping and loading a class object with pickle states https://github.com/telegraphic/hickle/issues/125""" with monkeypatch.context() as monkey: monkey.setattr(with_state,'__module__',re.sub(r'^\s*hickle\.','',with_state.__module__)) monkey.setattr(pickle,'dumps',make_visible_to_dumps) mode = 'w' obj = with_state() with pytest.warns(lookup.SerializedWarning): dump(obj, test_file_name, mode,**compression_kwargs) monkey.setattr(pickle,'loads',hide_from_hickle) obj_hkl = load(test_file_name) assert isinstance(obj,obj_hkl.__class__) or isinstance(obj_hkl,obj.__class__) assert np.allclose(obj[1], obj_hkl[1]) def test_local_func(test_file_name,compression_kwargs): """ Dumping and loading a local function https://github.com/telegraphic/hickle/issues/119""" mode = 'w' with pytest.warns(lookup.SerializedWarning): dump(func, test_file_name, mode,**compression_kwargs) func_hkl = load(test_file_name) assert isinstance(func,func_hkl.__class__) or isinstance(func_hkl,func.__class__) assert func(1, 2) == func_hkl(1, 2) def test_non_empty_group(test_file_name,compression_kwargs): """ Test if attempting to dump to a group with data fails """ hickle.dump(None, test_file_name,**compression_kwargs) with pytest.raises(ValueError): dump(None, test_file_name, 'r+',**compression_kwargs) def test_string(test_file_name,compression_kwargs): """ Dumping and loading a string """ mode = 'w' string_obj = "The quick brown fox jumps over the lazy dog" dump(string_obj, test_file_name, mode,**compression_kwargs) string_hkl = load(test_file_name) assert isinstance(string_hkl, str) assert string_obj == string_hkl def test_65bit_int(test_file_name,compression_kwargs): """ Dumping and loading an integer with arbitrary precision https://github.com/telegraphic/hickle/issues/113""" i = 2**65-1 dump(i, test_file_name,**compression_kwargs) i_hkl = load(test_file_name) assert i == i_hkl j = -2**63-1 dump(j, test_file_name,**compression_kwargs) j_hkl = load(test_file_name) assert j == j_hkl def test_list(test_file_name,compression_kwargs): """ Dumping and loading a list """ filename, mode = 'test_list.h5', 'w' list_obj = [1, 2, 3, 4, 5] dump(list_obj, test_file_name, mode=mode,**compression_kwargs) list_hkl = load(test_file_name) try: assert isinstance(list_hkl, list) assert list_obj == list_hkl import h5py a = h5py.File(test_file_name, 'r') a.close() except AssertionError: print("ERR:", list_obj, list_hkl) import h5py raise def test_set(test_file_name,compression_kwargs) : """ Dumping and loading a list """ mode = 'w' list_obj = set([1, 0, 3, 4.5, 11.2]) dump(list_obj, test_file_name, mode,**compression_kwargs) list_hkl = load(test_file_name) try: assert isinstance(list_hkl, set) assert list_obj == list_hkl except AssertionError: print(type(list_obj)) print(type(list_hkl)) raise def test_numpy(test_file_name,compression_kwargs): """ Dumping and loading numpy array """ mode = 'w' dtypes = ['float32', 'float64', 'complex64', 'complex128'] for dt in dtypes: array_obj = np.ones(8, dtype=dt) dump(array_obj, test_file_name, mode,**compression_kwargs) array_hkl = load(test_file_name) try: assert array_hkl.dtype == array_obj.dtype assert np.all((array_hkl, array_obj)) except AssertionError: print(array_hkl) print(array_obj) raise def test_masked(test_file_name,compression_kwargs): """ Test masked numpy array """ mode = 'w' a = np.ma.array([1, 2, 3, 4], dtype='float32', mask=[0, 1, 0, 0]) dump(a, test_file_name, mode,**compression_kwargs) a_hkl = load(test_file_name) try: assert a_hkl.dtype == a.dtype assert np.all((a_hkl, a)) except AssertionError: print(a_hkl) print(a) raise def test_object_numpy(test_file_name,compression_kwargs): """ Dumping and loading a NumPy array containing non-NumPy objects. https://github.com/telegraphic/hickle/issues/90""" # VisibleDeprecationWarning from newer numpy versions #np_array_data = np.array([[NESTED_DICT], ('What is this?',), {1, 2, 3, 7, 1}]) arr = np.array([NESTED_DICT])#, ('What is this?',), {1, 2, 3, 7, 1}]) dump(arr, test_file_name,**compression_kwargs) arr_hkl = load(test_file_name) assert np.all(arr == arr_hkl) arr2 = np.array(NESTED_DICT) dump(arr2, test_file_name,**compression_kwargs) arr_hkl2 = load(test_file_name) assert np.all(arr2 == arr_hkl2) def test_string_numpy(test_file_name,compression_kwargs): """ Dumping and loading NumPy arrays containing Python 3 strings. """ arr = np.array(["1313e", "was", "maybe?", "here"]) dump(arr, test_file_name,**compression_kwargs) arr_hkl = load(test_file_name) assert np.all(arr == arr_hkl) def test_list_object_numpy(test_file_name,compression_kwargs): """ Dumping and loading a list of NumPy arrays with objects. https://github.com/telegraphic/hickle/issues/90""" # VisibleDeprecationWarning from newer numpy versions lst = [np.array(NESTED_DICT)]#, np.array([('What is this?',), # {1, 2, 3, 7, 1}])] dump(lst, test_file_name,**compression_kwargs) lst_hkl = load(test_file_name) assert np.all(lst[0] == lst_hkl[0]) #assert np.all(lst[1] == lst_hkl[1]) def test_dict(test_file_name,compression_kwargs): """ Test dictionary dumping and loading """ mode = 'w' dd = { 'name': b'Danny', 'age': 28, 'height': 6.1, 'dork': True, 'nums': [1, 2, 3], 'narr': np.array([1, 2, 3]), } dump(dd, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) for k in dd.keys(): try: assert k in dd_hkl.keys() if isinstance(dd[k], np.ndarray): assert np.all((dd[k], dd_hkl[k])) else: pass assert isinstance(dd_hkl[k], dd[k].__class__) except AssertionError: print(k) print(dd_hkl[k]) print(dd[k]) print(type(dd_hkl[k]), type(dd[k])) raise def test_odict(test_file_name,compression_kwargs): """ Test ordered dictionary dumping and loading https://github.com/telegraphic/hickle/issues/65""" mode = 'w' od = odict(((3, [3, 0.1]), (7, [5, 0.1]), (5, [3, 0.1]))) dump(od, test_file_name, mode,**compression_kwargs) od_hkl = load(test_file_name) assert od.keys() == od_hkl.keys() for od_item, od_hkl_item in zip(od.items(), od_hkl.items()): assert od_item == od_hkl_item def test_empty_dict(test_file_name,compression_kwargs): """ Test empty dictionary dumping and loading https://github.com/telegraphic/hickle/issues/91""" mode = 'w' dump({}, test_file_name, mode,**compression_kwargs) assert load(test_file_name) == {} # TODO consider converting to parameterized test # or enable implicit parameterizing of all tests # though compression_kwargs fixture providing # various combinations of compression and chunking # related keywords @pytest.mark.no_compression def test_compression(test_file_name): """ Test compression on datasets""" mode = 'w' dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] comps = [None, 'gzip', 'lzf'] for dt in dtypes: for cc in comps: array_obj = np.ones(32768, dtype=dt) dump(array_obj, test_file_name, mode, compression=cc) print(cc, os.path.getsize(test_file_name)) array_hkl = load(test_file_name) try: assert array_hkl.dtype == array_obj.dtype assert np.all((array_hkl, array_obj)) except AssertionError: print(array_hkl) print(array_obj) raise def test_dict_int_key(test_file_name,compression_kwargs): """ Test for dictionaries with integer keys """ mode = 'w' dd = { 0: "test", 1: "test2" } dump(dd, test_file_name, mode,**compression_kwargs) load(test_file_name) def test_dict_nested(test_file_name,compression_kwargs): """ Test for dictionaries with integer keys """ mode = 'w' dd = NESTED_DICT dump(dd, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) ll_hkl = dd_hkl["level1_3"]["level2_1"]["level3_1"] ll = dd["level1_3"]["level2_1"]["level3_1"] assert ll == ll_hkl def test_masked_dict(test_file_name,compression_kwargs): """ Test dictionaries with masked arrays """ filename, mode = 'test.h5', 'w' dd = { "data": np.ma.array([1, 2, 3], mask=[True, False, False]), "data2": np.array([1, 2, 3, 4, 5]) } dump(dd, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) for k in dd.keys(): try: assert k in dd_hkl.keys() if isinstance(dd[k], np.ndarray): assert np.all((dd[k], dd_hkl[k])) elif isinstance(dd[k], np.ma.MaskedArray): print(dd[k].data) print(dd_hkl[k].data) assert np.allclose(dd[k].data, dd_hkl[k].data) assert np.allclose(dd[k].mask, dd_hkl[k].mask) assert isinstance(dd_hkl[k], dd[k].__class__) except AssertionError: print(k) print(dd_hkl[k]) print(dd[k]) print(type(dd_hkl[k]), type(dd[k])) raise def test_np_float(test_file_name,compression_kwargs): """ Test for singular np dtypes """ mode = 'w' dtype_list = (np.float16, np.float32, np.float64, np.complex64, np.complex128, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64) for dt in dtype_list: dd = dt(1) dump(dd, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) assert dd == dd_hkl assert dd.dtype == dd_hkl.dtype dd = {} for dt in dtype_list: dd[str(dt)] = dt(1.0) dump(dd, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) print(dd) for dt in dtype_list: assert dd[str(dt)] == dd_hkl[str(dt)] # TODO consider converting to parameterized test # or enable implicit parameterizing of all tests # though compression_kwargs fixture providing # various combinations of compression and chunking # related keywords @pytest.mark.no_compression def test_comp_kwargs(test_file_name): """ Test compression with some kwargs for shuffle and chunking """ mode = 'w' dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] comps = [None, 'gzip', 'lzf'] chunks = [(100, 100), (250, 250)] shuffles = [True, False] scaleoffsets = [0, 1, 2] for dt in dtypes: for cc in comps: for ch in chunks: for sh in shuffles: for so in scaleoffsets: kwargs = { 'compression': cc, 'dtype': dt, 'chunks': ch, 'shuffle': sh, 'scaleoffset': so } array_obj = NESTED_DICT dump(array_obj, test_file_name, mode, compression=cc) print(kwargs, os.path.getsize(test_file_name)) load(test_file_name) def test_list_numpy(test_file_name,compression_kwargs): """ Test converting a list of numpy arrays """ mode = 'w' a = np.ones(1024) b = np.zeros(1000) c = [a, b] dump(c, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) print(dd_hkl) assert isinstance(dd_hkl, list) assert isinstance(dd_hkl[0], np.ndarray) def test_tuple_numpy(test_file_name,compression_kwargs): """ Test converting a list of numpy arrays """ mode = 'w' a = np.ones(1024) b = np.zeros(1000) c = (a, b, a) dump(c, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) print(dd_hkl) assert isinstance(dd_hkl, tuple) assert isinstance(dd_hkl[0], np.ndarray) def test_numpy_dtype(test_file_name,compression_kwargs): """ Dumping and loading a NumPy dtype """ dtype = np.dtype('int64') dump(dtype, test_file_name,**compression_kwargs) dtype_hkl = load(test_file_name) assert dtype == dtype_hkl def test_none(test_file_name,compression_kwargs): """ Test None type hickling """ mode = 'w' a = None dump(a, test_file_name, mode,**compression_kwargs) dd_hkl = load(test_file_name) print(a) print(dd_hkl) assert isinstance(dd_hkl, type(None)) def test_list_order(test_file_name,compression_kwargs): """ https://github.com/telegraphic/hickle/issues/26 """ d = [np.arange(n + 1) for n in range(20)] dump(d, test_file_name,**compression_kwargs) d_hkl = load(test_file_name) try: for ii, xx in enumerate(d): assert d[ii].shape == d_hkl[ii].shape for ii, xx in enumerate(d): assert np.allclose(d[ii], d_hkl[ii]) except AssertionError: print(d[ii], d_hkl[ii]) raise def test_embedded_array(test_file_name,compression_kwargs): """ See https://github.com/telegraphic/hickle/issues/24 """ d_orig = [[np.array([10., 20.]), np.array([10, 20, 30])], [np.array([10, 2]), np.array([1.])]] dump(d_orig, test_file_name,**compression_kwargs) d_hkl = load(test_file_name) for ii, xx in enumerate(d_orig): for jj, yy in enumerate(xx): assert np.allclose(d_orig[ii][jj], d_hkl[ii][jj]) print(d_hkl) print(d_orig) ############## # NEW TESTS # ############### def generate_nested(): a = [1, 2, 3] b = [a, a, a] c = [a, b, 's'] d = [a, b, c, c, a] e = [d, d, d, d, 1] f = {'a': a, 'b': b, 'e': e} g = {'f': f, 'a': e, 'd': d} h = {'h': g, 'g': f} z = [f, a, b, c, d, e, f, g, h, g, h] a = np.array([1, 2, 3, 4]) b = set([1, 2, 3, 4, 5]) c = (1, 2, 3, 4, 5) d = np.ma.array([1, 2, 3, 4, 5, 6, 7, 8]) z = {'a': a, 'b': b, 'c': c, 'd': d, 'z': z} return z def test_dump_nested(test_file_name,compression_kwargs): """ Dump a complicated nested object to HDF5 """ z = generate_nested() dump(z, test_file_name, mode='w',**compression_kwargs) def test_ndarray(test_file_name,compression_kwargs): a = np.array([1, 2, 3]) b = np.array([2, 3, 4]) z = (a, b) print("Original:") pprint(z) dump(z, test_file_name, mode='w',**compression_kwargs) print("\nReconstructed:") z = load(test_file_name) pprint(z) def test_ndarray_masked(test_file_name,compression_kwargs): a = np.ma.array([1, 2, 3]) b = np.ma.array([2, 3, 4], mask=[True, False, True]) z = (a, b) print("Original:") pprint(z) dump(z, test_file_name, mode='w',**compression_kwargs) print("\nReconstructed:") z = load(test_file_name) pprint(z) def test_simple_dict(test_file_name,compression_kwargs): a = {'key1': 1, 'key2': 2} dump(a, test_file_name,**compression_kwargs) z = load(test_file_name) pprint(a) pprint(z) def test_complex_dict(test_file_name,compression_kwargs): a = {'akey': 1, 'akey2': 2} c = {'ckey': "hello", "ckey2": "hi there"} z = {'zkey1': a, 'zkey2': a, 'zkey3': c} print("Original:") pprint(z) dump(z, test_file_name, mode='w',**compression_kwargs) print("\nReconstructed:") z = load(test_file_name) pprint(z) def test_complex(test_file_name,compression_kwargs): """ Test complex value dtype is handled correctly https://github.com/telegraphic/hickle/issues/29 """ data = {"A": 1.5, "B": 1.5 + 1j, "C": np.linspace(0, 1, 4) + 2j} dump(data, test_file_name,**compression_kwargs) data2 = load(test_file_name) for key in data.keys(): assert isinstance(data[key], data2[key].__class__) def test_nonstring_keys(test_file_name,compression_kwargs): """ Test that keys are reconstructed back to their original datatypes https://github.com/telegraphic/hickle/issues/36 """ data = { u'test': 123, 'def': [b'test'], 'hik': np.array([1, 2, 3]), 0: 0, True: ['test'], 1.1: 'hey', 1j: 'complex_hashable', (1, 2): 'boo', ('A', 17.4, 42): [1, 7, 'A'], (): '1313e was here', '0': 0, None: None } print(data) dump(data, test_file_name,**compression_kwargs) data2 = load(test_file_name) print(data2) for key in data.keys(): assert key in data2.keys() print(data2) @pytest.mark.no_compression def test_scalar_compression(test_file_name): """ Test bug where compression causes a crash on scalar datasets (Scalars are incompressible!) https://github.com/telegraphic/hickle/issues/37 """ data = {'a': 0, 'b': np.float(2), 'c': True} dump(data, test_file_name, compression='gzip') data2 = load(test_file_name) print(data2) for key in data.keys(): assert isinstance(data[key], data2[key].__class__) def test_bytes(test_file_name,compression_kwargs): """ Dumping and loading a string. PYTHON3 ONLY """ mode = 'w' string_obj = b"The quick brown fox jumps over the lazy dog" dump(string_obj, test_file_name, mode,**compression_kwargs) string_hkl = load(test_file_name) print(type(string_obj)) print(type(string_hkl)) assert isinstance(string_hkl, bytes) assert string_obj == string_hkl def test_np_scalar(test_file_name,compression_kwargs): """ Numpy scalar datatype https://github.com/telegraphic/hickle/issues/50 """ r0 = {'test': np.float64(10.)} dump(r0, test_file_name,**compression_kwargs) r = load(test_file_name) print(r) assert isinstance(r0['test'], r['test'].__class__) def test_slash_dict_keys(test_file_name,compression_kwargs): """ Support for having slashes in dict keys https://github.com/telegraphic/hickle/issues/124""" dct = {'a/b': [1, '2'], 1.4: 3} dump(dct, test_file_name, 'w',**compression_kwargs) dct_hkl = load(test_file_name) assert isinstance(dct_hkl, dict) for key, val in dct_hkl.items(): assert val == dct.get(key) # Check that having backslashes in dict keys will serialize the dict dct2 = {'a\\b': [1, '2'], 1.4: 3} with pytest.warns(None) as not_expected: dump(dct2, test_file_name,**compression_kwargs) assert not not_expected # %% MAIN SCRIPT if __name__ == '__main__': """ Some tests and examples """ from _pytest.fixtures import FixtureRequest for filename in test_file_name(FixtureRequest(test_np_scalar)): test_np_scalar(filename) for filename in test_file_name(FixtureRequest(test_scalar_compression)): test_scalar_compression(filename) for filename in test_file_name(FixtureRequest(test_complex)): test_complex(filename) for filename in test_file_name(FixtureRequest(test_none)): test_none(filename) for filename in test_file_name(FixtureRequest(test_masked_dict)): test_masked_dict(filename) for filename in test_file_name(FixtureRequest(test_list)): test_list(filename) for filename in test_file_name(FixtureRequest(test_set)): test_set(filename) for filename in test_file_name(FixtureRequest(test_numpy)): test_numpy(filename) for filename in test_file_name(FixtureRequest(test_dict)): test_dict(filename) for filename in test_file_name(FixtureRequest(test_odict)): test_odict(filename) for filename in test_file_name(FixtureRequest(test_empty_dict)): test_empty_dict(filename) for filename in test_file_name(FixtureRequest(test_compression)): test_compression(filename) for filename in test_file_name(FixtureRequest(test_masked)): test_masked(filename) for filename in test_file_name(FixtureRequest(test_dict_nested)): test_dict_nested(filename) for filename in test_file_name(FixtureRequest(test_comp_kwargs)): test_comp_kwargs(filename) for filename in test_file_name(FixtureRequest(test_list_numpy)): test_list_numpy(filename) for filename in test_file_name(FixtureRequest(test_tuple_numpy)): test_tuple_numpy(filename) for filename in test_file_name(FixtureRequest(test_list_order)): test_list_order(filename) for filename in test_file_name(FixtureRequest(test_embedded_array)): test_embedded_array(filename) for filename in test_file_name(FixtureRequest(test_np_float)): test_np_float(filename) for filename in test_file_name(FixtureRequest(test_string)): test_string(filename) for filename in test_file_name(FixtureRequest(test_nonstring_keys)): test_nonstring_keys(filename) for filename in test_file_name(FixtureRequest(test_bytes)): test_bytes(filename) # NEW TESTS for filename in test_file_name(FixtureRequest(test_dump_nested)): test_dump_nested(filename) for filename in test_file_name(FixtureRequest(test_ndarray)): test_ndarray(filename) for filename in test_file_name(FixtureRequest(test_ndarray_masked)): test_ndarray_masked(filename) for filename in test_file_name(FixtureRequest(test_simple_dict)): test_simple_dict(filename) for filename in test_file_name(FixtureRequest(test_complex_dict)): test_complex_dict(filename) for filename in test_file_name(FixtureRequest(test_dict_int_key)): test_dict_int_key(filename) for filename in test_file_name(FixtureRequest(test_local_func)): test_local_func(filename) for filename in test_file_name(FixtureRequest(test_slash_dict_keys)): test_slash_dict_keys(filename) test_invalid_file() for filename in test_file_name(FixtureRequest(test_non_empty_group)): test_non_empty_group(filename) for filename in test_file_name(FixtureRequest(test_numpy_dtype)): test_numpy_dtype(filename) for filename in test_file_name(FixtureRequest(test_object_numpy)): test_object_numpy(filename) for filename in test_file_name(FixtureRequest(test_string_numpy)): test_string_numpy(filename) for filename in test_file_name(FixtureRequest(test_list_object_numpy)): test_list_object_numpy(filename) # Cleanup for filename in test_file_name(FixtureRequest(print)): print(filename) hickle-5.0.2/hickle/tests/test_legacy_load.py000066400000000000000000000046221430361177200212420ustar00rootroot00000000000000# %% IMPORTS # Built-in imports import glob from os import path import warnings import pytest import scipy.sparse import numpy as np # Package imports import h5py # hickle imports import hickle as hkl # %% FUNCTION DEFINITIONS def test_legacy_load(): dirpath = path.dirname(__file__) filelist = sorted(glob.glob(path.join(dirpath, 'legacy_hkls/*3_[0-9]_[0-9].hkl'))) # Make all warnings show warnings.simplefilter("always") for filename in filelist: with pytest.warns( UserWarning, match = r"Input\s+argument\s+'file_obj'\s+appears\s+to\s+be\s+a\s+file\s+made" r"\s+with\s+hickle\s+v3.\s+Using\s+legacy\s+load..." ): try: print(filename) a = hkl.load(filename,path='test') except Exception: with h5py.File(filename) as a: print(a.attrs.items()) print(a.items()) for key, item in a.items(): print(item.attrs.items()) raise @pytest.mark.no_compression def test_4_0_0_load(): """ test that files created by hickle 4.0.x can be loaded by hickle 4.1.x properly """ dirpath = path.dirname(__file__) filelist = sorted(glob.glob(path.join(dirpath, 'legacy_hkls/*4.[0-9].[0-9].hkl'))) from hickle.tests.generate_legacy_4_0_0 import generate_py_object compare_with,needs_compare = generate_py_object() # strange but without forcing garbage collection here h5py might produce # strange assuming a race related RuntimeError when h5py file is closed by # hickle.load(). Unless observed in wildlife this is only triggered by fast successive # calls of h5py methods. import gc gc.collect() for filename in filelist: content = hkl.load(filename) if filename != needs_compare: continue for item_id,content_item,compare_item in ( (i,content[i],compare_with[i]) for i in range(len(compare_with)) ): if scipy.sparse.issparse(content_item): assert np.allclose(content_item.toarray(),compare_item.toarray()) continue try: assert content_item == compare_item except ValueError: assert np.all(content_item == compare_item) # %% MAIN SCRIPT if __name__ == "__main__": test_legacy_load() test_4_0_0_load() hickle-5.0.2/paper.bib000066400000000000000000000056551430361177200145610ustar00rootroot00000000000000@article{astropy:2018, Adsurl = {https://ui.adsabs.harvard.edu/#abs/2018AJ....156..123T}, Author = {{Price-Whelan}, A.~M. and {Sip{'{o}}cz}, B.~M. and {G{"u}nther}, H.~M. and {Lim}, P.~L. and others}, Doi = {10.3847/1538-3881/aabc4f}, Eid = {123}, Journal = {aj}, Pages = {123}, Title = {{The Astropy Project: Building an Open-science Project and Status of the v2.0 Core Package}}, Volume = {156}, Year = 2018} @book{collette:2014, Author = {Andrew Collette}, Keywords = {python, hdf5}, Publisher = {O'Reilly}, Title = {Python and HDF5}, Year = {2013}} @article{Durant:2017, Author = {Durant, Thomas J.S. and Olson, Eben M. and Schulz, Wade L. and Torres, Richard}, Doi = {10.1373/clinchem.2017.276345}, Eprint = {http://clinchem.aaccjnls.org/content/63/12/1847.full.pdf}, Issn = {0009-9147}, Journal = {Clinical Chemistry}, Number = {12}, Pages = {1847--1855}, Publisher = {Clinical Chemistry}, Title = {Very Deep Convolutional Neural Networks for Morphologic Classification of Erythrocytes}, Url = {http://clinchem.aaccjnls.org/content/63/12/1847}, Volume = {63}, Year = {2017}, } @webpage{hdf5, Lastchecked = {November 2018}, Url = {https://support.hdfgroup.org/HDF5/doc/index.html}} @article{numpy, Author = {T. E. Oliphant}, Doi = {10.1109/MCSE.2007.58}, Issn = {1521-9615}, Journal = {Computing in Science Engineering}, Month = {May}, Number = {3}, Pages = {10-20}, Title = {Python for Scientific Computing}, Volume = {9}, Year = {2007}} @article{Price:2018, Adsnote = {Provided by the SAO/NASA Astrophysics Data System}, Adsurl = {https://ui.adsabs.harvard.edu/#abs/2018MNRAS.478.4193P}, Author = {{Price}, D.~C. and {Greenhill}, L.~J. and {Fialkov}, A. and {Bernardi}, G. and others}, Doi = {10.1093/mnras/sty1244}, Journal = {Monthly Notices of the Royal Astronomy Society}, Pages = {4193-4213}, Title = {{Design and characterization of the Large-aperture Experiment to Detect the Dark Age (LEDA) radiometer systems}}, Volume = {478}, Year = 2018, Bdsk-Url-1 = {https://doi.org/10.1093/mnras/sty1244}} @phdthesis{Raffel:2016, Author = {Colin Raffel}, School = {Columbia University}, Title = {Learning-Based Methods for Comparing Sequences, with Applications to Audio-to-MIDI Alignment and Matching}, Year = {2016}, Doi = {https://doi.org/10.7916/D8N58MHV}} @inproceedings{Zhang:2016, Acmid = {2934880}, Address = {New York, NY, USA}, Author = {Zhang, Hong and Chen, Li and Yi, Bairen and Chen, Kai and Chowdhury, Mosharaf and Geng, Yanhui}, Booktitle = {Proceedings of the 2016 ACM SIGCOMM Conference}, Doi = {10.1145/2934872.2934880}, Isbn = {978-1-4503-4193-6}, Keywords = {Coflow;, data-intensive applications;, datacenter networks}, Location = {Florianopolis, Brazil}, Numpages = {14}, Pages = {160--173}, Publisher = {ACM}, Series = {SIGCOMM '16}, Title = {CODA: Toward Automatically Identifying and Scheduling Coflows in the Dark}, Url = {http://doi.acm.org/10.1145/2934872.2934880}, Year = {2016}} hickle-5.0.2/paper.md000066400000000000000000000100241430361177200144070ustar00rootroot00000000000000--- title: 'Hickle: A HDF5-based python pickle replacement' tags: - Python - astronomy authors: - name: Danny C. Price orcid: 0000-0003-2783-1608 affiliation: "1, 2" # (Multiple affiliations must be quoted) - name: Ellert van der Velden orcid: 0000-0002-1559-9832 affiliation: 2 - name: Sébastien Celles orcid: 0000-0001-9987-4338 affiliation: 3 - name: Pieter T. Eendebak orcid: 0000-0001-7018-1124 affiliation: "4, 5" - name: Michael M. McKerns orcid: 0000-0001-8342-3778 affiliation: 6 - name: Eben M. Olson affiliation: 7 - name: Colin Raffel affiliation: 8 - name: Bairen Yi affiliation: 9 - name: Elliott Ash affiliation: 10 affiliations: - name: Department of Astronomy, University of California Berkeley, Berkeley CA 94720 index: 1 - name: Centre for Astrophysics & Supercomputing, Swinburne University of Technology, Hawthorn, VIC 3122, Australia index: 2 - name: Thermal Science and Energy Department, Institut Universitaire de Technologie de Poitiers - Université de Poitiers, France index: 3 - name: QuTech, Delft University of Technology, P.O. Box 5046, 2600 GA Delft, The Netherlands index: 4 - name: Netherlands Organisation for Applied Scientific Research (TNO), P.O. Box 155, 2600 AD Delft, The Netherlands index: 5 - name: Institute for Advanced Computational Science, Stony Brook University, Stony Brook, NY 11794-5250 index: 6 - name: Department of Laboratory Medicine, Yale University, New Haven CT 06510 USA index: 7 - name: Google Brain, Mountain View, CA, 94043 index: 8 - name: The Hong Kong University of Science and Technology index: 9 - name: ETH Zurich index: 10 date: 10 November 2018 bibliography: paper.bib --- # Summary ``hickle`` is a Python 2/3 package for quickly dumping and loading python data structures to Hierarchical Data Format 5 (HDF5) files [@hdf5]. When dumping to HDF5, ``hickle`` automatically convert Python data structures (e.g. lists, dictionaries, ``numpy`` arrays [@numpy]) into HDF5 groups and datasets. When loading from file, ``hickle`` automatically converts data back into its original data type. A key motivation for ``hickle`` is to provide high-performance loading and storage of scientific data in the widely-supported HDF5 format. ``hickle`` is designed as a drop-in replacement for the Python ``pickle`` package, which converts Python object hierarchies to and from Python-specific byte streams (processes known as 'pickling' and 'unpickling' respectively). Several different protocols exist, and files are not designed to be compatible between Python versions, nor interpretable in other languages. In contrast, ``hickle`` stores and loads files from HDF5, for which application programming interfaces (APIs) exist in most major languages, including C, Java, R, and MATLAB. Python data structures are mapped into the HDF5 abstract data model in a logical fashion, using the ``h5py`` package [@collette:2014]. Metadata required to reconstruct the hierarchy of objects, and to allow conversion into Python objects, is stored in HDF5 attributes. Most commonly used Python iterables (dict, tuple, list, set), and data types (int, float, str) are supported, as are ``numpy`` N-dimensional arrays. Commonly-used ``astropy`` data structures and ``scipy`` sparse matrices are also supported. ``hickle`` has been used in many scientific research projects, including: * Visualization and machine learning on volumetric fluorescence microscopy datasets from histological tissue imaging [@Durant:2017]. * Caching pre-computed features for MIDI and audio files for downstream machine learning tasks [@Raffel:2016]. * Storage and transmission of high volume of shot-gun proteomics data, such as mass spectra of proteins and peptide segments [@Zhang:2016]. * Storage of astronomical data and calibration data from radio telescopes [@Price:2018]. ``hickle`` is released under the MIT license, and is available from PyPi via ``pip``; source code is available at https://github.com/telegraphic/hickle. # References hickle-5.0.2/requirements.txt000066400000000000000000000000401430361177200162370ustar00rootroot00000000000000h5py>=2.10.0 numpy>=1.8,!=1.20 hickle-5.0.2/requirements32.txt000066400000000000000000000000371430361177200164120ustar00rootroot00000000000000h5py==2.10.0 numpy>=1.8,!=1.20 hickle-5.0.2/requirements_h5py_3.txt000066400000000000000000000000551430361177200174340ustar00rootroot00000000000000dill>=0.3.0 h5py>=3.0 numpy>=1.8 six>=1.11.0 hickle-5.0.2/requirements_test.txt000066400000000000000000000001701430361177200173020ustar00rootroot00000000000000dill>=0.3.0 codecov pytest>=4.6.0 pytest-cov astropy>=1.3,<4.0 scipy>=1.0.0 pandas>=0.24.0 check-manifest twine>=1.13.0 hickle-5.0.2/setup.cfg000066400000000000000000000003651430361177200146060ustar00rootroot00000000000000[metadata] description-file=README.md [aliases] test=pytest [tool:pytest] addopts=--verbose --cov --cov-config=setup.cfg --cov-report=term-missing [coverage:run] include=hickle/* omit= hickle/tests/* hickle/*/tests/* hickle/legacy_v3/* hickle-5.0.2/setup.py000066400000000000000000000045331430361177200145000ustar00rootroot00000000000000# To increment version # Check you have ~/.pypirc filled in # git tag x.y.z # git push && git push --tags # rm -rf dist; python setup.py sdist bdist_wheel # TEST: twine upload --repository-url https://test.pypi.org/legacy/ dist/* # twine upload dist/* from codecs import open import re from setuptools import setup, find_packages import sys author = "Danny Price, Ellert van der Velden and contributors" with open("README.md", "r") as fh: long_description = fh.read() with open("requirements.txt", 'r') as fh: requirements = fh.read().splitlines() with open("requirements_test.txt", 'r') as fh: test_requirements = fh.read().splitlines() # Read the __version__.py file with open('hickle/__version__.py', 'r') as f: vf = f.read() # Obtain version from read-in __version__.py file version = re.search(r"^_*version_* = ['\"]([^'\"]*)['\"]", vf, re.M).group(1) setup(name='hickle', version=version, description='Hickle - an HDF5 based version of pickle', long_description=long_description, long_description_content_type='text/markdown', author=author, author_email='dan@thetelegraphic.com', url='http://github.com/telegraphic/hickle', download_url=('https://github.com/telegraphic/hickle/archive/v%s.zip' % (version)), platforms='Cross platform (Linux, Mac OSX, Windows)', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'License :: OSI Approved', 'Natural Language :: English', 'Operating System :: MacOS', 'Operating System :: Microsoft :: Windows', 'Operating System :: Unix', 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Utilities', ], keywords=['pickle', 'hdf5', 'data storage', 'data export'], install_requires=requirements, tests_require=test_requirements, python_requires='>=3.5', packages=find_packages(), zip_safe=False, ) hickle-5.0.2/tox.ini000066400000000000000000000037471430361177200143070ustar00rootroot00000000000000[tox] # python 3.9 not added as astropy and numpy pose some problems on 3.9 # need further investigation envlist = py{35,36,37,38}, py{35,36,37,38}-compress skip_missing_interpreters=true #do i need change here to trigger wf [gh-actions] # needed to match gh-action python version numbers with tox mnemonic python = 3.5: py35 3.6: py36 3.7: py37 3.8: py38 [gh-actions:env] PLATFORM = ubuntu-latest: linux macos-latest: macos windows-latest: windows [testenv] passenv = HOME USER deps = !h5py3: -rrequirements{env:TOX_H5PY_REQIREMENTS:}.txt h5py3: -rrequirements_h5py_3.txt -rrequirements_test.txt # {posargs} allows to pass any pytest related cli arguments # to tox after -- argument separator. commands = linux: pip install --upgrade pip virtualenv macos: pip install --upgrade pip virtualenv windows: python -m pip install --upgrade pip virtualenv check-manifest py{35,36,37,38}-!compress: pytest -v -v --cov-report=term-missing --cov-report=xml:coverage.xml {posargs} compress: pytest --enable-compression -v -v --cov-report=term-missing --cov-report=xml:coverage.xml {posargs} [testenv:h5py3] # special environment for testing and debugging h5py >= 3.0 support # related issues. Manually calls python setup.py develop instead of # python setup.py install which would also be possible below. # system commands like mv, ln etc must be explicitly allowed to be # called from within the virtual environment skipsdist=true skip_install=true allowlist_externals= mv ln cp rm cat # change h5py version requirements to >= 3.0 commands_pre= mv -f requirements.txt requirements_mv.txt ln -s requirements_h5py_3.txt requirements.txt cat requirements.txt commands = python setup.py develop pytest --cov-report=term-missing {posargs} # switch back to initial state again commands_post= rm requirements.txt cp requirements_mv.txt requirements.txt [pytest] # options to be passed to pytest in any cases as well # as any desired pytest configuration values addopts = --cov=./hickle