pax_global_header00006660000000000000000000000064134345057370014525gustar00rootroot0000000000000052 comment=b12ee65c624207d3020c283ce561cfcc7ab71731 h5sparse-0.1.0/000077500000000000000000000000001343450573700132555ustar00rootroot00000000000000h5sparse-0.1.0/.editorconfig000066400000000000000000000003551343450573700157350ustar00rootroot00000000000000root = true [*] indent_style = space end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true indent_size = 2 [*.py] indent_size = 4 [*.md] trim_trailing_whitespace = false [*.rst] indent_size = 3 h5sparse-0.1.0/.gitignore000066400000000000000000000020701343450573700152440ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject # VSCode project settings .vscode h5sparse-0.1.0/.pylintrc000066400000000000000000000262761343450573700151370ustar00rootroot00000000000000# PyLint config for apitools code. # # NOTES: # # - Rules for test / demo code are generated into 'pylintrc_reduced' # as deltas from this configuration by the 'run_pylint.py' script. # # - 'RATIONALE: API mapping' as a defense for non-default settings is # based on the fact that this library maps APIs which are outside our # control, and adhering to the out-of-the-box defaults would induce # breakage / complexity in those mappings # [MASTER] # Specify a configuration file. # DEFAULT: rcfile= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). # DEFAULT: init-hook= # Profiled execution. # DEFAULT: profile=no # Add files or directories to the blacklist. They should be base names, not # paths. # DEFAULT: ignore=CVS # NOTE: This path must be relative due to the use of # os.walk in astroid.modutils.get_module_files. # Pickle collected data for later comparisons. # DEFAULT: persistent=yes # List of plugins (as comma separated values of python modules names) to load, # usually to register additional checkers. # DEFAULT: load-plugins= # DEPRECATED # DEFAULT: include-ids=no # DEPRECATED # DEFAULT: symbols=no [MESSAGES CONTROL] disable = fixme, locally-disabled, locally-enabled, no-member, no-name-in-module, no-self-use, super-on-old-class, too-many-arguments, too-many-function-args, missing-docstring, arguments-differ, too-many-ancestors, too-many-boolean-expressions, [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs # (visual studio) and html. You can also give a reporter class, eg # mypackage.mymodule.MyReporterClass. # DEFAULT: output-format=text # Put messages in a separate file for each module / package specified on the # command line instead of printing them on stdout. Reports (if any) will be # written in a file name "pylint_global.[txt|html]". # DEFAULT: files-output=no # Tells whether to display a full report or only the messages # DEFAULT: reports=yes # RATIONALE: run from Travis / tox, and don't need / want to parse output. reports=no # Python expression which should return a note less than 10 (10 is the highest # note). You have access to the variables errors warning, statement which # respectively contain the number of errors / warnings messages and the total # number of statements analyzed. This is used by the global evaluation report # (RP0004). # DEFAULT: evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) # Add a comment according to your evaluation note. This is used by the global # evaluation report (RP0004). # DEFAULT: comment=no # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details #msg-template= [SIMILARITIES] # Minimum lines number of a similarity. # DEFAULT: min-similarity-lines=4 min-similarity-lines=15 # Ignore comments when computing similarities. # DEFAULT: ignore-comments=yes # Ignore docstrings when computing similarities. # DEFAULT: ignore-docstrings=yes # Ignore imports when computing similarities. # DEFAULT: ignore-imports=no ignore-imports=yes [VARIABLES] # Tells whether we should check for unused import in __init__ files. # DEFAULT: init-import=no # A regular expression matching the name of dummy variables (i.e. expectedly # not used). dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) # List of additional names supposed to be defined in builtins. Remember that # you should avoid to define new builtins when possible. # DEFAULT: additional-builtins= [LOGGING] # Logging modules to check that the string format arguments are in logging # function parameter format # DEFAULT: logging-modules=logging [FORMAT] # Maximum number of characters on a single line. # DEFAULT: max-line-length=80 max-line-length=80 # Regexp for a line that is allowed to be longer than the limit. # DEFAULT: ignore-long-lines=^\s*(# )??$ # Allow the body of an if to be on the same line as the test if there is no # else. # DEFAULT: single-line-if-stmt=no # List of optional constructs for which whitespace checking is disabled # DEFAULT: no-space-check=trailing-comma,dict-separator # RATIONALE: pylint ignores whitespace checks around the # constructs "dict-separator" (cases like {1:2}) and # "trailing-comma" (cases like {1: 2, }). # By setting "no-space-check" to empty whitespace checks will be # enforced around both constructs. no-space-check = # Maximum number of lines in a module # DEFAULT: max-module-lines=1000 max-module-lines=1500 # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 # tab). # DEFAULT: indent-string=' ' # Number of spaces of indent required inside a hanging or continued line. # DEFAULT: indent-after-paren=4 [MISCELLANEOUS] # List of note tags to take in consideration, separated by a comma. # DEFAULT: notes=FIXME,XXX,TODO [BASIC] # Regular expression which should only match function or class names that do # not require a docstring. # DEFAULT: no-docstring-rgx=__.*__ # no-docstring-rgx=(__.*__|main) # no-docstring-rgx=.* # Minimum line length for functions/classes that require docstrings, shorter # ones are exempt. # DEFAULT: docstring-min-length=-1 docstring-min-length=10 # Regular expression which should only match correct module names. The # leading underscore is sanctioned for private modules by Google's style # guide. module-rgx=^(_?[a-z][a-z0-9_]*)|__init__$ # Regular expression matching correct constant names # DEFAULT: const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ # Regular expression matching correct class attribute names # DEFAULT: class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ # Regular expression matching correct class names # DEFAULT: class-rgx=[A-Z_][a-zA-Z0-9]+$ class-rgx=^_?[A-Z][a-zA-Z0-9]*$ # Regular expression which should only match correct function names. # 'camel_case' and 'snake_case' group names are used for consistency of naming # styles across functions and methods. function-rgx=^(?:(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ # Regular expression which should only match correct method names. # 'camel_case' and 'snake_case' group names are used for consistency of naming # styles across functions and methods. 'exempt' indicates a name which is # consistent with all naming styles. method-rgx=^(?:(?P__[a-z0-9_]+__|next)|(?P_{0,2}[A-Z][a-zA-Z0-9]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ # Regular expression matching correct attribute names # DEFAULT: attr-rgx=[a-z_][a-z0-9_]{2,30}$ attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ # Regular expression matching correct argument names # DEFAULT: argument-rgx=[a-z_][a-z0-9_]{2,30}$ argument-rgx=^[a-z][a-z0-9_]*$ # Regular expression matching correct variable names # DEFAULT: variable-rgx=[a-z_][a-z0-9_]{2,30}$ variable-rgx=^[a-z][a-z0-9_]*$ # Regular expression matching correct inline iteration names # DEFAULT: inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ inlinevar-rgx=^[a-z][a-z0-9_]*$ # Good variable names which should always be accepted, separated by a comma # DEFAULT: good-names=i,j,k,ex,Run,_ good-names=main,_ # Bad variable names which should always be refused, separated by a comma # DEFAULT: bad-names=foo,bar,baz,toto,tutu,tata bad-names= # List of builtins function names that should not be used, separated by a comma # bad-functions=input,apply,reduce [TYPECHECK] # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). # DEFAULT: ignore-mixin-members=yes # List of module names for which member attributes should not be checked # (useful for modules/projects where namespaces are manipulated during runtime # and thus existing member attributes cannot be deduced by static analysis # DEFAULT: ignored-modules= # List of classes names for which member attributes should not be checked # (useful for classes with attributes dynamically set). # DEFAULT: ignored-classes=SQLObject # When zope mode is activated, add a predefined set of Zope acquired attributes # to generated-members. # DEFAULT: zope=no # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E0201 when accessed. Python regular # expressions are accepted. # DEFAULT: generated-members=REQUEST,acl_users,aq_parent [IMPORTS] # Deprecated modules which should not be used, separated by a comma # DEFAULT: deprecated-modules=regsub,TERMIOS,Bastion,rexec # Create a graph of every (i.e. internal and external) dependencies in the # given file (report RP0402 must not be disabled) # DEFAULT: import-graph= # Create a graph of external dependencies in the given file (report RP0402 must # not be disabled) # DEFAULT: ext-import-graph= # Create a graph of internal dependencies in the given file (report RP0402 must # not be disabled) # DEFAULT: int-import-graph= [CLASSES] # List of interface methods to ignore, separated by a comma. This is used for # instance to not check methods defines in Zope's Interface base class. # DEFAULT: ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by # List of method names used to declare (i.e. assign) instance attributes. # DEFAULT: defining-attr-methods=__init__,__new__,setUp # List of valid names for the first argument in a class method. # DEFAULT: valid-classmethod-first-arg=cls # List of valid names for the first argument in a metaclass class method. # DEFAULT: valid-metaclass-classmethod-first-arg=mcs [DESIGN] # Maximum number of arguments for function / method # DEFAULT: max-args=5 # RATIONALE: API-mapping max-args = 14 # Argument names that match this expression will be ignored. Default to name # with leading underscore # DEFAULT: ignored-argument-names=_.* # Maximum number of locals for function / method body # DEFAULT: max-locals=15 max-locals=24 # Maximum number of return / yield for function / method body # DEFAULT: max-returns=6 max-returns=9 # Maximum number of branch for function / method body # DEFAULT: max-branches=12 max-branches=21 # Maximum number of statements in function / method body # DEFAULT: max-statements=50 # Maximum number of parents for a class (see R0901). # DEFAULT: max-parents=7 # Maximum number of attributes for a class (see R0902). # DEFAULT: max-attributes=7 # RATIONALE: API mapping max-attributes=19 # Minimum number of public methods for a class (see R0903). # DEFAULT: min-public-methods=2 # RATIONALE: context mgrs may have *no* public methods min-public-methods=0 # Maximum number of public methods for a class (see R0904). # DEFAULT: max-public-methods=20 # RATIONALE: API mapping max-public-methods=40 [ELIF] max-nested-blocks=6 [EXCEPTIONS] # Exceptions that will emit a warning when being caught. Defaults to # "Exception" # DEFAULT: overgeneral-exceptions=Exception h5sparse-0.1.0/.travis.yml000066400000000000000000000006141343450573700153670ustar00rootroot00000000000000notifications: email: on_success: change on_failure: always sudo: false language: python matrix: include: - python: '2.7' - python: '3.5' - python: '3.6' - python: '3.7' dist: xenial sudo: true - python: '3.6' env: TOXENV=flake8 install: - pip install -U pip wheel - pip install -U Cython - pip install -U tox-travis tox script: - tox h5sparse-0.1.0/LICENSE000066400000000000000000000020531343450573700142620ustar00rootroot00000000000000MIT License Copyright (c) 2017 Appier Inc Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. h5sparse-0.1.0/README.rst000066400000000000000000000100771343450573700147510ustar00rootroot00000000000000h5sparse ======== .. image:: https://img.shields.io/travis/appier/h5sparse/master.svg :target: https://travis-ci.org/appier/h5sparse .. image:: https://img.shields.io/pypi/v/h5sparse.svg :target: https://pypi.python.org/pypi/h5sparse .. image:: https://img.shields.io/pypi/l/h5sparse.svg :target: https://pypi.python.org/pypi/h5sparse Scipy sparse matrix in HDF5. Installation ------------ .. code:: bash pip install h5sparse Testing ------- - for single environment: .. code:: bash python setup.py test - for all environments: .. code:: bash tox Examples -------- Create dataset ************** .. code:: python In [1]: import scipy.sparse as ss ...: import h5sparse ...: import numpy as np ...: In [2]: sparse_matrix = ss.csr_matrix([[0, 1, 0], ...: [0, 0, 1], ...: [0, 0, 0], ...: [1, 1, 0]], ...: dtype=np.float64) In [3]: # create dataset from scipy sparse matrix ...: with h5sparse.File("test.h5") as h5f: ...: h5f.create_dataset('sparse/matrix', data=sparse_matrix) In [4]: # you can also create dataset from another dataset ...: with h5sparse.File("test.h5") as h5f: ...: h5f.create_dataset('sparse/matrix2', data=h5f['sparse/matrix']) In [5]: # you can also create dataset using the formats that original h5py accepts ...: with h5sparse.File("test.h5") as h5f: ...: h5f.create_dataset('sparse/matrix3', data=[1,2,3]) Read dataset ************ .. code:: python In [6]: h5f = h5sparse.File("test.h5") In [7]: h5f['sparse/matrix'][1:3] Out[7]: <2x3 sparse matrix of type '' with 1 stored elements in Compressed Sparse Row format> In [8]: h5f['sparse/matrix'][1:3].toarray() Out[8]: array([[ 0., 0., 1.], [ 0., 0., 0.]]) In [9]: h5f['sparse']['matrix'][1:3].toarray() Out[9]: array([[ 0., 0., 1.], [ 0., 0., 0.]]) In [10]: h5f['sparse']['matrix'][2:].toarray() Out[10]: array([[ 0., 0., 0.], [ 1., 1., 0.]]) In [11]: h5f['sparse']['matrix'][:2].toarray() Out[11]: array([[ 0., 1., 0.], [ 0., 0., 1.]]) In [12]: h5f['sparse']['matrix'][-2:].toarray() Out[12]: array([[ 0., 0., 0.], [ 1., 1., 0.]]) In [13]: h5f['sparse']['matrix'][:-2].toarray() Out[13]: array([[ 0., 1., 0.], [ 0., 0., 1.]]) In [14]: h5f['sparse']['matrix'][()].toarray() Out[14]: array([[ 0., 1., 0.], [ 0., 0., 1.], [ 0., 0., 0.], [ 1., 1., 0.]]) In [15]: import h5py In [16]: h5py_h5f = h5py.File("test.h5") In [17]: h5sparse.Group(h5py_h5f.id)['sparse/matrix'][()] Out[17]: <4x3 sparse matrix of type '' with 4 stored elements in Compressed Sparse Row format> In [18]: h5sparse.Group(h5py_h5f['sparse'].id)['matrix'][()] Out[18]: <4x3 sparse matrix of type '' with 4 stored elements in Compressed Sparse Row format> In [19]: h5sparse.Dataset(h5py_h5f['sparse/matrix'])[()] Out[19]: <4x3 sparse matrix of type '' with 4 stored elements in Compressed Sparse Row format> Append dataset ************** .. code:: python In [20]: to_append = ss.csr_matrix([[0, 1, 1], ...: [1, 0, 0]], ...: dtype=np.float64) In [21]: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,), ...: maxshape=(None,)) In [22]: h5f['matrix'].append(to_append) In [23]: h5f['matrix'][()] Out[23]: <6x3 sparse matrix of type '' with 7 stored elements in Compressed Sparse Row format> In [24]: h5f['matrix'][()].toarray() Out[24]: array([[ 0., 1., 0.], [ 0., 0., 1.], [ 0., 0., 0.], [ 1., 1., 0.], [ 0., 1., 1.], [ 1., 0., 0.]]) h5sparse-0.1.0/h5sparse/000077500000000000000000000000001343450573700150075ustar00rootroot00000000000000h5sparse-0.1.0/h5sparse/__init__.py000066400000000000000000000002201343450573700171120ustar00rootroot00000000000000import pkg_resources from .h5sparse import Group, File, Dataset # noqa: F401 __version__ = pkg_resources.get_distribution("h5sparse").version h5sparse-0.1.0/h5sparse/h5sparse.py000066400000000000000000000165671343450573700171320ustar00rootroot00000000000000import six import h5py import numpy as np import scipy.sparse as ss FORMAT_DICT = { 'csr': ss.csr_matrix, 'csc': ss.csc_matrix, } def get_format_str(data): for format_str, format_class in six.viewitems(FORMAT_DICT): if isinstance(data, format_class): return format_str raise ValueError("Data type {} is not supported.".format(type(data))) def get_format_class(format_str): try: format_class = FORMAT_DICT[format_str] except KeyError: raise ValueError("Format {} is not supported." .format(format_str)) return format_class class Group(h5py.Group): """The HDF5 group that can detect and create sparse matrix. """ def __getitem__(self, key): h5py_item = super(Group, self).__getitem__(key) if isinstance(h5py_item, h5py.Group): if 'h5sparse_format' in h5py_item.attrs: # detect the sparse matrix return Dataset(h5py_item) else: return Group(h5py_item.id) elif isinstance(h5py_item, h5py.Dataset): return h5py_item else: raise ValueError("Unexpected item type.") def create_dataset(self, name, shape=None, dtype=None, data=None, sparse_format=None, indptr_dtype=np.int64, indices_dtype=np.int32, **kwargs): """Create 3 datasets in a group to represent the sparse array. Parameters ---------- sparse_format: """ if isinstance(data, Dataset): assert sparse_format is None group = self.create_group(name) group.attrs['h5sparse_format'] = data.attrs['h5sparse_format'] group.attrs['h5sparse_shape'] = data.attrs['h5sparse_shape'] group.create_dataset('data', data=data.h5py_group['data'], dtype=dtype, **kwargs) group.create_dataset('indices', data=data.h5py_group['indices'], dtype=indices_dtype, **kwargs) group.create_dataset('indptr', data=data.h5py_group['indptr'], dtype=indptr_dtype, **kwargs) elif ss.issparse(data): if sparse_format is not None: format_class = get_format_class(sparse_format) data = format_class(data) group = self.create_group(name) group.attrs['h5sparse_format'] = get_format_str(data) group.attrs['h5sparse_shape'] = data.shape group.create_dataset('data', data=data.data, dtype=dtype, **kwargs) group.create_dataset('indices', data=data.indices, dtype=indices_dtype, **kwargs) group.create_dataset('indptr', data=data.indptr, dtype=indptr_dtype, **kwargs) elif data is None and sparse_format is not None: format_class = get_format_class(sparse_format) if dtype is None: dtype = np.float64 if shape is None: shape = (0, 0) data = format_class(shape, dtype=dtype) group = self.create_group(name) group.attrs['h5sparse_format'] = get_format_str(data) group.attrs['h5sparse_shape'] = data.shape group.create_dataset('data', data=data.data, dtype=dtype, **kwargs) group.create_dataset('indices', data=data.indices, dtype=indices_dtype, **kwargs) group.create_dataset('indptr', data=data.indptr, dtype=indptr_dtype, **kwargs) else: # forward the arguments to h5py assert sparse_format is None return super(Group, self).create_dataset( name, data=data, shape=shape, dtype=dtype, **kwargs) return Dataset(group) class File(h5py.File, Group): """The HDF5 file object that can detect and create sparse matrix. """ pass class Dataset(h5py.Group): """The HDF5 sparse matrix dataset. Parameters ---------- h5py_group : h5py.Dataset """ def __init__(self, h5py_group): super(Dataset, self).__init__(h5py_group.id) self.h5py_group = h5py_group self.shape = tuple(self.attrs['h5sparse_shape']) self.format_str = self.attrs['h5sparse_format'] self.dtype = h5py_group['data'].dtype self.indptr_dtype = h5py_group['indptr'].dtype self.indices_dtype = h5py_group['indices'].dtype def __getitem__(self, key): if isinstance(key, slice): if key.step is not None: raise NotImplementedError("Index step is not supported.") start = key.start stop = key.stop if stop is not None and stop > 0: stop += 1 if start is not None and start < 0: start -= 1 indptr_slice = slice(start, stop) indptr = self.h5py_group['indptr'][indptr_slice] data = self.h5py_group['data'][indptr[0]:indptr[-1]] indices = self.h5py_group['indices'][indptr[0]:indptr[-1]] indptr -= indptr[0] if self.format_str == 'csr': shape = (indptr.size - 1, self.shape[1]) elif self.format_str == 'csc': shape = (self.shape[0], indptr.size - 1) else: raise NotImplementedError("Slicing for format {} is not implemented." .format(self.format_str)) elif isinstance(key, tuple) and key == (): data = self.h5py_group['data'][()] indices = self.h5py_group['indices'][()] indptr = self.h5py_group['indptr'][()] shape = self.shape else: raise NotImplementedError("Only support one slice as index.") format_class = get_format_class(self.attrs['h5sparse_format']) return format_class((data, indices, indptr), shape=shape) @property def value(self): return self[()] def append(self, sparse_matrix): if self.format_str != get_format_str(sparse_matrix): raise ValueError("Format not the same.") if self.format_str == 'csr': # data data = self.h5py_group['data'] orig_data_size = data.shape[0] new_shape = (orig_data_size + sparse_matrix.data.shape[0],) data.resize(new_shape) data[orig_data_size:] = sparse_matrix.data # indptr indptr = self.h5py_group['indptr'] orig_data_size = indptr.shape[0] append_offset = indptr[-1] new_shape = (orig_data_size + sparse_matrix.indptr.shape[0] - 1,) indptr.resize(new_shape) indptr[orig_data_size:] = (sparse_matrix.indptr[1:].astype(np.int64) + append_offset) # indices indices = self.h5py_group['indices'] orig_data_size = indices.shape[0] new_shape = (orig_data_size + sparse_matrix.indices.shape[0],) indices.resize(new_shape) indices[orig_data_size:] = sparse_matrix.indices # shape self.shape = ( self.shape[0] + sparse_matrix.shape[0], max(self.shape[1], sparse_matrix.shape[1])) self.attrs['h5sparse_shape'] = self.shape else: raise NotImplementedError("The append method for format {} is not implemented." .format(self.format_str)) h5sparse-0.1.0/h5sparse/tests.py000066400000000000000000000127701343450573700165320ustar00rootroot00000000000000import os import json from tempfile import mkstemp import numpy as np import scipy.sparse as ss import h5sparse def test_create_and_read_dataset(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('sparse/matrix', data=sparse_matrix) with h5sparse.File(h5_path) as h5f: assert 'sparse' in h5f assert 'matrix' in h5f['sparse'] assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[1:3]).size == 0 assert (h5f['sparse']['matrix'][2:] != sparse_matrix[2:]).size == 0 assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:2]).size == 0 assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[-2:]).size == 0 assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:-2]).size == 0 assert (h5f['sparse']['matrix'][()] != sparse_matrix).size == 0 os.remove(h5_path) def test_create_dataset_with_format_change(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1]], dtype=np.float64) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('sparse/matrix', data=sparse_matrix, sparse_format='csc') with h5sparse.File(h5_path) as h5f: assert 'sparse' in h5f assert 'matrix' in h5f['sparse'] assert h5f['sparse']['matrix'].format_str == 'csc' result_matrix = h5f['sparse']['matrix'][()] assert isinstance(result_matrix, ss.csc_matrix) assert (result_matrix != sparse_matrix).size == 0 assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[:, 1:3]).size == 0 assert (h5f['sparse']['matrix'][2:] != sparse_matrix[:, 2:]).size == 0 assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:, :2]).size == 0 assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[:, -2:]).size == 0 assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:, :-2]).size == 0 os.remove(h5_path) def test_create_empty_sparse_dataset(): h5_path = mkstemp(suffix=".h5")[1] with h5sparse.File(h5_path) as h5f: h5f.create_dataset('sparse/matrix', sparse_format='csr') with h5sparse.File(h5_path) as h5f: assert 'sparse' in h5f assert 'matrix' in h5f['sparse'] assert h5f['sparse']['matrix'].format_str == 'csr' result_matrix = h5f['sparse']['matrix'][()] assert isinstance(result_matrix, ss.csr_matrix) assert result_matrix.shape == (0, 0) assert result_matrix.dtype == np.float64 assert h5f['sparse']['matrix'].shape == (0, 0) assert h5f['sparse']['matrix'].dtype == np.float64 os.remove(h5_path) def test_create_dataset_from_dataset(): from_h5_path = mkstemp(suffix=".h5")[1] to_h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) with h5sparse.File(from_h5_path) as from_h5f: from_dset = from_h5f.create_dataset('sparse/matrix', data=sparse_matrix) with h5sparse.File(to_h5_path) as to_h5f: to_h5f.create_dataset('sparse/matrix', data=from_dset) assert 'sparse' in to_h5f assert 'matrix' in to_h5f['sparse'] assert (to_h5f['sparse/matrix'][()] != sparse_matrix).size == 0 os.remove(from_h5_path) os.remove(to_h5_path) def test_dataset_append(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) to_append = ss.csr_matrix([[0, 1, 1], [1, 0, 0]], dtype=np.float64) appended_matrix = ss.vstack((sparse_matrix, to_append)) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,), maxshape=(None,)) h5f['matrix'].append(to_append) assert (h5f['matrix'][()] != appended_matrix).size == 0 os.remove(h5_path) def test_numpy_array(): h5_path = mkstemp(suffix=".h5")[1] matrix = np.random.rand(3, 5) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('matrix', data=matrix) assert 'matrix' in h5f np.testing.assert_equal(h5f['matrix'][()], matrix) os.remove(h5_path) def test_bytestring(): h5_path = mkstemp(suffix=".h5")[1] strings = [str(i) for i in range(100)] data = json.dumps(strings).encode('utf8') with h5sparse.File(h5_path) as h5f: h5f.create_dataset('strings', data=data) assert 'strings' in h5f assert strings == json.loads(h5f['strings'][()].decode('utf8')) os.remove(h5_path) def test_create_empty_dataset(): h5_path = mkstemp(suffix=".h5")[1] with h5sparse.File(h5_path) as h5f: h5f.create_dataset('empty_data', shape=(100, 200)) with h5sparse.File(h5_path) as h5f: assert h5f['empty_data'].shape == (100, 200) os.remove(h5_path) h5sparse-0.1.0/requirements.txt000066400000000000000000000000661343450573700165430ustar00rootroot00000000000000six h5py numpy scipy pylint tox flake8 nose coverage h5sparse-0.1.0/setup.cfg000066400000000000000000000020651343450573700151010ustar00rootroot00000000000000[aliases] test=nosetests [nosetests] exe=1 verbosity=1 detailed-errors=1 with-coverage=1 cover-package=h5sparse cover-branches=1 debug=nose.loader cover-html=1 cover-html-dir=htmlcov nocapture=1 [bdist_wheel] universal=1 [flake8] ignore = # D100: missing docstring in public module D100, # D101: missing docstring in public class D101, # D102: missing docstring in public method D102, # D103: missing docstring in public function D103, # D104 missing docstring in public package D104, # D105 missing docstring in magic method D105, # W503: line break occurred before a binary operator W503, # I201: missing newline between sections or imports I201, # N802: function name should be lowercase N802, # N803: argument name should be lowercase N803, # N806: variable in function should be lowercase N806, exclude = .tox, .git, __pycache__, docs/source/conf.py, build, dist, *.pyc, *.egg-info, .cache, .eggs, max-complexity = 10 max-line-length = 99 h5sparse-0.1.0/setup.py000066400000000000000000000026301343450573700147700ustar00rootroot00000000000000#!/usr/bin/env python from setuptools import setup setup_requires = [ 'nose', 'coverage', ] install_requires = [ 'h5py', 'numpy', 'scipy', 'six', ] tests_require = [] description = "Scipy sparse matrix in HDF5." long_description = """\ Please visit the `Github repository `_ for more information.\n """ with open('README.rst') as fp: long_description += fp.read() setup( name='h5sparse', version="0.1.0", description=description, long_description=long_description, author='Appier Inc.', url='https://github.com/appier/h5sparse', setup_requires=setup_requires, install_requires=install_requires, tests_require=tests_require, license="MIT", classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', 'Intended Audience :: Science/Research', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Topic :: Scientific/Engineering', 'Topic :: Database', 'Topic :: Software Development :: Libraries :: Python Modules', 'License :: OSI Approved :: MIT License', ], test_suite='nose.collector', packages=[ 'h5sparse', ], ) h5sparse-0.1.0/tox.ini000066400000000000000000000002621343450573700145700ustar00rootroot00000000000000[tox] envlist = py{27,35,36,37},flake8 [testenv] deps = nose coverage commands = nosetests [testenv:flake8] skip_install = true deps = flake8 commands = flake8