pax_global_header 0000666 0000000 0000000 00000000064 13050634271 0014513 g ustar 00root root 0000000 0000000 52 comment=1712813e8f79c5b1ac60813ef543fa558e8a7abc
fastchunking-0.0.3/ 0000775 0000000 0000000 00000000000 13050634271 0014177 5 ustar 00root root 0000000 0000000 fastchunking-0.0.3/.gitignore 0000664 0000000 0000000 00000001367 13050634271 0016176 0 ustar 00root root 0000000 0000000 # Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
#Ipython Notebook
.ipynb_checkpoints
fastchunking-0.0.3/.project 0000664 0000000 0000000 00000000577 13050634271 0015657 0 ustar 00root root 0000000 0000000
fastchunkingorg.python.pydev.PyDevBuilderorg.python.pydev.pythonNature
fastchunking-0.0.3/.pydevproject 0000664 0000000 0000000 00000000647 13050634271 0016725 0 ustar 00root root 0000000 0000000
Defaultpython 3.0/${PROJECT_DIR_NAME}
fastchunking-0.0.3/.pylintrc 0000664 0000000 0000000 00000031146 13050634271 0016051 0 ustar 00root root 0000000 0000000 [MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Pickle collected data for later comparisons.
persistent=yes
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Use multiple processes to speed up Pylint.
jobs=1
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
# Allow optimization of some AST trees. This will activate a peephole AST
# optimizer, which will apply various small optimizations. For instance, it can
# be used to obtain the result of joining multiple strings with the addition
# operator. Joining a lot of strings can lead to a maximum recursion error in
# Pylint and this flag can prevent that. It has one side effect, the resulting
# AST will be different than the one from reality.
optimize-ast=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,pointless-string-statement,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no
# Tells whether to display a full report or only the messages
reports=yes
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
[BASIC]
# List of builtins function names that should not be used, separated by a comma
bad-functions=map,filter,input
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# Regular expression matching correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for function names
function-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for variable names
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for attribute names
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for argument names
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct method names
method-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for method names
method-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
[ELIF]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )??$
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# List of classes names for which member attributes should not be checked
# (useful for classes with attributes dynamically set). This supports can work
# with qualified names.
ignored-classes=
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=_$|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception
fastchunking-0.0.3/.settings/ 0000775 0000000 0000000 00000000000 13050634271 0016115 5 ustar 00root root 0000000 0000000 fastchunking-0.0.3/.settings/org.eclipse.core.resources.prefs 0000664 0000000 0000000 00000000136 13050634271 0024330 0 ustar 00root root 0000000 0000000 eclipse.preferences.version=1
encoding//doc/source/conf.py=utf-8
encoding//docs/conf.py=utf-8
fastchunking-0.0.3/.travis.yml 0000664 0000000 0000000 00000000116 13050634271 0016306 0 ustar 00root root 0000000 0000000 language: python
python:
- "3.5"
install: pip install tox-travis
script: tox fastchunking-0.0.3/LICENSE 0000664 0000000 0000000 00000026135 13050634271 0015213 0 ustar 00root root 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
fastchunking-0.0.3/MANIFEST.in 0000664 0000000 0000000 00000000065 13050634271 0015736 0 ustar 00root root 0000000 0000000 include README.rst LICENSE
include lib/*.cpp lib/*.h fastchunking-0.0.3/README.rst 0000664 0000000 0000000 00000014265 13050634271 0015676 0 ustar 00root root 0000000 0000000 ===========================
fastchunking Python library
===========================
.. image:: https://travis-ci.org/netleibi/fastchunking.svg?branch=master
:target: https://travis-ci.org/netleibi/fastchunking
.. image:: https://badge.fury.io/py/fastchunking.svg
:target: https://badge.fury.io/py/fastchunking
.. image:: https://readthedocs.org/projects/fastchunking/badge/?version=latest
:target: http://fastchunking.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
What it is
----------
`fastchunking` is a Python library that contains efficient and easy-to-use
implementations of string chunking algorithms.
It has been developed as part of the work [LS16]_ at CISPA, Saarland University.
Installation
------------
::
$ pip install fastchunking
If you are using Python >=3.5, you have to install pybindgen which is required
by fastchunking beforehand as the latest version currently available via pip is
outdated:
::
$ pip uninstall pybindgen
$ pip install git+https://github.com/gjcarneiro/pybindgen.git
.. note:: For performance reasons, parts of this library are implemented in C++.
Installation from a source distribution, thus, requires availability of a
correctly configured C++ compiler.
Usage and Overview
------------------
`fastchunking` provides efficient implementations for different string chunking
algorithms, e.g., static chunking (SC) and content-defined chunking (CDC).
Static Chunking (SC)
^^^^^^^^^^^^^^^^^^^^
Static chunking splits a message into fixed-size chunks.
Let us consider a random example message that shall be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
Static chunking is trivial when chunking a single message:
>>> import fastchunking
>>> sc = fastchunking.SC()
>>> chunker = sc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message)
[4096, 8192, 12288, ...]
A large message can also be chunked in fragments, though:
>>> chunker = sc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message[:10240])
[4096, 8192]
>>> chunker.next_chunk_boundaries(message[10240:])
[2048, 6144, 10240, ...]
Content-Defined Chunking (CDC)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
`fastchunking` supports content-defined chunking, i.e., chunking of messages
into fragments of variable lengths.
Currently, a chunking strategy based on Rabin-Karp rolling hashes is supported.
As a rolling hash computation on plain-Python strings is incredibly slow with
any interpreter, most of the computation is performed by a C++ extension which
is based on the `ngramhashing` library by Daniel Lemire, see:
https://github.com/lemire/rollinghashcpp
Let us consider a random message that should be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
When using static chunking, we have to specify a rolling hash window size (here:
48 bytes) and an optional seed value that affects the pseudo-random distribution
of the generated chunk boundaries.
Despite that, usage is similar to static chunking:
>>> import fastchunking
>>> cdc = fastchunking.RabinKarpCDC(window_size=48, seed=0)
>>> chunker = cdc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message)
[7475, 10451, 12253, 13880, 15329, 19808, ...]
Chunking in fragments is straightforward:
>>> chunker = cdc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message[:10240])
[7475]
>>> chunker.next_chunk_boundaries(message[10240:])
[211, 2013, 3640, 5089, 9568, ...]
Multi-Level Chunking (ML-\*)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Multiple chunkers of the same type (but with different chunk sizes) can be
efficiently used in parallel, e.g., to perform multi-level chunking [LS16]_.
Again, let us consider a random message that should be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
Usage of multi-level-chunking, e.g., ML-CDC, is easy:
>>> import fastchunking
>>> cdc = fastchunking.RabinKarpCDC(window_size=48, seed=0)
>>> chunk_sizes = [1024, 2048, 4096]
>>> chunker = cdc.create_multilevel_chunker(chunk_sizes)
>>> chunker.next_chunk_boundaries_with_levels(message)
[(1049, 2), (1511, 1), (1893, 2), (2880, 1), (2886, 0),
(3701, 0), (4617, 0), (5809, 2), (5843, 0), ...]
The second value in each tuple indicates the highest chunk size that leads to
a boundary. Here, the first boundary is a boundary created by the chunker with
index 2, i.e., the chunker with 4096 bytes target chunk size.
.. note::
Only the highest index is output if multiple chunkers yield the same
boundary.
.. warning::
Chunk sizes have to be passed in correct order, i.e., from lowest to highest
value.
Performance
-----------
Computation costs for `static chunking` are barely measurable: As chunking does
not depend on the actual message but only its length, computation costs are
essentially limited to a single :code:`xrange` call.
`Content-defined chunking`, however, is expensive: The algorithm has to compute
hash values for rolling hash window contents at `every` byte position of the
message that is to be chunked. To minimize costs, fastchunking works as follows:
1. The message (fragment) is passed in its entirety to the C++ extension.
2. Chunking is performed within the C++ extension.
3. The resulting list of chunk boundaries is communicated back to Python and
converted into a Python list.
Based on a 100 MiB random content, the author measured the following throughput
on an Intel Core i7-4770K in a single, non-representative test run using
Python 3.5 (Windows x86-64):
=========== ==========
chunk size throughput
=========== ==========
64 bytes 118 MiB/s
128 bytes 153 MiB/s
256 bytes 187 MiB/s
512 bytes 206 MiB/s
1024 bytes 221 MiB/s
2048 bytes 226 MiB/s
4096 bytes 231 MiB/s
8192 bytes 234 MiB/s
16384 bytes 233 MiB/s
32768 bytes 234 MiB/s
=========== ==========
Testing
-------
`fastchunking` uses tox for testing, so simply run:
::
$ tox
References:
.. [LS16] Dominik Leibenger and Christoph Sorge (2016). sec-cs: Getting the
Most out of Untrusted Cloud Storage.
`arXiv:1606.03368 `_
fastchunking-0.0.3/docs/ 0000775 0000000 0000000 00000000000 13050634271 0015127 5 ustar 00root root 0000000 0000000 fastchunking-0.0.3/docs/Makefile 0000664 0000000 0000000 00000016716 13050634271 0016602 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " epub3 to make an epub3"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
@echo " dummy to check syntax errors of document sources"
.PHONY: clean
clean:
rm -rf $(BUILDDIR)/*
.PHONY: html
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
.PHONY: dirhtml
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
.PHONY: singlehtml
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
.PHONY: pickle
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
.PHONY: json
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
.PHONY: htmlhelp
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
.PHONY: qthelp
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fastchunking.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fastchunking.qhc"
.PHONY: applehelp
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
.PHONY: devhelp
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/fastchunking"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fastchunking"
@echo "# devhelp"
.PHONY: epub
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
.PHONY: epub3
epub3:
$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
@echo
@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
.PHONY: latex
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
.PHONY: latexpdf
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: latexpdfja
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: text
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
.PHONY: man
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
.PHONY: texinfo
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
.PHONY: info
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
.PHONY: gettext
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
.PHONY: changes
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
.PHONY: linkcheck
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
.PHONY: doctest
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
.PHONY: coverage
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
.PHONY: xml
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
.PHONY: pseudoxml
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
.PHONY: dummy
dummy:
$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
@echo
@echo "Build finished. Dummy builder generates no files."
fastchunking-0.0.3/docs/conf.py 0000664 0000000 0000000 00000030553 13050634271 0016434 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
#
# fastchunking documentation build configuration file, created by
# sphinx-quickstart on Fri Jun 03 14:46:13 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.todo',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = []
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The encoding of source files.
#
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'fastchunking'
copyright = u'2016, Dominik Leibenger'
author = u'Dominik Leibenger'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
# "Borrowed" from Flask conf.py file
import pkg_resources
try:
release = pkg_resources.get_distribution('fastchunking').version
except pkg_resources.DistributionNotFound:
print 'To build the documentation, The distribution information of'
print 'fastchunking has to be available. Either install the package into'
print 'your development environment or run "setup.py develop" to setup the'
print 'metadata. A virtualenv is recommended!'
sys.exit(1)
del pkg_resources
version = '.'.join(release.split('.')[:2])
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#
# today = ''
#
# Else, today_fmt is used as the format for a strftime call.
#
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The reST default role (used for this markup: `text`) to use for all
# documents.
#
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
# keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []
# The name for this set of Sphinx documents.
# " v documentation" by default.
#
# html_title = u'fastchunking v'
# A shorter title for the navigation bar. Default is the same as html_title.
#
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#
# html_logo = None
# The name of an image file (relative to this directory) to use as a favicon of
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#
# html_extra_path = []
# If not None, a 'Last updated on:' timestamp is inserted at every page
# bottom, using the given strftime format.
# The empty string is equivalent to '%b %d, %Y'.
#
# html_last_updated_fmt = None
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#
# html_additional_pages = {}
# If false, no module index is generated.
#
# html_domain_indices = True
# If false, no index is generated.
#
# html_use_index = True
# If true, the index is split into individual pages for each letter.
#
# html_split_index = False
# If true, links to the reST sources are added to the pages.
#
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
#
# html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# 'ja' uses this config value.
# 'zh' user can custom change `jieba` dictionary path.
#
# html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
#
# html_search_scorer = 'scorer.js'
# Output file base name for HTML help builder.
htmlhelp_basename = 'fastchunkingdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'fastchunking.tex', u'fastchunking Documentation',
u'Author', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#
# latex_use_parts = False
# If true, show page references after internal links.
#
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
#
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
#
# latex_appendices = []
# If false, no module index is generated.
#
# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'fastchunking', u'fastchunking Documentation',
[author], 1)
]
# If true, show URL addresses after external links.
#
# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'fastchunking', u'fastchunking Documentation',
author, 'fastchunking', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#
# texinfo_appendices = []
# If false, no module index is generated.
#
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#
# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
epub_author = author
epub_publisher = author
epub_copyright = copyright
# The basename for the epub file. It defaults to the project name.
# epub_basename = project
# The HTML theme for the epub output. Since the default themes are not
# optimized for small screen space, using the same theme for HTML and epub
# output is usually not wise. This defaults to 'epub', a theme designed to save
# visual space.
#
# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or 'en' if the language is not set.
#
# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
#
# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
#
# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#
# epub_pre_files = []
# HTML files that should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#
# epub_post_files = []
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
# The depth of the table of contents in toc.ncx.
#
# epub_tocdepth = 3
# Allow duplicate toc entries.
#
# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
#
# epub_tocscope = 'default'
# Fix unsupported image types using the Pillow.
#
# epub_fix_images = False
# Scale large images.
#
# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#
# epub_show_urls = 'inline'
# If false, no index is generated.
#
# epub_use_index = True
fastchunking-0.0.3/docs/fastchunking.rst 0000664 0000000 0000000 00000000242 13050634271 0020343 0 ustar 00root root 0000000 0000000 ====================
fastchunking package
====================
.. automodule:: fastchunking
:members:
:inherited-members:
:show-inheritance:
fastchunking-0.0.3/docs/index.rst 0000664 0000000 0000000 00000001022 13050634271 0016763 0 ustar 00root root 0000000 0000000 ===========================
fastchunking Python library
===========================
`fastchunking` is a Python library that contains efficient and easy-to-use
implementations of string chunking algorithms.
It has been developed as part of the work [LS16]_ at CISPA, Saarland University.
Contents
========
.. toctree::
:maxdepth: 1
installation
overview
fastchunking
performance
testing
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
fastchunking-0.0.3/docs/installation.rst 0000664 0000000 0000000 00000000437 13050634271 0020366 0 ustar 00root root 0000000 0000000 ============
Installation
============
Run::
$ pip install fastchunking
.. note::
For performance reasons, parts of this library are implemented in C++.
Installation from a source distribution, thus, requires availability of a
correctly configured C++ compiler. fastchunking-0.0.3/docs/make.bat 0000664 0000000 0000000 00000017100 13050634271 0016533 0 ustar 00root root 0000000 0000000 @ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^` where ^ is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. epub3 to make an epub3
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
echo. coverage to run coverage check of the documentation if enabled
echo. dummy to check syntax errors of document sources
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
REM Check if sphinx-build is available and fallback to Python version if any
%SPHINXBUILD% 1>NUL 2>NUL
if errorlevel 9009 goto sphinx_python
goto sphinx_ok
:sphinx_python
set SPHINXBUILD=python -m sphinx.__init__
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
:sphinx_ok
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fastchunking.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fastchunking.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "epub3" (
%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %~dp0
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %~dp0
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "coverage" (
%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
if errorlevel 1 exit /b 1
echo.
echo.Testing of coverage in the sources finished, look at the ^
results in %BUILDDIR%/coverage/python.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
if "%1" == "dummy" (
%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
if errorlevel 1 exit /b 1
echo.
echo.Build finished. Dummy builder generates no files.
goto end
)
:end
fastchunking-0.0.3/docs/overview.rst 0000664 0000000 0000000 00000007413 13050634271 0017534 0 ustar 00root root 0000000 0000000 ==================
Usage and Overview
==================
`fastchunking` provides efficient implementations for different string chunking
algorithms, e.g., static chunking (SC) and content-defined chunking (CDC).
Static Chunking (SC)
--------------------
Static chunking splits a message into fixed-size chunks.
Let us consider a random example message that shall be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
Static chunking is trivial when chunking a single message:
>>> import fastchunking
>>> sc = fastchunking.SC()
>>> chunker = sc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message)
[4096, 8192, 12288, ...]
A large message can also be chunked in fragments, though:
>>> chunker = sc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message[:10240])
[4096, 8192]
>>> chunker.next_chunk_boundaries(message[10240:])
[2048, 6144, 10240, ...]
Content-Defined Chunking (CDC)
------------------------------
`fastchunking` supports content-defined chunking, i.e., chunking of messages
into fragments of variable lengths.
Currently, a chunking strategy based on Rabin-Karp rolling hashes is supported.
As a rolling hash computation on plain-Python strings is incredibly slow with
any interpreter, most of the computation is performed by a C++ extension which
is based on the `ngramhashing` library by Daniel Lemire, see:
https://github.com/lemire/rollinghashcpp
Let us consider a random message that should be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
When using static chunking, we have to specify a rolling hash window size (here:
48 bytes) and an optional seed value that affects the pseudo-random distribution
of the generated chunk boundaries.
Despite that, usage is similar to static chunking:
>>> import fastchunking
>>> cdc = fastchunking.RabinKarpCDC(window_size=48, seed=0)
>>> chunker = cdc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message)
[7475L, 10451L, 12253L, 13880L, 15329L, 19808L, ...]
Chunking in fragments is straightforward:
>>> chunker = cdc.create_chunker(chunk_size=4096)
>>> chunker.next_chunk_boundaries(message[:10240])
[7475L]
>>> chunker.next_chunk_boundaries(message[10240:])
[211L, 2013L, 3640L, 5089L, 9568L, ...]
Multi-Level Chunking (ML-\*)
----------------------------
Multiple chunkers of the same type (but with different chunk sizes) can be
efficiently used in parallel, e.g., to perform multi-level chunking [LS16]_.
Again, let us consider a random message that should be chunked:
>>> import os
>>> message = os.urandom(1024*1024)
Usage of multi-level-chunking, e.g., ML-CDC, is easy:
>>> import fastchunking
>>> cdc = fastchunking.RabinKarpCDC(window_size=48, seed=0)
>>> chunk_sizes = [1024, 2048, 4096]
>>> chunker = cdc.create_multilevel_chunker(chunk_sizes)
>>> chunker.next_chunk_boundaries_with_levels(message)
[(1049L, 2L), (1511L, 1L), (1893L, 2L), (2880L, 1L), (2886L, 0L),
(3701L, 0L), (4617L, 0L), (5809L, 2L), (5843L, 0L), ...]
The second value in each tuple indicates the highest chunk size that leads to
a boundary. Here, the first boundary is a boundary created by the chunker with
index 2, i.e., the chunker with 4096 bytes target chunk size.
.. note::
Only the highest index is output if multiple chunkers yield the same
boundary.
.. warning::
Chunk sizes have to be passed in correct order, i.e., from lowest to highest
value.
References:
.. [LS16] Dominik Leibenger and Christoph Sorge (2016). sec-cs: Getting the
Most out of Untrusted Cloud Storage.
`arXiv:1606.03368 `_
fastchunking-0.0.3/docs/performance.rst 0000664 0000000 0000000 00000002403 13050634271 0020161 0 ustar 00root root 0000000 0000000 ===========
Performance
===========
Computation costs for `static chunking` are barely measurable: As chunking does
not depend on the actual message but only its length, computation costs are
essentially limited to a single :code:`xrange` call.
`Content-defined chunking`, however, is expensive: The algorithm has to compute
hash values for rolling hash window contents at `every` byte position of the
message that is to be chunked. To minimize costs, fastchunking works as follows:
1. The message (fragment) is passed in its entirety to the C++ extension.
2. Chunking is performed within the C++ extension.
3. The resulting list of chunk boundaries is communicated back to Python and
converted into a Python list.
Based on a 100 MiB random content, the author measured the following throughput
on an Intel Core i7-4600U in a single, non-representative test run:
=========== ==========
chunk size throughput
=========== ==========
64 bytes 49 MiB/s
128 bytes 57 MiB/s
256 bytes 62 MiB/s
512 bytes 63 MiB/s
1024 bytes 67 MiB/s
2048 bytes 68 MiB/s
4096 bytes 70 MiB/s
8192 bytes 71 MiB/s
16384 bytes 71 MiB/s
32768 bytes 71 MiB/s
=========== ==========
fastchunking-0.0.3/docs/testing.rst 0000664 0000000 0000000 00000000144 13050634271 0017335 0 ustar 00root root 0000000 0000000 =======
Testing
=======
`fastchunking` uses tox for testing, so simply run:
::
$ tox
fastchunking-0.0.3/fastchunking/ 0000775 0000000 0000000 00000000000 13050634271 0016663 5 ustar 00root root 0000000 0000000 fastchunking-0.0.3/fastchunking/__init__.py 0000664 0000000 0000000 00000021621 13050634271 0020776 0 ustar 00root root 0000000 0000000 """Fast and easy-to-use string chunking algorithms.
`fastchunking` provides two public classes meant to be used by end users.
* :class:`.SC`: Static chunking strategy.
* :class:`.RabinKarpCDC`: Rabin-Karp-based content-defined chunking strategy.
See below for details.
"""
import abc
import fastchunking._rabinkarprh as _rabinkarprh
__version__ = '0.0.3'
class BaseChunkingStrategy(object):
"""Abstract base class for chunking strategies."""
__metaclass__ = abc.ABCMeta
def __init__(self):
self.window_size = 1
@abc.abstractmethod
def create_chunker(self, chunk_size):
"""Abstract interface for chunker creation."""
raise NotImplementedError
def create_multilevel_chunker(self, chunk_sizes):
"""Create a multi-level chunker performing chunking with different
chunk sizes.
Args:
chunk_sizes (list): List of target chunk sizes.
Warning:
For performance reasons, behavior is only defined if chunk sizes
are passed in order, i.e., from lowest to highest value.
Returns:
BaseMultiLevelChunker: A multi-level chunker object.
"""
return DefaultMultiLevelChunker(chunk_sizes, self.create_chunker)
class BaseChunker(object):
"""Abstract class specifying the interface of chunkers."""
__metaclass__ = abc.ABCMeta
def next_chunk_boundaries(self, buf, prepend_bytes=0):
"""Computes the next chunk boundaries within `buf`.
Note:
If called more than once, output depends on `all` previous calls of this
function: The chunking algorithm is applied to the concatenation of all
`buf` values.
Args:
buf (string): The message that is to be chunked.
prepend_bytes (Optional[int]): Optional number of zero bytes that should be
input to the chunking algorithm before `buf`.
Returns:
list: List of chunk boundary positions relative to `buf`.
"""
raise NotImplementedError
class BaseMultiLevelChunker(BaseChunker):
"""Abstract class specifying the interface of multi-level chunkers."""
__metaclass__ = abc.ABCMeta
def next_chunk_boundaries(self, buf, prepend_bytes=0):
"""Computes the next chunk boundaries within `buf`.
See :meth:`.BaseChunker.next_chunk_boundaries`.
"""
return [boundary for (boundary, _) in
self.next_chunk_boundaries_levels(buf, prepend_bytes)]
def next_chunk_boundaries_levels(self, buf, prepend_bytes=0):
"""Computes the next chunk boundaries within `buf`.
Similar to :meth:`.next_chunk_boundaries`, but information about which
chunker led to a respective boundary is included in the returned value.
Args:
buf (string): The message that is to be chunked.
prepend_bytes (Optional[int]): Optional number of zero bytes that
should be input to the chunking algorithm before `buf`.
Returns:
list: List of tuples (boundary, level), where boundary is a boundary
position relative to `buf` and level is the index of the chunker
(i.e., the index of its chunk size specified during
instantiation) that yielded the boundary.
If multiple chunkers yield the same boundary, it is returned
only once, along with the highest matching chunker index.
"""
raise NotImplementedError
class DefaultMultiLevelChunker(BaseMultiLevelChunker):
"""Default multi-level chunker implementation, turning a standard chunker
into a multi-level chunker.
Multi-level chunkers perform chunking using multiple chunkers of type
:class:`.BaseChunker` with different chunk sizes in parallel.
"""
def __init__(self, chunk_sizes, chunker_create_fn):
# create a chunker for each chunk size
self._chunkers = [
chunker_create_fn(chunk_size) for chunk_size in chunk_sizes]
def next_chunk_boundaries_levels(self, buf, prepend_bytes=0):
"""Computes the next chunk boundaries within `buf`.
Similar to :meth:`.next_chunk_boundaries`, but information about which
chunker led to a respective boundary is included in the returned value.
Args:
buf (string): The message that is to be chunked.
prepend_bytes (Optional[int]): Optional number of zero bytes that
should be input to the chunking algorithm before `buf`.
Returns:
list: List of tuples (boundary, level), where boundary is a boundary
position relative to `buf` and level is the index of the chunker
(i.e., the index of its chunk size specified during
instantiation) that yielded the boundary.
If multiple chunkers yield the same boundary, it is returned
only once, along with the highest matching chunker index.
"""
boundaries = {}
for level_index, chunker in enumerate(self._chunkers):
boundaries.update(dict(
[(boundary, level_index) for boundary in
chunker.next_chunk_boundaries(buf, prepend_bytes)]))
return sorted(boundaries.items())
class SC(BaseChunkingStrategy):
"""Static chunking strategy.
Generates fixed-size chunks.
"""
def create_chunker(self, chunk_size):
"""Create a chunker performing static chunking (SC) with a specific
chunk size.
Args:
chunk_size (int): Target chunk size.
Returns:
BaseChunker: A chunker object.
"""
return SC._Chunker(chunk_size)
class _Chunker(BaseChunker):
"""Static chunker instance."""
def __init__(self, chunk_size):
self._chunk_size = chunk_size
self._next_chunk_boundary = self._chunk_size
def next_chunk_boundaries(self, buf, prepend_bytes=0):
# consider prepend_bytes
self._next_chunk_boundary = (
(self._next_chunk_boundary - prepend_bytes) % self._chunk_size)
if self._next_chunk_boundary == 0:
self._next_chunk_boundary = self._chunk_size
# determine chunk boundaries
buf_length = len(buf)
chunk_boundaries = range(
self._next_chunk_boundary, buf_length + 1, self._chunk_size)
# update next chunk boundary position
self._next_chunk_boundary = (
self._next_chunk_boundary - buf_length) % self._chunk_size
if self._next_chunk_boundary == 0:
self._next_chunk_boundary = self._chunk_size
return list(chunk_boundaries)
class RabinKarpCDC(BaseChunkingStrategy):
"""Content-defined chunking strategy based on Rabin Karp.
Generates variable-size chunks.
"""
def __init__(self, window_size, seed):
super(RabinKarpCDC, self).__init__()
self.window_size = window_size
self._seed = seed
def create_chunker(self, chunk_size):
"""Create a chunker performing content-defined chunking (CDC) using
Rabin Karp's rolling hash scheme with a specific, expected chunk size.
Args:
chunk_size (int): (Expected) target chunk size.
Returns:
BaseChunker: A chunker object.
"""
rolling_hash = _rabinkarprh.RabinKarpHash(self.window_size, self._seed)
rolling_hash.set_threshold(1.0 / chunk_size)
return RabinKarpCDC._Chunker(rolling_hash)
def create_multilevel_chunker(self, chunk_sizes):
"""Create a multi-level chunker performing content-defined chunking
(CDC) using Rabin Karp's rolling hash scheme with different specific,
expected chunk sizes.
Args:
chunk_sizes (list): List of (expected) target chunk sizes.
Warning:
For performance reasons, behavior is only defined if chunk sizes
are passed in order, i.e., from lowest to highest value.
Returns:
BaseMultiLevelChunker: A multi-level chunker object.
"""
rolling_hash = _rabinkarprh.RabinKarpMultiThresholdHash(
self.window_size, self._seed, [1.0 / chunk_size for chunk_size in chunk_sizes])
return RabinKarpCDC._MultiLevelChunker(rolling_hash)
class _Chunker(BaseChunker):
def __init__(self, rolling_hash):
self._rolling_hash = rolling_hash
def next_chunk_boundaries(self, buf, prepend_bytes=0):
return list(self._rolling_hash.next_chunk_boundaries(buf, prepend_bytes))
class _MultiLevelChunker(BaseMultiLevelChunker):
def __init__(self, rolling_hash):
self._rolling_hash = rolling_hash
def next_chunk_boundaries_levels(self, buf, prepend_bytes=0):
i = iter(list(
self._rolling_hash.next_chunk_boundaries_with_thresholds(buf, prepend_bytes)))
return zip(i, i)
fastchunking-0.0.3/fastchunking/benchmark.py 0000664 0000000 0000000 00000001631 13050634271 0021170 0 ustar 00root root 0000000 0000000 import timeit
import os
import time
import fastchunking
if __name__ == '__main__':
print("Benchmarking RabinKarpChunking creation time...")
NUMBER = 10000
total_time = timeit.timeit(
"fastchunking.RabinKarpCDC(48, 0).create_chunker(128)",
setup="import fastchunking",
number=NUMBER)
print("average creation time: {:f}s\n".format(total_time / NUMBER))
print("Benchmarking RabinKarpChunking chunking throughput...")
SIZE = 100 * 1024 * 1024 # 100 MiB
for chunksize in [64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]:
chunker = fastchunking.RabinKarpCDC(
48, 0).create_chunker(chunksize)
content = os.urandom(SIZE)
t = time.time()
chunker.next_chunk_boundaries(content, 0)
print("chunking throughput (chunksize = {} bytes): {} MiB/s".format(
chunksize, SIZE / 1024 / 1024 / (time.time() - t)))
fastchunking-0.0.3/fastchunking/test.py 0000664 0000000 0000000 00000023461 13050634271 0020222 0 ustar 00root root 0000000 0000000 import os
import sys
import unittest
sys.path.insert(0, os.path.abspath('..'))
import fastchunking
class StaticChunkingTests(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(StaticChunkingTests, self).__init__(*args, **kwargs)
self.chunking_strategy = fastchunking.SC()
def test_chunk_size_1(self):
chunker = self.chunking_strategy.create_chunker(chunk_size=1)
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 3)), [1, 2, 3])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [1])
def test_chunk_size_2(self):
chunker = self.chunking_strategy.create_chunker(chunk_size=2)
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 4)), [2, 4])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [1])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 3)), [2])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 3)), [1, 3])
def test_chunk_size_3(self):
chunker = self.chunking_strategy.create_chunker(chunk_size=3)
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 9)), [3, 6, 9])
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 10)), [3, 6, 9])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 2)), [2])
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 11)), [3, 6, 9])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [1])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 0)), [])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 0)), [])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 0)), [])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 3)), [3])
def test_chunk_size_4(self):
chunker = self.chunking_strategy.create_chunker(chunk_size=4)
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 12)), [4, 8, 12])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 2)), [])
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 12)), [2, 6, 10])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [])
self.assertEqual(
list(chunker.next_chunk_boundaries('0' * 12)), [1, 5, 9])
self.assertEqual(list(chunker.next_chunk_boundaries('0' * 1)), [1])
def test_multilevel(self):
chunker = self.chunking_strategy.create_multilevel_chunker(
[5, 10])
self.assertEqual(list(chunker.next_chunk_boundaries_levels(
'0' * 20)), [(5, 0), (10, 1), (15, 0), (20, 1)])
self.assertEqual(list(chunker.next_chunk_boundaries_levels(
'0' * 21)), [(5, 0), (10, 1), (15, 0), (20, 1)])
self.assertEqual(list(chunker.next_chunk_boundaries_levels(
'0' * 22)), [(4, 0), (9, 1), (14, 0), (19, 1)])
self.assertEqual(list(chunker.next_chunk_boundaries_levels(
'0' * 22)), [(2, 0), (7, 1), (12, 0), (17, 1), (22, 0)])
def test_multilevel_without_levels(self):
chunker = self.chunking_strategy.create_multilevel_chunker(
[5, 10])
self.assertEqual(list(chunker.next_chunk_boundaries(
'0' * 20)), [5, 10, 15, 20])
self.assertEqual(list(chunker.next_chunk_boundaries(
'0' * 21)), [5, 10, 15, 20])
self.assertEqual(list(chunker.next_chunk_boundaries(
'0' * 22)), [4, 9, 14, 19])
self.assertEqual(list(chunker.next_chunk_boundaries(
'0' * 22)), [2, 7, 12, 17, 22])
def test_prepending(self):
for _ in range(1024):
content = os.urandom(1024)
chunker = self.chunking_strategy.create_chunker(
chunk_size=64)
boundaries = chunker.next_chunk_boundaries(b'\0' + content)
prepend_chunker = self.chunking_strategy.create_chunker(
chunk_size=64)
prepend_boundaries = prepend_chunker.next_chunk_boundaries(
content, 1)
self.assertEqual(
boundaries, list(map(lambda x: x + 1, prepend_boundaries)))
class RabinKarpTests(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(RabinKarpTests, self).__init__(*args, **kwargs)
self.chunking_strategy = fastchunking.RabinKarpCDC(48, 0)
def test_deterministic_chunking(self):
content = os.urandom(1024 * 1024)
chunker = self.chunking_strategy.create_chunker(chunk_size=128)
boundaries = chunker.next_chunk_boundaries(content)
chunker2 = self.chunking_strategy.create_chunker(chunk_size=128)
boundaries2 = chunker2.next_chunk_boundaries(content)
self.assertEqual(boundaries, boundaries2)
def test_consistent_chunking(self):
chunker = self.chunking_strategy.create_chunker(chunk_size=128)
part_len = 10 * 1024
content = os.urandom(part_len)
boundaries = chunker.next_chunk_boundaries(content + content)
for boundary in boundaries:
if boundary < part_len:
self.assertIn(boundary + part_len, boundaries)
def test_prepending(self):
for _ in range(1024):
content = os.urandom(1024)
chunker = self.chunking_strategy.create_chunker(
chunk_size=64)
boundaries = chunker.next_chunk_boundaries(b'\0' + content)
prepend_chunker = self.chunking_strategy.create_chunker(
chunk_size=64)
prepend_boundaries = prepend_chunker.next_chunk_boundaries(
content, 1)
self.assertEqual(
boundaries, list(map(lambda x: x + 1, prepend_boundaries)))
def test_sample_data_1(self):
content = 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.'
chunker = self.chunking_strategy.create_chunker(chunk_size=128)
boundaries = chunker.next_chunk_boundaries(content)
self.assertEqual(boundaries, [91, 121, 387, 417])
def test_sample_data_2(self):
content = 'Lorem ipsum dolor sit amet, dictas definitiones ea nam, per at fugit voluptaria. Brute luptatum recusabo ne per, mei modo consul indoctum ex. Quem accusamus an sea. Graece oportere dignissim eos et, an diam voluptatibus est. Dictas aperiam est at, nibh tritani has ex, his decore aliquid ut. Ius ei iusto ludus clita, ea per inermis probatus forensibus. Eum ex ludus nullam persequeris, mel gubergren reprehendunt ad, ne mel regione disputationi. Aliquando forensibus sit ne, sea et graece causae fabulas. Vel eu pericula intellegat rationibus, in qui exerci adversarium. Ea nec feugiat placerat, eos dicat invidunt maluisset ea. Graece convenire.'
chunker = self.chunking_strategy.create_chunker(chunk_size=16)
boundaries = chunker.next_chunk_boundaries(content)
self.assertEqual(boundaries, [56, 98, 119, 182, 198, 204, 214, 245, 270, 282, 287, 312, 313, 315, 317, 328, 331,
345, 367, 377, 397, 410, 417, 418, 437, 443, 459, 466, 474, 475, 492, 497, 501, 522, 532, 545, 577, 597, 598, 606])
def test_multilevel(self):
content = 'Lorem ipsum dolor sit amet, dictas definitiones ea nam, per at fugit voluptaria. Brute luptatum recusabo ne per, mei modo consul indoctum ex. Quem accusamus an sea. Graece oportere dignissim eos et, an diam voluptatibus est. Dictas aperiam est at, nibh tritani has ex, his decore aliquid ut. Ius ei iusto ludus clita, ea per inermis probatus forensibus. Eum ex ludus nullam persequeris, mel gubergren reprehendunt ad, ne mel regione disputationi. Aliquando forensibus sit ne, sea et graece causae fabulas. Vel eu pericula intellegat rationibus, in qui exerci adversarium. Ea nec feugiat placerat, eos dicat invidunt maluisset ea. Graece convenire.'
chunker = self.chunking_strategy.create_multilevel_chunker(
[16, 32, 64])
boundaries_with_levels = chunker.next_chunk_boundaries_levels(
content)
self.assertEqual(list(boundaries_with_levels), [(56, 0), (98, 1), (106, 0), (136, 0), (182, 1), (196, 0), (198, 2), (204, 1), (206, 0), (213, 0), (227, 0), (245, 2), (270, 1), (282, 0), (287, 0), (312, 1), (313, 1), (315, 1), (317, 0), (328, 0), (
331, 0), (345, 0), (367, 0), (377, 2), (383, 1), (391, 0), (408, 0), (410, 2), (437, 0), (443, 0), (459, 1), (463, 0), (466, 2), (474, 1), (492, 2), (497, 2), (501, 1), (522, 2), (532, 1), (545, 1), (577, 0), (597, 0), (598, 2), (606, 0)])
class AbstractTests(unittest.TestCase):
def test_chunking_strategy(self):
class Test(fastchunking.BaseChunkingStrategy):
def create_chunker(self, *args, **kwargs):
return super(Test, self).create_chunker(*args, **kwargs)
self.assertRaises(NotImplementedError, Test().create_chunker, 4096)
self.assertRaises(
NotImplementedError, Test().create_multilevel_chunker, [4096])
def test_chunker(self):
class Test(fastchunking.BaseChunker):
def next_chunk_boundaries(self, *args, **kwargs):
return super(Test, self).next_chunk_boundaries(*args, **kwargs)
self.assertRaises(
NotImplementedError, Test().next_chunk_boundaries, '0')
if __name__ == "__main__":
unittest.main()
fastchunking-0.0.3/lib/ 0000775 0000000 0000000 00000000000 13050634271 0014745 5 ustar 00root root 0000000 0000000 fastchunking-0.0.3/lib/__init__.py 0000664 0000000 0000000 00000000000 13050634271 0017044 0 ustar 00root root 0000000 0000000 fastchunking-0.0.3/lib/characterhash.h 0000664 0000000 0000000 00000004177 13050634271 0017727 0 ustar 00root root 0000000 0000000 /*
* This is a modified version of the file characterhash.h within the rollinghashcpp package of Daniel Lemire.
*
* License: Apache 2.0
*
* The base version is available under
* https://github.com/lemire/rollinghashcpp/blob/07c597c17df7e0feb877cf5a7f556af9d6d17a83/characterhash.h
*
* Modifications:
* - Allow to specify the seed during initialization of CharacterHash.
*
* Author of modifications: Dominik Leibenger
*
*/
#ifndef CHARACTERHASH
#define CHARACTERHASH
typedef unsigned long long uint64;
typedef unsigned int uint32;
typedef unsigned int uint;
#include
#include
#include
#include "mersennetwister.h"
using namespace std;
class mersenneRNG {
public:
mersenneRNG(uint32 maxval) : mtr(),n(maxval) {};
uint32 operator()() { return mtr.randInt(n);}
void seed(uint32 seedval) { mtr.seed(seedval);}
void seed() { mtr.seed();}
uint32 rand_max() { return n;}
private:
MTRand mtr;
int n;
};
template
hashvaluetype maskfnc(int bits) {
assert(bits>0);
assert(bits<=sizeof(hashvaluetype)*8);
hashvaluetype x = static_cast(1) << (bits - 1);
return x ^ (x - 1);
}
template
class CharacterHash {
public:
CharacterHash(hashvaluetype maxval, uint32 seed) {
if(sizeof(hashvaluetype) <=4) {
mersenneRNG randomgenerator(maxval);
randomgenerator.seed(seed);
for(size_t k =0; k(randomgenerator());
} else if (sizeof(hashvaluetype) == 8) {
mersenneRNG randomgenerator(maxval>>32);
randomgenerator.seed(seed);
mersenneRNG randomgeneratorbase((maxval>>32) ==0 ? maxval : 0xFFFFFFFFU);
for(size_t k =0; k(randomgeneratorbase())
| (static_cast(randomgenerator()) << 32);
} else throw runtime_error("unsupported hash value type");
}
enum{nbrofchars = 1 << ( sizeof(chartype)*8 )};
hashvaluetype hashvalues[1 << ( sizeof(chartype)*8 )];
};
#endif
fastchunking-0.0.3/lib/mersennetwister.h 0000664 0000000 0000000 00000033554 13050634271 0020366 0 ustar 00root root 0000000 0000000
/**
* High performance random generator.
* Mersenne Twister
@article{matsumoto1998mtd,
title={{Mersenne Twister: A 623-Dimensionally Equidistributed Uniform Pseudo-Random Number Generator}},
author={MATSUMOTO, M. and NISHIMURA, T.},
journal={ACM Transactions on Modeling and Computer Simulation},
volume={8},
number={1},
pages={3-30},
year={1998}
}
*/
// MersenneTwister.h
// Mersenne Twister random number generator -- a C++ class MTRand
// Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
// Richard J. Wagner v1.0 15 May 2003 rjwagner@writeme.com
// The Mersenne Twister is an algorithm for generating random numbers. It
// was designed with consideration of the flaws in various other generators.
// The period, 2^19937-1, and the order of equidistribution, 623 dimensions,
// are far greater. The generator is also fast; it avoids multiplication and
// division, and it benefits from caches and pipelines. For more information
// see the inventors' web page at http://www.math.keio.ac.jp/~matumoto/emt.html
// Reference
// M. Matsumoto and T. Nishimura, "Mersenne Twister: A 623-Dimensionally
// Equidistributed Uniform Pseudo-Random Number Generator", ACM Transactions on
// Modeling and Computer Simulation, Vol. 8, No. 1, January 1998, pp 3-30.
// Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
// Copyright (C) 2000 - 2003, Richard J. Wagner
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The names of its contributors may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The original code included the following notice:
//
// When you use this, send an email to: matumoto@math.keio.ac.jp
// with an appropriate reference to your work.
//
// It would be nice to CC: rjwagner@writeme.com and Cokus@math.washington.edu
// when you write.
#ifndef MERSENNETWISTER_H
#define MERSENNETWISTER_H
// Not thread safe (unless auto-initialization is avoided and each thread has
// its own MTRand object)
#include
#include
#include
#include
#include
class MTRand {
// Data
public:
typedef unsigned long uint32; // unsigned integer type, at least 32 bits
enum { N = 624 }; // length of state vector
enum { SAVE = N + 1 }; // length of array for save()
protected:
enum { M = 397 }; // period parameter
uint32 state[N]; // internal state
uint32 *pNext; // next value to get from state
int left; // number of values left before reload needed
//Methods
public:
MTRand( const uint32& oneSeed ); // initialize with a simple uint32
MTRand( uint32 *const bigSeed, uint32 const seedLength = N ); // or an array
MTRand(); // auto-initialize with /dev/urandom or time() and clock()
// Do NOT use for CRYPTOGRAPHY without securely hashing several returned
// values together, otherwise the generator state can be learned after
// reading 624 consecutive values.
// Access to 32-bit random numbers
double rand(); // real number in [0,1]
double rand( const double& n ); // real number in [0,n]
double randExc(); // real number in [0,1)
double randExc( const double& n ); // real number in [0,n)
double randDblExc(); // real number in (0,1)
double randDblExc( const double& n ); // real number in (0,n)
uint32 randInt(); // integer in [0,2^32-1]
uint32 randInt( const uint32& n ); // integer in [0,n] for n < 2^32
double operator()() { return rand(); } // same as rand()
// Access to 53-bit random numbers (capacity of IEEE double precision)
double rand53(); // real number in [0,1)
// Access to nonuniform random number distributions
double randNorm( const double& mean = 0.0, const double& variance = 0.0 );
// Re-seeding functions with same behavior as initializers
void seed( const uint32 oneSeed );
void seed( uint32 *const bigSeed, const uint32 seedLength = N );
void seed();
// Saving and loading generator state
void save( uint32* saveArray ) const; // to array of size SAVE
void load( uint32 *const loadArray ); // from such array
friend std::ostream& operator<<( std::ostream& os, const MTRand& mtrand );
friend std::istream& operator>>( std::istream& is, MTRand& mtrand );
protected:
void initialize( const uint32 oneSeed );
void reload();
uint32 hiBit( const uint32& u ) const { return u & 0x80000000UL; }
uint32 loBit( const uint32& u ) const { return u & 0x00000001UL; }
uint32 loBits( const uint32& u ) const { return u & 0x7fffffffUL; }
uint32 mixBits( const uint32& u, const uint32& v ) const
{ return hiBit(u) | loBits(v); }
uint32 twist( const uint32& m, const uint32& s0, const uint32& s1 ) const
{ return m ^ (mixBits(s0,s1)>>1) ^ (-static_cast(loBit(s1)) & 0x9908b0dfUL); }
static uint32 hash( time_t t, clock_t c );
};
MTRand::MTRand( const uint32& oneSeed )
{ seed(oneSeed); }
MTRand::MTRand( uint32 *const bigSeed, const uint32 seedLength )
{ seed(bigSeed,seedLength); }
MTRand::MTRand()
{ seed(); }
double MTRand::rand()
{ return double(randInt()) * (1.0/4294967295.0); }
double MTRand::rand( const double& n )
{ return rand() * n; }
double MTRand::randExc()
{ return double(randInt()) * (1.0/4294967296.0); }
double MTRand::randExc( const double& n )
{ return randExc() * n; }
double MTRand::randDblExc()
{ return ( double(randInt()) + 0.5 ) * (1.0/4294967296.0); }
double MTRand::randDblExc( const double& n )
{ return randDblExc() * n; }
double MTRand::rand53()
{
uint32 a = randInt() >> 5, b = randInt() >> 6;
return ( a * 67108864.0 + b ) * (1.0/9007199254740992.0); // by Isaku Wada
}
double MTRand::randNorm( const double& mean, const double& variance )
{
// Return a real number from a normal (Gaussian) distribution with given
// mean and variance by Box-Muller method
double r = sqrt( -2.0 * log( 1.0-randDblExc()) ) * variance;
double phi = 2.0 * 3.14159265358979323846264338328 * randExc();
return mean + r * cos(phi);
}
MTRand::uint32 MTRand::randInt()
{
// Pull a 32-bit integer from the generator state
// Every other access function simply transforms the numbers extracted here
if( left == 0 ) reload();
--left;
uint32 s1;
s1 = *pNext++;
s1 ^= (s1 >> 11);
s1 ^= (s1 << 7) & 0x9d2c5680UL;
s1 ^= (s1 << 15) & 0xefc60000UL;
return ( s1 ^ (s1 >> 18) );
}
MTRand::uint32 MTRand::randInt( const uint32& n )
{
// Find which bits are used in n
// Optimized by Magnus Jonsson (magnus@smartelectronix.com)
uint32 used = n;
used |= used >> 1;
used |= used >> 2;
used |= used >> 4;
used |= used >> 8;
used |= used >> 16;
// Draw numbers until one is found in [0,n]
uint32 i;
do
i = randInt() & used; // toss unused bits to shorten search
while( i > n );
return i;
}
void MTRand::seed( const uint32 oneSeed )
{
// Seed the generator with a simple uint32
initialize(oneSeed);
reload();
}
void MTRand::seed( uint32 *const bigSeed, const uint32 seedLength )
{
// Seed the generator with an array of uint32's
// There are 2^19937-1 possible initial states. This function allows
// all of those to be accessed by providing at least 19937 bits (with a
// default seed length of N = 624 uint32's). Any bits above the lower 32
// in each element are discarded.
// Just call seed() if you want to get array from /dev/urandom
initialize(19650218UL);
int i = 1;
uint32 j = 0;
int k = ( N > seedLength ? N : seedLength );
for( ; k; --k )
{
state[i] =
state[i] ^ ( (state[i-1] ^ (state[i-1] >> 30)) * 1664525UL );
state[i] += ( bigSeed[j] & 0xffffffffUL ) + j;
state[i] &= 0xffffffffUL;
++i; ++j;
if( i >= N ) { state[0] = state[N-1]; i = 1; }
if( j >= seedLength ) j = 0;
}
for( k = N - 1; k; --k )
{
state[i] =
state[i] ^ ( (state[i-1] ^ (state[i-1] >> 30)) * 1566083941UL );
state[i] -= i;
state[i] &= 0xffffffffUL;
++i;
if( i >= N ) { state[0] = state[N-1]; i = 1; }
}
state[0] = 0x80000000UL; // MSB is 1, assuring non-zero initial array
reload();
}
void MTRand::seed()
{
// Seed the generator with an array from /dev/urandom if available
// Otherwise use a hash of time() and clock() values
// First try getting an array from /dev/urandom
FILE* urandom = fopen( "/dev/urandom", "rb" );
if( urandom )
{
uint32 bigSeed[N];
uint32 *s = bigSeed;
int i = N;
bool success = true;
while( success && i-- )
success = fread( s++, sizeof(uint32), 1, urandom );
fclose(urandom);
if( success ) { seed( bigSeed, N ); return; }
}
// Was not successful, so use time() and clock() instead
seed( hash( time(NULL), clock() ) );
}
void MTRand::initialize( const uint32 seed )
{
// Initialize generator state with seed
// See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
// In previous versions, most significant bits (MSBs) of the seed affect
// only MSBs of the state array. Modified 9 Jan 2002 by Makoto Matsumoto.
uint32 *s = state;
uint32 *r = state;
int i = 1;
*s++ = seed & 0xffffffffUL;
for( ; i < N; ++i )
{
*s++ = ( 1812433253UL * ( *r ^ (*r >> 30) ) + i ) & 0xffffffffUL;
r++;
}
}
void MTRand::reload()
{
// Generate N new values in state
// Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
uint32 *p = state;
int i;
for( i = N - M; i--; ++p )
*p = twist( p[M], p[0], p[1] );
for( i = M; --i; ++p )
*p = twist( p[M-N], p[0], p[1] );
*p = twist( p[M-N], p[0], state[0] );
left = N, pNext = state;
}
MTRand::uint32 MTRand::hash( time_t t, clock_t c )
{
// Get a uint32 from t and c
// Better than uint32(x) in case x is floating point in [0,1]
// Based on code by Lawrence Kirby (fred@genesis.demon.co.uk)
static uint32 differ = 0; // guarantee time-based seeds will change
uint32 h1 = 0;
unsigned char *p = reinterpret_cast( &t );
for( size_t i = 0; i < sizeof(t); ++i )
{
h1 *= UCHAR_MAX + 2U;
h1 += p[i];
}
uint32 h2 = 0;
p = reinterpret_cast( &c );
for( size_t j = 0; j < sizeof(c); ++j )
{
h2 *= UCHAR_MAX + 2U;
h2 += p[j];
}
return ( h1 + differ++ ) ^ h2;
}
void MTRand::save( uint32* saveArray ) const
{
uint32 *sa = saveArray;
const uint32 *s = state;
int i = N;
for( ; i--; *sa++ = *s++ ) {}
*sa = left;
}
void MTRand::load( uint32 *const loadArray )
{
uint32 *s = state;
uint32 *la = loadArray;
int i = N;
for( ; i--; *s++ = *la++ ) {}
left = *la;
pNext = &state[N-left];
}
std::ostream& operator<<( std::ostream& os, const MTRand& mtrand )
{
const MTRand::uint32 *s = mtrand.state;
int i = mtrand.N;
for( ; i--; os << *s++ << "\t" ) {}
return os << mtrand.left;
}
std::istream& operator>>( std::istream& is, MTRand& mtrand )
{
MTRand::uint32 *s = mtrand.state;
int i = mtrand.N;
for( ; i--; is >> *s++ ) {}
is >> mtrand.left;
mtrand.pNext = &mtrand.state[mtrand.N-mtrand.left];
return is;
}
#endif // MERSENNETWISTER_H
// Change log:
//
// v0.1 - First release on 15 May 2000
// - Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
// - Translated from C to C++
// - Made completely ANSI compliant
// - Designed convenient interface for initialization, seeding, and
// obtaining numbers in default or user-defined ranges
// - Added automatic seeding from /dev/urandom or time() and clock()
// - Provided functions for saving and loading generator state
//
// v0.2 - Fixed bug which reloaded generator one step too late
//
// v0.3 - Switched to clearer, faster reload() code from Matthew Bellew
//
// v0.4 - Removed trailing newline in saved generator format to be consistent
// with output format of built-in types
//
// v0.5 - Improved portability by replacing static const int's with enum's and
// clarifying return values in seed(); suggested by Eric Heimburg
// - Removed MAXINT constant; use 0xffffffffUL instead
//
// v0.6 - Eliminated seed overflow when uint32 is larger than 32 bits
// - Changed integer [0,n] generator to give better uniformity
//
// v0.7 - Fixed operator precedence ambiguity in reload()
// - Added access for real numbers in (0,1) and (0,n)
//
// v0.8 - Included time.h header to properly support time_t and clock_t
//
// v1.0 - Revised seeding to match 26 Jan 2002 update of Nishimura and Matsumoto
// - Allowed for seeding with arrays of any length
// - Added access for real numbers in [0,1) with 53-bit resolution
// - Added access for real numbers from normal (Gaussian) distributions
// - Increased overall speed by optimizing twist()
// - Doubled speed of integer [0,n] generation
// - Fixed out-of-range number generation on 64-bit machines
// - Improved portability by substituting literal constants for long enum's
// - Changed license from GNU LGPL to BSD
fastchunking-0.0.3/lib/rabinkarp.cpp 0000664 0000000 0000000 00000000056 13050634271 0017423 0 ustar 00root root 0000000 0000000 #ifndef RABINKARP
#define RABINKARP
#endif
fastchunking-0.0.3/lib/rabinkarp.h 0000664 0000000 0000000 00000026753 13050634271 0017104 0 ustar 00root root 0000000 0000000 /*
* An efficient RabinKarp rolling hash implementation.
*
* This library is based on and thus includes code fragments of the file
* rabinkarphash.h of the rollinghashcpp package by Daniel Lemire.
*
* License: Apache 2.0
*
* The base version is available under
* https://github.com/lemire/rollinghashcpp/blob/07c597c17df7e0feb877cf5a7f556af9d6d17a83/rabinkarphash.h
*
* Author: Dominik Leibenger
*
*/
#ifndef RABINKARP_H
#define RABINKARP_H
#include
#include "characterhash.h"
#include
#include
#include
class RabinKarp {
/* Implementation of the Rabin-Karp hash function.
*
* This code is based on
* https://github.com/lemire/rollinghashcpp/blob/07c597c17df7e0feb877cf5a7f556af9d6d17a83/rabinkarphash.h
* and therefore uses some variable names from the source.
*/
public:
RabinKarp(int my_window_size, int seed) :
hasher(maskfnc(WORDSIZE), seed),
HASHMASK(maskfnc(WORDSIZE)),
BtoN(1),
window_size(my_window_size) {
for (int i = 0; i < window_size; ++i) {
BtoN *= B;
BtoN &= HASHMASK;
}
}
protected:
void _update(unsigned char b, uint32 &hashvalue, unsigned char* window,
int &window_head, int &window_level) {
/* Consume a byte and update the hash value accordingly.
*
* The last window_size consumed bytes are always stored to ease rolling
* hash computation.
*/
if (window_level != window_size)
// corresponds to eat() in the original implementation
hashvalue = (B * hashvalue + hasher.hashvalues[b]) & HASHMASK;
else
// corresponds to update() in the original implementation
hashvalue = (B * hashvalue + hasher.hashvalues[b]
- BtoN * hasher.hashvalues[window[window_head]]) & HASHMASK;
// store consumed byte in rolling hash window
window[window_head] = b;
if (window_head == window_size - 1)
window_head = 0;
else
window_head += 1;
if (window_level != window_size)
window_level += 1;
}
uint32 _compute_threshold(double my_threshold) {
/* resolves a relative threshold (e.g., 0.01 for 1% matching hash
* values) to an absolute threshold in the range of actual hash values. */
return static_cast(my_threshold * (HASHMASK + 1));
}
private:
int n;
CharacterHash hasher;
const uint32 HASHMASK;
uint32 BtoN;
static const uint32 B = 37;
static const uint32 WORDSIZE = 29; // compute 29-bit integer hashes
protected:
int window_size;
};
class RabinKarpHash: RabinKarp {
/* High-level interface that performs chunking based on the Rabin-Karp
* rolling hash scheme.
*
* This is the interface used by the Python library. */
public:
RabinKarpHash(int my_window_size, int seed) :
hashvalue(0), window_level(0), window_head(0), RabinKarp(
my_window_size, seed) {
window = (unsigned char*) malloc(window_size * sizeof(unsigned char));
}
~RabinKarpHash() {
free(window);
}
void set_threshold(double my_threshold) {
threshold = _compute_threshold(my_threshold);
}
std::list next_chunk_boundaries(std::string *str,
unsigned int prepend_bytes) {
/* On input a Python string, this function computes a Python list object
* containing chunk boundary positions. */
const char* cstr = str->c_str();
unsigned int len = str->length();
for (unsigned int i = 0; i < prepend_bytes; ++i)
update(0);
std::list results;
for (unsigned int i = 0; i < len; ++i) {
update(cstr[i]);
if (window_level == window_size && hashvalue < threshold)
results.push_back(i + 1);
}
return (results);
}
private:
void update(unsigned char b) {
_update(b, hashvalue, window, window_head, window_level);
}
int window_level;
int window_head;
unsigned char* window;
uint32 threshold;
uint32 hashvalue;
};
class RabinKarpMultiThresholdHash: RabinKarp {
/*
* Performs multi-level chunking of a given content, based on the thresholds
* specified during initialization.
*
* Chunking is performed as follows:
* - To compute chunk boundaries of the first level (i.e., the nodes
* directly under the root node), the content is prepended by
* prepend_bytes bytes (as to allow that the first chunk is smaller than
* the specified window size) and then chunked using Rabin Karp, i.e., a
* chunk boundary is created whenever the current hashvalue is below the
* first given threshold.
* - Subsequent levels are computed similarly, but each higher-level chunk
* is considered in isolation, i.e., computed chunk boundaries of a
* level-(i+1) chunk must not depend on content outside of the scope of
* the corresponding level-i chunk. For this reason, a single chunking
* instance is not enough. Instead, we use one chunking instance for each
* individual threshold, filling lower-level windows with zeros whenever a
* chunk boundary at a higher level has been found.
*/
public:
RabinKarpMultiThresholdHash(int my_window_size,
int seed,
std::list my_thresholds) :
thresholds_count(my_thresholds.size()),
thresholds((uint32*) malloc(thresholds_count * sizeof(uint32))),
least_restrictive_required_chunker_index(0), // initialize optimization code
RabinKarp(my_window_size, seed) {
// initialize list of thresholds
std::list::iterator iter = my_thresholds.begin();
int i = 0;
for (std::list::iterator iter = my_thresholds.begin();
iter != my_thresholds.end(); ++iter) {
thresholds[i] = _compute_threshold(*iter);
++i;
}
// initialize a chunker for each threshold
threshold_window_levels = new int[thresholds_count];
threshold_window_heads = new int[thresholds_count];
threshold_content_lengths = new int[thresholds_count];
threshold_hashvalues = new uint32[thresholds_count];
threshold_windows = new unsigned char*[thresholds_count];
for (int threshold_index = 0; threshold_index < thresholds_count;
threshold_index++) {
threshold_window_levels[threshold_index] = 0;
threshold_window_heads[threshold_index] = 0;
threshold_content_lengths[threshold_index] = 0;
threshold_hashvalues[threshold_index] = 0;
threshold_windows[threshold_index] = new unsigned char[window_size];
}
}
~RabinKarpMultiThresholdHash() {
// clean up threshold-specific chunkers
delete[] threshold_window_levels;
delete[] threshold_window_heads;
delete[] threshold_content_lengths;
delete[] threshold_hashvalues;
for (int i = 0; i < thresholds_count; i++)
delete[] threshold_windows[i];
delete[] threshold_windows;
// clean up thresholds
free(thresholds);
}
std::list next_chunk_boundaries_with_thresholds(
std::string *content, unsigned int prepend_bytes) {
const char* content_str = content->c_str();
unsigned int len = content->length();
// prepend bytes as specified
for (int threshold_index = 0; threshold_index < thresholds_count;
++threshold_index)
for (unsigned int i = 0; i < prepend_bytes; ++i)
_update(0, threshold_hashvalues[threshold_index],
threshold_windows[threshold_index],
threshold_window_heads[threshold_index],
threshold_window_levels[threshold_index]);
// process content byte by byte
std::list boundaries;
for (unsigned int i = 0; i < len; ++i) {
// let current byte be processed by each required chunker
int new_least_restrictive_required_chunker_index = thresholds_count
- 1;
for (int threshold_index = thresholds_count - 1;
threshold_index >= least_restrictive_required_chunker_index;
--threshold_index) {
_update(content_str[i], threshold_hashvalues[threshold_index],
threshold_windows[threshold_index],
threshold_window_heads[threshold_index],
threshold_window_levels[threshold_index]);
threshold_content_lengths[threshold_index]++;
if (threshold_content_lengths[threshold_index] < window_size)
new_least_restrictive_required_chunker_index =
threshold_index;
}
least_restrictive_required_chunker_index =
new_least_restrictive_required_chunker_index;
/* assuming that thresholds are ordered from least restrictive to
* most restrictive, determine the most restrictive threshold that
* matches (if any) */
int matching_threshold_index = -1;
for (int threshold_index = 0; threshold_index < thresholds_count;
++threshold_index) {
int used_chunker_index = std::max(threshold_index,
least_restrictive_required_chunker_index);
/* thresholds are processed in this order since the majority of
* all positions will not match any threshold, allowing for an
* early break which is only possible when starting with the
* least restrictive threshold */
if (threshold_window_levels[used_chunker_index] == window_size
&& threshold_hashvalues[used_chunker_index]
< thresholds[threshold_index]) {
/* set matching threshold index, which will probably be
* overwritten by a higher (i.e., more restrictive threshold
* index in a subsequent iteration) */
matching_threshold_index = threshold_index;
} else {
/* if this threshold did not match and if it does not depend
* on any prepended zeros, none of the more restrictive
* thresholds will match */
if (threshold_content_lengths[used_chunker_index]
>= window_size)
break;
}
}
if (matching_threshold_index != -1) {
// add found boundary to list of boundaries
boundaries.push_back(i + 1);
boundaries.push_back(matching_threshold_index);
/* reset chunkers for lower-level nodes (i.e., chunkers with
* less restrictive thresholds) */
for (int j = 0; j < matching_threshold_index; ++j) {
for (unsigned int k = 0; k < prepend_bytes; ++k)
_update(0, threshold_hashvalues[j],
threshold_windows[j], threshold_window_heads[j],
threshold_window_levels[j]);
threshold_content_lengths[j] = 0;
}
/* some chunkers for nodes lower (i.e., chunkers with less
* restrictive thresholds) than
* least_restrictive_required_chunker_index are now used again
* due to the above-described reset, so we update their states
* accordingly */
for (int j = matching_threshold_index;
j < least_restrictive_required_chunker_index; ++j) {
threshold_hashvalues[j] =
threshold_hashvalues[least_restrictive_required_chunker_index];
std::memcpy(threshold_windows[j],
threshold_windows[least_restrictive_required_chunker_index],
window_size);
threshold_window_heads[j] =
threshold_window_heads[least_restrictive_required_chunker_index];
threshold_window_levels[j] =
threshold_window_levels[least_restrictive_required_chunker_index];
}
least_restrictive_required_chunker_index = 0;
}
}
// return boundaries list
return (boundaries);
}
private:
int thresholds_count;
uint32* thresholds;
int* threshold_window_levels;
int* threshold_window_heads;
int* threshold_content_lengths;
uint32* threshold_hashvalues;
unsigned char** threshold_windows;
/* OPTIMIZATION: If a chunker has processed at least window_size bytes of
* the content, all subsequent (i.e., more restrictive threshold) chunkers
* would have the same state. Thus, we save redundant executions by
* determining the least-restrictive chunker that is still required. */
int least_restrictive_required_chunker_index;
};
#endif
fastchunking-0.0.3/lib/rabinkarp_gen.py 0000664 0000000 0000000 00000002456 13050634271 0020130 0 ustar 00root root 0000000 0000000 import pybindgen
def generate(file_):
mod = pybindgen.Module('_rabinkarprh')
mod.add_include('"rabinkarp.h"')
mod.add_container('std::list', 'unsigned int', 'list')
mod.add_container('std::list', 'double', 'list')
cls = mod.add_class('RabinKarpHash')
cls.add_constructor([pybindgen.param('int', 'my_window_size'),
pybindgen.param('int', 'seed')])
cls.add_method('set_threshold',
None,
[pybindgen.param('double', 'my_threshold')])
cls.add_method('next_chunk_boundaries',
pybindgen.retval('std::list'),
[pybindgen.param('std::string*', 'str'),
pybindgen.param('unsigned int', 'prepend_bytes')])
cls = mod.add_class('RabinKarpMultiThresholdHash')
cls.add_constructor([pybindgen.param('int', 'my_window_size'),
pybindgen.param('int', 'seed'),
pybindgen.param('std::list', 'my_thresholds')])
cls.add_method('next_chunk_boundaries_with_thresholds',
pybindgen.retval('std::list'),
[pybindgen.param('std::string*', 'str'),
pybindgen.param('unsigned int', 'prepend_bytes')])
mod.generate(file_)
fastchunking-0.0.3/requirements.txt 0000664 0000000 0000000 00000000104 13050634271 0017456 0 ustar 00root root 0000000 0000000 pytest>=2.9.2
pybindgen>=0.17.0
coverage>=4.1
pytest-cov>=2.2.1
fastchunking-0.0.3/setup.py 0000664 0000000 0000000 00000006167 13050634271 0015723 0 ustar 00root root 0000000 0000000 import codecs
import os
import re
import sys
from setuptools import setup, Extension
from setuptools.command.test import test as TestCommand
# Some general-purpose code stolen from
# https://github.com/jeffknupp/sandman/blob/5c4b7074e8ba5a60b00659760e222c57ad24ef91/setup.py
here = os.path.abspath(os.path.dirname(__file__))
class Tox(TestCommand):
def finalize_options(self):
TestCommand.finalize_options(self)
self.test_args = []
self.test_suite = True
def run_tests(self):
# import here, cause outside the eggs aren't loaded
import tox
errcode = tox.cmdline(self.test_args)
sys.exit(errcode)
def read(*parts):
# intentionally *not* adding an encoding option to open
return codecs.open(os.path.join(here, *parts), 'r').read()
def find_version(*file_paths):
version_file = read(*file_paths)
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
version_file, re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
# Make sure build path exists.
build_path = os.path.join(here, 'build')
if not os.path.exists(build_path):
os.mkdir(build_path)
# Generate Python bindings for bundled C++ library.
module_fname = os.path.join(build_path, "rabinkarprh.cpp")
try:
import pybindgen #@UnusedImport
except ImportError:
print("WARNING: Failed to import pybindgen. If you called setup.py egg_info,"
"this is probably acceptable; otherwise, build will fail."
"You can resolve this problem by installing pybindgen beforehand.")
else:
with open(module_fname, "wt") as file_:
print("Generating file {}".format(module_fname))
from lib.rabinkarp_gen import generate
generate(file_)
setup(
name='fastchunking',
version=find_version('fastchunking', '__init__.py'),
description='Fast chunking library.',
long_description=read('README.rst'),
url='https://github.com/netleibi/fastchunking',
author='Dominik Leibenger',
author_email='python-fastchunking@mails.dominik-leibenger.de',
license='Apache Software License',
classifiers=[
'Development Status :: 2 - Pre-Alpha',
'Intended Audience :: Developers',
'Topic :: Software Development :: Libraries :: Python Modules',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5'
],
keywords=['text chunking', 'SC', 'static chunking', 'CDC', 'content-defined chunking', 'ML-*', 'multi-level chunking', 'ML-SC', 'ML-CDC', 'Rabin Karp', 'rolling hash'],
packages=['fastchunking', 'lib'],
setup_requires=['pybindgen'],
install_requires=['pybindgen'],
ext_modules=[
Extension('fastchunking._rabinkarprh',
sources=[module_fname, 'lib/rabinkarp.cpp'],
include_dirs=['lib']
)
],
test_suite='fastchunking.test',
tests_require=['tox'],
cmdclass={'test': Tox}
)
fastchunking-0.0.3/tox.ini 0000664 0000000 0000000 00000000703 13050634271 0015512 0 ustar 00root root 0000000 0000000 [tox]
envlist = py35
[pytest]
python_files=fastchunking/test.py
testpaths=fastchunking
python_functions=test_
[testenv]
deps=-rrequirements.txt
setenv=
PYTHONWARNINGS=all
commands=
{envbindir}/python setup.py develop
py.test --cov=fastchunking
[testenv:py35]
# the latest pybindgen release on pip (0.17) is not compatible to python 3.5 yet
deps=git+https://github.com/gjcarneiro/pybindgen.git
pytest>=2.9.2
coverage>=4.1
pytest-cov>=2.2.1