pax_global_header00006660000000000000000000000064130557726560014532gustar00rootroot0000000000000052 comment=04ceb168ede7ec55daf48c81977a5dc4b369d958 joblib-0.11/000077500000000000000000000000001305577265600127145ustar00rootroot00000000000000joblib-0.11/.coveragerc000066400000000000000000000001001305577265600150240ustar00rootroot00000000000000[run] omit = joblib/test/data/* [report] show_missing = True joblib-0.11/.gitignore000066400000000000000000000007401305577265600147050ustar00rootroot00000000000000*.py[oc] *.so # setup.py working directory build # setup.py dist directory dist # Editor temporary/working/backup files *$ .*.sw[nop] .sw[nop] *~ [#]*# .#* *.bak *.tmp *.tgz *.rej *.org .project *.diff .settings/ *.svn/ # Egg metadata *.egg-info # The shelf plugin uses this dir ./.shelf # Some IDEs add this directory .idea # Mac droppings .DS_Store doc/documentation.zip doc/generated doc/CHANGES.rst doc/README.rst # Coverage report .coverage # pytest cache on failure .cache joblib-0.11/.mailmap000066400000000000000000000007231305577265600143370ustar00rootroot00000000000000Gael Varoquaux Gael Varoquaux Gael Varoquaux Gael varoquaux Gael Varoquaux GaelVaroquaux Gael Varoquaux Gael VAROQUAUX Gael Varoquaux gvaroquaux joblib-0.11/.readthedocs-requirements.txt000066400000000000000000000001371305577265600205420ustar00rootroot00000000000000# rtd comes with sphinx 1.3.5 and we need sphinx>=1.4 for sphinx.ext.imgmath sphinx>=1.4 numpy joblib-0.11/.travis.yml000066400000000000000000000026551305577265600150350ustar00rootroot00000000000000# make it explicit that we favor the new container-based travis workers sudo: false language: python env: matrix: - PYTHON_VERSION="2.7" NUMPY_VERSION="1.6" - PYTHON_VERSION="2.7" NUMPY_VERSION="1.7" COVERAGE="true" - PYTHON_VERSION="3.3" NUMPY_VERSION="1.8" - PYTHON_VERSION="3.4" NUMPY_VERSION="1.9" # NUMPY_VERSION not set means numpy is not installed - PYTHON_VERSION="3.4" COVERAGE="true" - PYTHON_VERSION="3.5" NUMPY_VERSION="1.10" - PYTHON_VERSION="3.6" NUMPY_VERSION="1.11" COVERAGE="true" BUILD_DOC="true" # multiprocesssing disabled via the JOBLIB_MULTIPROCESSING environment variable - PYTHON_VERSION="3.6" NUMPY_VERSION="1.11" JOBLIB_MULTIPROCESSING=0 COVERAGE="true" # flake8 linting on diff wrt common ancestor with upstream/master # flake8 is only available with python 3.5 at the moment. # flake8 version is temporarily set to 2.5.1 because the next version # available on conda (3.3.0) has a bug that checks non python file - SKIP_TESTS="true" FLAKE8_VERSION="2.5.1" PYTHON_VERSION="3.5" install: - source continuous_integration/travis/install.sh script: - source continuous_integration/travis/test_script.sh after_success: # Ignore codecov failures because we don't want travis to report a # failure in the github UI just because the coverage report failed # to be published. - if [[ "$COVERAGE" == "true" ]]; then codecov || echo "failed"; fi joblib-0.11/CHANGES.rst000066400000000000000000000430221305577265600145170ustar00rootroot00000000000000Latest changes =============== Release 0.11 ------------ Alexandre Abadie Remove support for python 2.6 Alexandre Abadie Remove deprecated `format_signature`, `format_call` and `load_output` functions from Memory API. Loïc Estève Add initial implementation of LRU cache cleaning. You can specify the size limit of a ``Memory`` object via the ``bytes_limit`` parameter and then need to clean explicitly the cache via the ``Memory.reduce_size`` method. Olivier Grisel Make the multiprocessing backend work even when the name of the main thread is not the Python default. Thanks to Roman Yurchak for the suggestion. Karan Desai pytest is used to run the tests instead of nosetests. ``python setup.py test`` or ``python setup.py nosetests`` do not work anymore, run ``pytest joblib`` instead. Loïc Estève An instance of ``joblib.ParallelBackendBase`` can be passed into the ``parallel`` argument in ``joblib.Parallel``. Loïc Estève Fix handling of memmap objects with offsets greater than mmap.ALLOCATIONGRANULARITY in ``joblib.Parrallel``. See https://github.com/joblib/joblib/issues/451 for more details. Loïc Estève Fix performance regression in ``joblib.Parallel`` with n_jobs=1. See https://github.com/joblib/joblib/issues/483 for more details. Loïc Estève Fix race condition when a function cached with ``joblib.Memory.cache`` was used inside a ``joblib.Parallel``. See https://github.com/joblib/joblib/issues/490 for more details. Release 0.10.3 -------------- Loïc Estève Fix tests when multiprocessing is disabled via the JOBLIB_MULTIPROCESSING environment variable. harishmk Remove warnings in nested Parallel objects when the inner Parallel has n_jobs=1. See https://github.com/joblib/joblib/pull/406 for more details. Release 0.10.2 -------------- Loïc Estève FIX a bug in stack formatting when the error happens in a compiled extension. See https://github.com/joblib/joblib/pull/382 for more details. Vincent Latrouite FIX a bug in the constructor of BinaryZlibFile that would throw an exception when passing unicode filename (Python 2 only). See https://github.com/joblib/joblib/pull/384 for more details. Olivier Grisel Expose :class:`joblib.parallel.ParallelBackendBase` and :class:`joblib.parallel.AutoBatchingMixin` in the public API to make them officially re-usable by backend implementers. Release 0.10.0 -------------- Alexandre Abadie ENH: joblib.dump/load now accept file-like objects besides filenames. https://github.com/joblib/joblib/pull/351 for more details. Niels Zeilemaker and Olivier Grisel Refactored joblib.Parallel to enable the registration of custom computational backends. https://github.com/joblib/joblib/pull/306 Note the API to register custom backends is considered experimental and subject to change without deprecation. Alexandre Abadie Joblib pickle format change: joblib.dump always create a single pickle file and joblib.dump/joblib.save never do any memory copy when writing/reading pickle files. Reading pickle files generated with joblib versions prior to 0.10 will be supported for a limited amount of time, we advise to regenerate them from scratch when convenient. joblib.dump and joblib.load also support pickle files compressed using various strategies: zlib, gzip, bz2, lzma and xz. Note that lzma and xz are only available with python >= 3.3. https://github.com/joblib/joblib/pull/260 for more details. Antony Lee ENH: joblib.dump/load now accept pathlib.Path objects as filenames. https://github.com/joblib/joblib/pull/316 for more details. Olivier Grisel Workaround for "WindowsError: [Error 5] Access is denied" when trying to terminate a multiprocessing pool under Windows: https://github.com/joblib/joblib/issues/354 Release 0.9.4 ------------- Olivier Grisel FIX a race condition that could cause a joblib.Parallel to hang when collecting the result of a job that triggers an exception. https://github.com/joblib/joblib/pull/296 Olivier Grisel FIX a bug that caused joblib.Parallel to wrongly reuse previously memmapped arrays instead of creating new temporary files. https://github.com/joblib/joblib/pull/294 for more details. Loïc Estève FIX for raising non inheritable exceptions in a Parallel call. See https://github.com/joblib/joblib/issues/269 for more details. Alexandre Abadie FIX joblib.hash error with mixed types sets and dicts containing mixed types keys when using Python 3. see https://github.com/joblib/joblib/issues/254 Loïc Estève FIX joblib.dump/load for big numpy arrays with dtype=object. See https://github.com/joblib/joblib/issues/220 for more details. Loïc Estève FIX joblib.Parallel hanging when used with an exhausted iterator. See https://github.com/joblib/joblib/issues/292 for more details. Release 0.9.3 ------------- Olivier Grisel Revert back to the ``fork`` start method (instead of ``forkserver``) as the latter was found to cause crashes in interactive Python sessions. Release 0.9.2 ------------- Loïc Estève Joblib hashing now uses the default pickle protocol (2 for Python 2 and 3 for Python 3). This makes it very unlikely to get the same hash for a given object under Python 2 and Python 3. In particular, for Python 3 users, this means that the output of joblib.hash changes when switching from joblib 0.8.4 to 0.9.2 . We strive to ensure that the output of joblib.hash does not change needlessly in future versions of joblib but this is not officially guaranteed. Loïc Estève Joblib pickles generated with Python 2 can not be loaded with Python 3 and the same applies for joblib pickles generated with Python 3 and loaded with Python 2. During the beta period 0.9.0b2 to 0.9.0b4, we experimented with a joblib serialization that aimed to make pickles serialized with Python 3 loadable under Python 2. Unfortunately this serialization strategy proved to be too fragile as far as the long-term maintenance was concerned (For example see https://github.com/joblib/joblib/pull/243). That means that joblib pickles generated with joblib 0.9.0bN can not be loaded under joblib 0.9.2. Joblib beta testers, who are the only ones likely to be affected by this, are advised to delete their joblib cache when they upgrade from 0.9.0bN to 0.9.2. Arthur Mensch Fixed a bug with ``joblib.hash`` that used to return unstable values for strings and numpy.dtype instances depending on interning states. Olivier Grisel Make joblib use the 'forkserver' start method by default under Python 3.4+ to avoid causing crash with 3rd party libraries (such as Apple vecLib / Accelerate or the GCC OpenMP runtime) that use an internal thread pool that is not not reinitialized when a ``fork`` system call happens. Olivier Grisel New context manager based API (``with`` block) to re-use the same pool of workers across consecutive parallel calls. Vlad Niculae and Olivier Grisel Automated batching of fast tasks into longer running jobs to hide multiprocessing dispatching overhead when possible. Olivier Grisel FIX make it possible to call ``joblib.load(filename, mmap_mode='r')`` on pickled objects that include a mix of arrays of both memory memmapable dtypes and object dtype. Release 0.8.4 ------------- 2014-11-20 Olivier Grisel OPTIM use the C-optimized pickler under Python 3 This makes it possible to efficiently process parallel jobs that deal with numerous Python objects such as large dictionaries. Release 0.8.3 ------------- 2014-08-19 Olivier Grisel FIX disable memmapping for object arrays 2014-08-07 Lars Buitinck MAINT NumPy 1.10-safe version comparisons 2014-07-11 Olivier Grisel FIX #146: Heisen test failure caused by thread-unsafe Python lists This fix uses a queue.Queue datastructure in the failing test. This datastructure is thread-safe thanks to an internal Lock. This Lock instance not picklable hence cause the picklability check of delayed to check fail. When using the threading backend, picklability is no longer required, hence this PRs give the user the ability to disable it on a case by case basis. Release 0.8.2 ------------- 2014-06-30 Olivier Grisel BUG: use mmap_mode='r' by default in Parallel and MemmapingPool The former default of mmap_mode='c' (copy-on-write) caused problematic use of the paging file under Windows. 2014-06-27 Olivier Grisel BUG: fix usage of the /dev/shm folder under Linux Release 0.8.1 ------------- 2014-05-29 Gael Varoquaux BUG: fix crash with high verbosity Release 0.8.0 ------------- 2014-05-14 Olivier Grisel Fix a bug in exception reporting under Python 3 2014-05-10 Olivier Grisel Fixed a potential segfault when passing non-contiguous memmap instances. 2014-04-22 Gael Varoquaux ENH: Make memory robust to modification of source files while the interpreter is running. Should lead to less spurious cache flushes and recomputations. 2014-02-24 Philippe Gervais New ``Memory.call_and_shelve`` API to handle memoized results by reference instead of by value. Release 0.8.0a3 --------------- 2014-01-10 Olivier Grisel & Gael Varoquaux FIX #105: Race condition in task iterable consumption when pre_dispatch != 'all' that could cause crash with error messages "Pools seems closed" and "ValueError: generator already executing". 2014-01-12 Olivier Grisel FIX #72: joblib cannot persist "output_dir" keyword argument. Release 0.8.0a2 --------------- 2013-12-23 Olivier Grisel ENH: set default value of Parallel's max_nbytes to 100MB Motivation: avoid introducing disk latency on medium sized parallel workload where memory usage is not an issue. FIX: properly handle the JOBLIB_MULTIPROCESSING env variable FIX: timeout test failures under windows Release 0.8.0a -------------- 2013-12-19 Olivier Grisel FIX: support the new Python 3.4 multiprocessing API 2013-12-05 Olivier Grisel ENH: make Memory respect mmap_mode at first call too ENH: add a threading based backend to Parallel This is low overhead alternative backend to the default multiprocessing backend that is suitable when calling compiled extensions that release the GIL. Author: Dan Stahlke Date: 2013-11-08 FIX: use safe_repr to print arg vals in trace This fixes a problem in which extremely long (and slow) stack traces would be produced when function parameters are large numpy arrays. 2013-09-10 Olivier Grisel ENH: limit memory copy with Parallel by leveraging numpy.memmap when possible Release 0.7.1 --------------- 2013-07-25 Gael Varoquaux MISC: capture meaningless argument (n_jobs=0) in Parallel 2013-07-09 Lars Buitinck ENH Handles tuples, sets and Python 3's dict_keys type the same as lists. in pre_dispatch 2013-05-23 Martin Luessi ENH: fix function caching for IPython Release 0.7.0 --------------- **This release drops support for Python 2.5 in favor of support for Python 3.0** 2013-02-13 Gael Varoquaux BUG: fix nasty hash collisions 2012-11-19 Gael Varoquaux ENH: Parallel: Turn of pre-dispatch for already expanded lists Gael Varoquaux 2012-11-19 ENH: detect recursive sub-process spawning, as when people do not protect the __main__ in scripts under Windows, and raise a useful error. Gael Varoquaux 2012-11-16 ENH: Full python 3 support Release 0.6.5 --------------- 2012-09-15 Yannick Schwartz BUG: make sure that sets and dictionnaries give reproducible hashes 2012-07-18 Marek Rudnicki BUG: make sure that object-dtype numpy array hash correctly 2012-07-12 GaelVaroquaux BUG: Bad default n_jobs for Parallel Release 0.6.4 --------------- 2012-05-07 Vlad Niculae ENH: controlled randomness in tests and doctest fix 2012-02-21 GaelVaroquaux ENH: add verbosity in memory 2012-02-21 GaelVaroquaux BUG: non-reproducible hashing: order of kwargs The ordering of a dictionnary is random. As a result the function hashing was not reproducible. Pretty hard to test Release 0.6.3 --------------- 2012-02-14 GaelVaroquaux BUG: fix joblib Memory pickling 2012-02-11 GaelVaroquaux BUG: fix hasher with Python 3 2012-02-09 GaelVaroquaux API: filter_args: `*args, **kwargs -> args, kwargs` Release 0.6.2 --------------- 2012-02-06 Gael Varoquaux BUG: make sure Memory pickles even if cachedir=None Release 0.6.1 --------------- Bugfix release because of a merge error in release 0.6.0 Release 0.6.0 --------------- **Beta 3** 2012-01-11 Gael Varoquaux BUG: ensure compatibility with old numpy DOC: update installation instructions BUG: file semantic to work under Windows 2012-01-10 Yaroslav Halchenko BUG: a fix toward 2.5 compatibility **Beta 2** 2012-01-07 Gael Varoquaux ENH: hash: bugware to be able to hash objects defined interactively in IPython 2012-01-07 Gael Varoquaux ENH: Parallel: warn and not fail for nested loops ENH: Parallel: n_jobs=-2 now uses all CPUs but one 2012-01-01 Juan Manuel Caicedo Carvajal and Gael Varoquaux ENH: add verbosity levels in Parallel Release 0.5.7 --------------- 2011-12-28 Gael varoquaux API: zipped -> compress 2011-12-26 Gael varoquaux ENH: Add a zipped option to Memory API: Memory no longer accepts save_npy 2011-12-22 Kenneth C. Arnold and Gael varoquaux BUG: fix numpy_pickle for array subclasses 2011-12-21 Gael varoquaux ENH: add zip-based pickling 2011-12-19 Fabian Pedregosa Py3k: compatibility fixes. This makes run fine the tests test_disk and test_parallel Release 0.5.6 --------------- 2011-12-11 Lars Buitinck ENH: Replace os.path.exists before makedirs with exception check New disk.mkdirp will fail with other errnos than EEXIST. 2011-12-10 Bala Subrahmanyam Varanasi MISC: pep8 compliant Release 0.5.5 --------------- 2011-19-10 Fabian Pedregosa ENH: Make joblib installable under Python 3.X Release 0.5.4 --------------- 2011-09-29 Jon Olav Vik BUG: Make mangling path to filename work on Windows 2011-09-25 Olivier Grisel FIX: doctest heisenfailure on execution time 2011-08-24 Ralf Gommers STY: PEP8 cleanup. Release 0.5.3 --------------- 2011-06-25 Gael varoquaux API: All the usefull symbols in the __init__ Release 0.5.2 --------------- 2011-06-25 Gael varoquaux ENH: Add cpu_count 2011-06-06 Gael varoquaux ENH: Make sure memory hash in a reproducible way Release 0.5.1 --------------- 2011-04-12 Gael varoquaux TEST: Better testing of parallel and pre_dispatch Yaroslav Halchenko 2011-04-12 DOC: quick pass over docs -- trailing spaces/spelling Yaroslav Halchenko 2011-04-11 ENH: JOBLIB_MULTIPROCESSING env var to disable multiprocessing from the environment Alexandre Gramfort 2011-04-08 ENH : adding log message to know how long it takes to load from disk the cache Release 0.5.0 --------------- 2011-04-01 Gael varoquaux BUG: pickling MemoizeFunc does not store timestamp 2011-03-31 Nicolas Pinto TEST: expose hashing bug with cached method 2011-03-26...2011-03-27 Pietro Berkes BUG: fix error management in rm_subdirs BUG: fix for race condition during tests in mem.clear() Gael varoquaux 2011-03-22...2011-03-26 TEST: Improve test coverage and robustness Gael varoquaux 2011-03-19 BUG: hashing functions with only \*var \**kwargs Gael varoquaux 2011-02-01... 2011-03-22 BUG: Many fixes to capture interprocess race condition when mem.cache is used by several processes on the same cache. Fabian Pedregosa 2011-02-28 First work on Py3K compatibility Gael varoquaux 2011-02-27 ENH: pre_dispatch in parallel: lazy generation of jobs in parallel for to avoid drowning memory. GaelVaroquaux 2011-02-24 ENH: Add the option of overloading the arguments of the mother 'Memory' object in the cache method that is doing the decoration. Gael varoquaux 2010-11-21 ENH: Add a verbosity level for more verbosity Release 0.4.6 ---------------- Gael varoquaux 2010-11-15 ENH: Deal with interruption in parallel Gael varoquaux 2010-11-13 BUG: Exceptions raised by Parallel when n_job=1 are no longer captured. Gael varoquaux 2010-11-13 BUG: Capture wrong arguments properly (better error message) Release 0.4.5 ---------------- Pietro Berkes 2010-09-04 BUG: Fix Windows peculiarities with path separators and file names BUG: Fix more windows locking bugs Gael varoquaux 2010-09-03 ENH: Make sure that exceptions raised in Parallel also inherit from the original exception class ENH: Add a shadow set of exceptions Fabian Pedregosa 2010-09-01 ENH: Clean up the code for parallel. Thanks to Fabian Pedregosa for the patch. Release 0.4.4 ---------------- Gael varoquaux 2010-08-23 BUG: Fix Parallel on computers with only one CPU, for n_jobs=-1. Gael varoquaux 2010-08-02 BUG: Fix setup.py for extra setuptools args. Gael varoquaux 2010-07-29 MISC: Silence tests (and hopefuly Yaroslav :P) Release 0.4.3 ---------------- Gael Varoquaux 2010-07-22 BUG: Fix hashing for function with a side effect modifying their input argument. Thanks to Pietro Berkes for reporting the bug and proving the patch. Release 0.4.2 ---------------- Gael Varoquaux 2010-07-16 BUG: Make sure that joblib still works with Python2.5. => release 0.4.2 Release 0.4.1 ---------------- joblib-0.11/LICENSE.txt000066400000000000000000000027671305577265600145530ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2008-2016, The joblib developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. joblib-0.11/MANIFEST.in000066400000000000000000000002001305577265600144420ustar00rootroot00000000000000include *.txt *.py recursive-include joblib *.rst *.py graft doc graft doc/_static graft doc/_templates global-exclude *~ *.swp joblib-0.11/Makefile000066400000000000000000000001561305577265600143560ustar00rootroot00000000000000 all: test test: pytest joblib test-no-multiprocessing: export JOBLIB_MULTIPROCESSING=0 && pytest joblib joblib-0.11/README.rst000066400000000000000000000117471305577265600144150ustar00rootroot00000000000000|PyPi| |Travis| |AppVeyor| |Codecov| .. |Travis| image:: https://travis-ci.org/joblib/joblib.svg?branch=master :target: https://travis-ci.org/joblib/joblib :alt: Travis build status .. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/github/joblib/joblib?branch=master&svg=true :target: https://ci.appveyor.com/project/joblib-ci/joblib/history :alt: AppVeyor build status .. |Codecov| image:: https://codecov.io/gh/joblib/joblib/branch/master/graph/badge.svg :target: https://codecov.io/gh/joblib/joblib :alt: Codecov coverage .. |PyPi| image:: https://badge.fury.io/py/joblib.svg :target: https://badge.fury.io/py/joblib :alt: Joblib version The homepage of joblib with user documentation is located on: https://pythonhosted.org/joblib/ Getting the latest code ========================= To get the latest code using git, simply type:: git clone git://github.com/joblib/joblib.git If you don't have git installed, you can download a zip or tarball of the latest code: http://github.com/joblib/joblib/archives/master Installing ========================= You can use `pip` to install joblib:: pip install joblib from any directory or python setup.py install from the source directory. Joblib has no other mandatory dependency than Python (supported versions are 2.7+ and 3.3+). Numpy (at least version 1.6.1) is an optional dependency for array manipulation. Workflow to contribute ========================= To contribute to joblib, first create an account on `github `_. Once this is done, fork the `joblib repository `_ to have you own repository, clone it using 'git clone' on the computers where you want to work. Make your changes in your clone, push them to your github account, test them on several computers, and when you are happy with them, send a pull request to the main repository. Running the test suite ========================= To run the test suite, you need the pytest (version >= 3) and coverage modules. Run the test suite using:: pytest joblib from the root of the project. Building the docs ========================= To build the docs you need to have setuptools and sphinx (>=0.5) installed. Run the command:: python setup.py build_sphinx The docs are built in the build/sphinx/html directory. Making a source tarball ========================= To create a source tarball, eg for packaging or distributing, run the following command:: python setup.py sdist The tarball will be created in the `dist` directory. This command will compile the docs, and the resulting tarball can be installed with no extra dependencies than the Python standard library. You will need setuptool and sphinx. Making a release and uploading it to PyPI ================================================== This command is only run by project manager, to make a release, and upload in to PyPI:: python setup.py sdist bdist_wheel upload_docs --upload-dir build/sphinx/html twine upload dist/* Updating the changelog ======================== Changes are listed in the CHANGES.rst file. They must be manually updated but, the following git command may be used to generate the lines:: git log --abbrev-commit --date=short --no-merges --sparse Licensing ---------- joblib is **BSD-licenced** (3 clause): This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. Copyright (c) 2009-2011, joblib developpers All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Gael Varoquaux. nor the names of other joblib contributors may be used to endorse or promote products derived from this software without specific prior written permission. **This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. In no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.** joblib-0.11/TODO.rst000066400000000000000000000032601305577265600142140ustar00rootroot00000000000000Tasks at hand on joblib, in increasing order of difficulty. * Add a changelog! * In parallel: need to deal with return arguments that don't pickle. * Improve test coverage and documentation * Store a repr of the arguments for each call in the corresponding cachedir * Try to use Mike McKerns's Dill pickling module in Parallel: Implementation idea: * Create a new function that is wrapped and takes Dillo pickles as inputs as output, feed this one to multiprocessing * pickle everything using Dill in the Parallel object. http://dev.danse.us/trac/pathos/browser/dill * Make a sensible error message when wrong keyword arguments are given, currently we have:: from joblib import Memory mem = Memory(cachedir='cache') def f(a=0, b=2): return a, b g = mem.cache(f) g(c=2) /home/varoquau/dev/joblib/joblib/func_inspect.pyc in filter_args(func, ignore_lst, *args, **kwargs), line 168 TypeError: Ignore list for diffusion_reorder() contains and unexpected keyword argument 'cachedir' * add a 'depends' keyword argument to memory.cache, to be able to specify that a function depends on other functions, and thus that the cache should be cleared. * add a 'argument_hash' keyword argument to Memory.cache, to be able to replace the hashing logic of memory for the input arguments. It should accept as an input the dictionnary of arguments, as returned in func_inspect, and return a string. * add a sqlite db for provenance tracking. Store computation time and usage timestamps, to be able to do 'garbage-collection-like' cleaning of unused results, based on a cost function balancing computation cost and frequency of use. joblib-0.11/appveyor.yml000066400000000000000000000024271305577265600153110ustar00rootroot00000000000000environment: # There is no need to run the build for all the Python version / # architectures combo as the generated joblib wheel is the same on all # platforms (universal wheel). # We run the tests on 2 different target platforms for testing purpose only. matrix: - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.x" PYTHON_ARCH: "32" - PYTHON: "C:\\Python35-x64" PYTHON_VERSION: "3.5.x" PYTHON_ARCH: "64" install: # Install Python (from the official .msi of http://python.org) and pip when # not already installed. - powershell ./continuous_integration/appveyor/install.ps1 - SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH% # Install the build and runtime dependencies of the project. - pip install -r continuous_integration/appveyor/requirements.txt - python setup.py bdist_wheel - ps: ls dist # Install the generated wheel package to test it - pip install --pre --no-index --find-links dist/ joblib # Not a .NET project, we build in the install step instead build: false test_script: - powershell ./continuous_integration/appveyor/test.ps1 artifacts: # Archive the generated wheel package in the ci.appveyor.com build report. - path: dist\* on_success: # - TODO: upload the content of dist/*.whl to a public wheelhouse - codecov joblib-0.11/benchmarks/000077500000000000000000000000001305577265600150315ustar00rootroot00000000000000joblib-0.11/benchmarks/bench_auto_batching.py000066400000000000000000000103531305577265600213530ustar00rootroot00000000000000"""Benchmark batching="auto" on high number of fast tasks The goal of this script is to study the behavior of the batch_size='auto' and in particular the impact of the default value of the joblib.parallel.MIN_IDEAL_BATCH_DURATION constant. """ # Author: Olivier Grisel # License: BSD 3 clause import numpy as np import time import tempfile from pprint import pprint from joblib import Parallel, delayed from joblib._parallel_backends import AutoBatchingMixin def sleep_noop(duration, input_data, output_data_size): """Noop function to emulate real computation. Simulate CPU time with by sleeping duration. Induce overhead by accepting (and ignoring) any amount of data as input and allocating a requested amount of data. """ time.sleep(duration) if output_data_size: return np.ones(output_data_size, dtype=np.byte) def bench_short_tasks(task_times, n_jobs=2, batch_size="auto", pre_dispatch="2*n_jobs", verbose=True, input_data_size=0, output_data_size=0, backend=None, memmap_input=False): with tempfile.NamedTemporaryFile() as temp_file: if input_data_size: # Generate some input data with the required size if memmap_input: temp_file.close() input_data = np.memmap(temp_file.name, shape=input_data_size, dtype=np.byte, mode='w+') input_data[:] = 1 else: input_data = np.ones(input_data_size, dtype=np.byte) else: input_data = None t0 = time.time() p = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch, batch_size=batch_size, backend=backend) p(delayed(sleep_noop)(max(t, 0), input_data, output_data_size) for t in task_times) duration = time.time() - t0 effective_batch_size = getattr(p._backend, '_effective_batch_size', p.batch_size) print('Completed {} tasks in {:3f}s, final batch_size={}\n'.format( len(task_times), duration, effective_batch_size)) return duration, effective_batch_size if __name__ == "__main__": bench_parameters = dict( # batch_size=200, # batch_size='auto' by default # memmap_input=True, # if True manually memmap input out of timing # backend='threading', # backend='multiprocessing' by default # pre_dispatch='n_jobs', # pre_dispatch="2*n_jobs" by default input_data_size=int(2e7), # input data size in bytes output_data_size=int(1e5), # output data size in bytes n_jobs=2, verbose=10, ) print("Common benchmark parameters:") pprint(bench_parameters) AutoBatchingMixin.MIN_IDEAL_BATCH_DURATION = 0.2 AutoBatchingMixin.MAX_IDEAL_BATCH_DURATION = 2 # First pair of benchmarks to check that the auto-batching strategy is # stable (do not change the batch size too often) in the presence of large # variance while still be comparable to the equivalent load without # variance print('# high variance, no trend') # censored gaussian distribution high_variance = np.random.normal(loc=0.000001, scale=0.001, size=5000) high_variance[high_variance < 0] = 0 bench_short_tasks(high_variance, **bench_parameters) print('# low variance, no trend') low_variance = np.empty_like(high_variance) low_variance[:] = np.mean(high_variance) bench_short_tasks(low_variance, **bench_parameters) # Second pair of benchmarks: one has a cycling task duration pattern that # the auto batching feature should be able to roughly track. We use an even # power of cos to get only positive task durations with a majority close to # zero (only data transfer overhead). The shuffle variant should not # oscillate too much and still approximately have the same total run time. print('# cyclic trend') slow_time = 0.1 positive_wave = np.cos(np.linspace(1, 4 * np.pi, 300)) ** 8 cyclic = positive_wave * slow_time bench_short_tasks(cyclic, **bench_parameters) print("shuffling of the previous benchmark: same mean and variance") np.random.shuffle(cyclic) bench_short_tasks(cyclic, **bench_parameters) joblib-0.11/benchmarks/bench_compression.py000066400000000000000000000205621305577265600211100ustar00rootroot00000000000000"""Script comparing different pickling strategies.""" from joblib.numpy_pickle import NumpyPickler, NumpyUnpickler from joblib.numpy_pickle_utils import BinaryZlibFile, BinaryGzipFile from pickle import _Pickler, _Unpickler, Pickler, Unpickler import numpy as np import bz2 import lzma import time import io import sys import os from collections import OrderedDict def fileobj(obj, fname, mode, kwargs): """Create a file object.""" return obj(fname, mode, **kwargs) def bufferize(f, buf): """Bufferize a fileobject using buf.""" if buf is None: return f else: if (buf.__name__ == io.BufferedWriter.__name__ or buf.__name__ == io.BufferedReader.__name__): return buf(f, buffer_size=10 * 1024 ** 2) return buf(f) def _load(unpickler, fname, f): if unpickler.__name__ == NumpyUnpickler.__name__: p = unpickler(fname, f) else: p = unpickler(f) return p.load() def print_line(obj, strategy, buffer, pickler, dump, load, disk_used): """Nice printing function.""" print('% 20s | %6s | % 14s | % 7s | % 5.1f | % 5.1f | % 5s' % ( obj, strategy, buffer, pickler, dump, load, disk_used)) class PickleBufferedWriter(): """Protect the underlying fileobj against numerous calls to write This is achieved by internally keeping a list of small chunks and only flushing to the backing fileobj if passed a large chunk or after a threshold on the number of small chunks. """ def __init__(self, fileobj, max_buffer_size=10 * 1024 ** 2): self._fileobj = fileobj self._chunks = chunks = [] # As the `write` method is called many times by the pickler, # attribute look ups on the self's __dict__ are too expensive # hence we define a closure here with all the regularly # accessed parameters def _write(data): chunks.append(data) if len(chunks) > max_buffer_size: self.flush() self.write = _write def flush(self): self._fileobj.write(b''.join(self._chunks[:])) del self._chunks[:] def close(self): self.flush() self._fileobj.close() def __enter__(self): return self def __exit__(self, *exc): self.close() return False class PickleBufferedReader(): """Protect the underlying fileobj against numerous calls to write This is achieved by internally keeping a list of small chunks and only flushing to the backing fileobj if passed a large chunk or after a threshold on the number of small chunks. """ def __init__(self, fileobj, max_buffer_size=10 * 1024 ** 2): self._fileobj = fileobj self._buffer = bytearray(max_buffer_size) self.max_buffer_size = max_buffer_size self._position = 0 def read(self, n=None): data = b'' if n is None: data = self._fileobj.read() else: while len(data) < n: if self._position == 0: self._buffer = self._fileobj.read(self.max_buffer_size) elif self._position == self.max_buffer_size: self._position = 0 continue next_position = min(self.max_buffer_size, self._position + n - len(data)) data += self._buffer[self._position:next_position] self._position = next_position return data def readline(self): line = [] while True: c = self.read(1) line.append(c) if c == b'\n': break return b''.join(line) def close(self): self._fileobj.close() def __enter__(self): return self def __exit__(self, *exc): self.close() return False def run_bench(): print('% 20s | %10s | % 12s | % 8s | % 9s | % 9s | % 5s' % ( 'Object', 'Compression', 'Buffer', 'Pickler/Unpickler', 'dump time (s)', 'load time (s)', 'Disk used (MB)')) print("--- | --- | --- | --- | --- | --- | ---") for oname, obj in objects.items(): # Looping over the objects (array, dict, etc) if isinstance(obj, np.ndarray): osize = obj.nbytes / 1e6 else: osize = sys.getsizeof(obj) / 1e6 for cname, f in compressors.items(): fobj = f[0] fname = f[1] fmode = f[2] fopts = f[3] # Looping other defined compressors for bname, buf in bufs.items(): writebuf = buf[0] readbuf = buf[1] # Looping other picklers for pname, p in picklers.items(): pickler = p[0] unpickler = p[1] t0 = time.time() # Now pickling the object in the file if (writebuf is not None and writebuf.__name__ == io.BytesIO.__name__): b = writebuf() p = pickler(b) p.dump(obj) with fileobj(fobj, fname, fmode, fopts) as f: f.write(b.getvalue()) else: with bufferize(fileobj(fobj, fname, fmode, fopts), writebuf) as f: p = pickler(f) p.dump(obj) dtime = time.time() - t0 t0 = time.time() # Now loading the object from the file obj_r = None if (readbuf is not None and readbuf.__name__ == io.BytesIO.__name__): b = readbuf() with fileobj(fobj, fname, 'rb', {}) as f: b.write(f.read()) b.seek(0) obj_r = _load(unpickler, fname, b) else: with bufferize(fileobj(fobj, fname, 'rb', {}), readbuf) as f: obj_r = _load(unpickler, fname, f) ltime = time.time() - t0 if isinstance(obj, np.ndarray): assert((obj == obj_r).all()) else: assert(obj == obj_r) print_line("{} ({:.1f}MB)".format(oname, osize), cname, bname, pname, dtime, ltime, "{:.2f}".format(os.path.getsize(fname) / 1e6)) # Defining objects used in this bench DICT_SIZE = int(1e6) ARRAY_SIZE = int(1e7) arr = np.random.normal(size=(ARRAY_SIZE)) arr[::2] = 1 # Objects used for testing objects = OrderedDict([ ("dict", dict((i, str(i)) for i in range(DICT_SIZE))), ("list", [i for i in range(DICT_SIZE)]), ("array semi-random", arr), ("array random", np.random.normal(size=(ARRAY_SIZE))), ("array ones", np.ones((ARRAY_SIZE))), ]) #  We test 3 different picklers picklers = OrderedDict([ # Python implementation of Pickler/Unpickler ("Pickle", (_Pickler, _Unpickler)), # C implementation of Pickler/Unpickler ("cPickle", (Pickler, Unpickler)), # Joblib Pickler/Unpickler designed for numpy arrays. ("Joblib", (NumpyPickler, NumpyUnpickler)), ]) # The list of supported compressors used for testing compressors = OrderedDict([ ("No", (open, '/tmp/test_raw', 'wb', {})), ("Zlib", (BinaryZlibFile, '/tmp/test_zlib', 'wb', {'compresslevel': 3})), ("Gzip", (BinaryGzipFile, '/tmp/test_gzip', 'wb', {'compresslevel': 3})), ("Bz2", (bz2.BZ2File, '/tmp/test_bz2', 'wb', {'compresslevel': 3})), ("Xz", (lzma.LZMAFile, '/tmp/test_xz', 'wb', {'preset': 3, 'check': lzma.CHECK_NONE})), ("Lzma", (lzma.LZMAFile, '/tmp/test_lzma', 'wb', {'preset': 3, 'format': lzma.FORMAT_ALONE})), ]) # Test 3 buffering strategies bufs = OrderedDict([ ("None", (None, None)), ("io.BytesIO", (io.BytesIO, io.BytesIO)), ("io.Buffered", (io.BufferedWriter, io.BufferedReader)), ("PickleBuffered", (PickleBufferedWriter, PickleBufferedReader)), ]) if __name__ == "__main__": run_bench() joblib-0.11/benchmarks/bench_pickle.py000077500000000000000000000415241305577265600200220ustar00rootroot00000000000000""" Benching joblib pickle I/O. Warning: this is slow, and the benchs are easily offset by other disk activity. """ import os import time import shutil import numpy as np import joblib import gc from joblib.disk import disk_used try: from memory_profiler import memory_usage except ImportError: memory_usage = None def clear_out(): """Clear output directory.""" if os.path.exists('out'): shutil.rmtree('out') os.mkdir('out') def kill_disk_cache(): """Clear disk cache to avoid side effects.""" if os.name == 'posix' and os.uname()[0] == 'Linux': try: os.system('sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches"') except IOError as e: if e.errno == 13: print('Please run me as root') else: raise else: # Write ~100M to the disk open('tmp', 'wb').write(np.random.random(2e7)) def delete_obj(obj): """Force destruction of an object.""" if obj is not None: del obj gc.collect() def memory_used(func, *args, **kwargs): """Compute memory usage of func.""" if memory_usage is None: return np.NaN gc.collect() mem_use = memory_usage((func, args, kwargs), interval=.001) return max(mem_use) - min(mem_use) def timeit(func, *args, **kwargs): """Compute the mean execution time of func based on 7 measures.""" times = [] tries = kwargs['tries'] kwargs.pop('tries') if tries > 1: tries += 2 for _ in range(tries): kill_disk_cache() t0 = time.time() out = func(*args, **kwargs) if 1: # Just time the function t1 = time.time() times.append(t1 - t0) else: # Compute a hash of the output, to estimate the time # necessary to access the elements: this is a better # estimate of the time to load with me mmapping. joblib.hash(out) t1 = time.time() joblib.hash(out) t2 = time.time() times.append(t2 - t0 - 2 * (t2 - t1)) times.sort() return np.mean(times[1:-1]) if tries > 1 else t1 - t0, out def generate_rand_dict(size, with_arrays=False, with_string=False, array_shape=(10, 10)): """Generate dictionary with random values from list of keys.""" ret = {} rnd = np.random.RandomState(0) randoms = rnd.random_sample((size)) for key, random in zip(range(size), randoms): if with_arrays: ret[str(key)] = rnd.random_sample(array_shape) elif with_string: ret[str(key)] = str(random) else: ret[str(key)] = int(random) return ret def generate_rand_list(size, with_arrays=False, with_string=False, array_shape=(10, 10)): """Generate list with random values from list of keys.""" ret = [] rnd = np.random.RandomState(0) for random in rnd.random_sample((size)): if with_arrays: ret.append(rnd.random_sample(array_shape)) elif with_string: ret.append(str(random)) else: ret.append(int(random)) return ret def print_line(dataset, strategy, write_time, read_time, mem_write, mem_read, disk_used): """Nice printing function.""" print('% 15s, %12s, % 6.3f, % 7.4f, % 9.1f, % 9.1f, % 5.1f' % ( dataset, strategy, write_time, read_time, mem_write, mem_read, disk_used)) def print_bench_summary(args): """Nice bench summary function.""" summary = """Benchmark summary: - Global values: . Joblib version: {} . Number of tries to compute mean execution time: {} . Compression levels : {} . Compression algorithm: {} . Memory map mode : {} . Bench nifti data : {} . Bench big array : {} . Bench 2 big arrays : {} . Bench big dictionary: {} . Bench array+dict : {} """.format(joblib.__version__, args.tries, ", ".join(map(str, args.compress)), "None" if not args.compress else args.compressor, args.mmap, args.nifti, args.array, args.arrays, args.dict, args.combo) if args.array: shape = tuple(args.shape) size = round(np.multiply.reduce(shape) * 8 / 1024 ** 2, 1) summary += """ - Big array: . shape: {} . size in memory: {} MB """.format(str(shape), size) if args.dict: summary += """ - Big dictionary: . number of keys: {} . value type: {} """.format(args.size, 'np.ndarray' if args.valuearray else 'str' if args.valuestring else 'int') if args.valuearray: summary += """ . arrays shape: {} """.format(str(tuple(args.valuearrayshape))) if args.list: summary += """ - Big list: . number of elements: {} . value type: {} """.format(args.size, 'np.ndarray' if args.valuearray else 'str' if args.valuestring else 'int') if args.valuearray: summary += """ . arrays shape: {} """.format(str(tuple(args.valuearrayshape))) print(summary) def bench_compress(dataset, name='', compress=('zlib', 0), cache_size=0, tries=5): """Bench joblib dump and load functions, compress modes.""" # generate output compression strategy string before joblib compatibility # check as it may override the compress variable with a non tuple type. compress_str = "Raw" if compress[1] == 0 else "{} {}".format(*compress) # joblib versions prior to 0.10 doesn't support tuple in compress argument # so only the second element of the tuple is used for those versions # and the compression strategy is ignored. if (isinstance(compress, tuple) and tuple(map(int, joblib.__version__.split('.')[:2])) < (0, 10)): compress = compress[1] time_write = time_read = du = mem_read = mem_write = [] clear_out() time_write, obj = timeit(joblib.dump, dataset, 'out/test.pkl', tries=tries, compress=compress, cache_size=cache_size) del obj gc.collect() mem_write = memory_used(joblib.dump, dataset, 'out/test.pkl', compress=compress, cache_size=cache_size) delete_obj(dataset) du = disk_used('out') / 1024. time_read, obj = timeit(joblib.load, 'out/test.pkl', tries=tries) delete_obj(obj) mem_read = memory_used(joblib.load, 'out/test.pkl') print_line(name, compress_str, time_write, time_read, mem_write, mem_read, du) def bench_mmap(dataset, name='', cache_size=0, mmap_mode='r', tries=5): """Bench joblib dump and load functions, memmap modes.""" time_write = time_read = du = [] clear_out() time_write, _ = timeit(joblib.dump, dataset, 'out/test.pkl', tries=tries, cache_size=cache_size) mem_write = memory_used(joblib.dump, dataset, 'out/test.pkl', cache_size=cache_size) delete_obj(dataset) time_read, obj = timeit(joblib.load, 'out/test.pkl', tries=tries, mmap_mode=mmap_mode) delete_obj(obj) mem_read = memory_used(joblib.load, 'out/test.pkl', mmap_mode=mmap_mode) du = disk_used('out') / 1024. print_line(name, 'mmap %s' % mmap_mode, time_write, time_read, mem_write, mem_read, du) def run_bench(func, obj, name, **kwargs): """Run the benchmark function.""" func(obj, name, **kwargs) def run(args): """Run the full bench suite.""" if args.summary: print_bench_summary(args) if (not args.nifti and not args.array and not args.arrays and not args.dict and not args.list and not args.combo): print("Nothing to bench. Exiting") return compress_levels = args.compress compress_method = args.compressor mmap_mode = args.mmap container_size = args.size a1_shape = tuple(args.shape) a2_shape = (10000000, ) print('% 15s, %12s, % 6s, % 7s, % 9s, % 9s, % 5s' % ( 'Dataset', 'strategy', 'write', 'read', 'mem_write', 'mem_read', 'disk')) if args.nifti: # Nifti images try: import nibabel except ImportError: print("nibabel is not installed skipping nifti file benchmark.") else: def load_nii(filename): img = nibabel.load(filename) return img.get_data(), img.get_affine() for name, nifti_file in ( ('MNI', '/usr/share/fsl/data/atlases' '/MNI/MNI-prob-1mm.nii.gz'), ('Juelich', '/usr/share/fsl/data/atlases' '/Juelich/Juelich-prob-2mm.nii.gz'), ): for c_order in (True, False): name_d = '% 5s(%s)' % (name, 'C' if c_order else 'F') for compress_level in compress_levels: d = load_nii(nifti_file) if c_order: d = (np.ascontiguousarray(d[0]), d[1]) run_bench(bench_compress, d, name_d, compress=(compress_method, compress_level), tries=args.tries) del d if not args.nommap: d = load_nii(nifti_file) if c_order: d = (np.ascontiguousarray(d[0]), d[1]) run_bench(bench_mmap, d, name_d, mmap_mode=mmap_mode, tries=args.tries) del d # Generate random seed rnd = np.random.RandomState(0) if args.array: # numpy array name = '% 5s' % 'Big array' for compress_level in compress_levels: a1 = rnd.random_sample(a1_shape) run_bench(bench_compress, a1, name, compress=(compress_method, compress_level), tries=args.tries) del a1 if not args.nommap: a1 = rnd.random_sample(a1_shape) run_bench(bench_mmap, a1, name, mmap_mode=mmap_mode, tries=args.tries) del a1 if args.arrays: # Complex object with 2 big arrays name = '% 5s' % '2 big arrays' for compress_level in compress_levels: obj = [rnd.random_sample(a1_shape), rnd.random_sample(a2_shape)] run_bench(bench_compress, obj, name, compress=(compress_method, compress_level), tries=args.tries) del obj if not args.nommap: obj = [rnd.random_sample(a1_shape), rnd.random_sample(a2_shape)] run_bench(bench_mmap, obj, name, mmap_mode=mmap_mode, tries=args.tries) del obj if args.dict: # Big dictionary name = '% 5s' % 'Big dict' array_shape = tuple(args.valuearrayshape) for compress_level in compress_levels: big_dict = generate_rand_dict(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape) run_bench(bench_compress, big_dict, name, compress=(compress_method, compress_level), tries=args.tries) del big_dict if not args.nommap: big_dict = generate_rand_dict(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape) run_bench(bench_mmap, big_dict, name, mmap_mode=mmap_mode, tries=args.tries) del big_dict if args.list: # Big dictionary name = '% 5s' % 'Big list' array_shape = tuple(args.valuearrayshape) for compress_level in compress_levels: big_list = generate_rand_list(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape) run_bench(bench_compress, big_list, name, compress=(compress_method, compress_level), tries=args.tries) del big_list if not args.nommap: big_list = generate_rand_list(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape) run_bench(bench_mmap, big_list, name, mmap_mode=mmap_mode, tries=args.tries) del big_list if args.combo: # 2 big arrays with one big dict name = '% 5s' % 'Dict/arrays' array_shape = tuple(args.valuearrayshape) for compress in compress_levels: obj = [rnd.random_sample(a1_shape), generate_rand_dict(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape), rnd.random_sample(a2_shape)] run_bench(bench_compress, obj, name, compress=(compress_method, compress_level), tries=args.tries) del obj if not args.nommap: obj = [rnd.random_sample(a1_shape), generate_rand_dict(container_size, with_arrays=args.valuearray, with_string=args.valuestring, array_shape=array_shape), rnd.random_sample(a2_shape)] run_bench(bench_mmap, obj, name, mmap_mode=mmap_mode, tries=args.tries) del obj if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Joblib benchmark script") parser.add_argument('--compress', nargs='+', type=int, default=(0, 3), help="List of compress levels.") parser.add_argument('--compressor', type=str, default='zlib', choices=['zlib', 'gzip', 'bz2', 'xz', 'lzma'], help="Compression algorithm.") parser.add_argument('--mmap', type=str, default='r', choices=['r', 'r+', 'w+'], help="Memory map mode.") parser.add_argument('--tries', type=int, default=5, help="Number of tries to compute execution time" "mean on.") parser.add_argument('--shape', nargs='+', type=int, default=(10000, 10000), help="Big array shape.") parser.add_argument("-m", "--nommap", action="store_true", help="Don't bench memmap") parser.add_argument('--size', type=int, default=10000, help="Big dictionary size.") parser.add_argument('--valuearray', action="store_true", help="Use numpy arrays type in containers " "(list, dict)") parser.add_argument('--valuearrayshape', nargs='+', type=int, default=(10, 10), help="Shape of arrays in big containers.") parser.add_argument('--valuestring', action="store_true", help="Use string type in containers (list, dict).") parser.add_argument("-n", "--nifti", action="store_true", help="Benchmark Nifti data") parser.add_argument("-a", "--array", action="store_true", help="Benchmark single big numpy array") parser.add_argument("-A", "--arrays", action="store_true", help="Benchmark list of big numpy arrays") parser.add_argument("-d", "--dict", action="store_true", help="Benchmark big dictionary.") parser.add_argument("-l", "--list", action="store_true", help="Benchmark big list.") parser.add_argument("-c", "--combo", action="store_true", help="Benchmark big dictionary + list of " "big numpy arrays.") parser.add_argument("-s", "--summary", action="store_true", help="Show bench summary.") run(parser.parse_args()) joblib-0.11/continuous_integration/000077500000000000000000000000001305577265600175255ustar00rootroot00000000000000joblib-0.11/continuous_integration/appveyor/000077500000000000000000000000001305577265600213725ustar00rootroot00000000000000joblib-0.11/continuous_integration/appveyor/install.ps1000066400000000000000000000135431305577265600234730ustar00rootroot00000000000000# Sample script to install Python and pip under Windows # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ $MINICONDA_URL = "http://repo.continuum.io/miniconda/" $BASE_URL = "https://www.python.org/ftp/python/" $GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py" $GET_PIP_PATH = "C:\get-pip.py" function DownloadPython ($python_version, $platform_suffix) { $webclient = New-Object System.Net.WebClient $filename = "python-" + $python_version + $platform_suffix + ".msi" $url = $BASE_URL + $python_version + "/" + $filename $basedir = $pwd.Path + "\" $filepath = $basedir + $filename if (Test-Path $filename) { Write-Host "Reusing" $filepath return $filepath } # Download and retry up to 3 times in case of network transient errors. Write-Host "Downloading" $filename "from" $url $retry_attempts = 2 for($i=0; $i -lt $retry_attempts; $i++){ try { $webclient.DownloadFile($url, $filepath) break } Catch [Exception]{ Start-Sleep 1 } } if (Test-Path $filepath) { Write-Host "File saved at" $filepath } else { # Retry once to get the error message if any at the last try $webclient.DownloadFile($url, $filepath) } return $filepath } function InstallPython ($python_version, $architecture, $python_home) { Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home if (Test-Path $python_home) { Write-Host $python_home "already exists, skipping." return $false } if ($architecture -eq "32") { $platform_suffix = "" } else { $platform_suffix = ".amd64" } $msipath = DownloadPython $python_version $platform_suffix Write-Host "Installing" $msipath "to" $python_home $install_log = $python_home + ".log" $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home" $uninstall_args = "/qn /x $msipath" RunCommand "msiexec.exe" $install_args if (-not(Test-Path $python_home)) { Write-Host "Python seems to be installed else-where, reinstalling." RunCommand "msiexec.exe" $uninstall_args RunCommand "msiexec.exe" $install_args } if (Test-Path $python_home) { Write-Host "Python $python_version ($architecture) installation complete" } else { Write-Host "Failed to install Python in $python_home" Get-Content -Path $install_log Exit 1 } } function RunCommand ($command, $command_args) { Write-Host $command $command_args Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru } function InstallPip ($python_home) { $pip_path = $python_home + "\Scripts\pip.exe" $python_path = $python_home + "\python.exe" if (-not(Test-Path $pip_path)) { Write-Host "Installing pip..." $webclient = New-Object System.Net.WebClient $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH) Write-Host "Executing:" $python_path $GET_PIP_PATH Start-Process -FilePath "$python_path" -ArgumentList "$GET_PIP_PATH" -Wait -Passthru } else { Write-Host "pip already installed." } } function DownloadMiniconda ($python_version, $platform_suffix) { $webclient = New-Object System.Net.WebClient if ($python_version -eq "3.5") { $filename = "Miniconda3-3.5.5-Windows-" + $platform_suffix + ".exe" } else { $filename = "Miniconda-3.5.5-Windows-" + $platform_suffix + ".exe" } $url = $MINICONDA_URL + $filename $basedir = $pwd.Path + "\" $filepath = $basedir + $filename if (Test-Path $filename) { Write-Host "Reusing" $filepath return $filepath } # Download and retry up to 3 times in case of network transient errors. Write-Host "Downloading" $filename "from" $url $retry_attempts = 2 for($i=0; $i -lt $retry_attempts; $i++){ try { $webclient.DownloadFile($url, $filepath) break } Catch [Exception]{ Start-Sleep 1 } } if (Test-Path $filepath) { Write-Host "File saved at" $filepath } else { # Retry once to get the error message if any at the last try $webclient.DownloadFile($url, $filepath) } return $filepath } function InstallMiniconda ($python_version, $architecture, $python_home) { Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home if (Test-Path $python_home) { Write-Host $python_home "already exists, skipping." return $false } if ($architecture -eq "32") { $platform_suffix = "x86" } else { $platform_suffix = "x86_64" } $filepath = DownloadMiniconda $python_version $platform_suffix Write-Host "Installing" $filepath "to" $python_home $install_log = $python_home + ".log" $args = "/S /D=$python_home" Write-Host $filepath $args Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru if (Test-Path $python_home) { Write-Host "Python $python_version ($architecture) installation complete" } else { Write-Host "Failed to install Python in $python_home" Get-Content -Path $install_log Exit 1 } } function InstallMinicondaPip ($python_home) { $pip_path = $python_home + "\Scripts\pip.exe" $conda_path = $python_home + "\Scripts\conda.exe" if (-not(Test-Path $pip_path)) { Write-Host "Installing pip..." $args = "install --yes pip" Write-Host $conda_path $args Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru } else { Write-Host "pip already installed." } } function main () { InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON InstallPip $env:PYTHON } main joblib-0.11/continuous_integration/appveyor/requirements.txt000066400000000000000000000000461305577265600246560ustar00rootroot00000000000000numpy wheel pytest pytest-cov codecov joblib-0.11/continuous_integration/appveyor/test.ps1000066400000000000000000000005611305577265600230000ustar00rootroot00000000000000$installed_joblib_folder = $(python -c "import os; os.chdir('c:/'); import joblib;\ print(os.path.dirname(joblib.__file__))") echo "joblib found in: $installed_joblib_folder" # --pyargs argument is used to make sure we run the tests on the # installed package rather than on the local folder pytest --pyargs joblib --cov $installed_joblib_folder exit $LastExitCode joblib-0.11/continuous_integration/travis/000077500000000000000000000000001305577265600210355ustar00rootroot00000000000000joblib-0.11/continuous_integration/travis/flake8_diff.sh000077500000000000000000000116231305577265600235410ustar00rootroot00000000000000#!/bin/bash # This script is used in Travis to check that PRs do not add obvious # flake8 violations. It relies on two things: # - computing a similar diff to what github is showing in a PR. The # diff is done between: # 1. the common ancestor of the local branch and the # joblib/joblib remote # 2. the local branch # - run flake8 --diff on the computed diff # # Additional features: # - the line numbers in Travis match the local branch on the PR # author machine. # - bash continuous_integration/travis/flake8_diff.sh can be run # locally for quick turn-around set -e # pipefail is necessary to propagate exit codes set -o pipefail PROJECT=joblib/joblib PROJECT_URL=https://github.com/$PROJECT.git # Find the remote with the project name (upstream in most cases) REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') # Add a temporary remote if needed. For example this is necessary when # Travis is configured to run in a fork. In this case 'origin' is the # fork and not the reference repo we want to diff against. if [[ -z "$REMOTE" ]]; then TMP_REMOTE=tmp_reference_upstream REMOTE=$TMP_REMOTE git remote add $REMOTE $PROJECT_URL fi echo "Remotes:" echo '--------------------------------------------------------------------------------' git remote --verbose # Travis does the git clone with a limited depth (50 at the time of # writing). This may not be enough to find the common ancestor with # $REMOTE/master so we unshallow the git checkout if [[ -a .git/shallow ]]; then echo -e '\nTrying to unshallow the repo:' echo '--------------------------------------------------------------------------------' git fetch --unshallow fi if [[ "$TRAVIS" == "true" ]]; then if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] then # In main repo, using TRAVIS_COMMIT_RANGE to test the commits # that were pushed into a branch if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then echo "New branch, no commit range from Travis so passing this test by convention" exit 0 fi COMMIT_RANGE=$TRAVIS_COMMIT_RANGE fi else # We want to fetch the code as it is in the PR branch and not # the result of the merge into master. This way line numbers # reported by Travis will match with the local code. LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST # In Travis the PR target is always origin git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF fi fi # If not using the commit range from Travis we need to find the common # ancestor between $LOCAL_BRANCH_REF and $REMOTE/master if [[ -z "$COMMIT_RANGE" ]]; then if [[ -z "$LOCAL_BRANCH_REF" ]]; then LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) fi echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" echo '--------------------------------------------------------------------------------' git log -2 $LOCAL_BRANCH_REF REMOTE_MASTER_REF="$REMOTE/master" # Make sure that $REMOTE_MASTER_REF is a valid reference echo -e "\nFetching $REMOTE_MASTER_REF" echo '--------------------------------------------------------------------------------' git fetch $REMOTE master:refs/remotes/$REMOTE_MASTER_REF LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF) # Very confusing: need to use '..' i.e. two dots for 'git # rev-list' but '...' i.e. three dots for 'git diff' DIFF_RANGE="$REMOTE_MASTER_SHORT_HASH...$LOCAL_BRANCH_SHORT_HASH" REV_RANGE="$REMOTE_MASTER_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" echo -e '\nRunning flake8 on the diff in the range'\ "$DIFF_RANGE ($(git rev-list $REV_RANGE | wc -l) commit(s)):" echo '--------------------------------------------------------------------------------' if [[ -n "$TMP_REMOTE" ]]; then git remote remove $TMP_REMOTE fi else echo "Got the commit range from Travis: $COMMIT_RANGE" fi # We ignore files from doc/sphintext. Unfortunately there is no # way to do it with flake8 directly (the --exclude does not seem to # work with --diff). We could use the exclude magic in the git pathspec # ':!doc/sphintext' but it is only available on git 1.9 and Travis # uses git 1.8. # We need the following command to exit with 0 hence the echo in case # there is no match MODIFIED_FILES=$(git diff --name-only $DIFF_RANGE | \ grep -v 'doc/sphinxext' || echo "no_match") if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file outside doc/sphinxext has been modified" else # Conservative approach: diff without context so that code that # was not changed does not create failures git diff --unified=0 $DIFF_RANGE -- $MODIFIED_FILES | flake8 --diff --show-source fi echo -e "No problem detected by flake8\n" joblib-0.11/continuous_integration/travis/install.sh000077500000000000000000000062351305577265600230500ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called by the "install" step defined in # .travis.yml. See http://docs.travis-ci.com/ for more details. # The behavior of the script is controlled by environment variabled defined # in the .travis.yml in the top level folder of the project. # # This script is adapted from a similar script from the scikit-learn repository. # # License: 3-clause BSD set -e print_conda_requirements() { # Echo a conda requirement string for example # "pip python=2.7.3 scikit-learn=*". It has a hardcoded # list of possible packages to install and looks at _VERSION # environment variables to know whether to install a given package and # if yes which version to install. For example: # - for numpy, NUMPY_VERSION is used # - for scikit-learn, SCIKIT_LEARN_VERSION is used TO_INSTALL_ALWAYS="pip" REQUIREMENTS="$TO_INSTALL_ALWAYS" TO_INSTALL_MAYBE="python numpy flake8" for PACKAGE in $TO_INSTALL_MAYBE; do # Capitalize package name and add _VERSION PACKAGE_VERSION_VARNAME="${PACKAGE^^}_VERSION" # replace - by _, needed for scikit-learn for example PACKAGE_VERSION_VARNAME="${PACKAGE_VERSION_VARNAME//-/_}" # dereference $PACKAGE_VERSION_VARNAME to figure out the # version to install PACKAGE_VERSION="${!PACKAGE_VERSION_VARNAME}" if [ -n "$PACKAGE_VERSION" ]; then REQUIREMENTS="$REQUIREMENTS $PACKAGE=$PACKAGE_VERSION" fi done echo $REQUIREMENTS } create_new_conda_env() { # Deactivate the travis-provided virtual environment and setup a # conda-based environment instead deactivate # Use the miniconda installer for faster download / install of conda # itself wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \ -O miniconda.sh chmod +x miniconda.sh && ./miniconda.sh -b export PATH=/home/travis/miniconda2/bin:$PATH conda update --yes conda # Configure the conda environment and put it in the path using the # provided versions REQUIREMENTS=$(print_conda_requirements) echo "conda requirements string: $REQUIREMENTS" conda create -n testenv --yes $REQUIREMENTS source activate testenv if [[ "$INSTALL_MKL" == "true" ]]; then # Make sure that MKL is used conda install --yes mkl else # Make sure that MKL is not used conda remove --yes --features mkl || echo "MKL not installed" fi # Install pytest with pip to make sure we have pytest >= 3 because # conda has some outdated dependencies for Python 3.3. This can be # removed and pytest can be install through conda when we drop # support for Python 3.3 pip install pytest } create_new_conda_env if [ -n "$NUMPY_VERSION" ]; then # We want to ensure no memory copies are performed only when numpy is # installed. This also ensures that we don't keep a strong dependency on # memory_profiler. pip install memory_profiler fi if [[ "$COVERAGE" == "true" ]]; then pip install pytest-cov codecov fi if [[ "$BUILD_DOC" == "true" ]]; then conda install sphinx --yes python setup.py build_sphinx fi python setup.py install joblib-0.11/continuous_integration/travis/test_script.sh000077500000000000000000000012611305577265600237370ustar00rootroot00000000000000#!/bin/sh set -e if [[ -n "$FLAKE8_VERSION" ]]; then source continuous_integration/travis/flake8_diff.sh fi if [[ "$SKIP_TESTS" != "true" ]]; then if [ -z "$NUMPY_VERSION" ]; then # We want to disable doctests because they need numpy to # run. I could not find a way to override the # --doctest-modules in setup.cfg so we remove the # doctest-related lines in setup.cfg instead sed -i '/--doctest/d' setup.cfg fi if [ "$COVERAGE" == "true" ]; then # Add coverage option to setup.cfg file if current test run # has to generate report for codecov ... export PYTEST_ADDOPTS="--cov=joblib" fi make fi joblib-0.11/doc/000077500000000000000000000000001305577265600134615ustar00rootroot00000000000000joblib-0.11/doc/__init__.py000066400000000000000000000001371305577265600155730ustar00rootroot00000000000000""" This is a phony __init__.py file, so that pytest finds the doctests in this directory. """ joblib-0.11/doc/_templates/000077500000000000000000000000001305577265600156165ustar00rootroot00000000000000joblib-0.11/doc/_templates/layout.html000066400000000000000000000012501305577265600200170ustar00rootroot00000000000000{% extends '!layout.html' %} {%- if pagename == 'index' %} {% set title = 'Joblib: running Python functions as pipeline jobs' %} {%- endif %} {%- block sidebarsourcelink %} {% endblock %} {%- block sidebarsearch %}
{{ super() }}

Mailing list

joblib@librelist.com

Send an email to subscribe

{%- if show_source and has_source and sourcename %}
{{ _('Show this page source') }} {%- endif %} {% endblock %} joblib-0.11/doc/conf.py000066400000000000000000000164261305577265600147710ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # joblib documentation build configuration file, created by # sphinx-quickstart on Thu Oct 23 16:36:51 2008. # # This file is execfile()d with the current directory set to its # containing dir. # # The contents of this file are pickled, so don't put values in the # namespace that aren't pickleable (module imports are okay, # they're removed automatically). # # All configuration values have a default; values that are commented out # serve to show the default. import sys import os import joblib # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. #sys.path.append(os.path.abspath('.')) sys.path.append(os.path.abspath('./sphinxext')) # General configuration # --------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.imgmath', 'numpydoc', 'sphinx.ext.autosummary', 'sphinx.ext.coverage'] autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. project = 'joblib' copyright = '2008-2009, Gael Varoquaux' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = joblib.__version__ # The full version, including alpha/beta/rc tags. release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = [] # The reST default role (used for this markup: `text`) to use for all # documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # Avoid '+DOCTEST...' comments in the docs trim_doctest_flags = True # Options for HTML output # ----------------------- # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. #html_style = 'default.css' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_use_modindex = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, the reST sources are included in the HTML build as _sources/. #html_copy_source = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'joblibdoc' # Options for LaTeX output # ------------------------ # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, # document class [howto/manual]). latex_documents = [ ('index', 'joblib.tex', 'joblib Documentation', 'Gael Varoquaux', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_use_modindex = True # default is used to be compatible with both sphinx 1.2.3 and sphinx # 1.3.1. If we want to support only 1.3.1 'classic' can be used # instead html_theme = 'default' html_theme_options = { # "bgcolor": "#fff", # "footertextcolor": "#666", "relbarbgcolor": "#333", # "relbarlinkcolor": "#445481", # "relbartextcolor": "#445481", "sidebarlinkcolor": "#e15617", "sidebarbgcolor": "#000", # "sidebartextcolor": "#333", "footerbgcolor": "#111", "linkcolor": "#aa560c", # "bodyfont": '"Lucida Grande",Verdana,Lucida,Helvetica,Arial,sans-serif', # "headfont": "georgia, 'bitstream vera sans serif', 'lucida grande', # helvetica, verdana, sans-serif", # "headbgcolor": "#F5F5F5", "headtextcolor": "#643200", "codebgcolor": "#f5efe7", } ############################################################################## # Hack to copy the CHANGES.rst file import shutil try: shutil.copyfile('../CHANGES.rst', 'CHANGES.rst') shutil.copyfile('../README.rst', 'README.rst') except IOError: pass # This fails during the tesing, as the code is ran in a different # directory numpydoc_show_class_members = False suppress_warnings = ['image.nonlocal_uri'] joblib-0.11/doc/conftest.py000066400000000000000000000011211305577265600156530ustar00rootroot00000000000000from joblib.parallel import mp from joblib.test.common import np, setup_autokill, teardown_autokill from joblib.testing import skipif, fixture @fixture(scope='module') @skipif(np is None or mp is None, 'Numpy or Multiprocessing not available') def parallel_numpy_fixture(request): """Fixture to skip memmaping test if numpy is not installed""" def setup(module): setup_autokill(module.__name__, timeout=300) def teardown(): teardown_autokill(module.__name__) request.addfinalizer(teardown) return parallel_numpy_fixture return setup joblib-0.11/doc/developing.rst000066400000000000000000000001411305577265600163430ustar00rootroot00000000000000 =============== Development =============== .. include:: README.rst .. include:: CHANGES.rst joblib-0.11/doc/index.rst000066400000000000000000000016111305577265600153210ustar00rootroot00000000000000.. raw:: html .. raw:: html

Joblib: running Python functions as pipeline jobs

Introduction ------------ .. automodule:: joblib User manual -------------- .. toctree:: :maxdepth: 2 why.rst installing.rst memory.rst parallel.rst persistence.rst developing.rst Module reference ----------------- .. currentmodule:: joblib .. autosummary:: :toctree: generated Memory Parallel dump load hash joblib-0.11/doc/installing.rst000066400000000000000000000034621305577265600163640ustar00rootroot00000000000000Installing joblib =================== Using `pip` ------------ You can use `pip` to install joblib:: * For installing for all users, you need to run:: pip install joblib You may need to run the above command as administrator On a unix environment, it is better to install outside of the hierarchy managed by the system:: pip install --prefix /usr/local joblib * Installing only for a specific user is easy if you use Python 2.7 or above:: pip install --user joblib Using distributions -------------------- Joblib is packaged for several linux distribution: archlinux, debian, ubuntu, altlinux, and fedora. For minimum administration overhead, using the package manager is the recommended installation strategy on these systems. The manual way --------------- To install joblib first download the latest tarball (follow the link on the bottom of http://pypi.python.org/pypi/joblib) and expand it. Installing in a local environment .................................. If you don't need to install for all users, we strongly suggest that you create a local environment and install `joblib` in it. One of the pros of this method is that you never have to become administrator, and thus all the changes are local to your account and easy to clean up. Simply move to the directory created by expanding the `joblib` tarball and run the following command:: python setup.py install --user Installing for all users ........................ If you have administrator rights and want to install for all users, all you need to do is to go in directory created by expanding the `joblib` tarball and run the following line:: python setup.py install If you are under Unix, we suggest that you install in '/usr/local' in order not to interfere with your system:: python setup.py install --prefix /usr/local joblib-0.11/doc/memory.rst000066400000000000000000000324441305577265600155320ustar00rootroot00000000000000.. For doctests: >>> from joblib.testing import warnings_to_stdout >>> warnings_to_stdout() .. _memory: =========================================== On demand recomputing: the `Memory` class =========================================== .. currentmodule:: joblib.memory Usecase -------- The `Memory` class defines a context for lazy evaluation of function, by storing the results to the disk, and not rerunning the function twice for the same arguments. .. Commented out in favor of briefness You can use it as a context, with its `eval` method: .. automethod:: Memory.eval or decorate functions with the `cache` method: .. automethod:: Memory.cache It works by explicitly saving the output to a file and it is designed to work with non-hashable and potentially large input and output data types such as numpy arrays. A simple example: ~~~~~~~~~~~~~~~~~ First we create a temporary directory, for the cache:: >>> from tempfile import mkdtemp >>> cachedir = mkdtemp() We can instantiate a memory context, using this cache directory:: >>> from joblib import Memory >>> memory = Memory(cachedir=cachedir, verbose=0) Then we can decorate a function to be cached in this context:: >>> @memory.cache ... def f(x): ... print('Running f(%s)' % x) ... return x When we call this function twice with the same argument, it does not get executed the second time, and the output gets loaded from the pickle file:: >>> print(f(1)) Running f(1) 1 >>> print(f(1)) 1 However, when we call it a third time, with a different argument, the output gets recomputed:: >>> print(f(2)) Running f(2) 2 Comparison with `memoize` ~~~~~~~~~~~~~~~~~~~~~~~~~ The `memoize` decorator (http://code.activestate.com/recipes/52201/) caches in memory all the inputs and outputs of a function call. It can thus avoid running twice the same function, with a very small overhead. However, it compares input objects with those in cache on each call. As a result, for big objects there is a huge overhead. Moreover this approach does not work with numpy arrays, or other objects subject to non-significant fluctuations. Finally, using `memoize` with large objects will consume all the memory, where with `Memory`, objects are persisted to disk, using a persister optimized for speed and memory usage (:func:`joblib.dump`). In short, `memoize` is best suited for functions with "small" input and output objects, whereas `Memory` is best suited for functions with complex input and output objects, and aggressive persistence to disk. Using with `numpy` ------------------- The original motivation behind the `Memory` context was to be able to a memoize-like pattern on numpy arrays. `Memory` uses fast cryptographic hashing of the input arguments to check if they have been computed; An example ~~~~~~~~~~~ We define two functions, the first with a number as an argument, outputting an array, used by the second one. We decorate both functions with `Memory.cache`:: >>> import numpy as np >>> @memory.cache ... def g(x): ... print('A long-running calculation, with parameter %s' % x) ... return np.hamming(x) >>> @memory.cache ... def h(x): ... print('A second long-running calculation, using g(x)') ... return np.vander(x) If we call the function h with the array created by the same call to g, h is not re-run:: >>> a = g(3) A long-running calculation, with parameter 3 >>> a array([ 0.08, 1. , 0.08]) >>> g(3) array([ 0.08, 1. , 0.08]) >>> b = h(a) A second long-running calculation, using g(x) >>> b2 = h(a) >>> b2 array([[ 0.0064, 0.08 , 1. ], [ 1. , 1. , 1. ], [ 0.0064, 0.08 , 1. ]]) >>> np.allclose(b, b2) True Using memmapping ~~~~~~~~~~~~~~~~ To speed up cache looking of large numpy arrays, you can load them using memmapping (memory mapping):: >>> cachedir2 = mkdtemp() >>> memory2 = Memory(cachedir=cachedir2, mmap_mode='r') >>> square = memory2.cache(np.square) >>> a = np.vander(np.arange(3)).astype(np.float) >>> square(a) ________________________________________________________________________________ [Memory] Calling square... square(array([[ 0., 0., 1.], [ 1., 1., 1.], [ 4., 2., 1.]])) ___________________________________________________________square - 0.0s, 0.0min memmap([[ 0., 0., 1.], [ 1., 1., 1.], [ 16., 4., 1.]]) .. note:: Notice the debug mode used in the above example. It is useful for tracing of what is being reexecuted, and where the time is spent. If the `square` function is called with the same input argument, its return value is loaded from the disk using memmapping:: >>> res = square(a) >>> print(repr(res)) memmap([[ 0., 0., 1.], [ 1., 1., 1.], [ 16., 4., 1.]]) .. We need to close the memmap file to avoid file locking on Windows; closing numpy.memmap objects is done with del, which flushes changes to the disk >>> del res .. note:: If the memory mapping mode used was 'r', as in the above example, the array will be read only, and will be impossible to modified in place. On the other hand, using 'r+' or 'w+' will enable modification of the array, but will propagate these modification to the disk, which will corrupt the cache. If you want modification of the array in memory, we suggest you use the 'c' mode: copy on write. Shelving: using references to cached values ------------------------------------------- In some cases, it can be useful to get a reference to the cached result, instead of having the result itself. A typical example of this is when a lot of large numpy arrays must be dispatched accross several workers: instead of sending the data themselves over the network, send a reference to the joblib cache, and let the workers read the data from a network filesystem, potentially taking advantage of some system-level caching too. Getting a reference to the cache can be done using the `call_and_shelve` method on the wrapped function:: >>> result = g.call_and_shelve(4) A long-running calculation, with parameter 4 >>> result #doctest: +ELLIPSIS MemorizedResult(cachedir="...", func="g...", argument_hash="...") Once computed, the output of `g` is stored on disk, and deleted from memory. Reading the associated value can then be performed with the `get` method:: >>> result.get() array([ 0.08, 0.77, 0.77, 0.08]) The cache for this particular value can be cleared using the `clear` method. Its invocation causes the stored value to be erased from disk. Any subsequent call to `get` will cause a `KeyError` exception to be raised:: >>> result.clear() >>> result.get() #doctest: +ELLIPSIS Traceback (most recent call last): ... KeyError: 'Non-existing cache value (may have been cleared).\nFile ... does not exist' A `MemorizedResult` instance contains all that is necessary to read the cached value. It can be pickled for transmission or storage, and the printed representation can even be copy-pasted to a different python interpreter. .. topic:: Shelving when cache is disabled In the case where caching is disabled (e.g. `Memory(cachedir=None)`), the `call_and_shelve` method returns a `NotMemorizedResult` instance, that stores the full function output, instead of just a reference (since there is nothing to point to). All the above remains valid though, except for the copy-pasting feature. Gotchas -------- * **Across sessions, function cache is identified by the function's name**. Thus if you assign the same name to different functions, their cache will override each-others (you have 'name collisions'), and you will get unwanted re-run:: >>> @memory.cache ... def func(x): ... print('Running func(%s)' % x) >>> func2 = func >>> @memory.cache ... def func(x): ... print('Running a different func(%s)' % x) As long as you stay in the same session, there are no collisions (in joblib 0.8 and above), altough joblib does warn you that you are doing something dangerous:: >>> func(1) Running a different func(1) >>> func2(1) #doctest: +ELLIPSIS memory.rst:0: JobLibCollisionWarning: Possible name collisions between functions 'func' (:...) and 'func' (:...) Running func(1) >>> func(1) # No recomputation so far >>> func2(1) # No recomputation so far .. Empty the in-memory cache to simulate exiting and reloading the interpreter >>> import joblib.memory >>> joblib.memory._FUNCTION_HASHES.clear() But suppose you exit the interpreter and restart it, the cache will not be identified properly, and the functions will be rerun:: >>> func(1) #doctest: +ELLIPSIS memory.rst:0: JobLibCollisionWarning: Possible name collisions between functions 'func' (:...) and 'func' (:...) Running a different func(1) >>> func2(1) #doctest: +ELLIPSIS Running func(1) As long as you stay in the same session, you are not getting needless recomputation:: >>> func(1) # No recomputation now >>> func2(1) # No recomputation now * **lambda functions** Beware that with Python 2.7 lambda functions cannot be separated out:: >>> def my_print(x): ... print(x) >>> f = memory.cache(lambda : my_print(1)) >>> g = memory.cache(lambda : my_print(2)) >>> f() 1 >>> f() >>> g() # doctest: +SKIP memory.rst:0: JobLibCollisionWarning: Cannot detect name collisions for function '' 2 >>> g() # doctest: +SKIP >>> f() # doctest: +SKIP 1 * **memory cannot be used on some complex objects**, e.g. a callable object with a `__call__` method. However, it works on numpy ufuncs:: >>> sin = memory.cache(np.sin) >>> print(sin(0)) 0.0 * **caching methods: memory is designed for pure functions and it is not recommended to use it for methods**. If you want to use cache inside a class the recommended pattern is to cache a pure function and use the cached function inside your class, i.e. something like this:: @mem.cache def compute_func(arg1, arg2, arg3): # long computation return result class Foo(object): def __init__(self, args): self.data = None def compute(self): self.data = compute_func(self.arg1, self.arg2, 40) Using ``Memory`` for methods is not recommended and has some caveats that make it very fragile from a maintenance point of view because it is very easy to forget about these caveats when your software evolves. If you still want to do it (we would be interested about your use case by the way), here are a few known caveats: 1. you cannot decorate a method at class definition, because when the class is instantiated, the first argument (self) is *bound*, and no longer accessible to the `Memory` object. The following code won't work:: class Foo(object): @mem.cache # WRONG def method(self, args): pass The right way to do this is to decorate at instantiation time:: class Foo(object): def __init__(self, args): self.method = mem.cache(self.method) def method(self, ...): pass 2. The cached method will have ``self`` as one of its arguments. That means that the result will be recomputed if anything with ``self`` changes. For example if ``self.attr`` has changed calling ``self.method`` will recompute the result even if ``self.method`` does not use ``self.attr`` in its body. Another example is changing ``self`` inside the body of ``self.method``. The consequence is that ``self.method`` will create cache that will not be reused in subsequent calls. To alleviate these problems and if you *know* that the result of ``self.method`` does not depend on ``self`` you can use ``self.method = mem.cache(self.method, ignore=['self'])``. Ignoring some arguments ------------------------ It may be useful not to recalculate a function when certain arguments change, for instance a debug flag. `Memory` provides the `ignore` list:: >>> @memory.cache(ignore=['debug']) ... def my_func(x, debug=True): ... print('Called with x = %s' % x) >>> my_func(0) Called with x = 0 >>> my_func(0, debug=False) >>> my_func(0, debug=True) >>> # my_func was not reevaluated .. _memory_reference: Reference documentation of the `Memory` class ---------------------------------------------- .. autoclass:: Memory :members: __init__, cache, eval, clear Useful methods of decorated functions -------------------------------------- Function decorated by :meth:`Memory.cache` are :class:`MemorizedFunc` objects that, in addition of behaving like normal functions, expose methods useful for cache exploration and management. .. autoclass:: MemorizedFunc :members: __init__, call, clear, get_output_dir .. Let us not forget to clean our cache dir once we are finished:: >>> import shutil >>> try: ... shutil.rmtree(cachedir) ... shutil.rmtree(cachedir2) ... except OSError: ... pass # this can sometimes fail under Windows joblib-0.11/doc/parallel.rst000066400000000000000000000157621305577265600160220ustar00rootroot00000000000000 ================================= Embarrassingly parallel for loops ================================= Common usage ============ Joblib provides a simple helper class to write parallel for loops using multiprocessing. The core idea is to write the code to be executed as a generator expression, and convert it to parallel computing:: >>> from math import sqrt >>> [sqrt(i ** 2) for i in range(10)] [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] can be spread over 2 CPUs using the following:: >>> from math import sqrt >>> from joblib import Parallel, delayed >>> Parallel(n_jobs=2)(delayed(sqrt)(i ** 2) for i in range(10)) [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] Under the hood, the :class:`Parallel` object create a multiprocessing `pool` that forks the Python interpreter in multiple processes to execute each of the items of the list. The `delayed` function is a simple trick to be able to create a tuple `(function, args, kwargs)` with a function-call syntax. .. warning:: Under Windows, it is important to protect the main loop of code to avoid recursive spawning of subprocesses when using joblib.Parallel. In other words, you should be writing code like this: .. code-block:: python import .... def function1(...): ... def function2(...): ... ... if __name__ == '__main__': # do stuff with imports and functions defined about ... **No** code should *run* outside of the "if __name__ == '__main__'" blocks, only imports and definitions. Using the threading backend =========================== By default :class:`Parallel` uses the Python ``multiprocessing`` module to fork separate Python worker processes to execute tasks concurrently on separate CPUs. This is a reasonable default for generic Python programs but it induces some overhead as the input and output data need to be serialized in a queue for communication with the worker processes. If you know that the function you are calling is based on a compiled extension that releases the Python Global Interpreter Lock (GIL) during most of its computation then it might be more efficient to use threads instead of Python processes as concurrent workers. For instance this is the case if you write the CPU intensive part of your code inside a `with nogil`_ block of a Cython function. .. _`with nogil`: http://docs.cython.org/src/userguide/external_C_code.html#acquiring-and-releasing-the-gil To use the threads, just pass ``"threading"`` as the value of the ``backend`` parameter of the :class:`Parallel` constructor: >>> Parallel(n_jobs=2, backend="threading")( ... delayed(sqrt)(i ** 2) for i in range(10)) [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] Reusing a pool of workers ========================= Some algorithms require to make several consecutive calls to a parallel function interleaved with processing of the intermediate results. Calling ``Parallel`` several times in a loop is sub-optimal because it will create and destroy a pool of workers (threads or processes) several times which can cause a significant overhead. For this case it is more efficient to use the context manager API of the ``Parallel`` class to re-use the same pool of workers for several calls to the ``Parallel`` object:: >>> with Parallel(n_jobs=2) as parallel: ... accumulator = 0. ... n_iter = 0 ... while accumulator < 1000: ... results = parallel(delayed(sqrt)(accumulator + i ** 2) ... for i in range(5)) ... accumulator += sum(results) # synchronization barrier ... n_iter += 1 ... >>> (accumulator, n_iter) # doctest: +ELLIPSIS (1136.596..., 14) .. include:: parallel_numpy.rst Bad interaction of multiprocessing and third-party libraries ============================================================ Prior to Python 3.4 the ``'multiprocessing'`` backend of joblib can only use the ``fork`` strategy to create worker processes under non-Windows systems. This can cause some third-party libraries to crash or freeze. Such libraries include as Apple vecLib / Accelerate (used by NumPy under OSX), some old version of OpenBLAS (prior to 0.2.10) or the OpenMP runtime implementation from GCC. To avoid this problem ``joblib.Parallel`` can be configured to use the ``'forkserver'`` start method on Python 3.4 and later. The start method has to be configured by setting the ``JOBLIB_START_METHOD`` environment variable to ``'forkserver'`` instead of the default ``'fork'`` start method. However the user should be aware that using the ``'forkserver'`` method prevents ``joblib.Parallel`` to call function interactively defined in a shell session. You can read more on this topic in the `multiprocessing documentation `_. Under Windows the ``fork`` system call does not exist at all so this problem does not exist (but multiprocessing has more overhead). Custom backend API (experimental) ================================= .. versionadded:: 0.10 .. warning:: The custom backend API is experimental and subject to change without going through a deprecation cycle. User can provide their own implementation of a parallel processing backend in addition to the ``'multiprocessing'`` and ``'threading'`` backends provided by default. A backend is registered with the :func:`joblib.register_parallel_backend` function by passing a name and a backend factory. The backend factory can be any callable that returns an instance of ``ParallelBackendBase``. Please refer to the `default backends source code`_ as a reference if you want to implement your own custom backend. .. _`default backends source code`: https://github.com/joblib/joblib/blob/master/joblib/_parallel_backends.py Note that it is possible to register a backend class that has some mandatory constructor parameters such as the network address and connection credentials for a remote cluster computing service:: class MyCustomBackend(ParallelBackendBase): def __init__(self, endpoint, api_key): self.endpoint = endpoint self.api_key = api_key ... # Do something with self.endpoint and self.api_key somewhere in # one of the method of the class register_parallel_backend('custom', MyCustomBackend) The connection parameters can then be passed to the :func:`joblib.parallel_backend` context manager:: with parallel_backend('custom', endpoint='http://compute', api_key='42'): Parallel()(delayed(some_function)(i) for i in range(10)) Using the context manager can be helpful when using a third-party library that uses :class:`joblib.Parallel` internally while not exposing the ``backend`` argument in its own API. `Parallel` reference documentation ================================== .. autoclass:: joblib.Parallel :noindex: .. autofunction:: joblib.delayed .. autofunction:: joblib.register_parallel_backend .. autofunction:: joblib.parallel_backend joblib-0.11/doc/parallel_numpy.rst000066400000000000000000000134671305577265600172520ustar00rootroot00000000000000.. For doctests: >>> import sys >>> setup = getfixture('parallel_numpy_fixture') >>> fixture = setup(sys.modules[__name__]) Working with numerical data in shared memory (memmaping) ======================================================== By default the workers of the pool are real Python processes forked using the ``multiprocessing`` module of the Python standard library when ``n_jobs != 1``. The arguments passed as input to the ``Parallel`` call are serialized and reallocated in the memory of each worker process. This can be problematic for large arguments as they will be reallocated ``n_jobs`` times by the workers. As this problem can often occur in scientific computing with ``numpy`` based datastructures, :class:`joblib.Parallel` provides a special handling for large arrays to automatically dump them on the filesystem and pass a reference to the worker to open them as memory map on that file using the ``numpy.memmap`` subclass of ``numpy.ndarray``. This makes it possible to share a segment of data between all the worker processes. .. note:: The following only applies with the default ``"multiprocessing"`` backend. If your code can release the GIL, then using ``backend="threading"`` is even more efficient. Automated array to memmap conversion ------------------------------------ The automated array to memmap conversion is triggered by a configurable threshold on the size of the array:: >>> import numpy as np >>> from joblib import Parallel, delayed >>> from joblib.pool import has_shareable_memory >>> Parallel(n_jobs=2, max_nbytes=1e6)( ... delayed(has_shareable_memory)(np.ones(int(i))) ... for i in [1e2, 1e4, 1e6]) [False, False, True] By default the data is dumped to the ``/dev/shm`` shared-memory partition if it exists and writeable (typically the case under Linux). Otherwise the operating system's temporary folder is used. The location of the temporary data files can be customized by passing a ``temp_folder`` argument to the ``Parallel`` constructor. Passing ``max_nbytes=None`` makes it possible to disable the automated array to memmap conversion. Manual management of memmaped input data ---------------------------------------- For even finer tuning of the memory usage it is also possible to dump the array as an memmap directly from the parent process to free the memory before forking the worker processes. For instance let's allocate a large array in the memory of the parent process:: >>> large_array = np.ones(int(1e6)) Dump it to a local file for memmaping:: >>> import tempfile >>> import os >>> from joblib import load, dump >>> temp_folder = tempfile.mkdtemp() >>> filename = os.path.join(temp_folder, 'joblib_test.mmap') >>> if os.path.exists(filename): os.unlink(filename) >>> _ = dump(large_array, filename) >>> large_memmap = load(filename, mmap_mode='r+') The ``large_memmap`` variable is pointing to a ``numpy.memmap`` instance:: >>> large_memmap.__class__.__name__, large_array.nbytes, large_array.shape ('memmap', 8000000, (1000000,)) >>> np.allclose(large_array, large_memmap) True We can free the original array from the main process memory:: >>> del large_array >>> import gc >>> _ = gc.collect() It is possible to slice ``large_memmap`` into a smaller memmap:: >>> small_memmap = large_memmap[2:5] >>> small_memmap.__class__.__name__, small_memmap.nbytes, small_memmap.shape ('memmap', 24, (3,)) Finally we can also take a ``np.ndarray`` view backed on that same memory mapped file:: >>> small_array = np.asarray(small_memmap) >>> small_array.__class__.__name__, small_array.nbytes, small_array.shape ('ndarray', 24, (3,)) All those three datastructures point to the same memory buffer and this same buffer will also be reused directly by the worker processes of a ``Parallel`` call:: >>> Parallel(n_jobs=2, max_nbytes=None)( ... delayed(has_shareable_memory)(a) ... for a in [large_memmap, small_memmap, small_array]) [True, True, True] Note that here we used ``max_nbytes=None`` to disable the auto-dumping feature of ``Parallel``. ``small_array`` is still in shared memory in the worker processes because it was already backed by shared memory in the parent process. The pickling machinery of ``Parallel`` multiprocessing queues are able to detect this situation and optimize it on the fly to limit the number of memory copies. Writing parallel computation results in shared memory ----------------------------------------------------- If you open your data using the ``w+`` or ``r+`` mode in the main program, the worker will get ``r+`` mode access. Thus the worker will be able to write its results directly to the original data, alleviating the need of the serialization to send back the results to the parent process. Here is an example script on parallel processing with preallocated ``numpy.memmap`` datastructures: .. literalinclude:: ../examples/parallel_memmap.py :language: python :linenos: .. warning:: Having concurrent workers write on overlapping shared memory data segments, for instance by using inplace operators and assignments on a `numpy.memmap` instance, can lead to data corruption as numpy does not offer atomic operations. The previous example does not risk that issue as each task is updating an exclusive segment of the shared result array. Some C/C++ compilers offer lock-free atomic primitives such as add-and-fetch or compare-and-swap that could be exposed to Python via CFFI_ for instance. However providing numpy-aware atomic constructs is outside of the scope of the joblib project. .. _CFFI: https://cffi.readthedocs.org A final note: don't forget to clean up any temporary folder when you are done with the computation:: >>> import shutil >>> try: ... shutil.rmtree(temp_folder) ... except OSError: ... pass # this can sometimes fail under Windows joblib-0.11/doc/persistence.rst000066400000000000000000000126521305577265600165450ustar00rootroot00000000000000.. For doctests: >>> from joblib.testing import warnings_to_stdout >>> warnings_to_stdout() >>> fixture = getfixture('persistence_fixture') .. _persistence: =========== Persistence =========== .. currentmodule:: joblib.numpy_pickle Usecase ======= :func:`joblib.dump` and :func:`joblib.load` provide a replacement for pickle to work efficiently on Python objects containing large data, in particular large numpy arrays. A simple example ================ First we create a temporary directory:: >>> from tempfile import mkdtemp >>> savedir = mkdtemp() >>> import os >>> filename = os.path.join(savedir, 'test.pkl') Then we create an object to be persisted:: >>> import numpy as np >>> to_persist = [('a', [1, 2, 3]), ('b', np.arange(10))] which we save into `savedir`:: >>> import joblib >>> joblib.dump(to_persist, filename) # doctest: +ELLIPSIS ['...test.pkl'] We can then load the object from the file:: >>> joblib.load(filename) [('a', [1, 2, 3]), ('b', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))] Persistence in file objects =========================== Instead of filenames, `dump` and `load` functions also accept file objects: >>> with open(filename, 'wb') as fo: # doctest: +ELLIPSIS ... joblib.dump(to_persist, fo) >>> with open(filename, 'rb') as fo: # doctest: +ELLIPSIS ... joblib.load(fo) [('a', [1, 2, 3]), ('b', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))] Compressed joblib pickles ========================= Setting the `compress` argument to `True` in :func:`joblib.dump` will allow to save space on disk: >>> joblib.dump(to_persist, filename + '.compressed', compress=True) # doctest: +ELLIPSIS ['...test.pkl.compressed'] If the filename extension corresponds to one of the supported compression methods, the compressor will be used automatically: >>> joblib.dump(to_persist, filename + '.z') # doctest: +ELLIPSIS ['...test.pkl.z'] By default, `joblib.dump` uses the zlib compression method as it gives the best tradeoff between speed and disk space. The other supported compression methods are 'gzip', 'bz2', 'lzma' and 'xz': >>> # Dumping in a gzip compressed file using a compress level of 3. >>> joblib.dump(to_persist, filename + '.gz', compress=('gzip', 3)) # doctest: +ELLIPSIS ['...test.pkl.gz'] >>> joblib.load(filename + '.gz') [('a', [1, 2, 3]), ('b', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))] >>> joblib.dump(to_persist, filename + '.bz2', compress=('bz2', 3)) # doctest: +ELLIPSIS ['...test.pkl.bz2'] >>> joblib.load(filename + '.bz2') [('a', [1, 2, 3]), ('b', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))] .. note:: Lzma and Xz compression methods are only available for python versions >= 3.3. Compressor files provided by the python standard library can also be used to compress pickle, e.g ``gzip.GzipFile``, ``bz2.BZ2File``, ``lzma.LZMAFile``: >>> # Dumping in a gzip.GzipFile object using a compression level of 3. >>> import gzip >>> with gzip.GzipFile(filename + '.gz', 'wb', compresslevel=3) as fo: # doctest: +ELLIPSIS ... joblib.dump(to_persist, fo) >>> with gzip.GzipFile(filename + '.gz', 'rb') as fo: # doctest: +ELLIPSIS ... joblib.load(fo) [('a', [1, 2, 3]), ('b', array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))] More details can be found in the :func:`joblib.dump` and :func:`joblib.load` documentation. Compatibility across python versions ------------------------------------ Compatibility of joblib pickles across python versions is not fully supported. Note that, for a very restricted set of objects, this may appear to work when saving a pickle with python 2 and loading it with python 3 but relying on it is strongly discouraged. If you are switching between python versions, you will need to save a different joblib pickle for each python version. Here are a few examples or exceptions: - Saving joblib pickle with python 2, trying to load it with python 3:: Traceback (most recent call last): File "/home/lesteve/dev/joblib/joblib/numpy_pickle.py", line 453, in load obj = unpickler.load() File "/home/lesteve/miniconda3/lib/python3.4/pickle.py", line 1038, in load dispatch[key[0]](self) File "/home/lesteve/miniconda3/lib/python3.4/pickle.py", line 1176, in load_binstring self.append(self._decode_string(data)) File "/home/lesteve/miniconda3/lib/python3.4/pickle.py", line 1158, in _decode_string return value.decode(self.encoding, self.errors) UnicodeDecodeError: 'ascii' codec can't decode byte 0x80 in position 1024: ordinal not in range(128) Traceback (most recent call last): File "", line 1, in File "/home/lesteve/dev/joblib/joblib/numpy_pickle.py", line 462, in load raise new_exc ValueError: You may be trying to read with python 3 a joblib pickle generated with python 2. This is not feature supported by joblib. - Saving joblib pickle with python 3, trying to load it with python 2:: Traceback (most recent call last): File "", line 1, in File "joblib/numpy_pickle.py", line 453, in load obj = unpickler.load() File "/home/lesteve/miniconda3/envs/py27/lib/python2.7/pickle.py", line 858, in load dispatch[key](self) File "/home/lesteve/miniconda3/envs/py27/lib/python2.7/pickle.py", line 886, in load_proto raise ValueError, "unsupported pickle protocol: %d" % proto ValueError: unsupported pickle protocol: 3 joblib-0.11/doc/sphinxext/000077500000000000000000000000001305577265600155135ustar00rootroot00000000000000joblib-0.11/doc/sphinxext/LICENSE.txt000066400000000000000000000135071305577265600173440ustar00rootroot00000000000000------------------------------------------------------------------------------- The files - numpydoc.py - docscrape.py - docscrape_sphinx.py - phantom_import.py have the following license: Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - compiler_unparse.py - comment_eater.py - traitsdoc.py have the following license: This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. Copyright (c) 2006, Enthought, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Enthought, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The file - plot_directive.py originates from Matplotlib (http://matplotlib.sf.net/) which has the following license: Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. joblib-0.11/doc/sphinxext/__init__.py000066400000000000000000000000001305577265600176120ustar00rootroot00000000000000joblib-0.11/doc/sphinxext/numpydoc/000077500000000000000000000000001305577265600173515ustar00rootroot00000000000000joblib-0.11/doc/sphinxext/numpydoc/__init__.py000066400000000000000000000001361305577265600214620ustar00rootroot00000000000000from __future__ import division, absolute_import, print_function from .numpydoc import setup joblib-0.11/doc/sphinxext/numpydoc/docscrape.py000066400000000000000000000432321305577265600216720ustar00rootroot00000000000000"""Extract reference documentation from the NumPy source tree. """ from __future__ import division, absolute_import, print_function import inspect import textwrap import re import pydoc from warnings import warn import collections import sys class Reader(object): """A line-based string reader. """ def __init__(self, data): """ Parameters ---------- data : str String with lines separated by '\n'. """ if isinstance(data, list): self._str = data else: self._str = data.split('\n') # store string as list of lines self.reset() def __getitem__(self, n): return self._str[n] def reset(self): self._l = 0 # current line nr def read(self): if not self.eof(): out = self[self._l] self._l += 1 return out else: return '' def seek_next_non_empty_line(self): for l in self[self._l:]: if l.strip(): break else: self._l += 1 def eof(self): return self._l >= len(self._str) def read_to_condition(self, condition_func): start = self._l for line in self[start:]: if condition_func(line): return self[start:self._l] self._l += 1 if self.eof(): return self[start:self._l+1] return [] def read_to_next_empty_line(self): self.seek_next_non_empty_line() def is_empty(line): return not line.strip() return self.read_to_condition(is_empty) def read_to_next_unindented_line(self): def is_unindented(line): return (line.strip() and (len(line.lstrip()) == len(line))) return self.read_to_condition(is_unindented) def peek(self, n=0): if self._l + n < len(self._str): return self[self._l + n] else: return '' def is_empty(self): return not ''.join(self._str).strip() class ParseError(Exception): def __str__(self): message = self.message if hasattr(self, 'docstring'): message = "%s in %r" % (message, self.docstring) return message class NumpyDocString(collections.Mapping): def __init__(self, docstring, config={}): orig_docstring = docstring docstring = textwrap.dedent(docstring).split('\n') self._doc = Reader(docstring) self._parsed_data = { 'Signature': '', 'Summary': [''], 'Extended Summary': [], 'Parameters': [], 'Returns': [], 'Yields': [], 'Raises': [], 'Warns': [], 'Other Parameters': [], 'Attributes': [], 'Methods': [], 'See Also': [], 'Notes': [], 'Warnings': [], 'References': '', 'Examples': '', 'index': {} } try: self._parse() except ParseError as e: e.docstring = orig_docstring raise def __getitem__(self, key): return self._parsed_data[key] def __setitem__(self, key, val): if key not in self._parsed_data: warn("Unknown section %s" % key) else: self._parsed_data[key] = val def __iter__(self): return iter(self._parsed_data) def __len__(self): return len(self._parsed_data) def _is_at_section(self): self._doc.seek_next_non_empty_line() if self._doc.eof(): return False l1 = self._doc.peek().strip() # e.g. Parameters if l1.startswith('.. index::'): return True l2 = self._doc.peek(1).strip() # ---------- or ========== return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) def _strip(self, doc): i = 0 j = 0 for i, line in enumerate(doc): if line.strip(): break for j, line in enumerate(doc[::-1]): if line.strip(): break return doc[i:len(doc)-j] def _read_to_next_section(self): section = self._doc.read_to_next_empty_line() while not self._is_at_section() and not self._doc.eof(): if not self._doc.peek(-1).strip(): # previous line was empty section += [''] section += self._doc.read_to_next_empty_line() return section def _read_sections(self): while not self._doc.eof(): data = self._read_to_next_section() name = data[0].strip() if name.startswith('..'): # index section yield name, data[1:] elif len(data) < 2: yield StopIteration else: yield name, self._strip(data[2:]) def _parse_param_list(self, content): r = Reader(content) params = [] while not r.eof(): header = r.read().strip() if ' : ' in header: arg_name, arg_type = header.split(' : ')[:2] else: arg_name, arg_type = header, '' desc = r.read_to_next_unindented_line() desc = dedent_lines(desc) params.append((arg_name, arg_type, desc)) return params _name_rgx = re.compile(r"^\s*(:(?P\w+):`(?P[a-zA-Z0-9_.-]+)`|" r" (?P[a-zA-Z0-9_.-]+))\s*", re.X) def _parse_see_also(self, content): """ func_name : Descriptive text continued text another_func_name : Descriptive text func_name1, func_name2, :meth:`func_name`, func_name3 """ items = [] def parse_item_name(text): """Match ':role:`name`' or 'name'""" m = self._name_rgx.match(text) if m: g = m.groups() if g[1] is None: return g[3], None else: return g[2], g[1] raise ParseError("%s is not a item name" % text) def push_item(name, rest): if not name: return name, role = parse_item_name(name) items.append((name, list(rest), role)) del rest[:] current_func = None rest = [] for line in content: if not line.strip(): continue m = self._name_rgx.match(line) if m and line[m.end():].strip().startswith(':'): push_item(current_func, rest) current_func, line = line[:m.end()], line[m.end():] rest = [line.split(':', 1)[1].strip()] if not rest[0]: rest = [] elif not line.startswith(' '): push_item(current_func, rest) current_func = None if ',' in line: for func in line.split(','): if func.strip(): push_item(func, []) elif line.strip(): current_func = line elif current_func is not None: rest.append(line.strip()) push_item(current_func, rest) return items def _parse_index(self, section, content): """ .. index: default :refguide: something, else, and more """ def strip_each_in(lst): return [s.strip() for s in lst] out = {} section = section.split('::') if len(section) > 1: out['default'] = strip_each_in(section[1].split(','))[0] for line in content: line = line.split(':') if len(line) > 2: out[line[1]] = strip_each_in(line[2].split(',')) return out def _parse_summary(self): """Grab signature (if given) and summary""" if self._is_at_section(): return # If several signatures present, take the last one while True: summary = self._doc.read_to_next_empty_line() summary_str = " ".join([s.strip() for s in summary]).strip() if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str): self['Signature'] = summary_str if not self._is_at_section(): continue break if summary is not None: self['Summary'] = summary if not self._is_at_section(): self['Extended Summary'] = self._read_to_next_section() def _parse(self): self._doc.reset() self._parse_summary() sections = list(self._read_sections()) section_names = set([section for section, content in sections]) has_returns = 'Returns' in section_names has_yields = 'Yields' in section_names # We could do more tests, but we are not. Arbitrarily. if has_returns and has_yields: msg = 'Docstring contains both a Returns and Yields section.' raise ValueError(msg) for (section, content) in sections: if not section.startswith('..'): section = (s.capitalize() for s in section.split(' ')) section = ' '.join(section) if section in ('Parameters', 'Returns', 'Yields', 'Raises', 'Warns', 'Other Parameters', 'Attributes', 'Methods'): self[section] = self._parse_param_list(content) elif section.startswith('.. index::'): self['index'] = self._parse_index(section, content) elif section == 'See Also': self['See Also'] = self._parse_see_also(content) else: self[section] = content # string conversion routines def _str_header(self, name, symbol='-'): return [name, len(name)*symbol] def _str_indent(self, doc, indent=4): out = [] for line in doc: out += [' '*indent + line] return out def _str_signature(self): if self['Signature']: return [self['Signature'].replace('*', '\*')] + [''] else: return [''] def _str_summary(self): if self['Summary']: return self['Summary'] + [''] else: return [] def _str_extended_summary(self): if self['Extended Summary']: return self['Extended Summary'] + [''] else: return [] def _str_param_list(self, name): out = [] if self[name]: out += self._str_header(name) for param, param_type, desc in self[name]: if param_type: out += ['%s : %s' % (param, param_type)] else: out += [param] out += self._str_indent(desc) out += [''] return out def _str_section(self, name): out = [] if self[name]: out += self._str_header(name) out += self[name] out += [''] return out def _str_see_also(self, func_role): if not self['See Also']: return [] out = [] out += self._str_header("See Also") last_had_desc = True for func, desc, role in self['See Also']: if role: link = ':%s:`%s`' % (role, func) elif func_role: link = ':%s:`%s`' % (func_role, func) else: link = "`%s`_" % func if desc or last_had_desc: out += [''] out += [link] else: out[-1] += ", %s" % link if desc: out += self._str_indent([' '.join(desc)]) last_had_desc = True else: last_had_desc = False out += [''] return out def _str_index(self): idx = self['index'] out = [] out += ['.. index:: %s' % idx.get('default', '')] for section, references in idx.items(): if section == 'default': continue out += [' :%s: %s' % (section, ', '.join(references))] return out def __str__(self, func_role=''): out = [] out += self._str_signature() out += self._str_summary() out += self._str_extended_summary() for param_list in ('Parameters', 'Returns', 'Yields', 'Other Parameters', 'Raises', 'Warns'): out += self._str_param_list(param_list) out += self._str_section('Warnings') out += self._str_see_also(func_role) for s in ('Notes', 'References', 'Examples'): out += self._str_section(s) for param_list in ('Attributes', 'Methods'): out += self._str_param_list(param_list) out += self._str_index() return '\n'.join(out) def indent(str, indent=4): indent_str = ' '*indent if str is None: return indent_str lines = str.split('\n') return '\n'.join(indent_str + l for l in lines) def dedent_lines(lines): """Deindent a list of lines maximally""" return textwrap.dedent("\n".join(lines)).split("\n") def header(text, style='-'): return text + '\n' + style*len(text) + '\n' class FunctionDoc(NumpyDocString): def __init__(self, func, role='func', doc=None, config={}): self._f = func self._role = role # e.g. "func" or "meth" if doc is None: if func is None: raise ValueError("No function or docstring given") doc = inspect.getdoc(func) or '' NumpyDocString.__init__(self, doc) if not self['Signature'] and func is not None: func, func_name = self.get_func() try: try: signature = str(inspect.signature(func)) except (AttributeError, ValueError): # try to read signature, backward compat for older Python if sys.version_info[0] >= 3: argspec = inspect.getfullargspec(func) else: argspec = inspect.getargspec(func) signature = inspect.formatargspec(*argspec) signature = '%s%s' % (func_name, signature.replace('*', '\*')) except TypeError: signature = '%s()' % func_name self['Signature'] = signature def get_func(self): func_name = getattr(self._f, '__name__', self.__class__.__name__) if inspect.isclass(self._f): func = getattr(self._f, '__call__', self._f.__init__) else: func = self._f return func, func_name def __str__(self): out = '' func, func_name = self.get_func() signature = self['Signature'].replace('*', '\*') roles = {'func': 'function', 'meth': 'method'} if self._role: if self._role not in roles: print("Warning: invalid role %s" % self._role) out += '.. %s:: %s\n \n\n' % (roles.get(self._role, ''), func_name) out += super(FunctionDoc, self).__str__(func_role=self._role) return out class ClassDoc(NumpyDocString): extra_public_methods = ['__call__'] def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, config={}): if not inspect.isclass(cls) and cls is not None: raise ValueError("Expected a class or None, but got %r" % cls) self._cls = cls self.show_inherited_members = config.get( 'show_inherited_class_members', True) if modulename and not modulename.endswith('.'): modulename += '.' self._mod = modulename if doc is None: if cls is None: raise ValueError("No class or documentation string given") doc = pydoc.getdoc(cls) NumpyDocString.__init__(self, doc) if config.get('show_class_members', True): def splitlines_x(s): if not s: return [] else: return s.splitlines() for field, items in [('Methods', self.methods), ('Attributes', self.properties)]: if not self[field]: doc_list = [] for name in sorted(items): try: doc_item = pydoc.getdoc(getattr(self._cls, name)) doc_list.append((name, '', splitlines_x(doc_item))) except AttributeError: pass # method doesn't exist self[field] = doc_list @property def methods(self): if self._cls is None: return [] return [name for name, func in inspect.getmembers(self._cls) if ((not name.startswith('_') or name in self.extra_public_methods) and isinstance(func, collections.Callable) and self._is_show_member(name))] @property def properties(self): if self._cls is None: return [] return [name for name, func in inspect.getmembers(self._cls) if (not name.startswith('_') and (func is None or isinstance(func, property) or inspect.isgetsetdescriptor(func)) and self._is_show_member(name))] def _is_show_member(self, name): if self.show_inherited_members: return True # show all class members if name not in self._cls.__dict__: return False # class member is inherited, we do not show it return True joblib-0.11/doc/sphinxext/numpydoc/docscrape_sphinx.py000066400000000000000000000224671305577265600232720ustar00rootroot00000000000000from __future__ import division, absolute_import, print_function import sys import re import inspect import textwrap import pydoc import sphinx import collections from .docscrape import NumpyDocString, FunctionDoc, ClassDoc if sys.version_info[0] >= 3: sixu = lambda s: s else: sixu = lambda s: unicode(s, 'unicode_escape') class SphinxDocString(NumpyDocString): def __init__(self, docstring, config={}): NumpyDocString.__init__(self, docstring, config=config) self.load_config(config) def load_config(self, config): self.use_plots = config.get('use_plots', False) self.class_members_toctree = config.get('class_members_toctree', True) # string conversion routines def _str_header(self, name, symbol='`'): return ['.. rubric:: ' + name, ''] def _str_field_list(self, name): return [':' + name + ':'] def _str_indent(self, doc, indent=4): out = [] for line in doc: out += [' '*indent + line] return out def _str_signature(self): return [''] if self['Signature']: return ['``%s``' % self['Signature']] + [''] else: return [''] def _str_summary(self): return self['Summary'] + [''] def _str_extended_summary(self): return self['Extended Summary'] + [''] def _str_returns(self, name='Returns'): out = [] if self[name]: out += self._str_field_list(name) out += [''] for param, param_type, desc in self[name]: if param_type: out += self._str_indent(['**%s** : %s' % (param.strip(), param_type)]) else: out += self._str_indent([param.strip()]) if desc: out += [''] out += self._str_indent(desc, 8) out += [''] return out def _str_param_list(self, name): out = [] if self[name]: out += self._str_field_list(name) out += [''] for param, param_type, desc in self[name]: if param_type: out += self._str_indent(['**%s** : %s' % (param.strip(), param_type)]) else: out += self._str_indent(['**%s**' % param.strip()]) if desc: out += [''] out += self._str_indent(desc, 8) out += [''] return out @property def _obj(self): if hasattr(self, '_cls'): return self._cls elif hasattr(self, '_f'): return self._f return None def _str_member_list(self, name): """ Generate a member listing, autosummary:: table where possible, and a table where not. """ out = [] if self[name]: out += ['.. rubric:: %s' % name, ''] prefix = getattr(self, '_name', '') if prefix: prefix = '~%s.' % prefix autosum = [] others = [] for param, param_type, desc in self[name]: param = param.strip() # Check if the referenced member can have a docstring or not param_obj = getattr(self._obj, param, None) if not (callable(param_obj) or isinstance(param_obj, property) or inspect.isgetsetdescriptor(param_obj)): param_obj = None if param_obj and (pydoc.getdoc(param_obj) or not desc): # Referenced object has a docstring autosum += [" %s%s" % (prefix, param)] else: others.append((param, param_type, desc)) if autosum: out += ['.. autosummary::'] if self.class_members_toctree: out += [' :toctree:'] out += [''] + autosum if others: maxlen_0 = max(3, max([len(x[0]) for x in others])) hdr = sixu("=")*maxlen_0 + sixu(" ") + sixu("=")*10 fmt = sixu('%%%ds %%s ') % (maxlen_0,) out += ['', '', hdr] for param, param_type, desc in others: desc = sixu(" ").join(x.strip() for x in desc).strip() if param_type: desc = "(%s) %s" % (param_type, desc) out += [fmt % (param.strip(), desc)] out += [hdr] out += [''] return out def _str_section(self, name): out = [] if self[name]: out += self._str_header(name) out += [''] content = textwrap.dedent("\n".join(self[name])).split("\n") out += content out += [''] return out def _str_see_also(self, func_role): out = [] if self['See Also']: see_also = super(SphinxDocString, self)._str_see_also(func_role) out = ['.. seealso::', ''] out += self._str_indent(see_also[2:]) return out def _str_warnings(self): out = [] if self['Warnings']: out = ['.. warning::', ''] out += self._str_indent(self['Warnings']) return out def _str_index(self): idx = self['index'] out = [] if len(idx) == 0: return out out += ['.. index:: %s' % idx.get('default', '')] for section, references in idx.items(): if section == 'default': continue elif section == 'refguide': out += [' single: %s' % (', '.join(references))] else: out += [' %s: %s' % (section, ','.join(references))] return out def _str_references(self): out = [] if self['References']: out += self._str_header('References') if isinstance(self['References'], str): self['References'] = [self['References']] out.extend(self['References']) out += [''] # Latex collects all references to a separate bibliography, # so we need to insert links to it if sphinx.__version__ >= "0.6": out += ['.. only:: latex', ''] else: out += ['.. latexonly::', ''] items = [] for line in self['References']: m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) if m: items.append(m.group(1)) out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] return out def _str_examples(self): examples_str = "\n".join(self['Examples']) if (self.use_plots and 'import matplotlib' in examples_str and 'plot::' not in examples_str): out = [] out += self._str_header('Examples') out += ['.. plot::', ''] out += self._str_indent(self['Examples']) out += [''] return out else: return self._str_section('Examples') def __str__(self, indent=0, func_role="obj"): out = [] out += self._str_signature() out += self._str_index() + [''] out += self._str_summary() out += self._str_extended_summary() out += self._str_param_list('Parameters') out += self._str_returns('Returns') out += self._str_returns('Yields') for param_list in ('Other Parameters', 'Raises', 'Warns'): out += self._str_param_list(param_list) out += self._str_warnings() out += self._str_see_also(func_role) out += self._str_section('Notes') out += self._str_references() out += self._str_examples() for param_list in ('Attributes', 'Methods'): out += self._str_member_list(param_list) out = self._str_indent(out, indent) return '\n'.join(out) class SphinxFunctionDoc(SphinxDocString, FunctionDoc): def __init__(self, obj, doc=None, config={}): self.load_config(config) FunctionDoc.__init__(self, obj, doc=doc, config=config) class SphinxClassDoc(SphinxDocString, ClassDoc): def __init__(self, obj, doc=None, func_doc=None, config={}): self.load_config(config) ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) class SphinxObjDoc(SphinxDocString): def __init__(self, obj, doc=None, config={}): self._f = obj self.load_config(config) SphinxDocString.__init__(self, doc, config=config) def get_doc_object(obj, what=None, doc=None, config={}): if what is None: if inspect.isclass(obj): what = 'class' elif inspect.ismodule(obj): what = 'module' elif isinstance(obj, collections.Callable): what = 'function' else: what = 'object' if what == 'class': return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, config=config) elif what in ('function', 'method'): return SphinxFunctionDoc(obj, doc=doc, config=config) else: if doc is None: doc = pydoc.getdoc(obj) return SphinxObjDoc(obj, doc, config=config) joblib-0.11/doc/sphinxext/numpydoc/linkcode.py000066400000000000000000000047131305577265600215200ustar00rootroot00000000000000# -*- coding: utf-8 -*- """ linkcode ~~~~~~~~ Add external links to module code in Python object descriptions. :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ from __future__ import division, absolute_import, print_function import warnings import collections warnings.warn("This extension has been accepted to Sphinx upstream. " "Use the version from there (Sphinx >= 1.2) " "https://bitbucket.org/birkenfeld/sphinx/pull-request/47/sphinxextlinkcode", FutureWarning, stacklevel=1) from docutils import nodes from sphinx import addnodes from sphinx.locale import _ from sphinx.errors import SphinxError class LinkcodeError(SphinxError): category = "linkcode error" def doctree_read(app, doctree): env = app.builder.env resolve_target = getattr(env.config, 'linkcode_resolve', None) if not isinstance(env.config.linkcode_resolve, collections.Callable): raise LinkcodeError( "Function `linkcode_resolve` is not given in conf.py") domain_keys = dict( py=['module', 'fullname'], c=['names'], cpp=['names'], js=['object', 'fullname'], ) for objnode in doctree.traverse(addnodes.desc): domain = objnode.get('domain') uris = set() for signode in objnode: if not isinstance(signode, addnodes.desc_signature): continue # Convert signode to a specified format info = {} for key in domain_keys.get(domain, []): value = signode.get(key) if not value: value = '' info[key] = value if not info: continue # Call user code to resolve the link uri = resolve_target(domain, info) if not uri: # no source continue if uri in uris or not uri: # only one link per name, please continue uris.add(uri) onlynode = addnodes.only(expr='html') onlynode += nodes.reference('', '', internal=False, refuri=uri) onlynode[0] += nodes.inline('', _('[source]'), classes=['viewcode-link']) signode += onlynode def setup(app): app.connect('doctree-read', doctree_read) app.add_config_value('linkcode_resolve', None, '') joblib-0.11/doc/sphinxext/numpydoc/numpydoc.py000066400000000000000000000150451305577265600215660ustar00rootroot00000000000000""" ======== numpydoc ======== Sphinx extension that handles docstrings in the Numpy standard format. [1] It will: - Convert Parameters etc. sections to field lists. - Convert See Also section to a See also entry. - Renumber references. - Extract the signature from the docstring, if it can't be determined otherwise. .. [1] https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt """ from __future__ import division, absolute_import, print_function import sys import re import pydoc import sphinx import inspect import collections if sphinx.__version__ < '1.0.1': raise RuntimeError("Sphinx 1.0.1 or newer is required") from .docscrape_sphinx import get_doc_object, SphinxDocString from sphinx.util.compat import Directive if sys.version_info[0] >= 3: sixu = lambda s: s else: sixu = lambda s: unicode(s, 'unicode_escape') def mangle_docstrings(app, what, name, obj, options, lines, reference_offset=[0]): cfg = {'use_plots': app.config.numpydoc_use_plots, 'show_class_members': app.config.numpydoc_show_class_members, 'show_inherited_class_members': app.config.numpydoc_show_inherited_class_members, 'class_members_toctree': app.config.numpydoc_class_members_toctree} u_NL = sixu('\n') if what == 'module': # Strip top title pattern = '^\\s*[#*=]{4,}\\n[a-z0-9 -]+\\n[#*=]{4,}\\s*' title_re = re.compile(sixu(pattern), re.I | re.S) lines[:] = title_re.sub(sixu(''), u_NL.join(lines)).split(u_NL) else: doc = get_doc_object(obj, what, u_NL.join(lines), config=cfg) if sys.version_info[0] >= 3: doc = str(doc) else: doc = unicode(doc) lines[:] = doc.split(u_NL) if (app.config.numpydoc_edit_link and hasattr(obj, '__name__') and obj.__name__): if hasattr(obj, '__module__'): v = dict(full_name=sixu("%s.%s") % (obj.__module__, obj.__name__)) else: v = dict(full_name=obj.__name__) lines += [sixu(''), sixu('.. htmlonly::'), sixu('')] lines += [sixu(' %s') % x for x in (app.config.numpydoc_edit_link % v).split("\n")] # replace reference numbers so that there are no duplicates references = [] for line in lines: line = line.strip() m = re.match(sixu('^.. \\[([a-z0-9_.-])\\]'), line, re.I) if m: references.append(m.group(1)) # start renaming from the longest string, to avoid overwriting parts references.sort(key=lambda x: -len(x)) if references: for i, line in enumerate(lines): for r in references: if re.match(sixu('^\\d+$'), r): new_r = sixu("R%d") % (reference_offset[0] + int(r)) else: new_r = sixu("%s%d") % (r, reference_offset[0]) lines[i] = lines[i].replace(sixu('[%s]_') % r, sixu('[%s]_') % new_r) lines[i] = lines[i].replace(sixu('.. [%s]') % r, sixu('.. [%s]') % new_r) reference_offset[0] += len(references) def mangle_signature(app, what, name, obj, options, sig, retann): # Do not try to inspect classes that don't define `__init__` if (inspect.isclass(obj) and (not hasattr(obj, '__init__') or 'initializes x; see ' in pydoc.getdoc(obj.__init__))): return '', '' if not (isinstance(obj, collections.Callable) or hasattr(obj, '__argspec_is_invalid_')): return if not hasattr(obj, '__doc__'): return doc = SphinxDocString(pydoc.getdoc(obj)) if doc['Signature']: sig = re.sub(sixu("^[^(]*"), sixu(""), doc['Signature']) return sig, sixu('') def setup(app, get_doc_object_=get_doc_object): if not hasattr(app, 'add_config_value'): return # probably called by nose, better bail out global get_doc_object get_doc_object = get_doc_object_ app.connect('autodoc-process-docstring', mangle_docstrings) app.connect('autodoc-process-signature', mangle_signature) app.add_config_value('numpydoc_edit_link', None, False) app.add_config_value('numpydoc_use_plots', None, False) app.add_config_value('numpydoc_show_class_members', True, True) app.add_config_value('numpydoc_show_inherited_class_members', True, True) app.add_config_value('numpydoc_class_members_toctree', True, True) # Extra mangling domains app.add_domain(NumpyPythonDomain) app.add_domain(NumpyCDomain) metadata = {'parallel_read_safe': True} return metadata # ------------------------------------------------------------------------------ # Docstring-mangling domains # ------------------------------------------------------------------------------ from docutils.statemachine import ViewList from sphinx.domains.c import CDomain from sphinx.domains.python import PythonDomain class ManglingDomainBase(object): directive_mangling_map = {} def __init__(self, *a, **kw): super(ManglingDomainBase, self).__init__(*a, **kw) self.wrap_mangling_directives() def wrap_mangling_directives(self): for name, objtype in list(self.directive_mangling_map.items()): self.directives[name] = wrap_mangling_directive( self.directives[name], objtype) class NumpyPythonDomain(ManglingDomainBase, PythonDomain): name = 'np' directive_mangling_map = { 'function': 'function', 'class': 'class', 'exception': 'class', 'method': 'function', 'classmethod': 'function', 'staticmethod': 'function', 'attribute': 'attribute', } indices = [] class NumpyCDomain(ManglingDomainBase, CDomain): name = 'np-c' directive_mangling_map = { 'function': 'function', 'member': 'attribute', 'macro': 'function', 'type': 'class', 'var': 'object', } def wrap_mangling_directive(base_directive, objtype): class directive(base_directive): def run(self): env = self.state.document.settings.env name = None if self.arguments: m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) name = m.group(2).strip() if not name: name = self.arguments[0] lines = list(self.content) mangle_docstrings(env.app, objtype, name, None, None, lines) self.content = ViewList(lines, self.content.parent) return base_directive.run(self) return directive joblib-0.11/doc/why.rst000066400000000000000000000032671305577265600150320ustar00rootroot00000000000000 Why joblib: project goals =========================== What pipelines bring us -------------------------- Pipeline processing systems can provide a set of useful features: Data-flow programming for performance ...................................... * **On-demand computing:** in pipeline systems such as labView, or VTK calculations are performed as needed by the outputs and only when inputs change. * **Transparent parallelization:** a pipeline topology can be inspected to deduce which operations can be run in parallel (it is equivalent to purely functional programming). Provenance tracking for understanding the code ............................................... * **Tracking of data and computations:** to be able to fully reproduce a computational experiment: requires tracking of the data and operation implemented. * **Inspecting data flow:** Inspecting intermediate results helps debugging and understanding. .. topic:: But pipeline frameworks can get in the way :class: warning We want our code to look like the underlying algorithm, not like a software framework. Joblib's approach -------------------- Functions are the simplest abstraction used by everyone. Our pipeline jobs (or tasks) are made of decorated functions. Tracking of parameters in a meaningful way requires specification of data model. We give up on that and use hashing for performance and robustness. Design choices --------------- * No dependencies other than Python * Robust, well-tested code, at the cost of functionality * Fast and suitable for scientific computing on big dataset without changing the original code * Only local imports: **embed joblib in your code by copying it** joblib-0.11/examples/000077500000000000000000000000001305577265600145325ustar00rootroot00000000000000joblib-0.11/examples/parallel_memmap.py000066400000000000000000000060631305577265600202410ustar00rootroot00000000000000"""Demonstrate the usage of numpy.memmap with joblib.Parallel This example shows how to preallocate data in memmap arrays both for input and output of the parallel worker processes. Sample output for this program:: [Worker 93486] Sum for row 0 is -1599.756454 [Worker 93487] Sum for row 1 is -243.253165 [Worker 93488] Sum for row 3 is 610.201883 [Worker 93489] Sum for row 2 is 187.982005 [Worker 93489] Sum for row 7 is 326.381617 [Worker 93486] Sum for row 4 is 137.324438 [Worker 93489] Sum for row 8 is -198.225809 [Worker 93487] Sum for row 5 is -1062.852066 [Worker 93488] Sum for row 6 is 1666.334107 [Worker 93486] Sum for row 9 is -463.711714 Expected sums computed in the parent process: [-1599.75645426 -243.25316471 187.98200458 610.20188337 137.32443803 -1062.85206633 1666.33410715 326.38161713 -198.22580876 -463.71171369] Actual sums computed by the worker processes: [-1599.75645426 -243.25316471 187.98200458 610.20188337 137.32443803 -1062.85206633 1666.33410715 326.38161713 -198.22580876 -463.71171369] """ import tempfile import shutil import os import numpy as np from joblib import Parallel, delayed from joblib import load, dump def sum_row(input, output, i): """Compute the sum of a row in input and store it in output""" sum_ = input[i, :].sum() print("[Worker %d] Sum for row %d is %f" % (os.getpid(), i, sum_)) output[i] = sum_ if __name__ == "__main__": rng = np.random.RandomState(42) folder = tempfile.mkdtemp() samples_name = os.path.join(folder, 'samples') sums_name = os.path.join(folder, 'sums') try: # Generate some data and an allocate an output buffer samples = rng.normal(size=(10, int(1e6))) # Pre-allocate a writeable shared memory map as a container for the # results of the parallel computation sums = np.memmap(sums_name, dtype=samples.dtype, shape=samples.shape[0], mode='w+') # Dump the input data to disk to free the memory dump(samples, samples_name) # Release the reference on the original in memory array and replace it # by a reference to the memmap array so that the garbage collector can # release the memory before forking. gc.collect() is internally called # in Parallel just before forking. samples = load(samples_name, mmap_mode='r') # Fork the worker processes to perform computation concurrently Parallel(n_jobs=4)(delayed(sum_row)(samples, sums, i) for i in range(samples.shape[0])) # Compare the results from the output buffer with the ground truth print("Expected sums computed in the parent process:") expected_result = samples.sum(axis=1) print(expected_result) print("Actual sums computed by the worker processes:") print(sums) assert np.allclose(expected_result, sums) finally: try: shutil.rmtree(folder) except: print("Failed to delete: " + folder) joblib-0.11/joblib/000077500000000000000000000000001305577265600141555ustar00rootroot00000000000000joblib-0.11/joblib/__init__.py000066400000000000000000000116731305577265600162760ustar00rootroot00000000000000"""Joblib is a set of tools to provide **lightweight pipelining in Python**. In particular, joblib offers: 1. transparent disk-caching of the output values and lazy re-evaluation (memoize pattern) 2. easy simple parallel computing 3. logging and tracing of the execution Joblib is optimized to be **fast** and **robust** in particular on large data and has specific optimizations for `numpy` arrays. It is **BSD-licensed**. ========================= ================================================ **User documentation:** http://pythonhosted.org/joblib **Download packages:** http://pypi.python.org/pypi/joblib#downloads **Source code:** http://github.com/joblib/joblib **Report issues:** http://github.com/joblib/joblib/issues ========================= ================================================ Vision -------- The vision is to provide tools to easily achieve better performance and reproducibility when working with long running jobs. * **Avoid computing twice the same thing**: code is rerun over an over, for instance when prototyping computational-heavy jobs (as in scientific development), but hand-crafted solution to alleviate this issue is error-prone and often leads to unreproducible results * **Persist to disk transparently**: persisting in an efficient way arbitrary objects containing large data is hard. Using joblib's caching mechanism avoids hand-written persistence and implicitly links the file on disk to the execution context of the original Python object. As a result, joblib's persistence is good for resuming an application status or computational job, eg after a crash. Joblib strives to address these problems while **leaving your code and your flow control as unmodified as possible** (no framework, no new paradigms). Main features ------------------ 1) **Transparent and fast disk-caching of output value:** a memoize or make-like functionality for Python functions that works well for arbitrary Python objects, including very large numpy arrays. Separate persistence and flow-execution logic from domain logic or algorithmic code by writing the operations as a set of steps with well-defined inputs and outputs: Python functions. Joblib can save their computation to disk and rerun it only if necessary:: >>> from joblib import Memory >>> mem = Memory(cachedir='/tmp/joblib') >>> import numpy as np >>> a = np.vander(np.arange(3)).astype(np.float) >>> square = mem.cache(np.square) >>> b = square(a) # doctest: +ELLIPSIS ________________________________________________________________________________ [Memory] Calling square... square(array([[ 0., 0., 1.], [ 1., 1., 1.], [ 4., 2., 1.]])) ___________________________________________________________square - 0...s, 0.0min >>> c = square(a) >>> # The above call did not trigger an evaluation 2) **Embarrassingly parallel helper:** to make it easy to write readable parallel code and debug it quickly:: >>> from joblib import Parallel, delayed >>> from math import sqrt >>> Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10)) [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] 3) **Logging/tracing:** The different functionalities will progressively acquire better logging mechanism to help track what has been ran, and capture I/O easily. In addition, Joblib will provide a few I/O primitives, to easily define logging and display streams, and provide a way of compiling a report. We want to be able to quickly inspect what has been run. 4) **Fast compressed Persistence**: a replacement for pickle to work efficiently on Python objects containing large data ( *joblib.dump* & *joblib.load* ). .. >>> import shutil ; shutil.rmtree('/tmp/joblib/') """ # PEP0440 compatible formatted version, see: # https://www.python.org/dev/peps/pep-0440/ # # Generic release markers: # X.Y # X.Y.Z # For bugfix releases # # Admissible pre-release markers: # X.YaN # Alpha release # X.YbN # Beta release # X.YrcN # Release Candidate # X.Y # Final release # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # __version__ = '0.11' from .memory import Memory, MemorizedResult from .logger import PrintTime from .logger import Logger from .hashing import hash from .numpy_pickle import dump from .numpy_pickle import load from .parallel import Parallel from .parallel import delayed from .parallel import cpu_count from .parallel import register_parallel_backend from .parallel import parallel_backend from .parallel import effective_n_jobs __all__ = ['Memory', 'MemorizedResult', 'PrintTime', 'Logger', 'hash', 'dump', 'load', 'Parallel', 'delayed', 'cpu_count', 'effective_n_jobs', 'register_parallel_backend', 'parallel_backend'] joblib-0.11/joblib/_compat.py000066400000000000000000000006551305577265600161570ustar00rootroot00000000000000""" Compatibility layer for Python 3/Python 2 single codebase """ import sys PY3_OR_LATER = sys.version_info[0] >= 3 PY27 = sys.version_info[:2] == (2, 7) try: _basestring = basestring _bytes_or_unicode = (str, unicode) except NameError: _basestring = str _bytes_or_unicode = (bytes, str) def with_metaclass(meta, *bases): """Create a base class with a metaclass.""" return meta("NewBase", bases, {}) joblib-0.11/joblib/_memory_helpers.py000066400000000000000000000070261305577265600177250ustar00rootroot00000000000000try: # Available in Python 3 from tokenize import open as open_py_source except ImportError: # Copied from python3 tokenize from codecs import lookup, BOM_UTF8 import re from io import TextIOWrapper, open cookie_re = re.compile("coding[:=]\s*([-\w.]+)") def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" # Only care about the first 12 characters. enc = orig_enc[:12].lower().replace("_", "-") if enc == "utf-8" or enc.startswith("utf-8-"): return "utf-8" if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): return "iso-8859-1" return orig_enc def _detect_encoding(readline): """ The detect_encoding() function is used to detect the encoding that should be used to decode a Python source file. It requires one argment, readline, in the same way as the tokenize() generator. It will call readline a maximum of twice, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read in. It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, but disagree, a SyntaxError will be raised. If the encoding cookie is an invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 'utf-8-sig' is returned. If no encoding is specified, then the default of 'utf-8' will be returned. """ bom_found = False encoding = None default = 'utf-8' def read_or_stop(): try: return readline() except StopIteration: return b'' def find_cookie(line): try: line_string = line.decode('ascii') except UnicodeDecodeError: return None matches = cookie_re.findall(line_string) if not matches: return None encoding = _get_normal_name(matches[0]) try: codec = lookup(encoding) except LookupError: # This behaviour mimics the Python interpreter raise SyntaxError("unknown encoding: " + encoding) if bom_found: if codec.name != 'utf-8': # This behaviour mimics the Python interpreter raise SyntaxError('encoding problem: utf-8') encoding += '-sig' return encoding first = read_or_stop() if first.startswith(BOM_UTF8): bom_found = True first = first[3:] default = 'utf-8-sig' if not first: return default, [] encoding = find_cookie(first) if encoding: return encoding, [first] second = read_or_stop() if not second: return default, [first] encoding = find_cookie(second) if encoding: return encoding, [first, second] return default, [first, second] def open_py_source(filename): """Open a file in read only mode using the encoding detected by detect_encoding(). """ buffer = open(filename, 'rb') encoding, lines = _detect_encoding(buffer.readline) buffer.seek(0) text = TextIOWrapper(buffer, encoding, line_buffering=True) text.mode = 'r' return text joblib-0.11/joblib/_multiprocessing_helpers.py000066400000000000000000000022331305577265600216370ustar00rootroot00000000000000"""Helper module to factorize the conditional multiprocessing import logic We use a distinct module to simplify import statements and avoid introducing circular dependencies (for instance for the assert_spawning name). """ import os import warnings # Obtain possible configuration from the environment, assuming 1 (on) # by default, upon 0 set to None. Should instructively fail if some non # 0/1 value is set. mp = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None if mp: try: import multiprocessing as mp except ImportError: mp = None # 2nd stage: validate that locking is available on the system and # issue a warning if not if mp is not None: try: _sem = mp.Semaphore() del _sem # cleanup except (ImportError, OSError) as e: mp = None warnings.warn('%s. joblib will operate in serial mode' % (e,)) # 3rd stage: backward compat for the assert_spawning helper if mp is not None: try: # Python 3.4+ from multiprocessing.context import assert_spawning except ImportError: from multiprocessing.forking import assert_spawning else: assert_spawning = None joblib-0.11/joblib/_parallel_backends.py000066400000000000000000000341151305577265600203200ustar00rootroot00000000000000""" Backends for embarrassingly parallel code. """ import gc import os import sys import warnings import threading from abc import ABCMeta, abstractmethod from .format_stack import format_exc from .my_exceptions import WorkerInterrupt, TransportableException from ._multiprocessing_helpers import mp from ._compat import with_metaclass if mp is not None: from .pool import MemmapingPool from multiprocessing.pool import ThreadPool class ParallelBackendBase(with_metaclass(ABCMeta)): """Helper abc which defines all methods a ParallelBackend must implement""" supports_timeout = False @abstractmethod def effective_n_jobs(self, n_jobs): """Determine the number of jobs that can actually run in parallel n_jobs is the number of workers requested by the callers. Passing n_jobs=-1 means requesting all available workers for instance matching the number of CPU cores on the worker host(s). This method should return a guesstimate of the number of workers that can actually perform work concurrently. The primary use case is to make it possible for the caller to know in how many chunks to slice the work. In general working on larger data chunks is more efficient (less scheduling overhead and better use of CPU cache prefetching heuristics) as long as all the workers have enough work to do. """ @abstractmethod def apply_async(self, func, callback=None): """Schedule a func to be run""" def configure(self, n_jobs=1, parallel=None, **backend_args): """Reconfigure the backend and return the number of workers. This makes it possible to reuse an existing backend instance for successive independent calls to Parallel with different parameters. """ self.parallel = parallel return self.effective_n_jobs(n_jobs) def terminate(self): """Shutdown the process or thread pool""" def compute_batch_size(self): """Determine the optimal batch size""" return 1 def batch_completed(self, batch_size, duration): """Callback indicate how long it took to run a batch""" def get_exceptions(self): """List of exception types to be captured.""" return [] def abort_everything(self, ensure_ready=True): """Abort any running tasks This is called when an exception has been raised when executing a tasks and all the remaining tasks will be ignored and can therefore be aborted to spare computation resources. If ensure_ready is True, the backend should be left in an operating state as future tasks might be re-submitted via that same backend instance. If ensure_ready is False, the implementer of this method can decide to leave the backend in a closed / terminated state as no new task are expected to be submitted to this backend. Setting ensure_ready to False is an optimization that can be leveraged when aborting tasks via killing processes from a local process pool managed by the backend it-self: if we expect no new tasks, there is no point in re-creating a new working pool. """ # Does nothing by default: to be overriden in subclasses when canceling # tasks is possible. pass class SequentialBackend(ParallelBackendBase): """A ParallelBackend which will execute all batches sequentially. Does not use/create any threading objects, and hence has minimal overhead. Used when n_jobs == 1. """ def effective_n_jobs(self, n_jobs): """Determine the number of jobs which are going to run in parallel""" if n_jobs == 0: raise ValueError('n_jobs == 0 in Parallel has no meaning') return 1 def apply_async(self, func, callback=None): """Schedule a func to be run""" result = ImmediateResult(func) if callback: callback(result) return result class PoolManagerMixin(object): """A helper class for managing pool of workers.""" def effective_n_jobs(self, n_jobs): """Determine the number of jobs which are going to run in parallel""" if n_jobs == 0: raise ValueError('n_jobs == 0 in Parallel has no meaning') elif mp is None or n_jobs is None: # multiprocessing is not available or disabled, fallback # to sequential mode return 1 elif n_jobs < 0: n_jobs = max(mp.cpu_count() + 1 + n_jobs, 1) return n_jobs def terminate(self): """Shutdown the process or thread pool""" if self._pool is not None: self._pool.close() self._pool.terminate() # terminate does a join() self._pool = None def apply_async(self, func, callback=None): """Schedule a func to be run""" return self._pool.apply_async(SafeFunction(func), callback=callback) def abort_everything(self, ensure_ready=True): """Shutdown the pool and restart a new one with the same parameters""" self.terminate() if ensure_ready: self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel, **self.parallel._backend_args) class AutoBatchingMixin(object): """A helper class for automagically batching jobs.""" # In seconds, should be big enough to hide multiprocessing dispatching # overhead. # This settings was found by running benchmarks/bench_auto_batching.py # with various parameters on various platforms. MIN_IDEAL_BATCH_DURATION = .2 # Should not be too high to avoid stragglers: long jobs running alone # on a single worker while other workers have no work to process any more. MAX_IDEAL_BATCH_DURATION = 2 # Batching counters _effective_batch_size = 1 _smoothed_batch_duration = 0.0 def compute_batch_size(self): """Determine the optimal batch size""" old_batch_size = self._effective_batch_size batch_duration = self._smoothed_batch_duration if (batch_duration > 0 and batch_duration < self.MIN_IDEAL_BATCH_DURATION): # The current batch size is too small: the duration of the # processing of a batch of task is not large enough to hide # the scheduling overhead. ideal_batch_size = int(old_batch_size * self.MIN_IDEAL_BATCH_DURATION / batch_duration) # Multiply by two to limit oscilations between min and max. batch_size = max(2 * ideal_batch_size, 1) self._effective_batch_size = batch_size if self.parallel.verbose >= 10: self.parallel._print( "Batch computation too fast (%.4fs.) " "Setting batch_size=%d.", (batch_duration, batch_size)) elif (batch_duration > self.MAX_IDEAL_BATCH_DURATION and old_batch_size >= 2): # The current batch size is too big. If we schedule overly long # running batches some CPUs might wait with nothing left to do # while a couple of CPUs a left processing a few long running # batches. Better reduce the batch size a bit to limit the # likelihood of scheduling such stragglers. batch_size = old_batch_size // 2 self._effective_batch_size = batch_size if self.parallel.verbose >= 10: self.parallel._print( "Batch computation too slow (%.4fs.) " "Setting batch_size=%d.", (batch_duration, batch_size)) else: # No batch size adjustment batch_size = old_batch_size if batch_size != old_batch_size: # Reset estimation of the smoothed mean batch duration: this # estimate is updated in the multiprocessing apply_async # CallBack as long as the batch_size is constant. Therefore # we need to reset the estimate whenever we re-tune the batch # size. self._smoothed_batch_duration = 0 return batch_size def batch_completed(self, batch_size, duration): """Callback indicate how long it took to run a batch""" if batch_size == self._effective_batch_size: # Update the smoothed streaming estimate of the duration of a batch # from dispatch to completion old_duration = self._smoothed_batch_duration if old_duration == 0: # First record of duration for this batch size after the last # reset. new_duration = duration else: # Update the exponentially weighted average of the duration of # batch for the current effective size. new_duration = 0.8 * old_duration + 0.2 * duration self._smoothed_batch_duration = new_duration class ThreadingBackend(PoolManagerMixin, ParallelBackendBase): """A ParallelBackend which will use a thread pool to execute batches in. This is a low-overhead backend but it suffers from the Python Global Interpreter Lock if the called function relies a lot on Python objects. Mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a "with nogil" block or an expensive call to a library such as NumPy). """ supports_timeout = True def configure(self, n_jobs=1, parallel=None, **backend_args): """Build a process or thread pool and return the number of workers""" n_jobs = self.effective_n_jobs(n_jobs) if n_jobs == 1: # Avoid unnecessary overhead and use sequential backend instead. raise FallbackToBackend(SequentialBackend()) self.parallel = parallel self._pool = ThreadPool(n_jobs) return n_jobs class MultiprocessingBackend(PoolManagerMixin, AutoBatchingMixin, ParallelBackendBase): """A ParallelBackend which will use a multiprocessing.Pool. Will introduce some communication and memory overhead when exchanging input and output data with the with the worker Python processes. However, does not suffer from the Python Global Interpreter Lock. """ # Environment variables to protect against bad situations when nesting JOBLIB_SPAWNED_PROCESS = "__JOBLIB_SPAWNED_PARALLEL__" supports_timeout = True def effective_n_jobs(self, n_jobs): """Determine the number of jobs which are going to run in parallel. This also checks if we are attempting to create a nested parallel loop. """ if mp is None: return 1 if mp.current_process().daemon: # Daemonic processes cannot have children if n_jobs != 1: warnings.warn( 'Multiprocessing-backed parallel loops cannot be nested,' ' setting n_jobs=1', stacklevel=3) return 1 if not isinstance(threading.current_thread(), threading._MainThread): # Prevent posix fork inside in non-main posix threads warnings.warn( 'Multiprocessing-backed parallel loops cannot be nested' ' below threads, setting n_jobs=1', stacklevel=3) return 1 return super(MultiprocessingBackend, self).effective_n_jobs(n_jobs) def configure(self, n_jobs=1, parallel=None, **backend_args): """Build a process or thread pool and return the number of workers""" n_jobs = self.effective_n_jobs(n_jobs) if n_jobs == 1: raise FallbackToBackend(SequentialBackend()) already_forked = int(os.environ.get(self.JOBLIB_SPAWNED_PROCESS, 0)) if already_forked: raise ImportError( '[joblib] Attempting to do parallel computing ' 'without protecting your import on a system that does ' 'not support forking. To use parallel-computing in a ' 'script, you must protect your main loop using "if ' "__name__ == '__main__'" '". Please see the joblib documentation on Parallel ' 'for more information') # Set an environment variable to avoid infinite loops os.environ[self.JOBLIB_SPAWNED_PROCESS] = '1' # Make sure to free as much memory as possible before forking gc.collect() self._pool = MemmapingPool(n_jobs, **backend_args) self.parallel = parallel return n_jobs def terminate(self): """Shutdown the process or thread pool""" super(MultiprocessingBackend, self).terminate() if self.JOBLIB_SPAWNED_PROCESS in os.environ: del os.environ[self.JOBLIB_SPAWNED_PROCESS] class ImmediateResult(object): def __init__(self, batch): # Don't delay the application, to avoid keeping the input # arguments in memory self.results = batch() def get(self): return self.results class SafeFunction(object): """Wrapper that handles the serialization of exception tracebacks. If an exception is triggered when calling the inner function, a copy of the full traceback is captured to make it possible to serialize it so that it can be rendered in a different Python process. """ def __init__(self, func): self.func = func def __call__(self, *args, **kwargs): try: return self.func(*args, **kwargs) except KeyboardInterrupt: # We capture the KeyboardInterrupt and reraise it as # something different, as multiprocessing does not # interrupt processing for a KeyboardInterrupt raise WorkerInterrupt() except: e_type, e_value, e_tb = sys.exc_info() text = format_exc(e_type, e_value, e_tb, context=10, tb_offset=1) raise TransportableException(text, e_type) class FallbackToBackend(Exception): """Raised when configuration should fallback to another backend""" def __init__(self, backend): self.backend = backend joblib-0.11/joblib/backports.py000066400000000000000000000050551305577265600165240ustar00rootroot00000000000000""" Backports of fixes for joblib dependencies """ import os import time import ctypes import sys from distutils.version import LooseVersion try: import numpy as np def make_memmap(filename, dtype='uint8', mode='r+', offset=0, shape=None, order='C'): """Backport of numpy memmap offset fix. See https://github.com/numpy/numpy/pull/8443 for more details. The numpy fix will be available in numpy 1.13. """ mm = np.memmap(filename, dtype=dtype, mode=mode, offset=offset, shape=shape, order=order) if LooseVersion(np.__version__) < '1.13': mm.offset = offset return mm except ImportError: def make_memmap(filename, dtype='uint8', mode='r+', offset=0, shape=None, order='C'): raise NotImplementedError( "'joblib.backports.make_memmap' should not be used " 'if numpy is not installed.') if os.name == 'nt': error_access_denied = 5 try: from os import replace except ImportError: # Python 2.7 def replace(src, dst): if not isinstance(src, unicode): # noqa src = unicode(src, sys.getfilesystemencoding()) # noqa if not isinstance(dst, unicode): # noqa dst = unicode(dst, sys.getfilesystemencoding()) # noqa movefile_replace_existing = 0x1 return_value = ctypes.windll.kernel32.MoveFileExW( src, dst, movefile_replace_existing) if return_value == 0: raise ctypes.WinError() def concurrency_safe_rename(src, dst): """Renames ``src`` into ``dst`` overwriting ``dst`` if it exists. On Windows os.replace (or for Python 2.7 its implementation through MoveFileExW) can yield permission errors if executed by two different processes. """ max_sleep_time = 1 total_sleep_time = 0 sleep_time = 0.001 while total_sleep_time < max_sleep_time: try: replace(src, dst) break except Exception as exc: if getattr(exc, 'winerror', None) == error_access_denied: time.sleep(sleep_time) total_sleep_time += sleep_time sleep_time *= 2 else: raise else: raise else: try: from os import replace as concurrency_safe_rename except ImportError: from os import rename as concurrency_safe_rename # noqa joblib-0.11/joblib/disk.py000066400000000000000000000062431305577265600154660ustar00rootroot00000000000000""" Disk management utilities. """ # Authors: Gael Varoquaux # Lars Buitinck # Copyright (c) 2010 Gael Varoquaux # License: BSD Style, 3 clauses. import errno import os import shutil import sys import time def disk_used(path): """ Return the disk usage in a directory.""" size = 0 for file in os.listdir(path) + ['.']: stat = os.stat(os.path.join(path, file)) if hasattr(stat, 'st_blocks'): size += stat.st_blocks * 512 else: # on some platform st_blocks is not available (e.g., Windows) # approximate by rounding to next multiple of 512 size += (stat.st_size // 512 + 1) * 512 # We need to convert to int to avoid having longs on some systems (we # don't want longs to avoid problems we SQLite) return int(size / 1024.) def memstr_to_bytes(text): """ Convert a memory text to its value in bytes. """ kilo = 1024 units = dict(K=kilo, M=kilo ** 2, G=kilo ** 3) try: size = int(units[text[-1]] * float(text[:-1])) except (KeyError, ValueError): raise ValueError( "Invalid literal for size give: %s (type %s) should be " "alike '10G', '500M', '50K'." % (text, type(text))) return size def mkdirp(d): """Ensure directory d exists (like mkdir -p on Unix) No guarantee that the directory is writable. """ try: os.makedirs(d) except OSError as e: if e.errno != errno.EEXIST: raise # if a rmtree operation fails in rm_subdirs, wait for this much time (in secs), # then retry once. if it still fails, raise the exception RM_SUBDIRS_RETRY_TIME = 0.1 def rm_subdirs(path, onerror=None): """Remove all subdirectories in this path. The directory indicated by `path` is left in place, and its subdirectories are erased. If onerror is set, it is called to handle the error with arguments (func, path, exc_info) where func is os.listdir, os.remove, or os.rmdir; path is the argument to that function that caused it to fail; and exc_info is a tuple returned by sys.exc_info(). If onerror is None, an exception is raised. """ # NOTE this code is adapted from the one in shutil.rmtree, and is # just as fast names = [] try: names = os.listdir(path) except os.error as err: if onerror is not None: onerror(os.listdir, path, sys.exc_info()) else: raise for name in names: fullname = os.path.join(path, name) if os.path.isdir(fullname): if onerror is not None: shutil.rmtree(fullname, False, onerror) else: # allow the rmtree to fail once, wait and re-try. # if the error is raised again, fail err_count = 0 while True: try: shutil.rmtree(fullname, False, None) break except os.error: if err_count > 0: raise err_count += 1 time.sleep(RM_SUBDIRS_RETRY_TIME) joblib-0.11/joblib/format_stack.py000066400000000000000000000344571305577265600172210ustar00rootroot00000000000000""" Represent an exception with a lot of information. Provides 2 useful functions: format_exc: format an exception into a complete traceback, with full debugging instruction. format_outer_frames: format the current position in the stack call. Adapted from IPython's VerboseTB. """ # Authors: Gael Varoquaux < gael dot varoquaux at normalesup dot org > # Nathaniel Gray # Fernando Perez # Copyright: 2010, Gael Varoquaux # 2001-2004, Fernando Perez # 2001 Nathaniel Gray # License: BSD 3 clause import inspect import keyword import linecache import os import pydoc import sys import time import tokenize import traceback try: # Python 2 generate_tokens = tokenize.generate_tokens except AttributeError: # Python 3 generate_tokens = tokenize.tokenize INDENT = ' ' * 8 ############################################################################### # some internal-use functions def safe_repr(value): """Hopefully pretty robust repr equivalent.""" # this is pretty horrible but should always return *something* try: return pydoc.text.repr(value) except KeyboardInterrupt: raise except: try: return repr(value) except KeyboardInterrupt: raise except: try: # all still in an except block so we catch # getattr raising name = getattr(value, '__name__', None) if name: # ick, recursion return safe_repr(name) klass = getattr(value, '__class__', None) if klass: return '%s instance' % safe_repr(klass) except KeyboardInterrupt: raise except: return 'UNRECOVERABLE REPR FAILURE' def eq_repr(value, repr=safe_repr): return '=%s' % repr(value) ############################################################################### def uniq_stable(elems): """uniq_stable(elems) -> list Return from an iterable, a list of all the unique elements in the input, but maintaining the order in which they first appear. A naive solution to this problem which just makes a dictionary with the elements as keys fails to respect the stability condition, since dictionaries are unsorted by nature. Note: All elements in the input must be hashable. """ unique = [] unique_set = set() for nn in elems: if nn not in unique_set: unique.append(nn) unique_set.add(nn) return unique ############################################################################### def fix_frame_records_filenames(records): """Try to fix the filenames in each record from inspect.getinnerframes(). Particularly, modules loaded from within zip files have useless filenames attached to their code object, and inspect.getinnerframes() just uses it. """ fixed_records = [] for frame, filename, line_no, func_name, lines, index in records: # Look inside the frame's globals dictionary for __file__, which should # be better. better_fn = frame.f_globals.get('__file__', None) if isinstance(better_fn, str): # Check the type just in case someone did something weird with # __file__. It might also be None if the error occurred during # import. filename = better_fn fixed_records.append((frame, filename, line_no, func_name, lines, index)) return fixed_records def _fixed_getframes(etb, context=1, tb_offset=0): LNUM_POS, LINES_POS, INDEX_POS = 2, 4, 5 records = fix_frame_records_filenames(inspect.getinnerframes(etb, context)) # If the error is at the console, don't build any context, since it would # otherwise produce 5 blank lines printed out (there is no file at the # console) rec_check = records[tb_offset:] try: rname = rec_check[0][1] if rname == '' or rname.endswith(''): return rec_check except IndexError: pass aux = traceback.extract_tb(etb) assert len(records) == len(aux) for i, (file, lnum, _, _) in enumerate(aux): maybe_start = lnum - 1 - context // 2 start = max(maybe_start, 0) end = start + context lines = linecache.getlines(file)[start:end] buf = list(records[i]) buf[LNUM_POS] = lnum buf[INDEX_POS] = lnum - 1 - start buf[LINES_POS] = lines records[i] = tuple(buf) return records[tb_offset:] def _format_traceback_lines(lnum, index, lines, lvals=None): numbers_width = 7 res = [] i = lnum - index for line in lines: if i == lnum: # This is the line with the error pad = numbers_width - len(str(i)) if pad >= 3: marker = '-' * (pad - 3) + '-> ' elif pad == 2: marker = '> ' elif pad == 1: marker = '>' else: marker = '' num = marker + str(i) else: num = '%*s' % (numbers_width, i) line = '%s %s' % (num, line) res.append(line) if lvals and i == lnum: res.append(lvals + '\n') i = i + 1 return res def format_records(records): # , print_globals=False): # Loop over all records printing context and info frames = [] abspath = os.path.abspath for frame, file, lnum, func, lines, index in records: try: file = file and abspath(file) or '?' except OSError: # if file is '' or something not in the filesystem, # the abspath call will throw an OSError. Just ignore it and # keep the original file string. pass if file.endswith('.pyc'): file = file[:-4] + '.py' link = file args, varargs, varkw, locals = inspect.getargvalues(frame) if func == '?': call = '' else: # Decide whether to include variable details or not try: call = 'in %s%s' % (func, inspect.formatargvalues(args, varargs, varkw, locals, formatvalue=eq_repr)) except KeyError: # Very odd crash from inspect.formatargvalues(). The # scenario under which it appeared was a call to # view(array,scale) in NumTut.view.view(), where scale had # been defined as a scalar (it should be a tuple). Somehow # inspect messes up resolving the argument list of view() # and barfs out. At some point I should dig into this one # and file a bug report about it. print("\nJoblib's exception reporting continues...\n") call = 'in %s(***failed resolving arguments***)' % func # Initialize a list of names on the current line, which the # tokenizer below will populate. names = [] def tokeneater(token_type, token, start, end, line): """Stateful tokeneater which builds dotted names. The list of names it appends to (from the enclosing scope) can contain repeated composite names. This is unavoidable, since there is no way to disambiguate partial dotted structures until the full list is known. The caller is responsible for pruning the final list of duplicates before using it.""" # build composite names if token == '.': try: names[-1] += '.' # store state so the next token is added for x.y.z names tokeneater.name_cont = True return except IndexError: pass if token_type == tokenize.NAME and token not in keyword.kwlist: if tokeneater.name_cont: # Dotted names names[-1] += token tokeneater.name_cont = False else: # Regular new names. We append everything, the caller # will be responsible for pruning the list later. It's # very tricky to try to prune as we go, b/c composite # names can fool us. The pruning at the end is easy # to do (or the caller can print a list with repeated # names if so desired. names.append(token) elif token_type == tokenize.NEWLINE: raise IndexError # we need to store a bit of state in the tokenizer to build # dotted names tokeneater.name_cont = False def linereader(file=file, lnum=[lnum], getline=linecache.getline): line = getline(file, lnum[0]) lnum[0] += 1 return line # Build the list of names on this line of code where the exception # occurred. try: # This builds the names list in-place by capturing it from the # enclosing scope. for token in generate_tokens(linereader): tokeneater(*token) except (IndexError, UnicodeDecodeError, SyntaxError): # signals exit of tokenizer # SyntaxError can happen when trying to tokenize # a compiled (e.g. .so or .pyd) extension pass except tokenize.TokenError as msg: _m = ("An unexpected error occurred while tokenizing input file %s\n" "The following traceback may be corrupted or invalid\n" "The error message is: %s\n" % (file, msg)) print(_m) # prune names list of duplicates, but keep the right order unique_names = uniq_stable(names) # Start loop over vars lvals = [] for name_full in unique_names: name_base = name_full.split('.', 1)[0] if name_base in frame.f_code.co_varnames: if name_base in locals.keys(): try: value = safe_repr(eval(name_full, locals)) except: value = "undefined" else: value = "undefined" name = name_full lvals.append('%s = %s' % (name, value)) #elif print_globals: # if frame.f_globals.has_key(name_base): # try: # value = safe_repr(eval(name_full,frame.f_globals)) # except: # value = "undefined" # else: # value = "undefined" # name = 'global %s' % name_full # lvals.append('%s = %s' % (name,value)) if lvals: lvals = '%s%s' % (INDENT, ('\n%s' % INDENT).join(lvals)) else: lvals = '' level = '%s\n%s %s\n' % (75 * '.', link, call) if index is None: frames.append(level) else: frames.append('%s%s' % (level, ''.join( _format_traceback_lines(lnum, index, lines, lvals)))) return frames ############################################################################### def format_exc(etype, evalue, etb, context=5, tb_offset=0): """ Return a nice text document describing the traceback. Parameters ----------- etype, evalue, etb: as returned by sys.exc_info context: number of lines of the source file to plot tb_offset: the number of stack frame not to use (0 = use all) """ # some locals try: etype = etype.__name__ except AttributeError: pass # Header with the exception type, python version, and date pyver = 'Python ' + sys.version.split()[0] + ': ' + sys.executable date = time.ctime(time.time()) pid = 'PID: %i' % os.getpid() head = '%s%s%s\n%s%s%s' % ( etype, ' ' * (75 - len(str(etype)) - len(date)), date, pid, ' ' * (75 - len(str(pid)) - len(pyver)), pyver) # Drop topmost frames if requested records = _fixed_getframes(etb, context, tb_offset) # Get (safely) a string form of the exception info try: etype_str, evalue_str = map(str, (etype, evalue)) except: # User exception is improperly defined. etype, evalue = str, sys.exc_info()[:2] etype_str, evalue_str = map(str, (etype, evalue)) # ... and format it exception = ['%s: %s' % (etype_str, evalue_str)] frames = format_records(records) return '%s\n%s\n%s' % (head, '\n'.join(frames), ''.join(exception[0])) ############################################################################### def format_outer_frames(context=5, stack_start=None, stack_end=None, ignore_ipython=True): LNUM_POS, LINES_POS, INDEX_POS = 2, 4, 5 records = inspect.getouterframes(inspect.currentframe()) output = list() for i, (frame, filename, line_no, func_name, lines, index) \ in enumerate(records): # Look inside the frame's globals dictionary for __file__, which should # be better. better_fn = frame.f_globals.get('__file__', None) if isinstance(better_fn, str): # Check the type just in case someone did something weird with # __file__. It might also be None if the error occurred during # import. filename = better_fn if filename.endswith('.pyc'): filename = filename[:-4] + '.py' if ignore_ipython: # Hack to avoid printing the internals of IPython if (os.path.basename(filename) in ('iplib.py', 'py3compat.py') and func_name in ('execfile', 'safe_execfile', 'runcode')): break maybe_start = line_no - 1 - context // 2 start = max(maybe_start, 0) end = start + context lines = linecache.getlines(filename)[start:end] buf = list(records[i]) buf[LNUM_POS] = line_no buf[INDEX_POS] = line_no - 1 - start buf[LINES_POS] = lines output.append(tuple(buf)) return '\n'.join(format_records(output[stack_end:stack_start:-1])) joblib-0.11/joblib/func_inspect.py000066400000000000000000000317061305577265600172160ustar00rootroot00000000000000""" My own variation on function-specific inspect-like features. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. from itertools import islice import inspect import warnings import re import os from ._compat import _basestring from .logger import pformat from ._memory_helpers import open_py_source from ._compat import PY3_OR_LATER def get_func_code(func): """ Attempts to retrieve a reliable function code hash. The reason we don't use inspect.getsource is that it caches the source, whereas we want this to be modified on the fly when the function is modified. Returns ------- func_code: string The function code source_file: string The path to the file in which the function is defined. first_line: int The first line of the code in the source file. Notes ------ This function does a bit more magic than inspect, and is thus more robust. """ source_file = None try: code = func.__code__ source_file = code.co_filename if not os.path.exists(source_file): # Use inspect for lambda functions and functions defined in an # interactive shell, or in doctests source_code = ''.join(inspect.getsourcelines(func)[0]) line_no = 1 if source_file.startswith('', source_file).groups() line_no = int(line_no) source_file = '' % source_file return source_code, source_file, line_no # Try to retrieve the source code. with open_py_source(source_file) as source_file_obj: first_line = code.co_firstlineno # All the lines after the function definition: source_lines = list(islice(source_file_obj, first_line - 1, None)) return ''.join(inspect.getblock(source_lines)), source_file, first_line except: # If the source code fails, we use the hash. This is fragile and # might change from one session to another. if hasattr(func, '__code__'): # Python 3.X return str(func.__code__.__hash__()), source_file, -1 else: # Weird objects like numpy ufunc don't have __code__ # This is fragile, as quite often the id of the object is # in the repr, so it might not persist across sessions, # however it will work for ufuncs. return repr(func), source_file, -1 def _clean_win_chars(string): """Windows cannot encode some characters in filename.""" import urllib if hasattr(urllib, 'quote'): quote = urllib.quote else: # In Python 3, quote is elsewhere import urllib.parse quote = urllib.parse.quote for char in ('<', '>', '!', ':', '\\'): string = string.replace(char, quote(char)) return string def get_func_name(func, resolv_alias=True, win_characters=True): """ Return the function import path (as a list of module names), and a name for the function. Parameters ---------- func: callable The func to inspect resolv_alias: boolean, optional If true, possible local aliases are indicated. win_characters: boolean, optional If true, substitute special characters using urllib.quote This is useful in Windows, as it cannot encode some filenames """ if hasattr(func, '__module__'): module = func.__module__ else: try: module = inspect.getmodule(func) except TypeError: if hasattr(func, '__class__'): module = func.__class__.__module__ else: module = 'unknown' if module is None: # Happens in doctests, eg module = '' if module == '__main__': try: filename = os.path.abspath(inspect.getsourcefile(func)) except: filename = None if filename is not None: # mangling of full path to filename parts = filename.split(os.sep) if parts[-1].startswith(' 1500: formatted_arg = '%s...' % formatted_arg[:700] return formatted_arg def format_signature(func, *args, **kwargs): # XXX: Should this use inspect.formatargvalues/formatargspec? module, name = get_func_name(func) module = [m for m in module if m] if module: module.append(name) module_path = '.'.join(module) else: module_path = name arg_str = list() previous_length = 0 for arg in args: formatted_arg = _format_arg(arg) if previous_length > 80: formatted_arg = '\n%s' % formatted_arg previous_length = len(formatted_arg) arg_str.append(formatted_arg) arg_str.extend(['%s=%s' % (v, _format_arg(i)) for v, i in kwargs.items()]) arg_str = ', '.join(arg_str) signature = '%s(%s)' % (name, arg_str) return module_path, signature def format_call(func, args, kwargs, object_name="Memory"): """ Returns a nicely formatted statement displaying the function call with the given arguments. """ path, signature = format_signature(func, *args, **kwargs) msg = '%s\n[%s] Calling %s...\n%s' % (80 * '_', object_name, path, signature) return msg # XXX: Not using logging framework # self.debug(msg) joblib-0.11/joblib/hashing.py000066400000000000000000000236631305577265600161620ustar00rootroot00000000000000""" Fast cryptographic hash of Python objects, with a special case for fast hashing of numpy arrays. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import pickle import hashlib import sys import types import struct import io import decimal from ._compat import _bytes_or_unicode, PY3_OR_LATER if PY3_OR_LATER: Pickler = pickle._Pickler else: Pickler = pickle.Pickler class _ConsistentSet(object): """ Class used to ensure the hash of Sets is preserved whatever the order of its items. """ def __init__(self, set_sequence): # Forces order of elements in set to ensure consistent hash. try: # Trying first to order the set assuming the type of elements is # consistent and orderable. # This fails on python 3 when elements are unorderable # but we keep it in a try as it's faster. self._sequence = sorted(set_sequence) except (TypeError, decimal.InvalidOperation): # If elements are unorderable, sorting them using their hash. # This is slower but works in any case. self._sequence = sorted((hash(e) for e in set_sequence)) class _MyHash(object): """ Class used to hash objects that won't normally pickle """ def __init__(self, *args): self.args = args class Hasher(Pickler): """ A subclass of pickler, to do cryptographic hashing, rather than pickling. """ def __init__(self, hash_name='md5'): self.stream = io.BytesIO() # By default we want a pickle protocol that only changes with # the major python version and not the minor one protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER else pickle.HIGHEST_PROTOCOL) Pickler.__init__(self, self.stream, protocol=protocol) # Initialise the hash obj self._hash = hashlib.new(hash_name) def hash(self, obj, return_digest=True): try: self.dump(obj) except pickle.PicklingError as e: e.args += ('PicklingError while hashing %r: %r' % (obj, e),) raise dumps = self.stream.getvalue() self._hash.update(dumps) if return_digest: return self._hash.hexdigest() def save(self, obj): if isinstance(obj, (types.MethodType, type({}.pop))): # the Pickler cannot pickle instance methods; here we decompose # them into components that make them uniquely identifiable if hasattr(obj, '__func__'): func_name = obj.__func__.__name__ else: func_name = obj.__name__ inst = obj.__self__ if type(inst) == type(pickle): obj = _MyHash(func_name, inst.__name__) elif inst is None: # type(None) or type(module) do not pickle obj = _MyHash(func_name, inst) else: cls = obj.__self__.__class__ obj = _MyHash(func_name, inst, cls) Pickler.save(self, obj) def memoize(self, obj): # We want hashing to be sensitive to value instead of reference. # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] # to hash to the same value and that's why we disable memoization # for strings if isinstance(obj, _bytes_or_unicode): return Pickler.memoize(self, obj) # The dispatch table of the pickler is not accessible in Python # 3, as these lines are only bugware for IPython, we skip them. def save_global(self, obj, name=None, pack=struct.pack): # We have to override this method in order to deal with objects # defined interactively in IPython that are not injected in # __main__ kwargs = dict(name=name, pack=pack) if sys.version_info >= (3, 4): del kwargs['pack'] try: Pickler.save_global(self, obj, **kwargs) except pickle.PicklingError: Pickler.save_global(self, obj, **kwargs) module = getattr(obj, "__module__", None) if module == '__main__': my_name = name if my_name is None: my_name = obj.__name__ mod = sys.modules[module] if not hasattr(mod, my_name): # IPython doesn't inject the variables define # interactively in __main__ setattr(mod, my_name, obj) dispatch = Pickler.dispatch.copy() # builtin dispatch[type(len)] = save_global # type dispatch[type(object)] = save_global # classobj dispatch[type(Pickler)] = save_global # function dispatch[type(pickle.dump)] = save_global def _batch_setitems(self, items): # forces order of keys in dict to ensure consistent hash. try: # Trying first to compare dict assuming the type of keys is # consistent and orderable. # This fails on python 3 when keys are unorderable # but we keep it in a try as it's faster. Pickler._batch_setitems(self, iter(sorted(items))) except TypeError: # If keys are unorderable, sorting them using their hash. This is # slower but works in any case. Pickler._batch_setitems(self, iter(sorted((hash(k), v) for k, v in items))) def save_set(self, set_items): # forces order of items in Set to ensure consistent hash Pickler.save(self, _ConsistentSet(set_items)) dispatch[type(set())] = save_set class NumpyHasher(Hasher): """ Special case the hasher for when numpy is loaded. """ def __init__(self, hash_name='md5', coerce_mmap=False): """ Parameters ---------- hash_name: string The hash algorithm to be used coerce_mmap: boolean Make no difference between np.memmap and np.ndarray objects. """ self.coerce_mmap = coerce_mmap Hasher.__init__(self, hash_name=hash_name) # delayed import of numpy, to avoid tight coupling import numpy as np self.np = np if hasattr(np, 'getbuffer'): self._getbuffer = np.getbuffer else: self._getbuffer = memoryview def save(self, obj): """ Subclass the save method, to hash ndarray subclass, rather than pickling them. Off course, this is a total abuse of the Pickler class. """ if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: # Compute a hash of the object # The update function of the hash requires a c_contiguous buffer. if obj.shape == (): # 0d arrays need to be flattened because viewing them as bytes # raises a ValueError exception. obj_c_contiguous = obj.flatten() elif obj.flags.c_contiguous: obj_c_contiguous = obj elif obj.flags.f_contiguous: obj_c_contiguous = obj.T else: # Cater for non-single-segment arrays: this creates a # copy, and thus aleviates this issue. # XXX: There might be a more efficient way of doing this obj_c_contiguous = obj.flatten() # memoryview is not supported for some dtypes, e.g. datetime64, see # https://github.com/numpy/numpy/issues/4983. The # workaround is to view the array as bytes before # taking the memoryview. self._hash.update( self._getbuffer(obj_c_contiguous.view(self.np.uint8))) # We store the class, to be able to distinguish between # Objects with the same binary content, but different # classes. if self.coerce_mmap and isinstance(obj, self.np.memmap): # We don't make the difference between memmap and # normal ndarrays, to be able to reload previously # computed results with memmap. klass = self.np.ndarray else: klass = obj.__class__ # We also return the dtype and the shape, to distinguish # different views on the same data with different dtypes. # The object will be pickled by the pickler hashed at the end. obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) elif isinstance(obj, self.np.dtype): # Atomic dtype objects are interned by their default constructor: # np.dtype('f8') is np.dtype('f8') # This interning is not maintained by a # pickle.loads + pickle.dumps cycle, because __reduce__ # uses copy=True in the dtype constructor. This # non-deterministic behavior causes the internal memoizer # of the hasher to generate different hash values # depending on the history of the dtype object. # To prevent the hash from being sensitive to this, we use # .descr which is a full (and never interned) description of # the array dtype according to the numpy doc. klass = obj.__class__ obj = (klass, ('HASHED', obj.descr)) Hasher.save(self, obj) def hash(obj, hash_name='md5', coerce_mmap=False): """ Quick calculation of a hash to identify uniquely Python objects containing numpy arrays. Parameters ----------- hash_name: 'md5' or 'sha1' Hashing algorithm used. sha1 is supposedly safer, but md5 is faster. coerce_mmap: boolean Make no difference between np.memmap and np.ndarray """ if 'numpy' in sys.modules: hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) else: hasher = Hasher(hash_name=hash_name) return hasher.hash(obj) joblib-0.11/joblib/logger.py000066400000000000000000000120221305577265600160030ustar00rootroot00000000000000""" Helpers for logging. This module needs much love to become useful. """ # Author: Gael Varoquaux # Copyright (c) 2008 Gael Varoquaux # License: BSD Style, 3 clauses. from __future__ import print_function import time import sys import os import shutil import logging import pprint from .disk import mkdirp def _squeeze_time(t): """Remove .1s to the time under Windows: this is the time it take to stat files. This is needed to make results similar to timings under Unix, for tests """ if sys.platform.startswith('win'): return max(0, t - .1) else: return t def format_time(t): t = _squeeze_time(t) return "%.1fs, %.1fmin" % (t, t / 60.) def short_format_time(t): t = _squeeze_time(t) if t > 60: return "%4.1fmin" % (t / 60.) else: return " %5.1fs" % (t) def pformat(obj, indent=0, depth=3): if 'numpy' in sys.modules: import numpy as np print_options = np.get_printoptions() np.set_printoptions(precision=6, threshold=64, edgeitems=1) else: print_options = None out = pprint.pformat(obj, depth=depth, indent=indent) if print_options: np.set_printoptions(**print_options) return out ############################################################################### # class `Logger` ############################################################################### class Logger(object): """ Base class for logging messages. """ def __init__(self, depth=3): """ Parameters ---------- depth: int, optional The depth of objects printed. """ self.depth = depth def warn(self, msg): logging.warning("[%s]: %s" % (self, msg)) def debug(self, msg): # XXX: This conflicts with the debug flag used in children class logging.debug("[%s]: %s" % (self, msg)) def format(self, obj, indent=0): """ Return the formated representation of the object. """ return pformat(obj, indent=indent, depth=self.depth) ############################################################################### # class `PrintTime` ############################################################################### class PrintTime(object): """ Print and log messages while keeping track of time. """ def __init__(self, logfile=None, logdir=None): if logfile is not None and logdir is not None: raise ValueError('Cannot specify both logfile and logdir') # XXX: Need argument docstring self.last_time = time.time() self.start_time = self.last_time if logdir is not None: logfile = os.path.join(logdir, 'joblib.log') self.logfile = logfile if logfile is not None: mkdirp(os.path.dirname(logfile)) if os.path.exists(logfile): # Rotate the logs for i in range(1, 9): try: shutil.move(logfile + '.%i' % i, logfile + '.%i' % (i + 1)) except: "No reason failing here" # Use a copy rather than a move, so that a process # monitoring this file does not get lost. try: shutil.copy(logfile, logfile + '.1') except: "No reason failing here" try: with open(logfile, 'w') as logfile: logfile.write('\nLogging joblib python script\n') logfile.write('\n---%s---\n' % time.ctime(self.last_time)) except: """ Multiprocessing writing to files can create race conditions. Rather fail silently than crash the computation. """ # XXX: We actually need a debug flag to disable this # silent failure. def __call__(self, msg='', total=False): """ Print the time elapsed between the last call and the current call, with an optional message. """ if not total: time_lapse = time.time() - self.last_time full_msg = "%s: %s" % (msg, format_time(time_lapse)) else: # FIXME: Too much logic duplicated time_lapse = time.time() - self.start_time full_msg = "%s: %.2fs, %.1f min" % (msg, time_lapse, time_lapse / 60) print(full_msg, file=sys.stderr) if self.logfile is not None: try: with open(self.logfile, 'a') as f: print(full_msg, file=f) except: """ Multiprocessing writing to files can create race conditions. Rather fail silently than crash the calculation. """ # XXX: We actually need a debug flag to disable this # silent failure. self.last_time = time.time() joblib-0.11/joblib/memory.py000066400000000000000000001146151305577265600160470ustar00rootroot00000000000000""" A context object for caching a function's return value each time it is called with the same input arguments. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. from __future__ import with_statement import os import shutil import time import pydoc import re import functools import traceback import warnings import inspect import json import weakref import io import operator import collections import datetime import threading # Local imports from . import hashing from .func_inspect import get_func_code, get_func_name, filter_args from .func_inspect import format_call from .func_inspect import format_signature from ._memory_helpers import open_py_source from .logger import Logger, format_time, pformat from . import numpy_pickle from .disk import mkdirp, rm_subdirs, memstr_to_bytes from ._compat import _basestring, PY3_OR_LATER from .backports import concurrency_safe_rename FIRST_LINE_TEXT = "# first line:" CacheItemInfo = collections.namedtuple('CacheItemInfo', 'path size last_access') # TODO: The following object should have a data store object as a sub # object, and the interface to persist and query should be separated in # the data store. # # This would enable creating 'Memory' objects with a different logic for # pickling that would simply span a MemorizedFunc with the same # store (or do we want to copy it to avoid cross-talks?), for instance to # implement HDF5 pickling. # TODO: Same remark for the logger, and probably use the Python logging # mechanism. def extract_first_line(func_code): """ Extract the first line information from the function code text if available. """ if func_code.startswith(FIRST_LINE_TEXT): func_code = func_code.split('\n') first_line = int(func_code[0][len(FIRST_LINE_TEXT):]) func_code = '\n'.join(func_code[1:]) else: first_line = -1 return func_code, first_line class JobLibCollisionWarning(UserWarning): """ Warn that there might be a collision between names of functions. """ def _get_func_fullname(func): """Compute the part of part associated with a function. See code of_cache_key_to_dir() for details """ modules, funcname = get_func_name(func) modules.append(funcname) return os.path.join(*modules) def _cache_key_to_dir(cachedir, func, argument_hash): """Compute directory associated with a given cache key. func can be a function or a string as returned by _get_func_fullname(). """ parts = [cachedir] if isinstance(func, _basestring): parts.append(func) else: parts.append(_get_func_fullname(func)) if argument_hash is not None: parts.append(argument_hash) return os.path.join(*parts) def _load_output(output_dir, func_name, timestamp=None, metadata=None, mmap_mode=None, verbose=0): """Load output of a computation.""" if verbose > 1: signature = "" try: if metadata is not None: args = ", ".join(['%s=%s' % (name, value) for name, value in metadata['input_args'].items()]) signature = "%s(%s)" % (os.path.basename(func_name), args) else: signature = os.path.basename(func_name) except KeyError: pass if timestamp is not None: t = "% 16s" % format_time(time.time() - timestamp) else: t = "" if verbose < 10: print('[Memory]%s: Loading %s...' % (t, str(signature))) else: print('[Memory]%s: Loading %s from %s' % ( t, str(signature), output_dir)) filename = os.path.join(output_dir, 'output.pkl') if not os.path.isfile(filename): raise KeyError( "Non-existing cache value (may have been cleared).\n" "File %s does not exist" % filename) result = numpy_pickle.load(filename, mmap_mode=mmap_mode) return result def _get_cache_items(root_path): """Get cache information for reducing the size of the cache.""" cache_items = [] for dirpath, dirnames, filenames in os.walk(root_path): is_cache_hash_dir = re.match('[a-f0-9]{32}', os.path.basename(dirpath)) if is_cache_hash_dir: output_filename = os.path.join(dirpath, 'output.pkl') try: last_access = os.path.getatime(output_filename) except OSError: try: last_access = os.path.getatime(dirpath) except OSError: # The directory has already been deleted continue last_access = datetime.datetime.fromtimestamp(last_access) try: full_filenames = [os.path.join(dirpath, fn) for fn in filenames] dirsize = sum(os.path.getsize(fn) for fn in full_filenames) except OSError: # Either output_filename or one of the files in # dirpath does not exist any more. We assume this # directory is being cleaned by another process already continue cache_items.append(CacheItemInfo(dirpath, dirsize, last_access)) return cache_items def _get_cache_items_to_delete(root_path, bytes_limit): """Get cache items to delete to keep the cache under a size limit.""" if isinstance(bytes_limit, _basestring): bytes_limit = memstr_to_bytes(bytes_limit) cache_items = _get_cache_items(root_path) cache_size = sum(item.size for item in cache_items) to_delete_size = cache_size - bytes_limit if to_delete_size < 0: return [] # We want to delete first the cache items that were accessed a # long time ago cache_items.sort(key=operator.attrgetter('last_access')) cache_items_to_delete = [] size_so_far = 0 for item in cache_items: if size_so_far > to_delete_size: break cache_items_to_delete.append(item) size_so_far += item.size return cache_items_to_delete def concurrency_safe_write(to_write, filename, write_func): """Writes an object into a file in a concurrency-safe way.""" thread_id = id(threading.current_thread()) temporary_filename = '{}.thread-{}-pid-{}'.format( filename, thread_id, os.getpid()) write_func(to_write, temporary_filename) concurrency_safe_rename(temporary_filename, filename) # An in-memory store to avoid looking at the disk-based function # source code to check if a function definition has changed _FUNCTION_HASHES = weakref.WeakKeyDictionary() ############################################################################### # class `MemorizedResult` ############################################################################### class MemorizedResult(Logger): """Object representing a cached value. Attributes ---------- cachedir: string path to root of joblib cache func: function or string function whose output is cached. The string case is intended only for instanciation based on the output of repr() on another instance. (namely eval(repr(memorized_instance)) works). argument_hash: string hash of the function arguments mmap_mode: {None, 'r+', 'r', 'w+', 'c'} The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the different values. verbose: int verbosity level (0 means no message) timestamp, metadata: string for internal use only """ def __init__(self, cachedir, func, argument_hash, mmap_mode=None, verbose=0, timestamp=None, metadata=None): Logger.__init__(self) if isinstance(func, _basestring): self.func = func else: self.func = _get_func_fullname(func) self.argument_hash = argument_hash self.cachedir = cachedir self.mmap_mode = mmap_mode self._output_dir = _cache_key_to_dir(cachedir, self.func, argument_hash) if metadata is not None: self.metadata = metadata else: self.metadata = {} # No error is relevant here. try: with open(os.path.join(self._output_dir, 'metadata.json'), 'rb') as f: self.metadata = json.load(f) except: pass self.duration = self.metadata.get('duration', None) self.verbose = verbose self.timestamp = timestamp def get(self): """Read value from cache and return it.""" return _load_output(self._output_dir, _get_func_fullname(self.func), timestamp=self.timestamp, metadata=self.metadata, mmap_mode=self.mmap_mode, verbose=self.verbose) def clear(self): """Clear value from cache""" shutil.rmtree(self._output_dir, ignore_errors=True) def __repr__(self): return ('{class_name}(cachedir="{cachedir}", func="{func}", ' 'argument_hash="{argument_hash}")'.format( class_name=self.__class__.__name__, cachedir=self.cachedir, func=self.func, argument_hash=self.argument_hash )) def __reduce__(self): return (self.__class__, (self.cachedir, self.func, self.argument_hash), {'mmap_mode': self.mmap_mode}) class NotMemorizedResult(object): """Class representing an arbitrary value. This class is a replacement for MemorizedResult when there is no cache. """ __slots__ = ('value', 'valid') def __init__(self, value): self.value = value self.valid = True def get(self): if self.valid: return self.value else: raise KeyError("No value stored.") def clear(self): self.valid = False self.value = None def __repr__(self): if self.valid: return '{class_name}({value})'.format( class_name=self.__class__.__name__, value=pformat(self.value) ) else: return self.__class__.__name__ + ' with no value' # __getstate__ and __setstate__ are required because of __slots__ def __getstate__(self): return {"valid": self.valid, "value": self.value} def __setstate__(self, state): self.valid = state["valid"] self.value = state["value"] ############################################################################### # class `NotMemorizedFunc` ############################################################################### class NotMemorizedFunc(object): """No-op object decorating a function. This class replaces MemorizedFunc when there is no cache. It provides an identical API but does not write anything on disk. Attributes ---------- func: callable Original undecorated function. """ # Should be a light as possible (for speed) def __init__(self, func): self.func = func def __call__(self, *args, **kwargs): return self.func(*args, **kwargs) def call_and_shelve(self, *args, **kwargs): return NotMemorizedResult(self.func(*args, **kwargs)) def __reduce__(self): return (self.__class__, (self.func,)) def __repr__(self): return '%s(func=%s)' % ( self.__class__.__name__, self.func ) def clear(self, warn=True): # Argument "warn" is for compatibility with MemorizedFunc.clear pass ############################################################################### # class `MemorizedFunc` ############################################################################### class MemorizedFunc(Logger): """ Callable object decorating a function for caching its return value each time it is called. All values are cached on the filesystem, in a deep directory structure. Methods are provided to inspect the cache or clean it. Attributes ---------- func: callable The original, undecorated, function. cachedir: string Path to the base cache directory of the memory context. ignore: list or None List of variable names to ignore when choosing whether to recompute. mmap_mode: {None, 'r+', 'r', 'w+', 'c'} The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the different values. compress: boolean, or integer Whether to zip the stored data on disk. If an integer is given, it should be between 1 and 9, and sets the amount of compression. Note that compressed arrays cannot be read by memmapping. verbose: int, optional The verbosity flag, controls messages that are issued as the function is evaluated. """ #------------------------------------------------------------------------- # Public interface #------------------------------------------------------------------------- def __init__(self, func, cachedir, ignore=None, mmap_mode=None, compress=False, verbose=1, timestamp=None): """ Parameters ---------- func: callable The function to decorate cachedir: string The path of the base directory to use as a data store ignore: list or None List of variable names to ignore. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the arguments. compress : boolean, or integer Whether to zip the stored data on disk. If an integer is given, it should be between 1 and 9, and sets the amount of compression. Note that compressed arrays cannot be read by memmapping. verbose: int, optional Verbosity flag, controls the debug messages that are issued as functions are evaluated. The higher, the more verbose timestamp: float, optional The reference time from which times in tracing messages are reported. """ Logger.__init__(self) self.mmap_mode = mmap_mode self.func = func if ignore is None: ignore = [] self.ignore = ignore self._verbose = verbose self.cachedir = cachedir self.compress = compress if compress and self.mmap_mode is not None: warnings.warn('Compressed results cannot be memmapped', stacklevel=2) if timestamp is None: timestamp = time.time() self.timestamp = timestamp mkdirp(self.cachedir) try: functools.update_wrapper(self, func) except: " Objects like ufunc don't like that " if inspect.isfunction(func): doc = pydoc.TextDoc().document(func) # Remove blank line doc = doc.replace('\n', '\n\n', 1) # Strip backspace-overprints for compatibility with autodoc doc = re.sub('\x08.', '', doc) else: # Pydoc does a poor job on other objects doc = func.__doc__ self.__doc__ = 'Memoized version of %s' % doc def _cached_call(self, args, kwargs): """Call wrapped function and cache result, or read cache if available. This function returns the wrapped function output and some metadata. Returns ------- output: value or tuple what is returned by wrapped function argument_hash: string hash of function arguments metadata: dict some metadata about wrapped function call (see _persist_input()) """ # Compare the function code with the previous to see if the # function code has changed output_dir, argument_hash = self._get_output_dir(*args, **kwargs) metadata = None output_pickle_path = os.path.join(output_dir, 'output.pkl') # FIXME: The statements below should be try/excepted if not (self._check_previous_func_code(stacklevel=4) and os.path.isfile(output_pickle_path)): if self._verbose > 10: _, name = get_func_name(self.func) self.warn('Computing func %s, argument hash %s in ' 'directory %s' % (name, argument_hash, output_dir)) out, metadata = self.call(*args, **kwargs) if self.mmap_mode is not None: # Memmap the output at the first call to be consistent with # later calls out = _load_output(output_dir, _get_func_fullname(self.func), timestamp=self.timestamp, mmap_mode=self.mmap_mode, verbose=self._verbose) else: try: t0 = time.time() out = _load_output(output_dir, _get_func_fullname(self.func), timestamp=self.timestamp, metadata=metadata, mmap_mode=self.mmap_mode, verbose=self._verbose) if self._verbose > 4: t = time.time() - t0 _, name = get_func_name(self.func) msg = '%s cache loaded - %s' % (name, format_time(t)) print(max(0, (80 - len(msg))) * '_' + msg) except Exception: # XXX: Should use an exception logger _, signature = format_signature(self.func, *args, **kwargs) self.warn('Exception while loading results for ' '{}\n {}'.format( signature, traceback.format_exc())) out, metadata = self.call(*args, **kwargs) argument_hash = None return (out, argument_hash, metadata) def call_and_shelve(self, *args, **kwargs): """Call wrapped function, cache result and return a reference. This method returns a reference to the cached result instead of the result itself. The reference object is small and pickeable, allowing to send or store it easily. Call .get() on reference object to get result. Returns ------- cached_result: MemorizedResult or NotMemorizedResult reference to the value returned by the wrapped function. The class "NotMemorizedResult" is used when there is no cache activated (e.g. cachedir=None in Memory). """ _, argument_hash, metadata = self._cached_call(args, kwargs) return MemorizedResult(self.cachedir, self.func, argument_hash, metadata=metadata, verbose=self._verbose - 1, timestamp=self.timestamp) def __call__(self, *args, **kwargs): return self._cached_call(args, kwargs)[0] def __reduce__(self): """ We don't store the timestamp when pickling, to avoid the hash depending from it. In addition, when unpickling, we run the __init__ """ return (self.__class__, (self.func, self.cachedir, self.ignore, self.mmap_mode, self.compress, self._verbose)) #------------------------------------------------------------------------- # Private interface #------------------------------------------------------------------------- def _get_argument_hash(self, *args, **kwargs): return hashing.hash(filter_args(self.func, self.ignore, args, kwargs), coerce_mmap=(self.mmap_mode is not None)) def _get_output_dir(self, *args, **kwargs): """ Return the directory in which are persisted the result of the function called with the given arguments. """ argument_hash = self._get_argument_hash(*args, **kwargs) output_dir = os.path.join(self._get_func_dir(self.func), argument_hash) return output_dir, argument_hash get_output_dir = _get_output_dir # backward compatibility def _get_func_dir(self, mkdir=True): """ Get the directory corresponding to the cache for the function. """ func_dir = _cache_key_to_dir(self.cachedir, self.func, None) if mkdir: mkdirp(func_dir) return func_dir def _hash_func(self): """Hash a function to key the online cache""" func_code_h = hash(getattr(self.func, '__code__', None)) return id(self.func), hash(self.func), func_code_h def _write_func_code(self, filename, func_code, first_line): """ Write the function code and the filename to a file. """ # We store the first line because the filename and the function # name is not always enough to identify a function: people # sometimes have several functions named the same way in a # file. This is bad practice, but joblib should be robust to bad # practice. func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code) with io.open(filename, 'w', encoding="UTF-8") as out: out.write(func_code) # Also store in the in-memory store of function hashes is_named_callable = False if PY3_OR_LATER: is_named_callable = (hasattr(self.func, '__name__') and self.func.__name__ != '') else: is_named_callable = (hasattr(self.func, 'func_name') and self.func.func_name != '') if is_named_callable: # Don't do this for lambda functions or strange callable # objects, as it ends up being too fragile func_hash = self._hash_func() try: _FUNCTION_HASHES[self.func] = func_hash except TypeError: # Some callable are not hashable pass def _check_previous_func_code(self, stacklevel=2): """ stacklevel is the depth a which this function is called, to issue useful warnings to the user. """ # First check if our function is in the in-memory store. # Using the in-memory store not only makes things faster, but it # also renders us robust to variations of the files when the # in-memory version of the code does not vary try: if self.func in _FUNCTION_HASHES: # We use as an identifier the id of the function and its # hash. This is more likely to falsely change than have hash # collisions, thus we are on the safe side. func_hash = self._hash_func() if func_hash == _FUNCTION_HASHES[self.func]: return True except TypeError: # Some callables are not hashable pass # Here, we go through some effort to be robust to dynamically # changing code and collision. We cannot inspect.getsource # because it is not reliable when using IPython's magic "%run". func_code, source_file, first_line = get_func_code(self.func) func_dir = self._get_func_dir() func_code_file = os.path.join(func_dir, 'func_code.py') try: with io.open(func_code_file, encoding="UTF-8") as infile: old_func_code, old_first_line = \ extract_first_line(infile.read()) except IOError: self._write_func_code(func_code_file, func_code, first_line) return False if old_func_code == func_code: return True # We have differing code, is this because we are referring to # different functions, or because the function we are referring to has # changed? _, func_name = get_func_name(self.func, resolv_alias=False, win_characters=False) if old_first_line == first_line == -1 or func_name == '': if not first_line == -1: func_description = '%s (%s:%i)' % (func_name, source_file, first_line) else: func_description = func_name warnings.warn(JobLibCollisionWarning( "Cannot detect name collisions for function '%s'" % func_description), stacklevel=stacklevel) # Fetch the code at the old location and compare it. If it is the # same than the code store, we have a collision: the code in the # file has not changed, but the name we have is pointing to a new # code block. if not old_first_line == first_line and source_file is not None: possible_collision = False if os.path.exists(source_file): _, func_name = get_func_name(self.func, resolv_alias=False) num_lines = len(func_code.split('\n')) with open_py_source(source_file) as f: on_disk_func_code = f.readlines()[ old_first_line - 1:old_first_line - 1 + num_lines - 1] on_disk_func_code = ''.join(on_disk_func_code) possible_collision = (on_disk_func_code.rstrip() == old_func_code.rstrip()) else: possible_collision = source_file.startswith(' 10: _, func_name = get_func_name(self.func, resolv_alias=False) self.warn("Function %s (stored in %s) has changed." % (func_name, func_dir)) self.clear(warn=True) return False def clear(self, warn=True): """ Empty the function's cache. """ func_dir = self._get_func_dir(mkdir=False) if self._verbose > 0 and warn: self.warn("Clearing cache %s" % func_dir) if os.path.exists(func_dir): shutil.rmtree(func_dir, ignore_errors=True) mkdirp(func_dir) func_code, _, first_line = get_func_code(self.func) func_code_file = os.path.join(func_dir, 'func_code.py') self._write_func_code(func_code_file, func_code, first_line) def call(self, *args, **kwargs): """ Force the execution of the function with the given arguments and persist the output values. """ start_time = time.time() output_dir, _ = self._get_output_dir(*args, **kwargs) if self._verbose > 0: print(format_call(self.func, args, kwargs)) output = self.func(*args, **kwargs) self._persist_output(output, output_dir) duration = time.time() - start_time metadata = self._persist_input(output_dir, duration, args, kwargs) if self._verbose > 0: _, name = get_func_name(self.func) msg = '%s - %s' % (name, format_time(duration)) print(max(0, (80 - len(msg))) * '_' + msg) return output, metadata # Make public def _persist_output(self, output, dir): """ Persist the given output tuple in the directory. """ try: filename = os.path.join(dir, 'output.pkl') mkdirp(dir) write_func = functools.partial(numpy_pickle.dump, compress=self.compress) concurrency_safe_write(output, filename, write_func) if self._verbose > 10: print('Persisting in %s' % dir) except OSError: " Race condition in the creation of the directory " def _persist_input(self, output_dir, duration, args, kwargs, this_duration_limit=0.5): """ Save a small summary of the call using json format in the output directory. output_dir: string directory where to write metadata. duration: float time taken by hashing input arguments, calling the wrapped function and persisting its output. args, kwargs: list and dict input arguments for wrapped function this_duration_limit: float Max execution time for this function before issuing a warning. """ start_time = time.time() argument_dict = filter_args(self.func, self.ignore, args, kwargs) input_repr = dict((k, repr(v)) for k, v in argument_dict.items()) # This can fail due to race-conditions with multiple # concurrent joblibs removing the file or the directory metadata = {"duration": duration, "input_args": input_repr} try: mkdirp(output_dir) filename = os.path.join(output_dir, 'metadata.json') def write_func(output, dest_filename): with open(dest_filename, 'w') as f: json.dump(output, f) concurrency_safe_write(metadata, filename, write_func) except Exception: pass this_duration = time.time() - start_time if this_duration > this_duration_limit: # This persistence should be fast. It will not be if repr() takes # time and its output is large, because json.dump will have to # write a large file. This should not be an issue with numpy arrays # for which repr() always output a short representation, but can # be with complex dictionaries. Fixing the problem should be a # matter of replacing repr() above by something smarter. warnings.warn("Persisting input arguments took %.2fs to run.\n" "If this happens often in your code, it can cause " "performance problems \n" "(results will be correct in all cases). \n" "The reason for this is probably some large input " "arguments for a wrapped\n" " function (e.g. large strings).\n" "THIS IS A JOBLIB ISSUE. If you can, kindly provide " "the joblib's team with an\n" " example so that they can fix the problem." % this_duration, stacklevel=5) return metadata # XXX: Need a method to check if results are available. #------------------------------------------------------------------------- # Private `object` interface #------------------------------------------------------------------------- def __repr__(self): return '%s(func=%s, cachedir=%s)' % ( self.__class__.__name__, self.func, repr(self.cachedir), ) ############################################################################### # class `Memory` ############################################################################### class Memory(Logger): """ A context object for caching a function's return value each time it is called with the same input arguments. All values are cached on the filesystem, in a deep directory structure. see :ref:`memory_reference` """ #------------------------------------------------------------------------- # Public interface #------------------------------------------------------------------------- def __init__(self, cachedir, mmap_mode=None, compress=False, verbose=1, bytes_limit=None): """ Parameters ---------- cachedir: string or None The path of the base directory to use as a data store or None. If None is given, no caching is done and the Memory object is completely transparent. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the arguments. compress: boolean, or integer Whether to zip the stored data on disk. If an integer is given, it should be between 1 and 9, and sets the amount of compression. Note that compressed arrays cannot be read by memmapping. verbose: int, optional Verbosity flag, controls the debug messages that are issued as functions are evaluated. bytes_limit: int, optional Limit in bytes of the size of the cache """ # XXX: Bad explanation of the None value of cachedir Logger.__init__(self) self._verbose = verbose self.mmap_mode = mmap_mode self.timestamp = time.time() self.compress = compress self.bytes_limit = bytes_limit if compress and mmap_mode is not None: warnings.warn('Compressed results cannot be memmapped', stacklevel=2) if cachedir is None: self.cachedir = None else: self.cachedir = os.path.join(cachedir, 'joblib') mkdirp(self.cachedir) def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False): """ Decorates the given function func to only compute its return value for input arguments not cached on disk. Parameters ---------- func: callable, optional The function to be decorated ignore: list of strings A list of arguments name to ignore in the hashing verbose: integer, optional The verbosity mode of the function. By default that of the memory object is used. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the arguments. By default that of the memory object is used. Returns ------- decorated_func: MemorizedFunc object The returned object is a MemorizedFunc object, that is callable (behaves like a function), but offers extra methods for cache lookup and management. See the documentation for :class:`joblib.memory.MemorizedFunc`. """ if func is None: # Partial application, to be able to specify extra keyword # arguments in decorators return functools.partial(self.cache, ignore=ignore, verbose=verbose, mmap_mode=mmap_mode) if self.cachedir is None: return NotMemorizedFunc(func) if verbose is None: verbose = self._verbose if mmap_mode is False: mmap_mode = self.mmap_mode if isinstance(func, MemorizedFunc): func = func.func return MemorizedFunc(func, cachedir=self.cachedir, mmap_mode=mmap_mode, ignore=ignore, compress=self.compress, verbose=verbose, timestamp=self.timestamp) def clear(self, warn=True): """ Erase the complete cache directory. """ if warn: self.warn('Flushing completely the cache') if self.cachedir is not None: rm_subdirs(self.cachedir) def reduce_size(self): """Remove cache folders to make cache size fit in ``bytes_limit``.""" if self.cachedir is not None and self.bytes_limit is not None: cache_items_to_delete = _get_cache_items_to_delete( self.cachedir, self.bytes_limit) for cache_item in cache_items_to_delete: if self._verbose > 10: print('Deleting cache item {}'.format(cache_item)) try: shutil.rmtree(cache_item.path, ignore_errors=True) except OSError: # Even with ignore_errors=True can shutil.rmtree # can raise OSErrror with [Errno 116] Stale file # handle if another process has deleted the folder # already. pass def eval(self, func, *args, **kwargs): """ Eval function func with arguments `*args` and `**kwargs`, in the context of the memory. This method works similarly to the builtin `apply`, except that the function is called only if the cache is not up to date. """ if self.cachedir is None: return func(*args, **kwargs) return self.cache(func)(*args, **kwargs) #------------------------------------------------------------------------- # Private `object` interface #------------------------------------------------------------------------- def __repr__(self): return '%s(cachedir=%s)' % ( self.__class__.__name__, repr(self.cachedir), ) def __reduce__(self): """ We don't store the timestamp when pickling, to avoid the hash depending from it. In addition, when unpickling, we run the __init__ """ # We need to remove 'joblib' from the end of cachedir cachedir = self.cachedir[:-7] if self.cachedir is not None else None return (self.__class__, (cachedir, self.mmap_mode, self.compress, self._verbose)) joblib-0.11/joblib/my_exceptions.py000066400000000000000000000074031305577265600174210ustar00rootroot00000000000000""" Exceptions """ # Author: Gael Varoquaux < gael dot varoquaux at normalesup dot org > # Copyright: 2010, Gael Varoquaux # License: BSD 3 clause import sys from ._compat import PY3_OR_LATER class JoblibException(Exception): """A simple exception with an error message that you can get to.""" def __init__(self, *args): # We need to implement __init__ so that it is picked in the # multiple heritance hierarchy in the class created in # _mk_exception. Note: in Python 2, if you implement __init__ # in your exception class you need to set .args correctly, # otherwise you can dump an exception instance with pickle but # not load it (at load time an empty .args will be passed to # the constructor). Also we want to be explicit and not use # 'super' here. Using 'super' can cause a sibling class method # to be called and we have no control the sibling class method # constructor signature in the exception returned by # _mk_exception. Exception.__init__(self, *args) def __repr__(self): if hasattr(self, 'args') and len(self.args) > 0: message = self.args[0] else: message = '' name = self.__class__.__name__ return '%s\n%s\n%s\n%s' % (name, 75 * '_', message, 75 * '_') __str__ = __repr__ class TransportableException(JoblibException): """An exception containing all the info to wrap an original exception and recreate it. """ def __init__(self, message, etype): # The next line set the .args correctly. This is needed to # make the exception loadable with pickle JoblibException.__init__(self, message, etype) self.message = message self.etype = etype class WorkerInterrupt(Exception): """ An exception that is not KeyboardInterrupt to allow subprocesses to be interrupted. """ pass _exception_mapping = dict() def _mk_exception(exception, name=None): # Create an exception inheriting from both JoblibException # and that exception if name is None: name = exception.__name__ this_name = 'Joblib%s' % name if this_name in _exception_mapping: # Avoid creating twice the same exception this_exception = _exception_mapping[this_name] else: if exception is Exception: # JoblibException is already a subclass of Exception. No # need to use multiple inheritance return JoblibException, this_name try: this_exception = type( this_name, (JoblibException, exception), {}) _exception_mapping[this_name] = this_exception except TypeError: # This happens if "Cannot create a consistent method # resolution order", e.g. because 'exception' is a # subclass of JoblibException or 'exception' is not an # acceptable base class this_exception = JoblibException return this_exception, this_name def _mk_common_exceptions(): namespace = dict() if PY3_OR_LATER: import builtins as _builtin_exceptions common_exceptions = filter( lambda x: x.endswith('Error'), dir(_builtin_exceptions)) else: import exceptions as _builtin_exceptions common_exceptions = dir(_builtin_exceptions) for name in common_exceptions: obj = getattr(_builtin_exceptions, name) if isinstance(obj, type) and issubclass(obj, BaseException): this_obj, this_name = _mk_exception(obj, name=name) namespace[this_name] = this_obj return namespace # Updating module locals so that the exceptions pickle right. AFAIK this # works only at module-creation time locals().update(_mk_common_exceptions()) joblib-0.11/joblib/numpy_pickle.py000066400000000000000000000553051305577265600172360ustar00rootroot00000000000000"""Utilities for fast persistence of big data, with optional compression.""" # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import pickle import os import sys import warnings try: from pathlib import Path except ImportError: Path = None from .numpy_pickle_utils import _COMPRESSORS from .numpy_pickle_utils import BinaryZlibFile from .numpy_pickle_utils import Unpickler, Pickler from .numpy_pickle_utils import _read_fileobject, _write_fileobject from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE from .numpy_pickle_compat import load_compatibility from .numpy_pickle_compat import NDArrayWrapper # For compatibility with old versions of joblib, we need ZNDArrayWrapper # to be visible in the current namespace. # Explicitly skipping next line from flake8 as it triggers an F401 warning # which we don't care. from .numpy_pickle_compat import ZNDArrayWrapper # noqa from ._compat import _basestring, PY3_OR_LATER from .backports import make_memmap ############################################################################### # Utility objects for persistence. class NumpyArrayWrapper(object): """An object to be persisted instead of numpy arrays. This object is used to hack into the pickle machinery and read numpy array data from our custom persistence format. More precisely, this object is used for: * carrying the information of the persisted array: subclass, shape, order, dtype. Those ndarray metadata are used to correctly reconstruct the array with low level numpy functions. * determining if memmap is allowed on the array. * reading the array bytes from a file. * reading the array using memorymap from a file. * writing the array bytes to a file. Attributes ---------- subclass: numpy.ndarray subclass Determine the subclass of the wrapped array. shape: numpy.ndarray shape Determine the shape of the wrapped array. order: {'C', 'F'} Determine the order of wrapped array data. 'C' is for C order, 'F' is for fortran order. dtype: numpy.ndarray dtype Determine the data type of the wrapped array. allow_mmap: bool Determine if memory mapping is allowed on the wrapped array. Default: False. """ def __init__(self, subclass, shape, order, dtype, allow_mmap=False): """Constructor. Store the useful information for later.""" self.subclass = subclass self.shape = shape self.order = order self.dtype = dtype self.allow_mmap = allow_mmap def write_array(self, array, pickler): """Write array bytes to pickler file handle. This function is an adaptation of the numpy write_array function available in version 1.10.1 in numpy/lib/format.py. """ # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data # directly. Instead, we will pickle it out with version 2 of the # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: for chunk in pickler.np.nditer(array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order=self.order): pickler.file_handle.write(chunk.tostring('C')) def read_array(self, unpickler): """Read array from unpickler file handle. This function is an adaptation of the numpy read_array function available in version 1.10.1 in numpy/lib/format.py. """ if len(self.shape) == 0: count = 1 else: count = unpickler.np.multiply.reduce(self.shape) # Now read the actual data. if self.dtype.hasobject: # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: if (not PY3_OR_LATER and unpickler.np.compat.isfileobj(unpickler.file_handle)): # In python 2, gzip.GzipFile is considered as a file so one # can use numpy.fromfile(). # For file objects, use np.fromfile function. # This function is faster than the memory-intensive # method below. array = unpickler.np.fromfile(unpickler.file_handle, dtype=self.dtype, count=count) else: # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, # breaking large reads from gzip streams. Chunk reads to # BUFFER_SIZE bytes to avoid issue and reduce memory overhead # of the read. In non-chunked case count < max_read_count, so # only one read is performed. max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, self.dtype.itemsize) array = unpickler.np.empty(count, dtype=self.dtype) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * self.dtype.itemsize) data = _read_bytes(unpickler.file_handle, read_size, "array data") array[i:i + read_count] = \ unpickler.np.frombuffer(data, dtype=self.dtype, count=read_count) del data if self.order == 'F': array.shape = self.shape[::-1] array = array.transpose() else: array.shape = self.shape return array def read_mmap(self, unpickler): """Read an array using numpy memmap.""" offset = unpickler.file_handle.tell() if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' marray = make_memmap(unpickler.filename, dtype=self.dtype, shape=self.shape, order=self.order, mode=unpickler.mmap_mode, offset=offset) # update the offset so that it corresponds to the end of the read array unpickler.file_handle.seek(offset + marray.nbytes) return marray def read(self, unpickler): """Read the array corresponding to this wrapper. Use the unpickler to get all information to correctly read the array. Parameters ---------- unpickler: NumpyUnpickler Returns ------- array: numpy.ndarray """ # When requested, only use memmap mode if allowed. if unpickler.mmap_mode is not None and self.allow_mmap: array = self.read_mmap(unpickler) else: array = self.read_array(unpickler) # Manage array subclass case if (hasattr(array, '__array_prepare__') and self.subclass not in (unpickler.np.ndarray, unpickler.np.memmap)): # We need to reconstruct another subclass new_array = unpickler.np.core.multiarray._reconstruct( self.subclass, (0,), 'b') return new_array.__array_prepare__(array) else: return array ############################################################################### # Pickler classes class NumpyPickler(Pickler): """A pickler to persist big data efficiently. The main features of this object are: * persistence of numpy arrays in a single file. * optional compression with a special care on avoiding memory copies. Attributes ---------- fp: file File object handle used for serializing the input object. protocol: int Pickle protocol used. Default is pickle.DEFAULT_PROTOCOL under python 3, pickle.HIGHEST_PROTOCOL otherwise. """ dispatch = Pickler.dispatch.copy() def __init__(self, fp, protocol=None): self.file_handle = fp self.buffered = isinstance(self.file_handle, BinaryZlibFile) # By default we want a pickle protocol that only changes with # the major python version and not the minor one if protocol is None: protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER else pickle.HIGHEST_PROTOCOL) Pickler.__init__(self, self.file_handle, protocol=protocol) # delayed import of numpy, to avoid tight coupling try: import numpy as np except ImportError: np = None self.np = np def _create_array_wrapper(self, array): """Create and returns a numpy array wrapper from a numpy array.""" order = 'F' if (array.flags.f_contiguous and not array.flags.c_contiguous) else 'C' allow_mmap = not self.buffered and not array.dtype.hasobject wrapper = NumpyArrayWrapper(type(array), array.shape, order, array.dtype, allow_mmap=allow_mmap) return wrapper def save(self, obj): """Subclass the Pickler `save` method. This is a total abuse of the Pickler class in order to use the numpy persistence function `save` instead of the default pickle implementation. The numpy array is replaced by a custom wrapper in the pickle persistence stack and the serialized array is written right after in the file. Warning: the file produced does not follow the pickle format. As such it can not be read with `pickle.load`. """ if self.np is not None and type(obj) in (self.np.ndarray, self.np.matrix, self.np.memmap): if type(obj) is self.np.memmap: # Pickling doesn't work with memmapped arrays obj = self.np.asanyarray(obj) # The array wrapper is pickled instead of the real array. wrapper = self._create_array_wrapper(obj) Pickler.save(self, wrapper) # A framer was introduced with pickle protocol 4 and we want to # ensure the wrapper object is written before the numpy array # buffer in the pickle file. # See https://www.python.org/dev/peps/pep-3154/#framing to get # more information on the framer behavior. if self.proto >= 4: self.framer.commit_frame(force=True) # And then array bytes are written right after the wrapper. wrapper.write_array(obj, self) return return Pickler.save(self, obj) class NumpyUnpickler(Unpickler): """A subclass of the Unpickler to unpickle our numpy pickles. Attributes ---------- mmap_mode: str The memorymap mode to use for reading numpy arrays. file_handle: file_like File object to unpickle from. filename: str Name of the file to unpickle from. It should correspond to file_handle. This parameter is required when using mmap_mode. np: module Reference to numpy module if numpy is installed else None. """ dispatch = Unpickler.dispatch.copy() def __init__(self, filename, file_handle, mmap_mode=None): # The next line is for backward compatibility with pickle generated # with joblib versions less than 0.10. self._dirname = os.path.dirname(filename) self.mmap_mode = mmap_mode self.file_handle = file_handle # filename is required for numpy mmap mode. self.filename = filename self.compat_mode = False Unpickler.__init__(self, self.file_handle) try: import numpy as np except ImportError: np = None self.np = np def load_build(self): """Called to set the state of a newly created object. We capture it to replace our place-holder objects, NDArrayWrapper or NumpyArrayWrapper, by the array we are interested in. We replace them directly in the stack of pickler. NDArrayWrapper is used for backward compatibility with joblib <= 0.9. """ Unpickler.load_build(self) # For backward compatibility, we support NDArrayWrapper objects. if isinstance(self.stack[-1], (NDArrayWrapper, NumpyArrayWrapper)): if self.np is None: raise ImportError("Trying to unpickle an ndarray, " "but numpy didn't import correctly") array_wrapper = self.stack.pop() # If any NDArrayWrapper is found, we switch to compatibility mode, # this will be used to raise a DeprecationWarning to the user at # the end of the unpickling. if isinstance(array_wrapper, NDArrayWrapper): self.compat_mode = True self.stack.append(array_wrapper.read(self)) # Be careful to register our new method. if PY3_OR_LATER: dispatch[pickle.BUILD[0]] = load_build else: dispatch[pickle.BUILD] = load_build ############################################################################### # Utility functions def dump(value, filename, compress=0, protocol=None, cache_size=None): """Persist an arbitrary Python object into one file. Parameters ----------- value: any Python object The object to store to disk. filename: str or pathlib.Path The path of the file in which it is to be stored. The compression method corresponding to one of the supported filename extensions ('.z', '.gz', '.bz2', '.xz' or '.lzma') will be used automatically. compress: int from 0 to 9 or bool or 2-tuple, optional Optional compression level for the data. 0 or False is no compression. Higher value means more compression, but also slower read and write times. Using a value of 3 is often a good compromise. See the notes for more details. If compress is True, the compression level used is 3. If compress is a 2-tuple, the first element must correspond to a string between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma' 'xz'), the second element must be an integer from 0 to 9, corresponding to the compression level. protocol: positive int Pickle protocol, see pickle.dump documentation for more details. cache_size: positive int, optional This option is deprecated in 0.10 and has no effect. Returns ------- filenames: list of strings The list of file names in which the data is stored. If compress is false, each array is stored in a different file. See Also -------- joblib.load : corresponding loader Notes ----- Memmapping on load cannot be used for compressed files. Thus using compression can significantly slow down loading. In addition, compressed files take extra extra memory during dump and load. """ if Path is not None and isinstance(filename, Path): filename = str(filename) is_filename = isinstance(filename, _basestring) is_fileobj = hasattr(filename, "write") compress_method = 'zlib' # zlib is the default compression method. if compress is True: # By default, if compress is enabled, we want to be using 3 by default compress_level = 3 elif isinstance(compress, tuple): # a 2-tuple was set in compress if len(compress) != 2: raise ValueError( 'Compress argument tuple should contain exactly 2 elements: ' '(compress method, compress level), you passed {}' .format(compress)) compress_method, compress_level = compress else: compress_level = compress if compress_level is not False and compress_level not in range(10): # Raising an error if a non valid compress level is given. raise ValueError( 'Non valid compress level given: "{}". Possible values are ' '{}.'.format(compress_level, list(range(10)))) if compress_method not in _COMPRESSORS: # Raising an error if an unsupported compression method is given. raise ValueError( 'Non valid compression method given: "{}". Possible values are ' '{}.'.format(compress_method, _COMPRESSORS)) if not is_filename and not is_fileobj: # People keep inverting arguments, and the resulting error is # incomprehensible raise ValueError( 'Second argument should be a filename or a file-like object, ' '%s (type %s) was given.' % (filename, type(filename)) ) if is_filename and not isinstance(compress, tuple): # In case no explicit compression was requested using both compression # method and level in a tuple and the filename has an explicit # extension, we select the corresponding compressor. if filename.endswith('.z'): compress_method = 'zlib' elif filename.endswith('.gz'): compress_method = 'gzip' elif filename.endswith('.bz2'): compress_method = 'bz2' elif filename.endswith('.lzma'): compress_method = 'lzma' elif filename.endswith('.xz'): compress_method = 'xz' else: # no matching compression method found, we unset the variable to # be sure no compression level is set afterwards. compress_method = None if compress_method in _COMPRESSORS and compress_level == 0: # we choose a default compress_level of 3 in case it was not given # as an argument (using compress). compress_level = 3 if not PY3_OR_LATER and compress_method in ('lzma', 'xz'): raise NotImplementedError("{} compression is only available for " "python version >= 3.3. You are using " "{}.{}".format(compress_method, sys.version_info[0], sys.version_info[1])) if cache_size is not None: # Cache size is deprecated starting from version 0.10 warnings.warn("Please do not set 'cache_size' in joblib.dump, " "this parameter has no effect and will be removed. " "You used 'cache_size={}'".format(cache_size), DeprecationWarning, stacklevel=2) if compress_level != 0: with _write_fileobject(filename, compress=(compress_method, compress_level)) as f: NumpyPickler(f, protocol=protocol).dump(value) elif is_filename: with open(filename, 'wb') as f: NumpyPickler(f, protocol=protocol).dump(value) else: NumpyPickler(filename, protocol=protocol).dump(value) # If the target container is a file object, nothing is returned. if is_fileobj: return # For compatibility, the list of created filenames (e.g with one element # after 0.10.0) is returned by default. return [filename] def _unpickle(fobj, filename="", mmap_mode=None): """Internal unpickling function.""" # We are careful to open the file handle early and keep it open to # avoid race-conditions on renames. # That said, if data is stored in companion files, which can be # the case with the old persistence format, moving the directory # will create a race when joblib tries to access the companion # files. unpickler = NumpyUnpickler(filename, fobj, mmap_mode=mmap_mode) obj = None try: obj = unpickler.load() if unpickler.compat_mode: warnings.warn("The file '%s' has been generated with a " "joblib version less than 0.10. " "Please regenerate this pickle file." % filename, DeprecationWarning, stacklevel=3) except UnicodeDecodeError as exc: # More user-friendly error message if PY3_OR_LATER: new_exc = ValueError( 'You may be trying to read with ' 'python 3 a joblib pickle generated with python 2. ' 'This feature is not supported by joblib.') new_exc.__cause__ = exc raise new_exc # Reraise exception with Python 2 raise return obj def load(filename, mmap_mode=None): """Reconstruct a Python object from a file persisted with joblib.dump. Parameters ----------- filename: str or pathlib.Path The path of the file from which to load the object mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional If not None, the arrays are memory-mapped from the disk. This mode has no effect for compressed files. Note that in this case the reconstructed object might not longer match exactly the originally pickled object. Returns ------- result: any Python object The object stored in the file. See Also -------- joblib.dump : function to save an object Notes ----- This function can load numpy array files saved separately during the dump. If the mmap_mode argument is given, it is passed to np.load and arrays are loaded as memmaps. As a consequence, the reconstructed object might not match the original pickled object. Note that if the file was saved with compression, the arrays cannot be memmaped. """ if Path is not None and isinstance(filename, Path): filename = str(filename) if hasattr(filename, "read"): fobj = filename filename = getattr(fobj, 'name', '') with _read_fileobject(fobj, filename, mmap_mode) as fobj: obj = _unpickle(fobj) else: with open(filename, 'rb') as f: with _read_fileobject(f, filename, mmap_mode) as fobj: if isinstance(fobj, _basestring): # if the returned file object is a string, this means we # try to load a pickle file generated with an version of # Joblib so we load it with joblib compatibility function. return load_compatibility(fobj) obj = _unpickle(fobj, filename, mmap_mode) return obj joblib-0.11/joblib/numpy_pickle_compat.py000066400000000000000000000203671305577265600206010ustar00rootroot00000000000000"""Numpy pickle compatibility functions.""" import pickle import os import zlib from io import BytesIO from ._compat import PY3_OR_LATER from .numpy_pickle_utils import _ZFILE_PREFIX from .numpy_pickle_utils import Unpickler def hex_str(an_int): """Convert an int to an hexadecimal string.""" return '{:#x}'.format(an_int) if PY3_OR_LATER: def asbytes(s): if isinstance(s, bytes): return s return s.encode('latin1') else: asbytes = str _MAX_LEN = len(hex_str(2 ** 64)) _CHUNK_SIZE = 64 * 1024 def read_zfile(file_handle): """Read the z-file and return the content as a string. Z-files are raw data compressed with zlib used internally by joblib for persistence. Backward compatibility is not guaranteed. Do not use for external purposes. """ file_handle.seek(0) header_length = len(_ZFILE_PREFIX) + _MAX_LEN length = file_handle.read(header_length) length = length[len(_ZFILE_PREFIX):] length = int(length, 16) # With python2 and joblib version <= 0.8.4 compressed pickle header is one # character wider so we need to ignore an additional space if present. # Note: the first byte of the zlib data is guaranteed not to be a # space according to # https://tools.ietf.org/html/rfc6713#section-2.1 next_byte = file_handle.read(1) if next_byte != b' ': # The zlib compressed data has started and we need to go back # one byte file_handle.seek(header_length) # We use the known length of the data to tell Zlib the size of the # buffer to allocate. data = zlib.decompress(file_handle.read(), 15, length) assert len(data) == length, ( "Incorrect data length while decompressing %s." "The file could be corrupted." % file_handle) return data def write_zfile(file_handle, data, compress=1): """Write the data in the given file as a Z-file. Z-files are raw data compressed with zlib used internally by joblib for persistence. Backward compatibility is not guarantied. Do not use for external purposes. """ file_handle.write(_ZFILE_PREFIX) length = hex_str(len(data)) # Store the length of the data file_handle.write(asbytes(length.ljust(_MAX_LEN))) file_handle.write(zlib.compress(asbytes(data), compress)) ############################################################################### # Utility objects for persistence. class NDArrayWrapper(object): """An object to be persisted instead of numpy arrays. The only thing this object does, is to carry the filename in which the array has been persisted, and the array subclass. """ def __init__(self, filename, subclass, allow_mmap=True): """Constructor. Store the useful information for later.""" self.filename = filename self.subclass = subclass self.allow_mmap = allow_mmap def read(self, unpickler): """Reconstruct the array.""" filename = os.path.join(unpickler._dirname, self.filename) # Load the array from the disk # use getattr instead of self.allow_mmap to ensure backward compat # with NDArrayWrapper instances pickled with joblib < 0.9.0 allow_mmap = getattr(self, 'allow_mmap', True) memmap_kwargs = ({} if not allow_mmap else {'mmap_mode': unpickler.mmap_mode}) array = unpickler.np.load(filename, **memmap_kwargs) # Reconstruct subclasses. This does not work with old # versions of numpy if (hasattr(array, '__array_prepare__') and self.subclass not in (unpickler.np.ndarray, unpickler.np.memmap)): # We need to reconstruct another subclass new_array = unpickler.np.core.multiarray._reconstruct( self.subclass, (0,), 'b') return new_array.__array_prepare__(array) else: return array class ZNDArrayWrapper(NDArrayWrapper): """An object to be persisted instead of numpy arrays. This object store the Zfile filename in which the data array has been persisted, and the meta information to retrieve it. The reason that we store the raw buffer data of the array and the meta information, rather than array representation routine (tostring) is that it enables us to use completely the strided model to avoid memory copies (a and a.T store as fast). In addition saving the heavy information separately can avoid creating large temporary buffers when unpickling data with large arrays. """ def __init__(self, filename, init_args, state): """Constructor. Store the useful information for later.""" self.filename = filename self.state = state self.init_args = init_args def read(self, unpickler): """Reconstruct the array from the meta-information and the z-file.""" # Here we a simply reproducing the unpickling mechanism for numpy # arrays filename = os.path.join(unpickler._dirname, self.filename) array = unpickler.np.core.multiarray._reconstruct(*self.init_args) with open(filename, 'rb') as f: data = read_zfile(f) state = self.state + (data,) array.__setstate__(state) return array class ZipNumpyUnpickler(Unpickler): """A subclass of the Unpickler to unpickle our numpy pickles.""" dispatch = Unpickler.dispatch.copy() def __init__(self, filename, file_handle, mmap_mode=None): """Constructor.""" self._filename = os.path.basename(filename) self._dirname = os.path.dirname(filename) self.mmap_mode = mmap_mode self.file_handle = self._open_pickle(file_handle) Unpickler.__init__(self, self.file_handle) try: import numpy as np except ImportError: np = None self.np = np def _open_pickle(self, file_handle): return BytesIO(read_zfile(file_handle)) def load_build(self): """Set the state of a newly created object. We capture it to replace our place-holder objects, NDArrayWrapper, by the array we are interested in. We replace them directly in the stack of pickler. """ Unpickler.load_build(self) if isinstance(self.stack[-1], NDArrayWrapper): if self.np is None: raise ImportError("Trying to unpickle an ndarray, " "but numpy didn't import correctly") nd_array_wrapper = self.stack.pop() array = nd_array_wrapper.read(self) self.stack.append(array) # Be careful to register our new method. if PY3_OR_LATER: dispatch[pickle.BUILD[0]] = load_build else: dispatch[pickle.BUILD] = load_build def load_compatibility(filename): """Reconstruct a Python object from a file persisted with joblib.dump. This function ensures the compatibility with joblib old persistence format (<= 0.9.3). Parameters ----------- filename: string The name of the file from which to load the object Returns ------- result: any Python object The object stored in the file. See Also -------- joblib.dump : function to save an object Notes ----- This function can load numpy array files saved separately during the dump. """ with open(filename, 'rb') as file_handle: # We are careful to open the file handle early and keep it open to # avoid race-conditions on renames. That said, if data is stored in # companion files, moving the directory will create a race when # joblib tries to access the companion files. unpickler = ZipNumpyUnpickler(filename, file_handle=file_handle) try: obj = unpickler.load() except UnicodeDecodeError as exc: # More user-friendly error message if PY3_OR_LATER: new_exc = ValueError( 'You may be trying to read with ' 'python 3 a joblib pickle generated with python 2. ' 'This feature is not supported by joblib.') new_exc.__cause__ = exc raise new_exc finally: if hasattr(unpickler, 'file_handle'): unpickler.file_handle.close() return obj joblib-0.11/joblib/numpy_pickle_utils.py000066400000000000000000000557321305577265600204620ustar00rootroot00000000000000"""Utilities for fast persistence of big data, with optional compression.""" # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import pickle import sys import io import zlib import gzip import warnings import contextlib from contextlib import closing from ._compat import PY3_OR_LATER, PY27, _basestring try: from threading import RLock except ImportError: from dummy_threading import RLock if PY3_OR_LATER: Unpickler = pickle._Unpickler Pickler = pickle._Pickler xrange = range else: Unpickler = pickle.Unpickler Pickler = pickle.Pickler try: import numpy as np except ImportError: np = None try: import lzma except ImportError: lzma = None try: # The python standard library can be built without bz2 so we make bz2 # usage optional. # see https://github.com/scikit-learn/scikit-learn/issues/7526 for more # details. import bz2 except ImportError: bz2 = None # Magic numbers of supported compression file formats. ' _ZFILE_PREFIX = b'ZF' # used with pickle files created before 0.9.3. _ZLIB_PREFIX = b'\x78' _GZIP_PREFIX = b'\x1f\x8b' _BZ2_PREFIX = b'BZ' _XZ_PREFIX = b'\xfd\x37\x7a\x58\x5a' _LZMA_PREFIX = b'\x5d\x00' # Supported compressors _COMPRESSORS = ('zlib', 'bz2', 'lzma', 'xz', 'gzip') _COMPRESSOR_CLASSES = [gzip.GzipFile] if bz2 is not None: _COMPRESSOR_CLASSES.append(bz2.BZ2File) if lzma is not None: _COMPRESSOR_CLASSES.append(lzma.LZMAFile) # The max magic number length of supported compression file types. _MAX_PREFIX_LEN = max(len(prefix) for prefix in (_ZFILE_PREFIX, _GZIP_PREFIX, _BZ2_PREFIX, _XZ_PREFIX, _LZMA_PREFIX)) # Buffer size used in io.BufferedReader and io.BufferedWriter _IO_BUFFER_SIZE = 1024 ** 2 def _is_raw_file(fileobj): """Check if fileobj is a raw file object, e.g created with open.""" if PY3_OR_LATER: fileobj = getattr(fileobj, 'raw', fileobj) return isinstance(fileobj, io.FileIO) else: return isinstance(fileobj, file) # noqa ############################################################################### # Cache file utilities def _detect_compressor(fileobj): """Return the compressor matching fileobj. Parameters ---------- fileobj: file object Returns ------- str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'} """ # Read the magic number in the first bytes of the file. if hasattr(fileobj, 'peek'): # Peek allows to read those bytes without moving the cursor in the # file whic. first_bytes = fileobj.peek(_MAX_PREFIX_LEN) else: # Fallback to seek if the fileobject is not peekable. first_bytes = fileobj.read(_MAX_PREFIX_LEN) fileobj.seek(0) if first_bytes.startswith(_ZLIB_PREFIX): return "zlib" elif first_bytes.startswith(_GZIP_PREFIX): return "gzip" elif first_bytes.startswith(_BZ2_PREFIX): return "bz2" elif first_bytes.startswith(_LZMA_PREFIX): return "lzma" elif first_bytes.startswith(_XZ_PREFIX): return "xz" elif first_bytes.startswith(_ZFILE_PREFIX): return "compat" return "not-compressed" def _buffered_read_file(fobj): """Return a buffered version of a read file object.""" if PY27 and bz2 is not None and isinstance(fobj, bz2.BZ2File): # Python 2.7 doesn't work with BZ2File through a buffer: "no # attribute 'readable'" error. return fobj else: return io.BufferedReader(fobj, buffer_size=_IO_BUFFER_SIZE) def _buffered_write_file(fobj): """Return a buffered version of a write file object.""" if PY27 and bz2 is not None and isinstance(fobj, bz2.BZ2File): # Python 2.7 doesn't work with BZ2File through a buffer: no attribute # 'writable'. # BZ2File doesn't implement the file object context manager in python 2 # so we wrap the fileobj using `closing`. return closing(fobj) else: return io.BufferedWriter(fobj, buffer_size=_IO_BUFFER_SIZE) @contextlib.contextmanager def _read_fileobject(fileobj, filename, mmap_mode=None): """Utility function opening the right fileobject from a filename. The magic number is used to choose between the type of file object to open: * regular file object (default) * zlib file object * gzip file object * bz2 file object * lzma file object (for xz and lzma compressor) Parameters ---------- fileobj: file object compressor: str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'} filename: str filename path corresponding to the fileobj parameter. mmap_mode: str memory map mode that should be used to open the pickle file. This parameter is useful to verify that the user is not trying to one with compression. Default: None. Returns ------- a file like object """ # Detect if the fileobj contains compressed data. compressor = _detect_compressor(fileobj) if compressor == 'compat': # Compatibility with old pickle mode: simply return the input # filename "as-is" and let the compatibility function be called by the # caller. warnings.warn("The file '%s' has been generated with a joblib " "version less than 0.10. " "Please regenerate this pickle file." % filename, DeprecationWarning, stacklevel=2) yield filename else: # based on the compressor detected in the file, we open the # correct decompressor file object, wrapped in a buffer. if compressor == 'zlib': fileobj = _buffered_read_file(BinaryZlibFile(fileobj, 'rb')) elif compressor == 'gzip': fileobj = _buffered_read_file(BinaryGzipFile(fileobj, 'rb')) elif compressor == 'bz2' and bz2 is not None: if PY3_OR_LATER: fileobj = _buffered_read_file(bz2.BZ2File(fileobj, 'rb')) else: # In python 2, BZ2File doesn't support a fileobj opened in # binary mode. In this case, we pass the filename. fileobj = _buffered_read_file(bz2.BZ2File(fileobj.name, 'rb')) elif (compressor == 'lzma' or compressor == 'xz'): if PY3_OR_LATER and lzma is not None: # We support lzma only in python 3 because in python 2 users # may have installed the pyliblzma package, which also provides # the lzma module, but that unfortunately doesn't fully support # the buffer interface required by joblib. # See https://github.com/joblib/joblib/issues/403 for details. fileobj = _buffered_read_file(lzma.LZMAFile(fileobj, 'rb')) else: raise NotImplementedError("Lzma decompression is not " "supported for this version of " "python ({}.{})" .format(sys.version_info[0], sys.version_info[1])) # Checking if incompatible load parameters with the type of file: # mmap_mode cannot be used with compressed file or in memory buffers # such as io.BytesIO. if mmap_mode is not None: if isinstance(fileobj, io.BytesIO): warnings.warn('In memory persistence is not compatible with ' 'mmap_mode "%(mmap_mode)s" flag passed. ' 'mmap_mode option will be ignored.' % locals(), stacklevel=2) elif compressor != 'not-compressed': warnings.warn('mmap_mode "%(mmap_mode)s" is not compatible ' 'with compressed file %(filename)s. ' '"%(mmap_mode)s" flag will be ignored.' % locals(), stacklevel=2) elif not _is_raw_file(fileobj): warnings.warn('"%(fileobj)r" is not a raw file, mmap_mode ' '"%(mmap_mode)s" flag will be ignored.' % locals(), stacklevel=2) yield fileobj def _write_fileobject(filename, compress=("zlib", 3)): """Return the right compressor file object in write mode.""" compressmethod = compress[0] compresslevel = compress[1] if compressmethod == "gzip": return _buffered_write_file(BinaryGzipFile(filename, 'wb', compresslevel=compresslevel)) elif compressmethod == "bz2" and bz2 is not None: return _buffered_write_file(bz2.BZ2File(filename, 'wb', compresslevel=compresslevel)) elif lzma is not None and compressmethod == "xz": return _buffered_write_file(lzma.LZMAFile(filename, 'wb', check=lzma.CHECK_NONE, preset=compresslevel)) elif lzma is not None and compressmethod == "lzma": return _buffered_write_file(lzma.LZMAFile(filename, 'wb', preset=compresslevel, format=lzma.FORMAT_ALONE)) else: return _buffered_write_file(BinaryZlibFile(filename, 'wb', compresslevel=compresslevel)) ############################################################################### # Joblib zlib compression file object definition _MODE_CLOSED = 0 _MODE_READ = 1 _MODE_READ_EOF = 2 _MODE_WRITE = 3 _BUFFER_SIZE = 8192 class BinaryZlibFile(io.BufferedIOBase): """A file object providing transparent zlib (de)compression. A BinaryZlibFile can act as a wrapper for an existing file object, or refer directly to a named file on disk. Note that BinaryZlibFile provides only a *binary* file interface: data read is returned as bytes, and data to be written should be given as bytes. This object is an adaptation of the BZ2File object and is compatible with versions of python >= 2.7. If filename is a str or bytes object, it gives the name of the file to be opened. Otherwise, it should be a file object, which will be used to read or write the compressed data. mode can be 'rb' for reading (default) or 'wb' for (over)writing If mode is 'wb', compresslevel can be a number between 1 and 9 specifying the level of compression: 1 produces the least compression, and 9 (default) produces the most compression. """ wbits = zlib.MAX_WBITS def __init__(self, filename, mode="rb", compresslevel=9): # This lock must be recursive, so that BufferedIOBase's # readline(), readlines() and writelines() don't deadlock. self._lock = RLock() self._fp = None self._closefp = False self._mode = _MODE_CLOSED self._pos = 0 self._size = -1 if not isinstance(compresslevel, int) or not (1 <= compresslevel <= 9): raise ValueError("'compresslevel' must be an integer " "between 1 and 9. You provided 'compresslevel={}'" .format(compresslevel)) if mode == "rb": mode_code = _MODE_READ self._decompressor = zlib.decompressobj(self.wbits) self._buffer = b"" self._buffer_offset = 0 elif mode == "wb": mode_code = _MODE_WRITE self._compressor = zlib.compressobj(compresslevel, zlib.DEFLATED, self.wbits, zlib.DEF_MEM_LEVEL, 0) else: raise ValueError("Invalid mode: %r" % (mode,)) if isinstance(filename, _basestring): self._fp = io.open(filename, mode) self._closefp = True self._mode = mode_code elif hasattr(filename, "read") or hasattr(filename, "write"): self._fp = filename self._mode = mode_code else: raise TypeError("filename must be a str or bytes object, " "or a file") def close(self): """Flush and close the file. May be called more than once without error. Once the file is closed, any other operation on it will raise a ValueError. """ with self._lock: if self._mode == _MODE_CLOSED: return try: if self._mode in (_MODE_READ, _MODE_READ_EOF): self._decompressor = None elif self._mode == _MODE_WRITE: self._fp.write(self._compressor.flush()) self._compressor = None finally: try: if self._closefp: self._fp.close() finally: self._fp = None self._closefp = False self._mode = _MODE_CLOSED self._buffer = b"" self._buffer_offset = 0 @property def closed(self): """True if this file is closed.""" return self._mode == _MODE_CLOSED def fileno(self): """Return the file descriptor for the underlying file.""" self._check_not_closed() return self._fp.fileno() def seekable(self): """Return whether the file supports seeking.""" return self.readable() and self._fp.seekable() def readable(self): """Return whether the file was opened for reading.""" self._check_not_closed() return self._mode in (_MODE_READ, _MODE_READ_EOF) def writable(self): """Return whether the file was opened for writing.""" self._check_not_closed() return self._mode == _MODE_WRITE # Mode-checking helper functions. def _check_not_closed(self): if self.closed: fname = getattr(self._fp, 'name', None) msg = "I/O operation on closed file" if fname is not None: msg += " {}".format(fname) msg += "." raise ValueError(msg) def _check_can_read(self): if self._mode not in (_MODE_READ, _MODE_READ_EOF): self._check_not_closed() raise io.UnsupportedOperation("File not open for reading") def _check_can_write(self): if self._mode != _MODE_WRITE: self._check_not_closed() raise io.UnsupportedOperation("File not open for writing") def _check_can_seek(self): if self._mode not in (_MODE_READ, _MODE_READ_EOF): self._check_not_closed() raise io.UnsupportedOperation("Seeking is only supported " "on files open for reading") if not self._fp.seekable(): raise io.UnsupportedOperation("The underlying file object " "does not support seeking") # Fill the readahead buffer if it is empty. Returns False on EOF. def _fill_buffer(self): if self._mode == _MODE_READ_EOF: return False # Depending on the input data, our call to the decompressor may not # return any data. In this case, try again after reading another block. while self._buffer_offset == len(self._buffer): try: rawblock = (self._decompressor.unused_data or self._fp.read(_BUFFER_SIZE)) if not rawblock: raise EOFError except EOFError: # End-of-stream marker and end of file. We're good. self._mode = _MODE_READ_EOF self._size = self._pos return False else: self._buffer = self._decompressor.decompress(rawblock) self._buffer_offset = 0 return True # Read data until EOF. # If return_data is false, consume the data without returning it. def _read_all(self, return_data=True): # The loop assumes that _buffer_offset is 0. Ensure that this is true. self._buffer = self._buffer[self._buffer_offset:] self._buffer_offset = 0 blocks = [] while self._fill_buffer(): if return_data: blocks.append(self._buffer) self._pos += len(self._buffer) self._buffer = b"" if return_data: return b"".join(blocks) # Read a block of up to n bytes. # If return_data is false, consume the data without returning it. def _read_block(self, n_bytes, return_data=True): # If we have enough data buffered, return immediately. end = self._buffer_offset + n_bytes if end <= len(self._buffer): data = self._buffer[self._buffer_offset: end] self._buffer_offset = end self._pos += len(data) return data if return_data else None # The loop assumes that _buffer_offset is 0. Ensure that this is true. self._buffer = self._buffer[self._buffer_offset:] self._buffer_offset = 0 blocks = [] while n_bytes > 0 and self._fill_buffer(): if n_bytes < len(self._buffer): data = self._buffer[:n_bytes] self._buffer_offset = n_bytes else: data = self._buffer self._buffer = b"" if return_data: blocks.append(data) self._pos += len(data) n_bytes -= len(data) if return_data: return b"".join(blocks) def read(self, size=-1): """Read up to size uncompressed bytes from the file. If size is negative or omitted, read until EOF is reached. Returns b'' if the file is already at EOF. """ with self._lock: self._check_can_read() if size == 0: return b"" elif size < 0: return self._read_all() else: return self._read_block(size) def readinto(self, b): """Read up to len(b) bytes into b. Returns the number of bytes read (0 for EOF). """ with self._lock: return io.BufferedIOBase.readinto(self, b) def write(self, data): """Write a byte string to the file. Returns the number of uncompressed bytes written, which is always len(data). Note that due to buffering, the file on disk may not reflect the data written until close() is called. """ with self._lock: self._check_can_write() # Convert data type if called by io.BufferedWriter. if isinstance(data, memoryview): data = data.tobytes() compressed = self._compressor.compress(data) self._fp.write(compressed) self._pos += len(data) return len(data) # Rewind the file to the beginning of the data stream. def _rewind(self): self._fp.seek(0, 0) self._mode = _MODE_READ self._pos = 0 self._decompressor = zlib.decompressobj(self.wbits) self._buffer = b"" self._buffer_offset = 0 def seek(self, offset, whence=0): """Change the file position. The new position is specified by offset, relative to the position indicated by whence. Values for whence are: 0: start of stream (default); offset must not be negative 1: current stream position 2: end of stream; offset must not be positive Returns the new file position. Note that seeking is emulated, so depending on the parameters, this operation may be extremely slow. """ with self._lock: self._check_can_seek() # Recalculate offset as an absolute file position. if whence == 0: pass elif whence == 1: offset = self._pos + offset elif whence == 2: # Seeking relative to EOF - we need to know the file's size. if self._size < 0: self._read_all(return_data=False) offset = self._size + offset else: raise ValueError("Invalid value for whence: %s" % (whence,)) # Make it so that offset is the number of bytes to skip forward. if offset < self._pos: self._rewind() else: offset -= self._pos # Read and discard data until we reach the desired position. self._read_block(offset, return_data=False) return self._pos def tell(self): """Return the current file position.""" with self._lock: self._check_not_closed() return self._pos class BinaryGzipFile(BinaryZlibFile): """A file object providing transparent gzip (de)compression. If filename is a str or bytes object, it gives the name of the file to be opened. Otherwise, it should be a file object, which will be used to read or write the compressed data. mode can be 'rb' for reading (default) or 'wb' for (over)writing If mode is 'wb', compresslevel can be a number between 1 and 9 specifying the level of compression: 1 produces the least compression, and 9 (default) produces the most compression. """ wbits = 31 # zlib compressor/decompressor wbits value for gzip format. # Utility functions/variables from numpy required for writing arrays. # We need at least the functions introduced in version 1.9 of numpy. Here, # we use the ones from numpy 1.10.2. BUFFER_SIZE = 2 ** 18 # size of buffer for reading npz files in bytes def _read_bytes(fp, size, error_template="ran out of data"): """Read from file-like object until size bytes are read. Raises ValueError if not EOF is encountered before size bytes are read. Non-blocking objects only supported if they derive from io objects. Required as e.g. ZipExtFile in python 2.6 can return less data than requested. This function was taken from numpy/lib/format.py in version 1.10.2. Parameters ---------- fp: file-like object size: int error_template: str Returns ------- a bytes object The data read in bytes. """ data = bytes() while True: # io files (default in python3) return None or raise on # would-block, python2 file will truncate, probably nothing can be # done about that. note that regular files can't be non-blocking try: r = fp.read(size - len(data)) data += r if len(r) == 0 or len(data) == size: break except io.BlockingIOError: pass if len(data) != size: msg = "EOF: reading %s, expected %d bytes got %d" raise ValueError(msg % (error_template, size, len(data))) else: return data joblib-0.11/joblib/parallel.py000066400000000000000000001004701305577265600163250ustar00rootroot00000000000000""" Helpers for embarrassingly parallel code. """ # Author: Gael Varoquaux < gael dot varoquaux at normalesup dot org > # Copyright: 2010, Gael Varoquaux # License: BSD 3 clause from __future__ import division import os import sys from math import sqrt import functools import time import threading import itertools from numbers import Integral from contextlib import contextmanager import warnings try: import cPickle as pickle except ImportError: import pickle from ._multiprocessing_helpers import mp from .format_stack import format_outer_frames from .logger import Logger, short_format_time from .my_exceptions import TransportableException, _mk_exception from .disk import memstr_to_bytes from ._parallel_backends import (FallbackToBackend, MultiprocessingBackend, ThreadingBackend, SequentialBackend) from ._compat import _basestring # Make sure that those two classes are part of the public joblib.parallel API # so that 3rd party backend implementers can import them from here. from ._parallel_backends import AutoBatchingMixin # noqa from ._parallel_backends import ParallelBackendBase # noqa BACKENDS = { 'multiprocessing': MultiprocessingBackend, 'threading': ThreadingBackend, 'sequential': SequentialBackend, } # name of the backend used by default by Parallel outside of any context # managed by ``parallel_backend``. DEFAULT_BACKEND = 'multiprocessing' DEFAULT_N_JOBS = 1 # Thread local value that can be overriden by the ``parallel_backend`` context # manager _backend = threading.local() def get_active_backend(): """Return the active default backend""" active_backend_and_jobs = getattr(_backend, 'backend_and_jobs', None) if active_backend_and_jobs is not None: return active_backend_and_jobs # We are outside of the scope of any parallel_backend context manager, # create the default backend instance now active_backend = BACKENDS[DEFAULT_BACKEND]() return active_backend, DEFAULT_N_JOBS @contextmanager def parallel_backend(backend, n_jobs=-1, **backend_params): """Change the default backend used by Parallel inside a with block. If ``backend`` is a string it must match a previously registered implementation using the ``register_parallel_backend`` function. Alternatively backend can be passed directly as an instance. By default all available workers will be used (``n_jobs=-1``) unless the caller passes an explicit value for the ``n_jobs`` parameter. This is an alternative to passing a ``backend='backend_name'`` argument to the ``Parallel`` class constructor. It is particularly useful when calling into library code that uses joblib internally but does not expose the backend argument in its own API. >>> from operator import neg >>> with parallel_backend('threading'): ... print(Parallel()(delayed(neg)(i + 1) for i in range(5))) ... [-1, -2, -3, -4, -5] Warning: this function is experimental and subject to change in a future version of joblib. .. versionadded:: 0.10 """ if isinstance(backend, _basestring): backend = BACKENDS[backend](**backend_params) old_backend_and_jobs = getattr(_backend, 'backend_and_jobs', None) try: _backend.backend_and_jobs = (backend, n_jobs) # return the backend instance to make it easier to write tests yield backend, n_jobs finally: if old_backend_and_jobs is None: if getattr(_backend, 'backend_and_jobs', None) is not None: del _backend.backend_and_jobs else: _backend.backend_and_jobs = old_backend_and_jobs # Under Linux or OS X the default start method of multiprocessing # can cause third party libraries to crash. Under Python 3.4+ it is possible # to set an environment variable to switch the default start method from # 'fork' to 'forkserver' or 'spawn' to avoid this issue albeit at the cost # of causing semantic changes and some additional pool instantiation overhead. if hasattr(mp, 'get_context'): method = os.environ.get('JOBLIB_START_METHOD', '').strip() or None DEFAULT_MP_CONTEXT = mp.get_context(method=method) else: DEFAULT_MP_CONTEXT = None class BatchedCalls(object): """Wrap a sequence of (func, args, kwargs) tuples as a single callable""" def __init__(self, iterator_slice): self.items = list(iterator_slice) self._size = len(self.items) def __call__(self): return [func(*args, **kwargs) for func, args, kwargs in self.items] def __len__(self): return self._size ############################################################################### # CPU count that works also when multiprocessing has been disabled via # the JOBLIB_MULTIPROCESSING environment variable def cpu_count(): """Return the number of CPUs.""" if mp is None: return 1 return mp.cpu_count() ############################################################################### # For verbosity def _verbosity_filter(index, verbose): """ Returns False for indices increasingly apart, the distance depending on the value of verbose. We use a lag increasing as the square of index """ if not verbose: return True elif verbose > 10: return False if index == 0: return False verbose = .5 * (11 - verbose) ** 2 scale = sqrt(index / verbose) next_scale = sqrt((index + 1) / verbose) return (int(next_scale) == int(scale)) ############################################################################### def delayed(function, check_pickle=True): """Decorator used to capture the arguments of a function. Pass `check_pickle=False` when: - performing a possibly repeated check is too costly and has been done already once outside of the call to delayed. - when used in conjunction `Parallel(backend='threading')`. """ # Try to pickle the input function, to catch the problems early when # using with multiprocessing: if check_pickle: pickle.dumps(function) def delayed_function(*args, **kwargs): return function, args, kwargs try: delayed_function = functools.wraps(function)(delayed_function) except AttributeError: " functools.wraps fails on some callable objects " return delayed_function ############################################################################### class BatchCompletionCallBack(object): """Callback used by joblib.Parallel's multiprocessing backend. This callable is executed by the parent process whenever a worker process has returned the results of a batch of tasks. It is used for progress reporting, to update estimate of the batch processing duration and to schedule the next batch of tasks to be processed. """ def __init__(self, dispatch_timestamp, batch_size, parallel): self.dispatch_timestamp = dispatch_timestamp self.batch_size = batch_size self.parallel = parallel def __call__(self, out): self.parallel.n_completed_tasks += self.batch_size this_batch_duration = time.time() - self.dispatch_timestamp self.parallel._backend.batch_completed(self.batch_size, this_batch_duration) self.parallel.print_progress() if self.parallel._original_iterator is not None: self.parallel.dispatch_next() ############################################################################### def register_parallel_backend(name, factory, make_default=False): """Register a new Parallel backend factory. The new backend can then be selected by passing its name as the backend argument to the Parallel class. Moreover, the default backend can be overwritten globally by setting make_default=True. The factory can be any callable that takes no argument and return an instance of ``ParallelBackendBase``. Warning: this function is experimental and subject to change in a future version of joblib. .. versionadded:: 0.10 """ BACKENDS[name] = factory if make_default: global DEFAULT_BACKEND DEFAULT_BACKEND = name def effective_n_jobs(n_jobs=-1): """Determine the number of jobs that can actually run in parallel n_jobs is the is the number of workers requested by the callers. Passing n_jobs=-1 means requesting all available workers for instance matching the number of CPU cores on the worker host(s). This method should return a guesstimate of the number of workers that can actually perform work concurrently with the currently enabled default backend. The primary use case is to make it possible for the caller to know in how many chunks to slice the work. In general working on larger data chunks is more efficient (less scheduling overhead and better use of CPU cache prefetching heuristics) as long as all the workers have enough work to do. Warning: this function is experimental and subject to change in a future version of joblib. .. versionadded:: 0.10 """ backend, _ = get_active_backend() return backend.effective_n_jobs(n_jobs=n_jobs) ############################################################################### class Parallel(Logger): ''' Helper class for readable parallel mapping. Parameters ----------- n_jobs: int, default: 1 The maximum number of concurrently running jobs, such as the number of Python worker processes when backend="multiprocessing" or the size of the thread-pool when backend="threading". If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. backend: str, ParallelBackendBase instance or None, \ default: 'multiprocessing' Specify the parallelization backend implementation. Supported backends are: - "multiprocessing" used by default, can induce some communication and memory overhead when exchanging input and output data with the worker Python processes. - "threading" is a very low-overhead backend but it suffers from the Python Global Interpreter Lock if the called function relies a lot on Python objects. "threading" is mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a "with nogil" block or an expensive call to a library such as NumPy). - finally, you can register backends by calling register_parallel_backend. This will allow you to implement a backend of your liking. verbose: int, optional The verbosity level: if non zero, progress messages are printed. Above 50, the output is sent to stdout. The frequency of the messages increases with the verbosity level. If it more than 10, all iterations are reported. timeout: float, optional Timeout limit for each task to complete. If any task takes longer a TimeOutError will be raised. Only applied when n_jobs != 1 pre_dispatch: {'all', integer, or expression, as in '3*n_jobs'} The number of batches (of tasks) to be pre-dispatched. Default is '2*n_jobs'. When batch_size="auto" this is reasonable default and the multiprocessing workers should never starve. batch_size: int or 'auto', default: 'auto' The number of atomic tasks to dispatch at once to each worker. When individual evaluations are very fast, multiprocessing can be slower than sequential computation because of the overhead. Batching fast computations together can mitigate this. The ``'auto'`` strategy keeps track of the time it takes for a batch to complete, and dynamically adjusts the batch size to keep the time on the order of half a second, using a heuristic. The initial batch size is 1. ``batch_size="auto"`` with ``backend="threading"`` will dispatch batches of a single task at a time as the threading backend has very little overhead and using larger batch size has not proved to bring any gain in that case. temp_folder: str, optional Folder to be used by the pool for memmaping large arrays for sharing memory with worker processes. If None, this will try in order: - a folder pointed by the JOBLIB_TEMP_FOLDER environment variable, - /dev/shm if the folder exists and is writable: this is a RAMdisk filesystem available by default on modern Linux distributions, - the default system temporary folder that can be overridden with TMP, TMPDIR or TEMP environment variables, typically /tmp under Unix operating systems. Only active when backend="multiprocessing". max_nbytes int, str, or None, optional, 1M by default Threshold on the size of arrays passed to the workers that triggers automated memory mapping in temp_folder. Can be an int in Bytes, or a human-readable string, e.g., '1M' for 1 megabyte. Use None to disable memmaping of large arrays. Only active when backend="multiprocessing". mmap_mode: {None, 'r+', 'r', 'w+', 'c'} Memmapping mode for numpy arrays passed to workers. See 'max_nbytes' parameter documentation for more details. Notes ----- This object uses the multiprocessing module to compute in parallel the application of a function to many different arguments. The main functionality it brings in addition to using the raw multiprocessing API are (see examples for details): * More readable code, in particular since it avoids constructing list of arguments. * Easier debugging: - informative tracebacks even when the error happens on the client side - using 'n_jobs=1' enables to turn off parallel computing for debugging without changing the codepath - early capture of pickling errors * An optional progress meter. * Interruption of multiprocesses jobs with 'Ctrl-C' * Flexible pickling control for the communication to and from the worker processes. * Ability to use shared memory efficiently with worker processes for large numpy-based datastructures. Examples -------- A simple example: >>> from math import sqrt >>> from joblib import Parallel, delayed >>> Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10)) [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] Reshaping the output when the function has several return values: >>> from math import modf >>> from joblib import Parallel, delayed >>> r = Parallel(n_jobs=1)(delayed(modf)(i/2.) for i in range(10)) >>> res, i = zip(*r) >>> res (0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5) >>> i (0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0) The progress meter: the higher the value of `verbose`, the more messages: >>> from time import sleep >>> from joblib import Parallel, delayed >>> r = Parallel(n_jobs=2, verbose=5)(delayed(sleep)(.1) for _ in range(10)) #doctest: +SKIP [Parallel(n_jobs=2)]: Done 1 out of 10 | elapsed: 0.1s remaining: 0.9s [Parallel(n_jobs=2)]: Done 3 out of 10 | elapsed: 0.2s remaining: 0.5s [Parallel(n_jobs=2)]: Done 6 out of 10 | elapsed: 0.3s remaining: 0.2s [Parallel(n_jobs=2)]: Done 9 out of 10 | elapsed: 0.5s remaining: 0.1s [Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 0.5s finished Traceback example, note how the line of the error is indicated as well as the values of the parameter passed to the function that triggered the exception, even though the traceback happens in the child process: >>> from heapq import nlargest >>> from joblib import Parallel, delayed >>> Parallel(n_jobs=2)(delayed(nlargest)(2, n) for n in (range(4), 'abcde', 3)) #doctest: +SKIP #... --------------------------------------------------------------------------- Sub-process traceback: --------------------------------------------------------------------------- TypeError Mon Nov 12 11:37:46 2012 PID: 12934 Python 2.7.3: /usr/bin/python ........................................................................... /usr/lib/python2.7/heapq.pyc in nlargest(n=2, iterable=3, key=None) 419 if n >= size: 420 return sorted(iterable, key=key, reverse=True)[:n] 421 422 # When key is none, use simpler decoration 423 if key is None: --> 424 it = izip(iterable, count(0,-1)) # decorate 425 result = _nlargest(n, it) 426 return map(itemgetter(0), result) # undecorate 427 428 # General case, slowest method TypeError: izip argument #1 must support iteration ___________________________________________________________________________ Using pre_dispatch in a producer/consumer situation, where the data is generated on the fly. Note how the producer is first called 3 times before the parallel loop is initiated, and then called to generate new data on the fly. In this case the total number of iterations cannot be reported in the progress messages: >>> from math import sqrt >>> from joblib import Parallel, delayed >>> def producer(): ... for i in range(6): ... print('Produced %s' % i) ... yield i >>> out = Parallel(n_jobs=2, verbose=100, pre_dispatch='1.5*n_jobs')( ... delayed(sqrt)(i) for i in producer()) #doctest: +SKIP Produced 0 Produced 1 Produced 2 [Parallel(n_jobs=2)]: Done 1 jobs | elapsed: 0.0s Produced 3 [Parallel(n_jobs=2)]: Done 2 jobs | elapsed: 0.0s Produced 4 [Parallel(n_jobs=2)]: Done 3 jobs | elapsed: 0.0s Produced 5 [Parallel(n_jobs=2)]: Done 4 jobs | elapsed: 0.0s [Parallel(n_jobs=2)]: Done 5 out of 6 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=2)]: Done 6 out of 6 | elapsed: 0.0s finished ''' def __init__(self, n_jobs=1, backend=None, verbose=0, timeout=None, pre_dispatch='2 * n_jobs', batch_size='auto', temp_folder=None, max_nbytes='1M', mmap_mode='r'): active_backend, default_n_jobs = get_active_backend() if backend is None and n_jobs == 1: # If we are under a parallel_backend context manager, look up # the default number of jobs and use that instead: n_jobs = default_n_jobs self.n_jobs = n_jobs self.verbose = verbose self.timeout = timeout self.pre_dispatch = pre_dispatch if isinstance(max_nbytes, _basestring): max_nbytes = memstr_to_bytes(max_nbytes) self._backend_args = dict( max_nbytes=max_nbytes, mmap_mode=mmap_mode, temp_folder=temp_folder, verbose=max(0, self.verbose - 50), ) if DEFAULT_MP_CONTEXT is not None: self._backend_args['context'] = DEFAULT_MP_CONTEXT if backend is None: backend = active_backend elif isinstance(backend, ParallelBackendBase): # Use provided backend as is pass elif hasattr(backend, 'Pool') and hasattr(backend, 'Lock'): # Make it possible to pass a custom multiprocessing context as # backend to change the start method to forkserver or spawn or # preload modules on the forkserver helper process. self._backend_args['context'] = backend backend = MultiprocessingBackend() else: try: backend_factory = BACKENDS[backend] except KeyError: raise ValueError("Invalid backend: %s, expected one of %r" % (backend, sorted(BACKENDS.keys()))) backend = backend_factory() if (batch_size == 'auto' or isinstance(batch_size, Integral) and batch_size > 0): self.batch_size = batch_size else: raise ValueError( "batch_size must be 'auto' or a positive integer, got: %r" % batch_size) self._backend = backend self._output = None self._jobs = list() self._managed_backend = False # This lock is used coordinate the main thread of this process with # the async callback thread of our the pool. self._lock = threading.Lock() def __enter__(self): self._managed_backend = True self._initialize_backend() return self def __exit__(self, exc_type, exc_value, traceback): self._terminate_backend() self._managed_backend = False def _initialize_backend(self): """Build a process or thread pool and return the number of workers""" try: n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self, **self._backend_args) if self.timeout is not None and not self._backend.supports_timeout: warnings.warn( 'The backend class {!r} does not support timeout. ' "You have set 'timeout={}' in Parallel but " "the 'timeout' parameter will not be used.".format( self._backend.__class__.__name__, self.timeout)) except FallbackToBackend as e: # Recursively initialize the backend in case of requested fallback. self._backend = e.backend n_jobs = self._initialize_backend() return n_jobs def _effective_n_jobs(self): if self._backend: return self._backend.effective_n_jobs(self.n_jobs) return 1 def _terminate_backend(self): if self._backend is not None: self._backend.terminate() def _dispatch(self, batch): """Queue the batch for computing, with or without multiprocessing WARNING: this method is not thread-safe: it should be only called indirectly via dispatch_one_batch. """ # If job.get() catches an exception, it closes the queue: if self._aborting: return self.n_dispatched_tasks += len(batch) self.n_dispatched_batches += 1 dispatch_timestamp = time.time() cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) job = self._backend.apply_async(batch, callback=cb) self._jobs.append(job) def dispatch_next(self): """Dispatch more data for parallel processing This method is meant to be called concurrently by the multiprocessing callback. We rely on the thread-safety of dispatch_one_batch to protect against concurrent consumption of the unprotected iterator. """ if not self.dispatch_one_batch(self._original_iterator): self._iterating = False self._original_iterator = None def dispatch_one_batch(self, iterator): """Prefetch the tasks for the next batch and dispatch them. The effective size of the batch is computed here. If there are no more jobs to dispatch, return False, else return True. The iterator consumption and dispatching is protected by the same lock so calling this function should be thread safe. """ if self.batch_size == 'auto': batch_size = self._backend.compute_batch_size() else: # Fixed batch size strategy batch_size = self.batch_size with self._lock: tasks = BatchedCalls(itertools.islice(iterator, batch_size)) if len(tasks) == 0: # No more tasks available in the iterator: tell caller to stop. return False else: self._dispatch(tasks) return True def _print(self, msg, msg_args): """Display the message on stout or stderr depending on verbosity""" # XXX: Not using the logger framework: need to # learn to use logger better. if not self.verbose: return if self.verbose < 50: writer = sys.stderr.write else: writer = sys.stdout.write msg = msg % msg_args writer('[%s]: %s\n' % (self, msg)) def print_progress(self): """Display the process of the parallel execution only a fraction of time, controlled by self.verbose. """ if not self.verbose: return elapsed_time = time.time() - self._start_time # Original job iterator becomes None once it has been fully # consumed : at this point we know the total number of jobs and we are # able to display an estimation of the remaining time based on already # completed jobs. Otherwise, we simply display the number of completed # tasks. if self._original_iterator is not None: if _verbosity_filter(self.n_dispatched_batches, self.verbose): return self._print('Done %3i tasks | elapsed: %s', (self.n_completed_tasks, short_format_time(elapsed_time), )) else: index = self.n_completed_tasks # We are finished dispatching total_tasks = self.n_dispatched_tasks # We always display the first loop if not index == 0: # Display depending on the number of remaining items # A message as soon as we finish dispatching, cursor is 0 cursor = (total_tasks - index + 1 - self._pre_dispatch_amount) frequency = (total_tasks // self.verbose) + 1 is_last_item = (index + 1 == total_tasks) if (is_last_item or cursor % frequency): return remaining_time = (elapsed_time / index) * \ (self.n_dispatched_tasks - index * 1.0) # only display status if remaining time is greater or equal to 0 self._print('Done %3i out of %3i | elapsed: %s remaining: %s', (index, total_tasks, short_format_time(elapsed_time), short_format_time(remaining_time), )) def retrieve(self): self._output = list() while self._iterating or len(self._jobs) > 0: if len(self._jobs) == 0: # Wait for an async callback to dispatch new jobs time.sleep(0.01) continue # We need to be careful: the job list can be filling up as # we empty it and Python list are not thread-safe by default hence # the use of the lock with self._lock: job = self._jobs.pop(0) try: if getattr(self._backend, 'supports_timeout', False): self._output.extend(job.get(timeout=self.timeout)) else: self._output.extend(job.get()) except BaseException as exception: # Note: we catch any BaseException instead of just Exception # instances to also include KeyboardInterrupt. # Stop dispatching any new job in the async callback thread self._aborting = True # If the backend allows it, cancel or kill remaining running # tasks without waiting for the results as we will raise # the exception we got back to the caller instead of returning # any result. backend = self._backend if (backend is not None and hasattr(backend, 'abort_everything')): # If the backend is managed externally we need to make sure # to leave it in a working state to allow for future jobs # scheduling. ensure_ready = self._managed_backend backend.abort_everything(ensure_ready=ensure_ready) if not isinstance(exception, TransportableException): raise else: # Capture exception to add information on the local # stack in addition to the distant stack this_report = format_outer_frames(context=10, stack_start=1) report = """Multiprocessing exception: %s --------------------------------------------------------------------------- Sub-process traceback: --------------------------------------------------------------------------- %s""" % (this_report, exception.message) # Convert this to a JoblibException exception_type = _mk_exception(exception.etype)[0] exception = exception_type(report) raise exception def __call__(self, iterable): if self._jobs: raise ValueError('This Parallel instance is already running') # A flag used to abort the dispatching of jobs in case an # exception is found self._aborting = False if not self._managed_backend: n_jobs = self._initialize_backend() else: n_jobs = self._effective_n_jobs() iterator = iter(iterable) pre_dispatch = self.pre_dispatch if pre_dispatch == 'all' or n_jobs == 1: # prevent further dispatch via multiprocessing callback thread self._original_iterator = None self._pre_dispatch_amount = 0 else: self._original_iterator = iterator if hasattr(pre_dispatch, 'endswith'): pre_dispatch = eval(pre_dispatch) self._pre_dispatch_amount = pre_dispatch = int(pre_dispatch) # The main thread will consume the first pre_dispatch items and # the remaining items will later be lazily dispatched by async # callbacks upon task completions. iterator = itertools.islice(iterator, pre_dispatch) self._start_time = time.time() self.n_dispatched_batches = 0 self.n_dispatched_tasks = 0 self.n_completed_tasks = 0 try: # Only set self._iterating to True if at least a batch # was dispatched. In particular this covers the edge # case of Parallel used with an exhausted iterator. while self.dispatch_one_batch(iterator): self._iterating = True else: self._iterating = False if pre_dispatch == "all" or n_jobs == 1: # The iterable was consumed all at once by the above for loop. # No need to wait for async callbacks to trigger to # consumption. self._iterating = False self.retrieve() # Make sure that we get a last message telling us we are done elapsed_time = time.time() - self._start_time self._print('Done %3i out of %3i | elapsed: %s finished', (len(self._output), len(self._output), short_format_time(elapsed_time))) finally: if not self._managed_backend: self._terminate_backend() self._jobs = list() output = self._output self._output = None return output def __repr__(self): return '%s(n_jobs=%s)' % (self.__class__.__name__, self.n_jobs) joblib-0.11/joblib/pool.py000066400000000000000000000610731305577265600155070ustar00rootroot00000000000000"""Custom implementation of multiprocessing.Pool with custom pickler. This module provides efficient ways of working with data stored in shared memory with numpy.memmap arrays without inducing any memory copy between the parent and child processes. This module should not be imported if multiprocessing is not available as it implements subclasses of multiprocessing Pool that uses a custom alternative to SimpleQueue. """ # Author: Olivier Grisel # Copyright: 2012, Olivier Grisel # License: BSD 3 clause from mmap import mmap import errno import os import stat import sys import threading import atexit import tempfile import shutil import warnings from time import sleep try: WindowsError except NameError: WindowsError = type(None) from pickle import whichmodule try: # Python 2 compat from cPickle import loads from cPickle import dumps except ImportError: from pickle import loads from pickle import dumps import copyreg # Customizable pure Python pickler in Python 2 # customizable C-optimized pickler under Python 3.3+ from pickle import Pickler from pickle import HIGHEST_PROTOCOL from io import BytesIO from ._multiprocessing_helpers import mp, assert_spawning # We need the class definition to derive from it not the multiprocessing.Pool # factory function from multiprocessing.pool import Pool try: import numpy as np from numpy.lib.stride_tricks import as_strided except ImportError: np = None from .numpy_pickle import load from .numpy_pickle import dump from .hashing import hash from .backports import make_memmap # Some system have a ramdisk mounted by default, we can use it instead of /tmp # as the default folder to dump big arrays to share with subprocesses SYSTEM_SHARED_MEM_FS = '/dev/shm' # Folder and file permissions to chmod temporary files generated by the # memmaping pool. Only the owner of the Python process can access the # temporary files and folder. FOLDER_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR FILE_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR ############################################################################### # Support for efficient transient pickling of numpy data structures def _get_backing_memmap(a): """Recursively look up the original np.memmap instance base if any.""" b = getattr(a, 'base', None) if b is None: # TODO: check scipy sparse datastructure if scipy is installed # a nor its descendants do not have a memmap base return None elif isinstance(b, mmap): # a is already a real memmap instance. return a else: # Recursive exploration of the base ancestry return _get_backing_memmap(b) def has_shareable_memory(a): """Return True if a is backed by some mmap buffer directly or not.""" return _get_backing_memmap(a) is not None def _strided_from_memmap(filename, dtype, mode, offset, order, shape, strides, total_buffer_len): """Reconstruct an array view on a memory mapped file.""" if mode == 'w+': # Do not zero the original data when unpickling mode = 'r+' if strides is None: # Simple, contiguous memmap return make_memmap(filename, dtype=dtype, shape=shape, mode=mode, offset=offset, order=order) else: # For non-contiguous data, memmap the total enclosing buffer and then # extract the non-contiguous view with the stride-tricks API base = make_memmap(filename, dtype=dtype, shape=total_buffer_len, mode=mode, offset=offset, order=order) return as_strided(base, shape=shape, strides=strides) def _reduce_memmap_backed(a, m): """Pickling reduction for memmap backed arrays. a is expected to be an instance of np.ndarray (or np.memmap) m is expected to be an instance of np.memmap on the top of the ``base`` attribute ancestry of a. ``m.base`` should be the real python mmap object. """ # offset that comes from the striding differences between a and m a_start, a_end = np.byte_bounds(a) m_start = np.byte_bounds(m)[0] offset = a_start - m_start # offset from the backing memmap offset += m.offset if m.flags['F_CONTIGUOUS']: order = 'F' else: # The backing memmap buffer is necessarily contiguous hence C if not # Fortran order = 'C' if a.flags['F_CONTIGUOUS'] or a.flags['C_CONTIGUOUS']: # If the array is a contiguous view, no need to pass the strides strides = None total_buffer_len = None else: # Compute the total number of items to map from which the strided # view will be extracted. strides = a.strides total_buffer_len = (a_end - a_start) // a.itemsize return (_strided_from_memmap, (m.filename, a.dtype, m.mode, offset, order, a.shape, strides, total_buffer_len)) def reduce_memmap(a): """Pickle the descriptors of a memmap instance to reopen on same file.""" m = _get_backing_memmap(a) if m is not None: # m is a real mmap backed memmap instance, reduce a preserving striding # information return _reduce_memmap_backed(a, m) else: # This memmap instance is actually backed by a regular in-memory # buffer: this can happen when using binary operators on numpy.memmap # instances return (loads, (dumps(np.asarray(a), protocol=HIGHEST_PROTOCOL),)) class ArrayMemmapReducer(object): """Reducer callable to dump large arrays to memmap files. Parameters ---------- max_nbytes: int Threshold to trigger memmaping of large arrays to files created a folder. temp_folder: str Path of a folder where files for backing memmaped arrays are created. mmap_mode: 'r', 'r+' or 'c' Mode for the created memmap datastructure. See the documentation of numpy.memmap for more details. Note: 'w+' is coerced to 'r+' automatically to avoid zeroing the data on unpickling. verbose: int, optional, 0 by default If verbose > 0, memmap creations are logged. If verbose > 1, both memmap creations, reuse and array pickling are logged. prewarm: bool, optional, False by default. Force a read on newly memmaped array to make sure that OS pre-cache it memory. This can be useful to avoid concurrent disk access when the same data array is passed to different worker processes. """ def __init__(self, max_nbytes, temp_folder, mmap_mode, verbose=0, context_id=None, prewarm=True): self._max_nbytes = max_nbytes self._temp_folder = temp_folder self._mmap_mode = mmap_mode self.verbose = int(verbose) self._prewarm = prewarm if context_id is not None: warnings.warn('context_id is deprecated and ignored in joblib' ' 0.9.4 and will be removed in 0.11', DeprecationWarning) def __call__(self, a): m = _get_backing_memmap(a) if m is not None: # a is already backed by a memmap file, let's reuse it directly return _reduce_memmap_backed(a, m) if (not a.dtype.hasobject and self._max_nbytes is not None and a.nbytes > self._max_nbytes): # check that the folder exists (lazily create the pool temp folder # if required) try: os.makedirs(self._temp_folder) os.chmod(self._temp_folder, FOLDER_PERMISSIONS) except OSError as e: if e.errno != errno.EEXIST: raise e # Find a unique, concurrent safe filename for writing the # content of this array only once. basename = "%d-%d-%s.pkl" % ( os.getpid(), id(threading.current_thread()), hash(a)) filename = os.path.join(self._temp_folder, basename) # In case the same array with the same content is passed several # times to the pool subprocess children, serialize it only once # XXX: implement an explicit reference counting scheme to make it # possible to delete temporary files as soon as the workers are # done processing this data. if not os.path.exists(filename): if self.verbose > 0: print("Memmaping (shape=%r, dtype=%s) to new file %s" % ( a.shape, a.dtype, filename)) for dumped_filename in dump(a, filename): os.chmod(dumped_filename, FILE_PERMISSIONS) if self._prewarm: # Warm up the data to avoid concurrent disk access in # multiple children processes load(filename, mmap_mode=self._mmap_mode).max() elif self.verbose > 1: print("Memmaping (shape=%s, dtype=%s) to old file %s" % ( a.shape, a.dtype, filename)) # The worker process will use joblib.load to memmap the data return (load, (filename, self._mmap_mode)) else: # do not convert a into memmap, let pickler do its usual copy with # the default system pickler if self.verbose > 1: print("Pickling array (shape=%r, dtype=%s)." % ( a.shape, a.dtype)) return (loads, (dumps(a, protocol=HIGHEST_PROTOCOL),)) ############################################################################### # Enable custom pickling in Pool queues class CustomizablePickler(Pickler): """Pickler that accepts custom reducers. HIGHEST_PROTOCOL is selected by default as this pickler is used to pickle ephemeral datastructures for interprocess communication hence no backward compatibility is required. `reducers` is expected to be a dictionary with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ # We override the pure Python pickler as its the only way to be able to # customize the dispatch table without side effects in Python 2.7 # to 3.2. For Python 3.3+ leverage the new dispatch_table # feature from http://bugs.python.org/issue14166 that makes it possible # to use the C implementation of the Pickler which is faster. def __init__(self, writer, reducers=None, protocol=HIGHEST_PROTOCOL): Pickler.__init__(self, writer, protocol=protocol) if reducers is None: reducers = {} if hasattr(Pickler, 'dispatch'): # Make the dispatch registry an instance level attribute instead of # a reference to the class dictionary under Python 2 self.dispatch = Pickler.dispatch.copy() else: # Under Python 3 initialize the dispatch table with a copy of the # default registry self.dispatch_table = copyreg.dispatch_table.copy() for type, reduce_func in reducers.items(): self.register(type, reduce_func) def register(self, type, reduce_func): """Attach a reducer function to a given type in the dispatch table.""" if hasattr(Pickler, 'dispatch'): # Python 2 pickler dispatching is not explicitly customizable. # Let us use a closure to workaround this limitation. def dispatcher(self, obj): reduced = reduce_func(obj) self.save_reduce(obj=obj, *reduced) self.dispatch[type] = dispatcher else: self.dispatch_table[type] = reduce_func class CustomizablePicklingQueue(object): """Locked Pipe implementation that uses a customizable pickler. This class is an alternative to the multiprocessing implementation of SimpleQueue in order to make it possible to pass custom pickling reducers, for instance to avoid memory copy when passing memory mapped datastructures. `reducers` is expected to be a dict with key / values being `(type, callable)` pairs where `callable` is a function that, given an instance of `type`, will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ def __init__(self, context, reducers=None): self._reducers = reducers self._reader, self._writer = context.Pipe(duplex=False) self._rlock = context.Lock() if sys.platform == 'win32': self._wlock = None else: self._wlock = context.Lock() self._make_methods() def __getstate__(self): assert_spawning(self) return (self._reader, self._writer, self._rlock, self._wlock, self._reducers) def __setstate__(self, state): (self._reader, self._writer, self._rlock, self._wlock, self._reducers) = state self._make_methods() def empty(self): return not self._reader.poll() def _make_methods(self): self._recv = recv = self._reader.recv racquire, rrelease = self._rlock.acquire, self._rlock.release def get(): racquire() try: return recv() finally: rrelease() self.get = get if self._reducers: def send(obj): buffer = BytesIO() CustomizablePickler(buffer, self._reducers).dump(obj) self._writer.send_bytes(buffer.getvalue()) self._send = send else: self._send = send = self._writer.send if self._wlock is None: # writes to a message oriented win32 pipe are atomic self.put = send else: wlock_acquire, wlock_release = ( self._wlock.acquire, self._wlock.release) def put(obj): wlock_acquire() try: return send(obj) finally: wlock_release() self.put = put class PicklingPool(Pool): """Pool implementation with customizable pickling reducers. This is useful to control how data is shipped between processes and makes it possible to use shared memory without useless copies induces by the default pickling methods of the original objects passed as arguments to dispatch. `forward_reducers` and `backward_reducers` are expected to be dictionaries with key/values being `(type, callable)` pairs where `callable` is a function that, given an instance of `type`, will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation about pickling for more details. """ def __init__(self, processes=None, forward_reducers=None, backward_reducers=None, **kwargs): if forward_reducers is None: forward_reducers = dict() if backward_reducers is None: backward_reducers = dict() self._forward_reducers = forward_reducers self._backward_reducers = backward_reducers poolargs = dict(processes=processes) poolargs.update(kwargs) super(PicklingPool, self).__init__(**poolargs) def _setup_queues(self): context = getattr(self, '_ctx', mp) self._inqueue = CustomizablePicklingQueue(context, self._forward_reducers) self._outqueue = CustomizablePicklingQueue(context, self._backward_reducers) self._quick_put = self._inqueue._send self._quick_get = self._outqueue._recv def delete_folder(folder_path): """Utility function to cleanup a temporary folder if still existing.""" try: if os.path.exists(folder_path): shutil.rmtree(folder_path) except WindowsError: warnings.warn("Failed to clean temporary folder: %s" % folder_path) class MemmapingPool(PicklingPool): """Process pool that shares large arrays to avoid memory copy. This drop-in replacement for `multiprocessing.pool.Pool` makes it possible to work efficiently with shared memory in a numpy context. Existing instances of numpy.memmap are preserved: the child suprocesses will have access to the same shared memory in the original mode except for the 'w+' mode that is automatically transformed as 'r+' to avoid zeroing the original data upon instantiation. Furthermore large arrays from the parent process are automatically dumped to a temporary folder on the filesystem such as child processes to access their content via memmaping (file system backed shared memory). Note: it is important to call the terminate method to collect the temporary folder used by the pool. Parameters ---------- processes: int, optional Number of worker processes running concurrently in the pool. initializer: callable, optional Callable executed on worker process creation. initargs: tuple, optional Arguments passed to the initializer callable. temp_folder: str, optional Folder to be used by the pool for memmaping large arrays for sharing memory with worker processes. If None, this will try in order: - a folder pointed by the JOBLIB_TEMP_FOLDER environment variable, - /dev/shm if the folder exists and is writable: this is a RAMdisk filesystem available by default on modern Linux distributions, - the default system temporary folder that can be overridden with TMP, TMPDIR or TEMP environment variables, typically /tmp under Unix operating systems. max_nbytes int or None, optional, 1e6 by default Threshold on the size of arrays passed to the workers that triggers automated memory mapping in temp_folder. Use None to disable memmaping of large arrays. mmap_mode: {'r+', 'r', 'w+', 'c'} Memmapping mode for numpy arrays passed to workers. See 'max_nbytes' parameter documentation for more details. forward_reducers: dictionary, optional Reducers used to pickle objects passed from master to worker processes: see below. backward_reducers: dictionary, optional Reducers used to pickle return values from workers back to the master process. verbose: int, optional Make it possible to monitor how the communication of numpy arrays with the subprocess is handled (pickling or memmaping) prewarm: bool or str, optional, "auto" by default. If True, force a read on newly memmaped array to make sure that OS pre- cache it in memory. This can be useful to avoid concurrent disk access when the same data array is passed to different worker processes. If "auto" (by default), prewarm is set to True, unless the Linux shared memory partition /dev/shm is available and used as temp_folder. `forward_reducers` and `backward_reducers` are expected to be dictionaries with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6, mmap_mode='r', forward_reducers=None, backward_reducers=None, verbose=0, context_id=None, prewarm=False, **kwargs): if forward_reducers is None: forward_reducers = dict() if backward_reducers is None: backward_reducers = dict() if context_id is not None: warnings.warn('context_id is deprecated and ignored in joblib' ' 0.9.4 and will be removed in 0.11', DeprecationWarning) # Prepare a sub-folder name for the serialization of this particular # pool instance (do not create in advance to spare FS write access if # no array is to be dumped): use_shared_mem = False pool_folder_name = "joblib_memmaping_pool_%d_%d" % ( os.getpid(), id(self)) if temp_folder is None: temp_folder = os.environ.get('JOBLIB_TEMP_FOLDER', None) if temp_folder is None: if os.path.exists(SYSTEM_SHARED_MEM_FS): try: temp_folder = SYSTEM_SHARED_MEM_FS pool_folder = os.path.join(temp_folder, pool_folder_name) if not os.path.exists(pool_folder): os.makedirs(pool_folder) use_shared_mem = True except IOError: # Missing rights in the the /dev/shm partition, # fallback to regular temp folder. temp_folder = None if temp_folder is None: # Fallback to the default tmp folder, typically /tmp temp_folder = tempfile.gettempdir() temp_folder = os.path.abspath(os.path.expanduser(temp_folder)) pool_folder = os.path.join(temp_folder, pool_folder_name) self._temp_folder = pool_folder # Register the garbage collector at program exit in case caller forgets # to call terminate explicitly: note we do not pass any reference to # self to ensure that this callback won't prevent garbage collection of # the pool instance and related file handler resources such as POSIX # semaphores and pipes pool_module_name = whichmodule(delete_folder, 'delete_folder') def _cleanup(): # In some cases the Python runtime seems to set delete_folder to # None just before exiting when accessing the delete_folder # function from the closure namespace. So instead we reimport # the delete_folder function explicitly. # https://github.com/joblib/joblib/issues/328 # We cannot just use from 'joblib.pool import delete_folder' # because joblib should only use relative imports to allow # easy vendoring. delete_folder = __import__( pool_module_name, fromlist=['delete_folder']).delete_folder delete_folder(pool_folder) atexit.register(_cleanup) if np is not None: # Register smart numpy.ndarray reducers that detects memmap backed # arrays and that is alse able to dump to memmap large in-memory # arrays over the max_nbytes threshold if prewarm == "auto": prewarm = not use_shared_mem forward_reduce_ndarray = ArrayMemmapReducer( max_nbytes, pool_folder, mmap_mode, verbose, prewarm=prewarm) forward_reducers[np.ndarray] = forward_reduce_ndarray forward_reducers[np.memmap] = reduce_memmap # Communication from child process to the parent process always # pickles in-memory numpy.ndarray without dumping them as memmap # to avoid confusing the caller and make it tricky to collect the # temporary folder backward_reduce_ndarray = ArrayMemmapReducer( None, pool_folder, mmap_mode, verbose) backward_reducers[np.ndarray] = backward_reduce_ndarray backward_reducers[np.memmap] = reduce_memmap poolargs = dict( processes=processes, forward_reducers=forward_reducers, backward_reducers=backward_reducers) poolargs.update(kwargs) super(MemmapingPool, self).__init__(**poolargs) def terminate(self): n_retries = 10 for i in range(n_retries): try: super(MemmapingPool, self).terminate() break except OSError as e: if isinstance(e, WindowsError): # Workaround occasional "[Error 5] Access is denied" issue # when trying to terminate a process under windows. sleep(0.1) if i + 1 == n_retries: warnings.warn("Failed to terminate worker processes in" " multiprocessing pool: %r" % e) delete_folder(self._temp_folder) joblib-0.11/joblib/test/000077500000000000000000000000001305577265600151345ustar00rootroot00000000000000joblib-0.11/joblib/test/__init__.py000066400000000000000000000001111305577265600172360ustar00rootroot00000000000000from joblib.test import test_memory from joblib.test import test_hashing joblib-0.11/joblib/test/common.py000066400000000000000000000057651305577265600170130ustar00rootroot00000000000000""" Small utilities for testing. """ import threading import signal import time import os import sys import gc from joblib._multiprocessing_helpers import mp from joblib.testing import SkipTest, skipif # A decorator to run tests only when numpy is available try: import numpy as np def with_numpy(func): """A decorator to skip tests requiring numpy.""" return func except ImportError: def with_numpy(func): """A decorator to skip tests requiring numpy.""" def my_func(): raise SkipTest('Test requires numpy') return my_func np = None # TODO: Turn this back on after refactoring yield based tests in test_hashing # with_numpy = skipif(not np, reason='Test requires numpy.') # we use memory_profiler library for memory consumption checks try: from memory_profiler import memory_usage def with_memory_profiler(func): """A decorator to skip tests requiring memory_profiler.""" return func def memory_used(func, *args, **kwargs): """Compute memory usage when executing func.""" gc.collect() mem_use = memory_usage((func, args, kwargs), interval=.001) return max(mem_use) - min(mem_use) except ImportError: def with_memory_profiler(func): """A decorator to skip tests requiring memory_profiler.""" def dummy_func(): raise SkipTest('Test requires memory_profiler.') return dummy_func memory_usage = memory_used = None # A utility to kill the test runner in case a multiprocessing assumption # triggers an infinite wait on a pipe by the master process for one of its # failed workers _KILLER_THREADS = dict() def setup_autokill(module_name, timeout=30): """Timeout based suiciding thread to kill the test runner process If some subprocess dies in an unexpected way we don't want the parent process to block indefinitely. """ if "NO_AUTOKILL" in os.environ or "--pdb" in sys.argv: # Do not install the autokiller return # Renew any previous contract under that name by first cancelling the # previous version (that should normally not happen in practice) teardown_autokill(module_name) def autokill(): pid = os.getpid() print("Timeout exceeded: terminating stalled process: %d" % pid) os.kill(pid, signal.SIGTERM) # If were are still there ask the OS to kill ourself for real time.sleep(0.5) print("Timeout exceeded: killing stalled process: %d" % pid) os.kill(pid, signal.SIGKILL) _KILLER_THREADS[module_name] = t = threading.Timer(timeout, autokill) t.start() def teardown_autokill(module_name): """Cancel a previously started killer thread""" killer = _KILLER_THREADS.get(module_name) if killer is not None: killer.cancel() with_multiprocessing = skipif( mp is None, reason='Needs multiprocessing to run.') with_dev_shm = skipif( not os.path.exists('/dev/shm'), reason='This test requires the /dev/shm shared memory fs.') joblib-0.11/joblib/test/data/000077500000000000000000000000001305577265600160455ustar00rootroot00000000000000joblib-0.11/joblib/test/data/__init__.py000066400000000000000000000000001305577265600201440ustar00rootroot00000000000000joblib-0.11/joblib/test/data/create_numpy_pickle.py000066400000000000000000000070311305577265600224420ustar00rootroot00000000000000""" This script is used to generate test data for joblib/test/test_numpy_pickle.py """ import sys import re # pytest needs to be able to import this module even when numpy is # not installed try: import numpy as np except ImportError: np = None import joblib def get_joblib_version(joblib_version=joblib.__version__): """Normalize joblib version by removing suffix. >>> get_joblib_version('0.8.4') '0.8.4' >>> get_joblib_version('0.8.4b1') '0.8.4' >>> get_joblib_version('0.9.dev0') '0.9' """ matches = [re.match(r'(\d+).*', each) for each in joblib_version.split('.')] return '.'.join([m.group(1) for m in matches if m is not None]) def write_test_pickle(to_pickle, args): kwargs = {} compress = args.compress method = args.method joblib_version = get_joblib_version() py_version = '{0[0]}{0[1]}'.format(sys.version_info) numpy_version = ''.join(np.__version__.split('.')[:2]) # The game here is to generate the right filename according to the options. body = '_compressed' if (compress and method == 'zlib') else '' if compress: if method == 'zlib': kwargs['compress'] = True extension = '.gz' else: kwargs['compress'] = (method, 3) extension = '.pkl.{}'.format(method) if args.cache_size: kwargs['cache_size'] = 0 body += '_cache_size' else: extension = '.pkl' pickle_filename = 'joblib_{}{}_pickle_py{}_np{}{}'.format( joblib_version, body, py_version, numpy_version, extension) try: joblib.dump(to_pickle, pickle_filename, **kwargs) except Exception as e: # With old python version (=< 3.3.), we can arrive there when # dumping compressed pickle with LzmaFile. print("Error: cannot generate file '{}' with arguments '{}'. " "Error was: {}".format(pickle_filename, kwargs, e)) else: print("File '{}' generated successfuly.".format(pickle_filename)) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description="Joblib pickle data " "generator.") parser.add_argument('--cache_size', action="store_true", help="Force creation of companion numpy " "files for pickled arrays.") parser.add_argument('--compress', action="store_true", help="Generate compress pickles.") parser.add_argument('--method', type=str, default='zlib', choices=['zlib', 'gzip', 'bz2', 'xz', 'lzma'], help="Set compression method.") # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. to_pickle = [np.arange(5, dtype=np.dtype('77P4 4f,P:|Ba S, E N *:UéI`7Jffegd [ 1=JJC-(X*T(PRB̭K;K _[S ⒢B0cfh-d eL*dn/d *dfΆ}L@q@Tu7x:c y2'&%k2& x3f0{3 &0_@PHXDTL\BRJZFVN^AQIYEUM]CSK[GWO? 0(8$4,<"2*:&6.>耔Դ̬ܼ¢ҲʪںƦֶή޾ 'M2u3g͞3w -^t+W^v 7m޲u;w޳w>r'O>s /]r7oݾs=~/_~?}?B%pP-T%`þPM,)ʬA_ .(USy?0SKr,9RAP=U`kjoblib-0.11/joblib/test/data/joblib_0.10.0_compressed_pickle_py27_np17.gz000066400000000000000000000013651305577265600257520ustar00rootroot00000000000000x^k`-dHOL+-/LIq+Ë R 5 j 5BYSJ* R YZ77P4 4f,P:|Ba S, E N *:UéI`7Jffegd [ 1=JJC-(X*T(PRB̭K;K _[S ⒢B0cfh-d eL*dn/d *dfΆ}L@q@Tu7x:c y2'&%k2& x3f0{3 &0_@PHXDTL\BRJZFVN^AQIYEUM]CSK[GWO? 0(8$4,<"2*:&6.>耔Դ̬ܼ¢ҲʪںƦֶή޾ 'M2u3g͞3w -^t+W^v 7m޲u;w޳w>r'O>s /]r7oݾs=~/_~?}?B%pP-T%`þPM,)ʬA_ .(USy?0SKr,9RAP=UwTjoblib-0.11/joblib/test/data/joblib_0.10.0_compressed_pickle_py33_np18.gz000066400000000000000000000014301305577265600257410ustar00rootroot00000000000000x^eSexPn]&a+Vn0VtCpÒ({MPB'nx)#=3:mt S<{)RbPQGV,+rY +dȍ+Ci$0/Yu#NF qi>"md҂(ǴP8iAZ YI* v +.Gུ_"Tvڃ1Ngh)؝ALRe`ATL\L²WJOpCR^*IZ2DPR3*F ‚CB9sEΓ7_ .UX%J)[TBJTV=FZԍW?AF4m7oan٪Mv;;u i1{8#x{دAI >|Qnj7~IL6}Y̝7E,X|U׬]~Mlݶ}]ݗC=vSϜ=wK\v[ܽwG<}W߼}O|_% OQG'!jE g W|ԨyJdGG3!23"$c _joblib-0.11/joblib/test/data/joblib_0.10.0_compressed_pickle_py34_np19.gz000066400000000000000000000014321305577265600257450ustar00rootroot00000000000000x^eSexPn]&a+Vn0e I Zo@Moju D :I%{g]}8()y)iÆ9jO8iӦϘ9k/Xh̥˖Xjoظi۶عkY8xǎ8y_x׮߸y?xϞx߾ QRA1ytfLR}AF(U+Jdk#Ege YQP1:_joblib-0.11/joblib/test/data/joblib_0.10.0_compressed_pickle_py35_np19.gz000066400000000000000000000014261305577265600257510ustar00rootroot00000000000000x^uSe|P]I;؆+C76tEd$ ʥ)00dw)2>K/w/E2=A0V`wۜfPQgY<9ŝNE@^z (*T kXPcr~; !҈CSGh@P*OQc4 ? C#a6m( ].y\`@ !` foYy?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q"h)q#}q$(hhh KKq%h hhcnumpy.matrixlib.defmatrix matrix q&hubXC'est l'été !q'e.joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np16.pkl.bz2000066400000000000000000000017451305577265600243710ustar00rootroot00000000000000BZh31AY&SYrG(\1EU4&&4LL1=``L4`40IdzLɓzG0j4i '2H FLmFh !4ɠdi)4B1SOI4id 4 LOSF@4 @C#FhL& hE4hm hh4 4ѠF@ɡhh = aL"ğ Bj ԑ@f ܰg(-Kd#V@E P4 bvaF*C2!ql8PfᩝÙ DAB (gL:iӎuH]w7&8NOw%2WgdNl "ne{Pr'{@_U߯Qr%YD!!k)0 [aCa1ˁ>&[D@Ph CRW V(Z O7opqr%tu k |-Uy p`CEő-Z䐥W:(uJ3V[38H&4"!d&4]dLċȗ̴IV8dH2jx-+7E y<4;;$(% ri3]z&H -a4z[Rg%)VkB NBPBh9*`PE V${n6qE#sE={p93:ͦ2F2Uyzny*'&MsJ ,'!@,\26:>BFJNRV[/13579;=?ACEGIK GMOUWY[]_ac~lVv,Q"/o0p1q2ryy:Zz]m}sw7$p@.p!Pjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np16.pkl.gzip000066400000000000000000000014011305577265600246320ustar00rootroot00000000000000k`-dHOL+-/LIq+Ë R 5 j 5BYSJ* R YZ77P4 4f,P:|Ba S, E N *:UéI`7Jffegd [ 1=JJC-(X*T(PRB̭K;K _[S ⒢B0cfh-d eL*dn/d *dfΆ}L@q@Tu7x:c y2'&%k2& x3f0{3 &0_@PHXDTL\BRJZFVN^AQIYEUM]CSK[GWO? 0(8$4,<"2*:&6.>耔Դ̬ܼ¢ҲʪںƦֶή޾ 'M2u3g͞3w -^t+W^v 7m޲u;w޳w>r'O>s /]r7oݾs=~/_~?}?B%pP-T%`þPM,)ʬA_ .(USy?0SKr,9RAP=U`kjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np17.pkl000066400000000000000000000017321305577265600236720ustar00rootroot00000000000000]q(cjoblib.numpy_pickle NumpyArrayWrapper q)q}q(Udtypeqcnumpy dtype qUi8qKKqRq(KU?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q"h)q#}q$(hhh KKq%h hhcnumpy.matrixlib.defmatrix matrix q&hubXC'est l'été !q'e.joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np17.pkl.bz2000066400000000000000000000017451305577265600243720ustar00rootroot00000000000000BZh91AY&SYrG(\1EU4&&4LL1=``L4`40IdzLɓzG0j4i '2H FLmFh !4ɠdi)4B1SOI4id 4 LOSF@4 @C#FhL& hE4hm hh4 4ѠF@ɡhh = aL"ğ Bj ԑ@f ܰg(-Kd#V@E P4 bvaF*C2!ql8PfᩝÙ DAB (gL:iӎuH]w7&8NOw%2WgdNl "ne{Pr'{@_U߯Qr%YD!!k)0 [aCa1ˁ>&[D@Ph CRW V(Z O7opqr%tu k |-Uy p`CEő-Z䐥W:(uJ3V[38H&4"!d&4]dLċȗ̴IV8dH2jx-+7E y<4;;$(% ri3]z&H -a4z[Rg%)VkB NBPBh9*`PE V${n6qE#sE={p93:ͦ2F2Uyzny*'&MsJ ,'!@,\26:>BFJNRV[/13579;=?ACEGIK GMOUWY[]_ac~lVv,Q"/o0p1q2ryy:Zz]m}sw7$p@.p!Pjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np17.pkl.gzip000066400000000000000000000014361305577265600246430ustar00rootroot00000000000000zVjoblib_0.10.0_pickle_py27_np17.pklk`-dHOL+-/LIq+Ë R 5 j 5BYSJ* R YZ77P4 4f,P:|Ba S, E N *:UéI`7Jffegd [  JJC-(X*T(PRB̭K;K _[S ⒢B0cfh-d eL*dn/d *dfΆ}`q@Tu7x:c y2'&%k2& x3f0{3 &酀_@PHXDTL\BRJZFVN^AQIYEUM]CSK[GWO? 0(8$4,<"2*:&6.>耔Դ̬ܼ¢ҲʪںƦֶή޾ 'M2u3g͞3w -^t+W^v 7m޲u;w޳w>r'O>s /]r7oݾs=~/_~?}?B%pP-T%`þPM,)ʬA_ .(U/G g+KTP,TO`kjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py27_np17.pkl.lzma000066400000000000000000000012241305577265600246300ustar00rootroot00000000000000]@I׀qm&˻1)'vQɨ˗)߀y=@-kMeqgͅܧ?Iu?^hѭ(}: );(lHnnACN>5Xgsw)DEZ4̒̀娅aq"u|#T9 _DIOU`B}s;_3y&R(~ BcXiNKG! -Ɂu"S'FϤ yH?9m![E3]U{W E#5(R:ѽT6E- Tc~Z/r2/\KJmA-%;{C?E ǖ0Racf[VpR5qtCQG^5Xgsw)DEZ4̒̀娅aq"u|#T9 _DIOU`B}s;_3y&R(~ BcXiNKG! -Ɂu"S'FϤ yH?9m![E3]U{W E#5(R:ѽT6E- Tc~Z/r2/\KJmA-%;{C?E ǖ0Racf[VpR5qtCQG^׳0gYZjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py33_np18.pkl000066400000000000000000000020541305577265600236660ustar00rootroot00000000000000]q(cjoblib.numpy_pickle NumpyArrayWrapper q)q}q(XshapeqKqX allow_mmapqXsubclassqcnumpy ndarray qXdtypeq cnumpy dtype q Xi8q KKq Rq (KX?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q!h)q"}q#(hKKq$hhcnumpy.matrixlib.defmatrix matrix q%h h hhubXC'est l'été !q&e.joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py33_np18.pkl.bz2000066400000000000000000000017501305577265600243640ustar00rootroot00000000000000BZh91AY&SY|u*B12i0 LM04U= &dF&H& f` 2b``&LLidɦ")h䧚Q3H hj=GG  5 SOjiCO4h44Aeީae٥ppJ(S,b lB , .TB*(2ĴB &P c{6&BFJNRVZ^bfjnrvz~ j+k,l-m.n/o֡0 0p1q2r3s4t5uw{g_xC]BBn\joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py33_np18.pkl.gzip000066400000000000000000000014771305577265600246460ustar00rootroot00000000000000zVjoblib_0.10.0_pickle_py33_np18.pkluexPvݺL oV0tK@"$i TN 0݇pwwwwZ{MQ%#{:Jn3촒 5,'w`qbQPJT:AkX1)v}Pc.wRτ88m6 LFX&Hwy݂ Pq o O=MJ~A)2 Zٵ `04ɇB 7ȡU)JVYiXaȣ}h5-5?:h_hP)^%ȍDI4$(Ҡ m$ha#w͍N G(;ӓ g.RX7ɡ} I%i&t]h >28gǁ:|#0`͝@z[c=JP~/ ?8c 7A'&ST}4h@`P6$4,OxD| ,THѨbKՕ*]l9} q+URZ|5kծX^ 5nҴּe6m۵`ةsn սmًN`]w :xд̬aG5zq'L4yig̜5{y,\xIeW\zu7lܴymwܵ{ޜ}Uغ:YP-"l/蔥NcKO}xZspK8YyY-Ktr|ǚQ͂~1>`ᘻ˳19{VZXiv`mKF aA[.'cm3`B٥g$T|}nkϏ-Oneed iD1׽zJ[B}`7 ĠsOgk@Q**r};ufמ3]WWgƬ:Y (zp,'5Jlj 3/5S/֒Q[)(9c EiqDƊՉ^xi=٠ qZE}a!zud!a~3*q_lAljDAL ۥٵ%^*v֐bt]e0J]%& nTPޟ 7 z(q$ =/`֩䃢S~} ,)p7g¹CX\OTz%T%$BF7ъg44/%Ҕ0^D:,䇝BVթU]7y-[F ܹ 6%0(ɫE,jJ ¬Z|nF=vvbtyw噉)<[w@9CTnh4ZuzhgYZjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py34_np19.pkl000066400000000000000000000020541305577265600236700ustar00rootroot00000000000000]q(cjoblib.numpy_pickle NumpyArrayWrapper q)q}q(Xdtypeqcnumpy dtype qXi8qKKqRq(KX?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q!h)q"}q#(hhh cnumpy.matrixlib.defmatrix matrix q$h hhhKKq%ubXC'est l'été !q&e.joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py34_np19.pkl.bz2000066400000000000000000000017751305577265600243750ustar00rootroot00000000000000BZh91AY&SYz!|QyL4&L I`# 10  4L10A!=CІ#z"dƚ'j4='iO&SɄ=FG FѠLM0&#!M#&@4hC*z)43S&M4=@A2@@4ԂH4M3SMhh4d @4 (,F`6ia-\;KȈb/?+EPB K , &$'#eTˤ l&it$ xF1B.-Lq5v:"vCS\,z"zD}/ɡD0=QhIeѬ̛.eTv "-:<>89:gv]ADd@Љ1r|}~QTHFxO$\\?RP J DvD ` 6 (6Mdqnd bFIק#xL#3((ja#W}$TXhȃT1OBfӤbL:5y0֜83{`!VdeŸyp*lq kQ{5-J2Р!aȀ& OZ4;" MA2@D0l\'V&L'%yhc~~ UR"$+YfD2]42*nJB(cTX⑔`େLyKInrmPcO n0IH@B$!!"b#c$d%eLMNOPQ*5=EMU]emu}zG #'+/37;?CGHN'?0?G󧫯L;)„ joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py34_np19.pkl.gzip000066400000000000000000000014771305577265600246500ustar00rootroot00000000000000zVjoblib_0.10.0_pickle_py34_np19.pkluu|@S]I7pٰ+C7(P,(IH!_ 0݇pwwwwO ۇ#rwwUNFJlTOFMF]#sEs((J& *C!)pAMQ GAkNJsBᨠ:{[v4q!\4cd@+ tk-q A)R! z׮!& !\*eUɪm+r5aKcDGaL#ECeT*/cJ$:YCbEh^/.@UɐE4P(k"tR:3#h6) GSN<oZ I%eVrWAAƵosTM h :30/wTY%Afm]zZ/0!4Ɖa**FG!9rʝ'o *Yh% %K.SX|*WZ-zj׉W~7i5knjѲu:vU`ugl=zv'o >t#G3v 'M2u3g͞3w -Xt+W^v 7m޲u;wޓw>r'O>s /]r7oݾs=~/_~?})^"^(*9[#K T(N @q>pFn>33""m e,joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py34_np19.pkl.lzma000066400000000000000000000012711305577265600246320ustar00rootroot00000000000000],@I׀qm&˻1)'vQɨ˗)߀y=@-kMe5aKKN&5Pu\0FU֚9F,񷊶ѬwRb&$Y\L~~}HƸjx.߆Za3h7 a9.c(=Lnku,LCU%p=%(6r}w;ﱢ) Q}OTvy%Y,V*K= ێ*բI]jL5fӎ]%6G+3=ϭ3Yoez=zj(S\}ў䋗\2Q)6C*b\ %4mO2AtنԦe${P"x4hDD!g\ wI&$7,"櫻S:g_珸|d3әEZOZG# > Γdd@ L{=A!0yXpLQ˘u!t8qLz^tq2Us')PZTGyŲ/ZRr<=':j?/F NOT?^󐓆Pz@n= A}]2hgdm Aψ:8X5 ͏ub\V83+G?zpjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py34_np19.pkl.xz000066400000000000000000000013601305577265600243270ustar00rootroot000000000000007zXZִF!t/+]@I׀qm&˻1)'vQɨ˗)߀y=@-kMe4clSb9x 4*_ae{"d"GoOӴ$^ 6b/D ֫)8*d6FRfn2!={ZXR Dv!YȺz˒B-c7x"YйѮ~}(4̞>vi@J Ӷ!Z@Wo^#5IY?~2 uAjTs/Zzɒ'Hy5hfXT2nyhTM)d! d뵜uPlWLCV&/ML@α]t"-&14qRPpB^W?=.hʐ-;(%1Ձ jMhO!A#<{p^yhAY1oгx/ŸmB s c-}/wX q܎zQD(+Ost/lG5 aB98VD$=3P-m1}&2c`C5s# K4Iv<;a"(i |i$7}LϨ MOH XbϠ k-7L&;#"iEE' {Z䳠^# Z^\ZQ?밤C o.x :Agr1dgYZjoblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py35_np19.pkl000066400000000000000000000020541305577265600236710ustar00rootroot00000000000000]q(cjoblib.numpy_pickle NumpyArrayWrapper q)q}q(X allow_mmapqXshapeqKqXorderqXCqXsubclassq cnumpy ndarray q Xdtypeq cnumpy dtype q Xi8q KKqRq(KX?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q!h)q"}q#(hhKKq$hhh cnumpy.matrixlib.defmatrix matrix q%h hubXC'est l'été !q&e.joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py35_np19.pkl.bz2000066400000000000000000000017551305577265600243740ustar00rootroot00000000000000BZh91AY&SYG|Q4L&0&d=b `&M0!M@mT1=G4' )zO&`@@dhb2dA&M240FL 4р"5cRh3SL@d@ 44~z6i4Mz f![#NB8GȈ.?*@( k 6- l5m[mLL%9DPR $p*l2PIT9=hzQe?IZIbKT0L$2&LH#33Cf0lfYч3&MfXsÛ0X=Hq؄@N.[m6QtuM燒G(O!T%DUդ%{|YH| 6"d&LB@:ċ@P‡iQ~)3FT0%P7 =V:rsEnW l!*|͠7`\%?`Cx@I-'g9%@!)p!PpX Q A7C. BpT Xb.!T (UFBY̿?2A!;l/E zC%^(C" ʤ+iq<ZLhLQy2RN6=,oehpQ@Φ C@MY((($hqY9*.|z30Ȏ 4CPN0Bf&A. )yB(bjCi'Shu>z?? 0(8$4W}yHI8hpZzƐ̡Æ9jO8iӦϘ9k/Xhq֒˖Xjoظi۶عk8xǎ8y_x׮߸y?xϞx߾(yPTl Jz, ;PLbP|n>ⳳ"m Rޫ_,joblib-0.11/joblib/test/data/joblib_0.10.0_pickle_py35_np19.pkl.lzma000066400000000000000000000012751305577265600246370ustar00rootroot00000000000000],@I׀qm&˻1)'vQɨ˗)߀y=@-kMe4clSb9x 4*_ae{"d"GoOӴ$^ 6b/D ֫)! yTkEqtF 0WȞɁXpI秓HYimI$f><j~x<>͞py~s `{ 4ALa5bY}x^z& J'7m(D_񒘟L^[- CRS} *w*c)y,}v\p]_EK/7{s9"e$*QOqHM½3\&RE~ f#yYtT]u{8N#V^~Y@1$UVDDyN}GE8ߍ] RKl'Rv?ճ573ԿQ3cMsptspv45iV@Q|"D."0g:{"`  j˩4 [9O:2yy-70V-⁢AթP,hh(.0Aɿ'C3TJ~E VY3Kݴ"ZJ\ *L;, 8@~n 4)XU@>&M GsE!TF"\( IѲ: *7#@eT˓7@`P! .RX%K(S6RU|JTVFZѵԭWALløF4mּ޲uv;tL :u5;`azdzs6 Kp/dA) >?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~q!h)q"}q#(hcnumpy.matrixlib.defmatrix matrix q$hKKq%hh h hhubXC'est l'été !q&e.joblib-0.11/joblib/test/data/joblib_0.11.0_pickle_py36_np111.pkl.bz2000066400000000000000000000017371305577265600244470ustar00rootroot00000000000000BZh31AY&SYf|sFqѠ000" 4ژLHhɚz =#&10OH @! LM04i05)zSOJ2i@ =@4@m@zhhh2h 4h4C 44@Ѧ=FESQ!|+ *l舔.?*FThB͹ Bp^ 4rʖP (# 2`$3+y`-% EdV4, 6$2-p&$Le͆  Bb&C gØ93J'1*>f^QQOPox& .Aeimquy}B̬L`"4h-M|cMP`\" #Z ,%R:.>+ջEkAhSPYM mHaA >? ؉QjC[O"M+0@D1897P?056.O@cxI̓P.\MDDd! @$g[fdKab'k:XR2MPϸhtt lwrt;w ':X NV@6;ZY!#%')+-X[]_acegikmoqsuwy{}Au5ug>[&/dd? i"(H3T@joblib-0.11/joblib/test/data/joblib_0.11.0_pickle_py36_np111.pkl.gzip000066400000000000000000000014401305577265600247120ustar00rootroot00000000000000mSu\AcUB8[1AWO=5[ӽx{{*) vw7vwcwwww 7f}ߛM:z⬔9N+ݛc0hdRO: 2j0<:BrS4G\B1Zq% T#h~R"$C7 B[hȋ+pe:3WKAhl4 )C[$9m6 AH&M)5^ϊe@=aD8ձ~,LzDC̤ !& S쿠r71^X>0V-⁢AթP,hh(.0Aɿ'C3TJ~E VY3Kݴ"ZJ\ *L;, 8@~n 4)XU@>&M GsE!TF"\( IѲ: *7#@eT˓7@`P! .RX%K(S6RU|JTVFZѵԭWALløF4mּ޲uv;tL :u5;`azdzs6 Kp/dA) >͚iCtUёT؟8=:!2V0gQ;Ywǖ+Zv“.Fg<5~6+Us9\/u>It='G8@^u{L\?Q 9IQw¾sbV44edB-_U>'UVÉG9C E boUEعwO90`{:`1 < |+A̬a/ղ3"jtmTi7 }}id#-MQAU[;y}(#R2zi%>kK1I1;. ~07lŽV˩W̻2-!l>aCD[㴲YH& ?cBy^$ qa)B70҅RS`l gcC B"rXa-k!c54EcM-[+͚iCtUёT؟8=:!2V0gQ;Ywǖ+Zv“.Fg<5~6+Us9\/u>It='G8@^u{L\?Q 9IQw¾sbV44edB-_U>'UVÉG9C E boUEعwO90`{:`1 < |+A̬a/ղ3"jtmTi7 }}id#-MQAU[;y}(#R2zi%>kK1I1;. ~07lŽV˩W̻2-!l>aCD[㴲YH& ?cBy^$ qa)B70҅RS`l gcC B"rXa-k!c54EcM-[+?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~qh)q}q(hhcnumpy.matrixlib.defmatrix matrix qhU(joblib_0.9.2_pickle_py27_np16.pkl_04.npyqubXC'est l'été !qe.joblib-0.11/joblib/test/data/joblib_0.9.2_pickle_py27_np16.pkl_01.npy000066400000000000000000000001701305577265600247230ustar00rootroot00000000000000NUMPYF{'descr': '?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~qh)q}q(hhcnumpy.matrixlib.defmatrix matrix qhU(joblib_0.9.2_pickle_py27_np17.pkl_04.npyqubXC'est l'été !qe.joblib-0.11/joblib/test/data/joblib_0.9.2_pickle_py27_np17.pkl_01.npy000066400000000000000000000001701305577265600247240ustar00rootroot00000000000000NUMPYF{'descr': '?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~qh)q}q(hhcnumpy.matrixlib.defmatrix matrix qhX(joblib_0.9.2_pickle_py33_np18.pkl_04.npyqubXC'est l'été !qe.joblib-0.11/joblib/test/data/joblib_0.9.2_pickle_py33_np18.pkl_01.npy000066400000000000000000000001701305577265600247220ustar00rootroot00000000000000NUMPYF{'descr': '?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~qh)q}q(hX(joblib_0.9.2_pickle_py34_np19.pkl_04.npyqhhcnumpy.matrixlib.defmatrix matrix qubXC'est l'été !qe.joblib-0.11/joblib/test/data/joblib_0.9.2_pickle_py34_np19.pkl_01.npy000066400000000000000000000001701305577265600247240ustar00rootroot00000000000000NUMPYF{'descr': '?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~qh)q}q(hcnumpy.matrixlib.defmatrix matrix qhhX(joblib_0.9.2_pickle_py35_np19.pkl_04.npyqubXC'est l'été !qe.joblib-0.11/joblib/test/data/joblib_0.9.2_pickle_py35_np19.pkl_01.npy000066400000000000000000000001701305577265600247250ustar00rootroot00000000000000NUMPYF{'descr': 'oB~gƤBB̜ԼB x=K=2܂Ԕ̪T *M -ҫ * K2@~-ȀP,4Bq$ %;`d+AA](W TҜLH&&pCrbP([ ^(TxE5P ?5c_R[  ĤBBm p|&x3Yތ%zIN F&fV6vN.n^>~A!aQ1q I)iY9yE%eU5u M-m]=}C#cS3s K+k[;{G'gW7wO/o_?аȨظCRR32sr KJ+*kj[Z;:{z'L4yig̜5{y,\xeW\zu7lܴymwܵ{} # Lars Buitinck # Copyright (c) 2010 Gael Varoquaux # License: BSD Style, 3 clauses. from __future__ import with_statement import array import os from joblib.disk import disk_used, memstr_to_bytes, mkdirp from joblib.testing import parametrize, raises ############################################################################### def test_disk_used(tmpdir): cachedir = tmpdir.strpath # Not write a file that is 1M big in this directory, and check the # size. The reason we use such a big file is that it makes us robust # to errors due to block allocation. a = array.array('i') sizeof_i = a.itemsize target_size = 1024 n = int(target_size * 1024 / sizeof_i) a = array.array('i', n * (1,)) with open(os.path.join(cachedir, 'test'), 'wb') as output: a.tofile(output) assert disk_used(cachedir) >= target_size assert disk_used(cachedir) < target_size + 12 @parametrize('text,value', [('80G', 80 * 1024 ** 3), ('1.4M', int(1.4 * 1024 ** 2)), ('120M', 120 * 1024 ** 2), ('53K', 53 * 1024)]) def test_memstr_to_bytes(text, value): assert memstr_to_bytes(text) == value @parametrize('text,exception,regex', [('fooG', ValueError, r'Invalid literal for size.*fooG.*'), ('1.4N', ValueError, r'Invalid literal for size.*1.4N.*')]) def test_memstr_to_bytes_exception(text, exception, regex): with raises(exception) as excinfo: memstr_to_bytes(text) assert excinfo.match(regex) def test_mkdirp(tmpdir): mkdirp(os.path.join(tmpdir.strpath, 'ham')) mkdirp(os.path.join(tmpdir.strpath, 'ham')) mkdirp(os.path.join(tmpdir.strpath, 'spam', 'spam')) # Not all OSErrors are ignored with raises(OSError): mkdirp('') joblib-0.11/joblib/test/test_format_stack.py000066400000000000000000000100531305577265600212210ustar00rootroot00000000000000""" Unit tests for the stack formatting utilities """ # Author: Gael Varoquaux # Copyright (c) 2010 Gael Varoquaux # License: BSD Style, 3 clauses. import imp import os import re import sys from joblib.format_stack import safe_repr, _fixed_getframes, format_records from joblib.format_stack import format_exc from joblib.test.common import with_numpy, np ############################################################################### class Vicious(object): def __repr__(self): raise ValueError def test_safe_repr(): safe_repr(Vicious()) def _change_file_extensions_to_pyc(record): _1, filename, _2, _3, _4, _5 = record if filename.endswith('.py'): filename += 'c' return _1, filename, _2, _3, _4, _5 def _raise_exception(a, b): """Function that raises with a non trivial call stack """ def helper(a, b): raise ValueError('Nope, this can not work') helper(a, b) def test_format_records(): try: _raise_exception('a', 42) except ValueError: etb = sys.exc_info()[2] records = _fixed_getframes(etb) # Modify filenames in traceback records from .py to .pyc pyc_records = [_change_file_extensions_to_pyc(record) for record in records] formatted_records = format_records(pyc_records) # Check that the .py file and not the .pyc one is listed in # the traceback for fmt_rec in formatted_records: assert 'test_format_stack.py in' in fmt_rec # Check exception stack arrow_regex = r'^-+>\s+\d+\s+' assert re.search(arrow_regex + "_raise_exception\('a', 42\)", formatted_records[0], re.MULTILINE) assert re.search(arrow_regex + r'helper\(a, b\)', formatted_records[1], re.MULTILINE) assert "a = 'a'" in formatted_records[1] assert 'b = 42' in formatted_records[1] assert re.search(arrow_regex + "raise ValueError\('Nope, this can not work'\)", formatted_records[2], re.MULTILINE) def test_format_records_file_with_less_lines_than_context(tmpdir): # See https://github.com/joblib/joblib/issues/420 filename = os.path.join(tmpdir.strpath, 'small_file.py') code_lines = ['def func():', ' 1/0'] code = '\n'.join(code_lines) open(filename, 'w').write(code) small_file = imp.load_source('small_file', filename) try: small_file.func() except ZeroDivisionError: etb = sys.exc_info()[2] records = _fixed_getframes(etb, context=10) # Check that if context is bigger than the number of lines in # the file you do not get padding frame, tb_filename, line, func_name, context, _ = records[-1] assert [l.rstrip() for l in context] == code_lines formatted_records = format_records(records) # 2 lines for header in the traceback: lines of ...... + # filename with function len_header = 2 nb_lines_formatted_records = len(formatted_records[1].splitlines()) assert (nb_lines_formatted_records == len_header + len(code_lines)) # Check exception stack arrow_regex = r'^-+>\s+\d+\s+' assert re.search(arrow_regex + r'1/0', formatted_records[1], re.MULTILINE) @with_numpy def test_format_exc_with_compiled_code(): # Trying to tokenize compiled C code raise SyntaxError. # See https://github.com/joblib/joblib/issues/101 for more details. try: np.random.uniform('invalid_value') except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() formatted_exc = format_exc(exc_type, exc_value, exc_traceback, context=10) # The name of the extension can be something like # mtrand.cpython-33m.so pattern = 'mtrand[a-z0-9._-]*\.(so|pyd)' assert re.search(pattern, formatted_exc) joblib-0.11/joblib/test/test_func_inspect.py000066400000000000000000000206251305577265600212320ustar00rootroot00000000000000""" Test the func_inspect module. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import functools from joblib.func_inspect import filter_args, get_func_name, get_func_code from joblib.func_inspect import _clean_win_chars, format_signature from joblib.memory import Memory from joblib.test.common import with_numpy from joblib.testing import fixture, parametrize, raises from joblib._compat import PY3_OR_LATER ############################################################################### # Module-level functions and fixture, for tests def f(x, y=0): pass def g(x): pass def h(x, y=0, *args, **kwargs): pass def i(x=1): pass def j(x, y, **kwargs): pass def k(*args, **kwargs): pass @fixture(scope='module') def cached_func(tmpdir_factory): # Create a Memory object to test decorated functions. # We should be careful not to call the decorated functions, so that # cache directories are not created in the temp dir. cachedir = tmpdir_factory.mktemp("joblib_test_func_inspect") mem = Memory(cachedir.strpath) @mem.cache def cached_func_inner(x): return x return cached_func_inner class Klass(object): def f(self, x): return x ############################################################################### # Tests @parametrize('func,args,filtered_args', [(f, [[], (1, )], {'x': 1, 'y': 0}), (f, [['x'], (1, )], {'y': 0}), (f, [['y'], (0, )], {'x': 0}), (f, [['y'], (0, ), {'y': 1}], {'x': 0}), (f, [['x', 'y'], (0, )], {}), (f, [[], (0,), {'y': 1}], {'x': 0, 'y': 1}), (f, [['y'], (), {'x': 2, 'y': 1}], {'x': 2}), (g, [[], (), {'x': 1}], {'x': 1}), (i, [[], (2, )], {'x': 2})]) def test_filter_args(func, args, filtered_args): assert filter_args(func, *args) == filtered_args def test_filter_args_method(): obj = Klass() assert filter_args(obj.f, [], (1, )) == {'x': 1, 'self': obj} @parametrize('func,args,filtered_args', [(h, [[], (1, )], {'x': 1, 'y': 0, '*': [], '**': {}}), (h, [[], (1, 2, 3, 4)], {'x': 1, 'y': 2, '*': [3, 4], '**': {}}), (h, [[], (1, 25), {'ee': 2}], {'x': 1, 'y': 25, '*': [], '**': {'ee': 2}}), (h, [['*'], (1, 2, 25), {'ee': 2}], {'x': 1, 'y': 2, '**': {'ee': 2}})]) def test_filter_varargs(func, args, filtered_args): assert filter_args(func, *args) == filtered_args @parametrize('func,args,filtered_args', [(k, [[], (1, 2), {'ee': 2}], {'*': [1, 2], '**': {'ee': 2}}), (k, [[], (3, 4)], {'*': [3, 4], '**': {}})]) def test_filter_kwargs(func, args, filtered_args): assert filter_args(func, *args) == filtered_args def test_filter_args_2(): assert (filter_args(j, [], (1, 2), {'ee': 2}) == {'x': 1, 'y': 2, '**': {'ee': 2}}) ff = functools.partial(f, 1) # filter_args has to special-case partial assert filter_args(ff, [], (1, )) == {'*': [1], '**': {}} assert filter_args(ff, ['y'], (1, )) == {'*': [1], '**': {}} @parametrize('func,funcname', [(f, 'f'), (g, 'g'), (cached_func, 'cached_func')]) def test_func_name(func, funcname): # Check that we are not confused by decoration # here testcase 'cached_func' is the function itself assert get_func_name(func)[1] == funcname def test_func_name_on_inner_func(cached_func): # Check that we are not confused by decoration # here testcase 'cached_func' is the 'cached_func_inner' function # returned by 'cached_func' fixture assert get_func_name(cached_func)[1] == 'cached_func_inner' def test_func_inspect_errors(): # Check that func_inspect is robust and will work on weird objects assert get_func_name('a'.lower)[-1] == 'lower' assert get_func_code('a'.lower)[1:] == (None, -1) ff = lambda x: x assert get_func_name(ff, win_characters=False)[-1] == '' assert get_func_code(ff)[1] == __file__.replace('.pyc', '.py') # Simulate a function defined in __main__ ff.__module__ = '__main__' assert get_func_name(ff, win_characters=False)[-1] == '' assert get_func_code(ff)[1] == __file__.replace('.pyc', '.py') def func_with_kwonly_args(a, b, kw1='kw1', kw2='kw2'): pass def func_with_signature(a, b): pass if PY3_OR_LATER: exec(""" def func_with_kwonly_args(a, b, *, kw1='kw1', kw2='kw2'): pass def func_with_signature(a: int, b: int) -> None: pass """) def test_filter_args_python_3(): assert ( filter_args(func_with_kwonly_args, [], (1, 2), {'kw1': 3, 'kw2': 4}) == {'a': 1, 'b': 2, 'kw1': 3, 'kw2': 4}) # filter_args doesn't care about keyword-only arguments so you # can pass 'kw1' into *args without any problem with raises(ValueError) as excinfo: filter_args(func_with_kwonly_args, [], (1, 2, 3), {'kw2': 2}) excinfo.match("Keyword-only parameter 'kw1' was passed as positional " "parameter") assert ( filter_args(func_with_kwonly_args, ['b', 'kw2'], (1, 2), {'kw1': 3, 'kw2': 4}) == {'a': 1, 'kw1': 3}) assert (filter_args(func_with_signature, ['b'], (1, 2)) == {'a': 1}) def test_bound_methods(): """ Make sure that calling the same method on two different instances of the same class does resolv to different signatures. """ a = Klass() b = Klass() assert filter_args(a.f, [], (1, )) != filter_args(b.f, [], (1, )) @parametrize('exception,regex,func,args', [(ValueError, 'ignore_lst must be a list of parameters to ignore', f, ['bar', (None, )]), (ValueError, 'Ignore list: argument \'(.*)\' is not defined', g, [['bar'], (None, )]), (ValueError, 'Wrong number of arguments', h, [[]])]) def test_filter_args_error_msg(exception, regex, func, args): """ Make sure that filter_args returns decent error messages, for the sake of the user. """ with raises(exception) as excinfo: filter_args(func, *args) excinfo.match(regex) def test_clean_win_chars(): string = r'C:\foo\bar\main.py' mangled_string = _clean_win_chars(string) for char in ('\\', ':', '<', '>', '!'): assert char not in mangled_string @parametrize('func,args,kwargs,sgn_expected', [(g, [list(range(5))], {}, 'g([0, 1, 2, 3, 4])'), (k, [1, 2, (3, 4)], {'y': True}, 'k(1, 2, (3, 4), y=True)')]) def test_format_signature(func, args, kwargs, sgn_expected): # Test signature formatting. path, sgn_result = format_signature(func, *args, **kwargs) assert sgn_result == sgn_expected def test_format_signature_long_arguments(): shortening_threshold = 1500 # shortening gets it down to 700 characters but there is the name # of the function in the signature and a few additional things # like dots for the ellipsis shortening_target = 700 + 10 arg = 'a' * shortening_threshold _, signature = format_signature(h, arg) assert len(signature) < shortening_target nb_args = 5 args = [arg for _ in range(nb_args)] _, signature = format_signature(h, *args) assert len(signature) < shortening_target * nb_args kwargs = {str(i): arg for i, arg in enumerate(args)} _, signature = format_signature(h, **kwargs) assert len(signature) < shortening_target * nb_args _, signature = format_signature(h, *args, **kwargs) assert len(signature) < shortening_target * 2 * nb_args @with_numpy def test_format_signature_numpy(): """ Test the format signature formatting with numpy. """ def test_special_source_encoding(): from joblib.test.test_func_inspect_special_encoding import big5_f func_code, source_file, first_line = get_func_code(big5_f) assert first_line == 5 assert "def big5_f():" in func_code assert "test_func_inspect_special_encoding" in source_file def _get_code(): from joblib.test.test_func_inspect_special_encoding import big5_f return get_func_code(big5_f)[0] def test_func_code_consistency(): from joblib.parallel import Parallel, delayed codes = Parallel(n_jobs=2)(delayed(_get_code)() for _ in range(5)) assert len(set(codes)) == 1 joblib-0.11/joblib/test/test_func_inspect_special_encoding.py000066400000000000000000000002221305577265600245670ustar00rootroot00000000000000# -*- coding: big5 -*- # Some Traditional Chinese characters: @Ǥr def big5_f(): """Ωժ """ # return 0 joblib-0.11/joblib/test/test_hashing.py000066400000000000000000000353231305577265600201740ustar00rootroot00000000000000""" Test the hashing module. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import time import hashlib import sys import gc import io import collections import itertools import pickle import random from decimal import Decimal from joblib.hashing import hash from joblib.func_inspect import filter_args from joblib.memory import Memory from joblib.testing import raises, skipif, fixture, parametrize from joblib.test.common import np, with_numpy from joblib.my_exceptions import TransportableException from joblib._compat import PY3_OR_LATER try: # Python 2/Python 3 compat unicode('str') except NameError: unicode = lambda s: s ############################################################################### # Helper functions for the tests def time_func(func, *args): """ Time function func on *args. """ times = list() for _ in range(3): t1 = time.time() func(*args) times.append(time.time() - t1) return min(times) def relative_time(func1, func2, *args): """ Return the relative time between func1 and func2 applied on *args. """ time_func1 = time_func(func1, *args) time_func2 = time_func(func2, *args) relative_diff = 0.5 * (abs(time_func1 - time_func2) / (time_func1 + time_func2)) return relative_diff class Klass(object): def f(self, x): return x class KlassWithCachedMethod(object): def __init__(self, cachedir): mem = Memory(cachedir=cachedir) self.f = mem.cache(self.f) def f(self, x): return x ############################################################################### # Tests input_list = [1, 2, 1., 2., 1 + 1j, 2. + 1j, 'a', 'b', (1,), (1, 1,), [1, ], [1, 1, ], {1: 1}, {1: 2}, {2: 1}, None, gc.collect, [1, ].append, # Next 2 sets have unorderable elements in python 3. set(('a', 1)), set(('a', 1, ('a', 1))), # Next 2 dicts have unorderable type of keys in python 3. {'a': 1, 1: 2}, {'a': 1, 1: 2, 'd': {'a': 1}}] @parametrize('obj1', input_list) @parametrize('obj2', input_list) def test_trivial_hash(obj1, obj2): """Smoke test hash on various types.""" # Check that 2 objects have the same hash only if they are the same. are_hashes_equal = hash(obj1) == hash(obj2) are_objs_identical = obj1 is obj2 assert are_hashes_equal == are_objs_identical def test_hash_methods(): # Check that hashing instance methods works a = io.StringIO(unicode('a')) assert hash(a.flush) == hash(a.flush) a1 = collections.deque(range(10)) a2 = collections.deque(range(9)) assert hash(a1.extend) != hash(a2.extend) @fixture(scope='function') @with_numpy def three_np_arrays(): rnd = np.random.RandomState(0) arr1 = rnd.random_sample((10, 10)) arr2 = arr1.copy() arr3 = arr2.copy() arr3[0] += 1 return arr1, arr2, arr3 def test_hash_numpy_arrays(three_np_arrays): arr1, arr2, arr3 = three_np_arrays for obj1, obj2 in itertools.product(three_np_arrays, repeat=2): are_hashes_equal = hash(obj1) == hash(obj2) are_arrays_equal = np.all(obj1 == obj2) assert are_hashes_equal == are_arrays_equal assert hash(arr1) != hash(arr1.T) def test_hash_numpy_dict_of_arrays(three_np_arrays): arr1, arr2, arr3 = three_np_arrays d1 = {1: arr1, 2: arr2} d2 = {1: arr2, 2: arr1} d3 = {1: arr2, 2: arr3} assert hash(d1) == hash(d2) assert hash(d1) != hash(d3) @with_numpy @parametrize('dtype', ['datetime64[s]', 'timedelta64[D]']) def test_numpy_datetime_array(dtype): # memoryview is not supported for some dtypes e.g. datetime64 # see https://github.com/joblib/joblib/issues/188 for more details a_hash = hash(np.arange(10)) array = np.arange(0, 10, dtype=dtype) assert hash(array) != a_hash @with_numpy def test_hash_numpy_noncontiguous(): a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] b = np.ascontiguousarray(a) assert hash(a) != hash(b) c = np.asfortranarray(a) assert hash(a) != hash(c) @with_numpy @parametrize('coerce_mmap', [True, False]) def test_hash_memmap(tmpdir, coerce_mmap): """Check that memmap and arrays hash identically if coerce_mmap is True.""" filename = tmpdir.join('memmap_temp').strpath try: m = np.memmap(filename, shape=(10, 10), mode='w+') a = np.asarray(m) are_hashes_equal = (hash(a, coerce_mmap=coerce_mmap) == hash(m, coerce_mmap=coerce_mmap)) assert are_hashes_equal == coerce_mmap finally: if 'm' in locals(): del m # Force a garbage-collection cycle, to be certain that the # object is delete, and we don't run in a problem under # Windows with a file handle still open. gc.collect() @with_numpy @skipif(sys.platform == 'win32', reason='This test is not stable under windows' ' for some reason') def test_hash_numpy_performance(): """ Check the performance of hashing numpy arrays: In [22]: a = np.random.random(1000000) In [23]: %timeit hashlib.md5(a).hexdigest() 100 loops, best of 3: 20.7 ms per loop In [24]: %timeit hashlib.md5(pickle.dumps(a, protocol=2)).hexdigest() 1 loops, best of 3: 73.1 ms per loop In [25]: %timeit hashlib.md5(cPickle.dumps(a, protocol=2)).hexdigest() 10 loops, best of 3: 53.9 ms per loop In [26]: %timeit hash(a) 100 loops, best of 3: 20.8 ms per loop """ rnd = np.random.RandomState(0) a = rnd.random_sample(1000000) if hasattr(np, 'getbuffer'): # Under python 3, there is no getbuffer getbuffer = np.getbuffer else: getbuffer = memoryview md5_hash = lambda x: hashlib.md5(getbuffer(x)).hexdigest() relative_diff = relative_time(md5_hash, hash, a) assert relative_diff < 0.3 # Check that hashing an tuple of 3 arrays takes approximately # 3 times as much as hashing one array time_hashlib = 3 * time_func(md5_hash, a) time_hash = time_func(hash, (a, a, a)) relative_diff = 0.5 * (abs(time_hash - time_hashlib) / (time_hash + time_hashlib)) assert relative_diff < 0.3 def test_bound_methods_hash(): """ Make sure that calling the same method on two different instances of the same class does resolve to the same hashes. """ a = Klass() b = Klass() assert (hash(filter_args(a.f, [], (1, ))) == hash(filter_args(b.f, [], (1, )))) def test_bound_cached_methods_hash(tmpdir): """ Make sure that calling the same _cached_ method on two different instances of the same class does resolve to the same hashes. """ a = KlassWithCachedMethod(tmpdir.strpath) b = KlassWithCachedMethod(tmpdir.strpath) assert (hash(filter_args(a.f.func, [], (1, ))) == hash(filter_args(b.f.func, [], (1, )))) @with_numpy def test_hash_object_dtype(): """ Make sure that ndarrays with dtype `object' hash correctly.""" a = np.array([np.arange(i) for i in range(6)], dtype=object) b = np.array([np.arange(i) for i in range(6)], dtype=object) assert hash(a) == hash(b) @with_numpy def test_numpy_scalar(): # Numpy scalars are built from compiled functions, and lead to # strange pickling paths explored, that can give hash collisions a = np.float64(2.0) b = np.float64(3.0) assert hash(a) != hash(b) def test_dict_hash(tmpdir): # Check that dictionaries hash consistently, eventhough the ordering # of the keys is not garanteed k = KlassWithCachedMethod(tmpdir.strpath) d = {'#s12069__c_maps.nii.gz': [33], '#s12158__c_maps.nii.gz': [33], '#s12258__c_maps.nii.gz': [33], '#s12277__c_maps.nii.gz': [33], '#s12300__c_maps.nii.gz': [33], '#s12401__c_maps.nii.gz': [33], '#s12430__c_maps.nii.gz': [33], '#s13817__c_maps.nii.gz': [33], '#s13903__c_maps.nii.gz': [33], '#s13916__c_maps.nii.gz': [33], '#s13981__c_maps.nii.gz': [33], '#s13982__c_maps.nii.gz': [33], '#s13983__c_maps.nii.gz': [33]} a = k.f(d) b = k.f(a) assert hash(a) == hash(b) def test_set_hash(tmpdir): # Check that sets hash consistently, even though their ordering # is not guaranteed k = KlassWithCachedMethod(tmpdir.strpath) s = set(['#s12069__c_maps.nii.gz', '#s12158__c_maps.nii.gz', '#s12258__c_maps.nii.gz', '#s12277__c_maps.nii.gz', '#s12300__c_maps.nii.gz', '#s12401__c_maps.nii.gz', '#s12430__c_maps.nii.gz', '#s13817__c_maps.nii.gz', '#s13903__c_maps.nii.gz', '#s13916__c_maps.nii.gz', '#s13981__c_maps.nii.gz', '#s13982__c_maps.nii.gz', '#s13983__c_maps.nii.gz']) a = k.f(s) b = k.f(a) assert hash(a) == hash(b) def test_set_decimal_hash(): # Check that sets containing decimals hash consistently, even though # ordering is not guaranteed assert (hash(set([Decimal(0), Decimal('NaN')])) == hash(set([Decimal('NaN'), Decimal(0)]))) def test_string(): # Test that we obtain the same hash for object owning several strings, # whatever the past of these strings (which are immutable in Python) string = 'foo' a = {string: 'bar'} b = {string: 'bar'} c = pickle.loads(pickle.dumps(b)) assert hash([a, b]) == hash([a, c]) @with_numpy def test_dtype(): # Test that we obtain the same hash for object owning several dtype, # whatever the past of these dtypes. Catter for cache invalidation with # complex dtype a = np.dtype([('f1', np.uint), ('f2', np.int32)]) b = a c = pickle.loads(pickle.dumps(a)) assert hash([a, c]) == hash([a, b]) @parametrize('to_hash,expected', [('This is a string to hash', {'py2': '80436ada343b0d79a99bfd8883a96e45', 'py3': '71b3f47df22cb19431d85d92d0b230b2'}), (u"C'est l\xe9t\xe9", {'py2': '2ff3a25200eb6219f468de2640913c2d', 'py3': '2d8d189e9b2b0b2e384d93c868c0e576'}), ((123456, 54321, -98765), {'py2': '50d81c80af05061ac4dcdc2d5edee6d6', 'py3': 'e205227dd82250871fa25aa0ec690aa3'}), ([random.Random(42).random() for _ in range(5)], {'py2': '1a36a691b2e2ba3a9df72de3dccf17ea', 'py3': 'a11ffad81f9682a7d901e6edc3d16c84'}), ([3, 'abc', None, TransportableException('foo', ValueError)], {'py2': 'adb6ba84990ee5e462dc138383f11802', 'py3': '994f663c64ba5e64b2a85ebe75287829'}), ({'abcde': 123, 'sadfas': [-9999, 2, 3]}, {'py2': 'fc9314a39ff75b829498380850447047', 'py3': 'aeda150553d4bb5c69f0e69d51b0e2ef'})]) def test_hashes_stay_the_same(to_hash, expected): # We want to make sure that hashes don't change with joblib # version. For end users, that would mean that they have to # regenerate their cache from scratch, which potentially means # lengthy recomputations. # Expected results have been generated with joblib 0.9.2 py_version_str = 'py3' if PY3_OR_LATER else 'py2' assert hash(to_hash) == expected[py_version_str] @with_numpy def test_hashes_are_different_between_c_and_fortran_contiguous_arrays(): # We want to be sure that the c-contiguous and f-contiguous versions of the # same array produce 2 different hashes. rng = np.random.RandomState(0) arr_c = rng.random_sample((10, 10)) arr_f = np.asfortranarray(arr_c) assert hash(arr_c) != hash(arr_f) @with_numpy def test_0d_array(): hash(np.array(0)) @with_numpy def test_0d_and_1d_array_hashing_is_different(): assert hash(np.array(0)) != hash(np.array([0])) @with_numpy def test_hashes_stay_the_same_with_numpy_objects(): # We want to make sure that hashes don't change with joblib # version. For end users, that would mean that they have to # regenerate their cache from scratch, which potentially means # lengthy recomputations. rng = np.random.RandomState(42) # Being explicit about dtypes in order to avoid # architecture-related differences. Also using 'f4' rather than # 'f8' for float arrays because 'f8' arrays generated by # rng.random.randn don't seem to be bit-identical on 32bit and # 64bit machines. to_hash_list = [ rng.randint(-1000, high=1000, size=50).astype(' # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import re from joblib.logger import PrintTime try: # Python 2/Python 3 compat unicode('str') except NameError: unicode = lambda s: s def test_print_time(tmpdir, capsys): # A simple smoke test for PrintTime. logfile = tmpdir.join('test.log').strpath print_time = PrintTime(logfile=logfile) print_time(unicode('Foo')) # Create a second time, to smoke test log rotation. print_time = PrintTime(logfile=logfile) print_time(unicode('Foo')) # And a third time print_time = PrintTime(logfile=logfile) print_time(unicode('Foo')) out_printed_text, err_printed_text = capsys.readouterr() # Use regexps to be robust to time variations match = r"Foo: 0\..s, 0\..min\nFoo: 0\..s, 0..min\nFoo: " + \ r".\..s, 0..min\n" if not re.match(match, err_printed_text): raise AssertionError('Excepted %s, got %s' % (match, err_printed_text)) joblib-0.11/joblib/test/test_memory.py000066400000000000000000000650501305577265600200630ustar00rootroot00000000000000""" Test the memory module. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import shutil import os import os.path import sys import time import datetime import pickle try: # Python 2.7: use the C pickle to speed up # test_concurrency_safe_write which pickles big python objects import cPickle as cpickle except ImportError: import pickle as cpickle import functools from joblib.memory import Memory, MemorizedFunc, NotMemorizedFunc from joblib.memory import MemorizedResult, NotMemorizedResult, _FUNCTION_HASHES from joblib.memory import _get_cache_items, _get_cache_items_to_delete from joblib.memory import _load_output, _get_func_fullname from joblib.memory import JobLibCollisionWarning from joblib.memory import concurrency_safe_write from joblib.parallel import Parallel, delayed from joblib.test.common import with_numpy, np from joblib.test.common import with_multiprocessing from joblib.testing import parametrize, raises, warns from joblib._compat import PY3_OR_LATER ############################################################################### # Module-level variables for the tests def f(x, y=1): """ A module-level function for testing purposes. """ return x ** 2 + y ############################################################################### # Helper function for the tests def check_identity_lazy(func, accumulator, cachedir): """ Given a function and an accumulator (a list that grows every time the function is called), check that the function can be decorated by memory to be a lazy identity. """ # Call each function with several arguments, and check that it is # evaluated only once per argument. memory = Memory(cachedir=cachedir, verbose=0) func = memory.cache(func) for i in range(3): for _ in range(2): assert func(i) == i assert len(accumulator) == i + 1 ############################################################################### # Tests def test_memory_integration(tmpdir): """ Simple test of memory lazy evaluation. """ accumulator = list() # Rmk: this function has the same name than a module-level function, # thus it serves as a test to see that both are identified # as different. def f(l): accumulator.append(1) return l check_identity_lazy(f, accumulator, tmpdir.strpath) # Now test clearing for compress in (False, True): for mmap_mode in ('r', None): memory = Memory(cachedir=tmpdir.strpath, verbose=10, mmap_mode=mmap_mode, compress=compress) # First clear the cache directory, to check that our code can # handle that # NOTE: this line would raise an exception, as the database file is # still open; we ignore the error since we want to test what # happens if the directory disappears shutil.rmtree(tmpdir.strpath, ignore_errors=True) g = memory.cache(f) g(1) g.clear(warn=False) current_accumulator = len(accumulator) out = g(1) assert len(accumulator) == current_accumulator + 1 # Also, check that Memory.eval works similarly assert memory.eval(f, 1) == out assert len(accumulator) == current_accumulator + 1 # Now do a smoke test with a function defined in __main__, as the name # mangling rules are more complex f.__module__ = '__main__' memory = Memory(cachedir=tmpdir.strpath, verbose=0) memory.cache(f)(1) def test_no_memory(): """ Test memory with cachedir=None: no memoize """ accumulator = list() def ff(l): accumulator.append(1) return l memory = Memory(cachedir=None, verbose=0) gg = memory.cache(ff) for _ in range(4): current_accumulator = len(accumulator) gg(1) assert len(accumulator) == current_accumulator + 1 def test_memory_kwarg(tmpdir): " Test memory with a function with keyword arguments." accumulator = list() def g(l=None, m=1): accumulator.append(1) return l check_identity_lazy(g, accumulator, tmpdir.strpath) memory = Memory(cachedir=tmpdir.strpath, verbose=0) g = memory.cache(g) # Smoke test with an explicit keyword argument: assert g(l=30, m=2) == 30 def test_memory_lambda(tmpdir): " Test memory with a function with a lambda." accumulator = list() def helper(x): """ A helper function to define l as a lambda. """ accumulator.append(1) return x l = lambda x: helper(x) check_identity_lazy(l, accumulator, tmpdir.strpath) def test_memory_name_collision(tmpdir): " Check that name collisions with functions will raise warnings" memory = Memory(cachedir=tmpdir.strpath, verbose=0) @memory.cache def name_collision(x): """ A first function called name_collision """ return x a = name_collision @memory.cache def name_collision(x): """ A second function called name_collision """ return x b = name_collision with warns(JobLibCollisionWarning) as warninfo: a(1) b(1) assert len(warninfo) == 1 assert "collision" in str(warninfo[0].message) def test_memory_warning_lambda_collisions(tmpdir): # Check that multiple use of lambda will raise collisions memory = Memory(cachedir=tmpdir.strpath, verbose=0) a = lambda x: x a = memory.cache(a) b = lambda x: x + 1 b = memory.cache(b) with warns(JobLibCollisionWarning) as warninfo: assert a(0) == 0 assert b(1) == 2 assert a(1) == 1 # In recent Python versions, we can retrieve the code of lambdas, # thus nothing is raised assert len(warninfo) == 4 def test_memory_warning_collision_detection(tmpdir): # Check that collisions impossible to detect will raise appropriate # warnings. memory = Memory(cachedir=tmpdir.strpath, verbose=0) a1 = eval('lambda x: x') a1 = memory.cache(a1) b1 = eval('lambda x: x+1') b1 = memory.cache(b1) with warns(JobLibCollisionWarning) as warninfo: a1(1) b1(1) a1(0) assert len(warninfo) == 2 assert "cannot detect" in str(warninfo[0].message).lower() def test_memory_partial(tmpdir): " Test memory with functools.partial." accumulator = list() def func(x, y): """ A helper function to define l as a lambda. """ accumulator.append(1) return y import functools function = functools.partial(func, 1) check_identity_lazy(function, accumulator, tmpdir.strpath) def test_memory_eval(tmpdir): " Smoke test memory with a function with a function defined in an eval." memory = Memory(cachedir=tmpdir.strpath, verbose=0) m = eval('lambda x: x') mm = memory.cache(m) assert mm(1) == 1 def count_and_append(x=[]): """ A function with a side effect in its arguments. Return the lenght of its argument and append one element. """ len_x = len(x) x.append(None) return len_x def test_argument_change(tmpdir): """ Check that if a function has a side effect in its arguments, it should use the hash of changing arguments. """ memory = Memory(cachedir=tmpdir.strpath, verbose=0) func = memory.cache(count_and_append) # call the function for the first time, is should cache it with # argument x=[] assert func() == 0 # the second time the argument is x=[None], which is not cached # yet, so the functions should be called a second time assert func() == 1 @with_numpy @parametrize('mmap_mode', [None, 'r']) def test_memory_numpy(tmpdir, mmap_mode): " Test memory with a function with numpy arrays." accumulator = list() def n(l=None): accumulator.append(1) return l memory = Memory(cachedir=tmpdir.strpath, mmap_mode=mmap_mode, verbose=0) cached_n = memory.cache(n) rnd = np.random.RandomState(0) for i in range(3): a = rnd.random_sample((10, 10)) for _ in range(3): assert np.all(cached_n(a) == a) assert len(accumulator) == i + 1 @with_numpy def test_memory_numpy_check_mmap_mode(tmpdir): """Check that mmap_mode is respected even at the first call""" memory = Memory(cachedir=tmpdir.strpath, mmap_mode='r', verbose=0) @memory.cache() def twice(a): return a * 2 a = np.ones(3) b = twice(a) c = twice(a) assert isinstance(c, np.memmap) assert c.mode == 'r' assert isinstance(b, np.memmap) assert b.mode == 'r' def test_memory_exception(tmpdir): """ Smoketest the exception handling of Memory. """ memory = Memory(cachedir=tmpdir.strpath, verbose=0) class MyException(Exception): pass @memory.cache def h(exc=0): if exc: raise MyException # Call once, to initialise the cache h() for _ in range(3): # Call 3 times, to be sure that the Exception is always raised with raises(MyException): h(1) def test_memory_ignore(tmpdir): " Test the ignore feature of memory " memory = Memory(cachedir=tmpdir.strpath, verbose=0) accumulator = list() @memory.cache(ignore=['y']) def z(x, y=1): accumulator.append(1) assert z.ignore == ['y'] z(0, y=1) assert len(accumulator) == 1 z(0, y=1) assert len(accumulator) == 1 z(0, y=2) assert len(accumulator) == 1 @parametrize('ignore, verbose, mmap_mode', [(['x'], 100, 'r'), ([], 10, None)]) def test_partial_decoration(tmpdir, ignore, verbose, mmap_mode): "Check cache may be called with kwargs before decorating" memory = Memory(cachedir=tmpdir.strpath, verbose=0) @memory.cache(ignore=ignore, verbose=verbose, mmap_mode=mmap_mode) def z(x): pass assert z.ignore == ignore assert z._verbose == verbose assert z.mmap_mode == mmap_mode def test_func_dir(tmpdir): # Test the creation of the memory cache directory for the function. memory = Memory(cachedir=tmpdir.strpath, verbose=0) path = __name__.split('.') path.append('f') path = tmpdir.join('joblib', *path).strpath g = memory.cache(f) # Test that the function directory is created on demand assert g._get_func_dir() == path assert os.path.exists(path) # Test that the code is stored. # For the following test to be robust to previous execution, we clear # the in-memory store _FUNCTION_HASHES.clear() assert not g._check_previous_func_code() assert os.path.exists(os.path.join(path, 'func_code.py')) assert g._check_previous_func_code() # Test the robustness to failure of loading previous results. dir, _ = g.get_output_dir(1) a = g(1) assert os.path.exists(dir) os.remove(os.path.join(dir, 'output.pkl')) assert a == g(1) def test_persistence(tmpdir): # Test the memorized functions can be pickled and restored. memory = Memory(cachedir=tmpdir.strpath, verbose=0) g = memory.cache(f) output = g(1) h = pickle.loads(pickle.dumps(g)) output_dir, _ = h.get_output_dir(1) func_name = _get_func_fullname(f) assert output == _load_output(output_dir, func_name) memory2 = pickle.loads(pickle.dumps(memory)) assert memory.cachedir == memory2.cachedir # Smoke test that pickling a memory with cachedir=None works memory = Memory(cachedir=None, verbose=0) pickle.loads(pickle.dumps(memory)) g = memory.cache(f) gp = pickle.loads(pickle.dumps(g)) gp(1) def test_call_and_shelve(tmpdir): """Test MemorizedFunc outputting a reference to cache. """ for func, Result in zip((MemorizedFunc(f, tmpdir.strpath), NotMemorizedFunc(f), Memory(cachedir=tmpdir.strpath, verbose=0).cache(f), Memory(cachedir=None).cache(f), ), (MemorizedResult, NotMemorizedResult, MemorizedResult, NotMemorizedResult)): assert func(2) == 5 result = func.call_and_shelve(2) assert isinstance(result, Result) assert result.get() == 5 result.clear() with raises(KeyError): result.get() result.clear() # Do nothing if there is no cache. def test_memorized_pickling(tmpdir): for func in (MemorizedFunc(f, tmpdir.strpath), NotMemorizedFunc(f)): filename = tmpdir.join('pickling_test.dat').strpath result = func.call_and_shelve(2) with open(filename, 'wb') as fp: pickle.dump(result, fp) with open(filename, 'rb') as fp: result2 = pickle.load(fp) assert result2.get() == result.get() os.remove(filename) def test_memorized_repr(tmpdir): func = MemorizedFunc(f, tmpdir.strpath) result = func.call_and_shelve(2) func2 = MemorizedFunc(f, tmpdir.strpath) result2 = func2.call_and_shelve(2) assert result.get() == result2.get() assert repr(func) == repr(func2) # Smoke test with NotMemorizedFunc func = NotMemorizedFunc(f) repr(func) repr(func.call_and_shelve(2)) # Smoke test for message output (increase code coverage) func = MemorizedFunc(f, tmpdir.strpath, verbose=11, timestamp=time.time()) result = func.call_and_shelve(11) result.get() func = MemorizedFunc(f, tmpdir.strpath, verbose=11) result = func.call_and_shelve(11) result.get() func = MemorizedFunc(f, tmpdir.strpath, verbose=5, timestamp=time.time()) result = func.call_and_shelve(11) result.get() func = MemorizedFunc(f, tmpdir.strpath, verbose=5) result = func.call_and_shelve(11) result.get() def test_memory_file_modification(capsys, tmpdir, monkeypatch): # Test that modifying a Python file after loading it does not lead to # Recomputation dir_name = tmpdir.mkdir('tmp_import').strpath filename = os.path.join(dir_name, 'tmp_joblib_.py') content = 'def f(x):\n print(x)\n return x\n' with open(filename, 'w') as module_file: module_file.write(content) # Load the module: monkeypatch.syspath_prepend(dir_name) import tmp_joblib_ as tmp memory = Memory(cachedir=tmpdir.strpath, verbose=0) f = memory.cache(tmp.f) # First call f a few times f(1) f(2) f(1) # Now modify the module where f is stored without modifying f with open(filename, 'w') as module_file: module_file.write('\n\n' + content) # And call f a couple more times f(1) f(1) # Flush the .pyc files shutil.rmtree(dir_name) os.mkdir(dir_name) # Now modify the module where f is stored, modifying f content = 'def f(x):\n print("x=%s" % x)\n return x\n' with open(filename, 'w') as module_file: module_file.write(content) # And call f more times prior to reloading: the cache should not be # invalidated at this point as the active function definition has not # changed in memory yet. f(1) f(1) # Now reload sys.stdout.write('Reloading\n') sys.modules.pop('tmp_joblib_') import tmp_joblib_ as tmp f = memory.cache(tmp.f) # And call f more times f(1) f(1) out, err = capsys.readouterr() assert out == '1\n2\nReloading\nx=1\n' def _function_to_cache(a, b): # Just a place holder function to be mutated by tests pass def _sum(a, b): return a + b def _product(a, b): return a * b def test_memory_in_memory_function_code_change(tmpdir): _function_to_cache.__code__ = _sum.__code__ memory = Memory(cachedir=tmpdir.strpath, verbose=0) f = memory.cache(_function_to_cache) assert f(1, 2) == 3 assert f(1, 2) == 3 with warns(JobLibCollisionWarning): # Check that inline function modification triggers a cache invalidation _function_to_cache.__code__ = _product.__code__ assert f(1, 2) == 2 assert f(1, 2) == 2 def test_clear_memory_with_none_cachedir(): memory = Memory(cachedir=None) memory.clear() if PY3_OR_LATER: exec(""" def func_with_kwonly_args(a, b, *, kw1='kw1', kw2='kw2'): return a, b, kw1, kw2 def func_with_signature(a: int, b: float) -> float: return a + b """) def test_memory_func_with_kwonly_args(tmpdir): memory = Memory(cachedir=tmpdir.strpath, verbose=0) func_cached = memory.cache(func_with_kwonly_args) assert func_cached(1, 2, kw1=3) == (1, 2, 3, 'kw2') # Making sure that providing a keyword-only argument by # position raises an exception with raises(ValueError) as excinfo: func_cached(1, 2, 3, kw2=4) excinfo.match("Keyword-only parameter 'kw1' was passed as positional " "parameter") # Keyword-only parameter passed by position with cached call # should still raise ValueError func_cached(1, 2, kw1=3, kw2=4) with raises(ValueError) as excinfo: func_cached(1, 2, 3, kw2=4) excinfo.match("Keyword-only parameter 'kw1' was passed as positional " "parameter") # Test 'ignore' parameter func_cached = memory.cache(func_with_kwonly_args, ignore=['kw2']) assert func_cached(1, 2, kw1=3, kw2=4) == (1, 2, 3, 4) assert func_cached(1, 2, kw1=3, kw2='ignored') == (1, 2, 3, 4) def test_memory_func_with_signature(tmpdir): memory = Memory(cachedir=tmpdir.strpath, verbose=0) func_cached = memory.cache(func_with_signature) assert func_cached(1, 2.) == 3. def _setup_toy_cache(tmpdir, num_inputs=10): memory = Memory(cachedir=tmpdir.strpath, verbose=0) @memory.cache() def get_1000_bytes(arg): return 'a' * 1000 inputs = list(range(num_inputs)) for arg in inputs: get_1000_bytes(arg) hash_dirnames = [get_1000_bytes._get_output_dir(arg)[0] for arg in inputs] full_hashdirs = [os.path.join(get_1000_bytes.cachedir, dirname) for dirname in hash_dirnames] return memory, full_hashdirs, get_1000_bytes def test__get_cache_items(tmpdir): memory, expected_hash_cachedirs, _ = _setup_toy_cache(tmpdir) cachedir = memory.cachedir cache_items = _get_cache_items(cachedir) hash_cachedirs = [ci.path for ci in cache_items] assert set(hash_cachedirs) == set(expected_hash_cachedirs) def get_files_size(directory): full_paths = [os.path.join(directory, fn) for fn in os.listdir(directory)] return sum(os.path.getsize(fp) for fp in full_paths) expected_hash_cache_sizes = [get_files_size(hash_dir) for hash_dir in hash_cachedirs] hash_cache_sizes = [ci.size for ci in cache_items] assert hash_cache_sizes == expected_hash_cache_sizes output_filenames = [os.path.join(hash_dir, 'output.pkl') for hash_dir in hash_cachedirs] expected_last_accesses = [ datetime.datetime.fromtimestamp(os.path.getatime(fn)) for fn in output_filenames] last_accesses = [ci.last_access for ci in cache_items] assert last_accesses == expected_last_accesses def test__get_cache_items_to_delete(tmpdir): memory, expected_hash_cachedirs, _ = _setup_toy_cache(tmpdir) cachedir = memory.cachedir cache_items = _get_cache_items(cachedir) # bytes_limit set to keep only one cache item (each hash cache # folder is about 1000 bytes + metadata) cache_items_to_delete = _get_cache_items_to_delete(cachedir, '2K') nb_hashes = len(expected_hash_cachedirs) assert set.issubset(set(cache_items_to_delete), set(cache_items)) assert len(cache_items_to_delete) == nb_hashes - 1 # Sanity check bytes_limit=2048 is the same as bytes_limit='2K' cache_items_to_delete_2048b = _get_cache_items_to_delete(cachedir, 2048) assert sorted(cache_items_to_delete) == sorted(cache_items_to_delete_2048b) # bytes_limit greater than the size of the cache cache_items_to_delete_empty = _get_cache_items_to_delete(cachedir, '1M') assert cache_items_to_delete_empty == [] # All the cache items need to be deleted bytes_limit_too_small = 500 cache_items_to_delete_500b = _get_cache_items_to_delete( cachedir, bytes_limit_too_small) assert set(cache_items_to_delete_500b), set(cache_items) # Test LRU property: surviving cache items should all have a more # recent last_access that the ones that have been deleted cache_items_to_delete_6000b = _get_cache_items_to_delete(cachedir, 6000) surviving_cache_items = set(cache_items).difference( cache_items_to_delete_6000b) assert (max(ci.last_access for ci in cache_items_to_delete_6000b) <= min(ci.last_access for ci in surviving_cache_items)) def test_memory_reduce_size(tmpdir): memory, _, _ = _setup_toy_cache(tmpdir) cachedir = memory.cachedir ref_cache_items = _get_cache_items(cachedir) # By default memory.bytes_limit is None and reduce_size is a noop memory.reduce_size() cache_items = _get_cache_items(cachedir) assert sorted(ref_cache_items) == sorted(cache_items) # No cache items deleted if bytes_limit greater than the size of # the cache memory.bytes_limit = '1M' memory.reduce_size() cache_items = _get_cache_items(cachedir) assert sorted(ref_cache_items) == sorted(cache_items) # bytes_limit is set so that only two cache items are kept memory.bytes_limit = '3K' memory.reduce_size() cache_items = _get_cache_items(cachedir) assert set.issubset(set(cache_items), set(ref_cache_items)) assert len(cache_items) == 2 # bytes_limit set so that no cache item is kept bytes_limit_too_small = 500 memory.bytes_limit = bytes_limit_too_small memory.reduce_size() cache_items = _get_cache_items(cachedir) assert cache_items == [] def test_memory_clear(tmpdir): memory, _, _ = _setup_toy_cache(tmpdir) memory.clear() assert os.listdir(memory.cachedir) == [] def fast_func_with_complex_output(): complex_obj = ['a' * 1000] * 1000 return complex_obj def fast_func_with_conditional_complex_output(complex_output=True): complex_obj = {str(i): i for i in range(int(1e5))} return complex_obj if complex_output else 'simple output' @with_multiprocessing def test_cached_function_race_condition_when_persisting_output(tmpdir, capfd): # Test race condition where multiple processes are writing into # the same output.pkl. See # https://github.com/joblib/joblib/issues/490 for more details. memory = Memory(cachedir=tmpdir.strpath) func_cached = memory.cache(fast_func_with_complex_output) Parallel(n_jobs=2)(delayed(func_cached)() for i in range(3)) stdout, stderr = capfd.readouterr() # Checking both stdout and stderr (ongoing PR #434 may change # logging destination) to make sure there is no exception while # loading the results exception_msg = 'Exception while loading results' assert exception_msg not in stdout assert exception_msg not in stderr @with_multiprocessing def test_cached_function_race_condition_when_persisting_output_2(tmpdir, capfd): # Test race condition in first attempt at solving # https://github.com/joblib/joblib/issues/490. The race condition # was due to the delay between seeing the cache directory created # (interpreted as the result being cached) and the output.pkl being # pickled. memory = Memory(cachedir=tmpdir.strpath) func_cached = memory.cache(fast_func_with_conditional_complex_output) Parallel(n_jobs=2)(delayed(func_cached)(True if i % 2 == 0 else False) for i in range(3)) stdout, stderr = capfd.readouterr() # Checking both stdout and stderr (ongoing PR #434 may change # logging destination) to make sure there is no exception while # loading the results exception_msg = 'Exception while loading results' assert exception_msg not in stdout assert exception_msg not in stderr def write_func(output, filename): with open(filename, 'wb') as f: cpickle.dump(output, f) def load_func(expected, filename): for i in range(10): try: with open(filename, 'rb') as f: reloaded = cpickle.load(f) break except OSError: # On Windows you can have WindowsError ([Error 5] Access # is denied) when reading the file, probably because a # writer process has a lock on the file time.sleep(0.1) else: raise assert expected == reloaded @with_multiprocessing @parametrize('backend', ['multiprocessing', 'threading']) def test_concurrency_safe_write(tmpdir, backend): filename = tmpdir.join('test.pkl').strpath obj = {str(i): i for i in range(int(1e5))} funcs = [functools.partial(concurrency_safe_write, write_func=write_func) if i % 3 != 2 else load_func for i in range(12)] Parallel(n_jobs=2, backend=backend)( delayed(func)(obj, filename) for func in funcs) def test_memory_recomputes_after_an_error_why_loading_results(tmpdir, monkeypatch): memory = Memory(tmpdir.strpath) def func(arg): # This makes sure that the timestamp returned by two calls of # func are different. This is needed on Windows where # time.time resolution may not be accurate enough time.sleep(0.01) return arg, time.time() cached_func = memory.cache(func) input_arg = 'arg' arg, timestamp = cached_func(input_arg) # Make sure the function is correctly cached assert arg == input_arg # Corrupting output.pkl to make sure that an error happens when # loading the cached result single_cache_item, = _get_cache_items(memory.cachedir) output_filename = os.path.join(single_cache_item.path, 'output.pkl') with open(output_filename, 'w') as f: f.write('garbage') recorded_warnings = [] def append_to_record(item): recorded_warnings.append(item) # Make sure that corrupting the file causes recomputation and that # a warning is issued. Need monkeypatch because pytest does not # capture stdlib logging output (see # https://github.com/pytest-dev/pytest/issues/2079) monkeypatch.setattr(cached_func, 'warn', append_to_record) recomputed_arg, recomputed_timestamp = cached_func(arg) assert len(recorded_warnings) == 1 exception_msg = 'Exception while loading results' assert exception_msg in recorded_warnings[0] assert recomputed_arg == arg assert recomputed_timestamp > timestamp joblib-0.11/joblib/test/test_my_exceptions.py000066400000000000000000000045231305577265600214370ustar00rootroot00000000000000""" Test my automatically generate exceptions """ from joblib import my_exceptions class CustomException(Exception): def __init__(self, a, b, c, d): self.a, self.b, self.c, self.d = a, b, c, d class CustomException2(Exception): """A custom exception with a .args attribute Just to check that the JoblibException created from it has it args set correctly """ def __init__(self, a, *args): self.a = a self.args = args def test_inheritance(): assert isinstance(my_exceptions.JoblibNameError(), NameError) assert isinstance(my_exceptions.JoblibNameError(), my_exceptions.JoblibException) assert (my_exceptions.JoblibNameError is my_exceptions._mk_exception(NameError)[0]) def test_inheritance_special_cases(): # _mk_exception should transform Exception to JoblibException assert (my_exceptions._mk_exception(Exception)[0] is my_exceptions.JoblibException) # Subclasses of JoblibException should be mapped to # JoblibException by _mk_exception for exception in [my_exceptions.JoblibException, my_exceptions.TransportableException]: assert (my_exceptions._mk_exception(exception)[0] is my_exceptions.JoblibException) # Non-inheritable exception classes should be mapped to # JoblibException by _mk_exception. That can happen with classes # generated with SWIG. See # https://github.com/joblib/joblib/issues/269 for a concrete # example. non_inheritable_classes = [type(lambda: None), bool] for exception in non_inheritable_classes: assert (my_exceptions._mk_exception(exception)[0] is my_exceptions.JoblibException) def test__mk_exception(): # Check that _mk_exception works on a bunch of different exceptions for klass in (Exception, TypeError, SyntaxError, ValueError, ImportError, CustomException, CustomException2): message = 'This message should be in the exception repr' exc = my_exceptions._mk_exception(klass)[0]( message, 'some', 'other', 'args', 'that are not', 'in the repr') exc_repr = repr(exc) assert isinstance(exc, klass) assert isinstance(exc, my_exceptions.JoblibException) assert exc.__class__.__name__ in exc_repr assert message in exc_repr joblib-0.11/joblib/test/test_numpy_pickle.py000066400000000000000000001014001305577265600212400ustar00rootroot00000000000000"""Test the numpy pickler as a replacement of the standard pickler.""" import copy import os import random import sys import re import io import warnings import gzip import zlib import bz2 import pickle import socket from contextlib import closing import mmap from joblib.test.common import np, with_numpy from joblib.test.common import with_memory_profiler, memory_used from joblib.testing import parametrize, raises, SkipTest, warns # numpy_pickle is not a drop-in replacement of pickle, as it takes # filenames instead of open files as arguments. from joblib import numpy_pickle from joblib.test import data from joblib._compat import PY3_OR_LATER from joblib.numpy_pickle_utils import _IO_BUFFER_SIZE, BinaryZlibFile from joblib.numpy_pickle_utils import _detect_compressor, _COMPRESSORS ############################################################################### # Define a list of standard types. # Borrowed from dill, initial author: Micheal McKerns: # http://dev.danse.us/trac/pathos/browser/dill/dill_test2.py typelist = [] # testing types _none = None typelist.append(_none) _type = type typelist.append(_type) _bool = bool(1) typelist.append(_bool) _int = int(1) typelist.append(_int) try: _long = long(1) typelist.append(_long) except NameError: # long is not defined in python 3 pass _float = float(1) typelist.append(_float) _complex = complex(1) typelist.append(_complex) _string = str(1) typelist.append(_string) try: _unicode = unicode(1) typelist.append(_unicode) except NameError: # unicode is not defined in python 3 pass _tuple = () typelist.append(_tuple) _list = [] typelist.append(_list) _dict = {} typelist.append(_dict) try: _file = file typelist.append(_file) except NameError: pass # file does not exists in Python 3 try: _buffer = buffer typelist.append(_buffer) except NameError: # buffer does not exists in Python 3 pass _builtin = len typelist.append(_builtin) def _function(x): yield x class _class: def _method(self): pass class _newclass(object): def _method(self): pass typelist.append(_function) typelist.append(_class) typelist.append(_newclass) # _instance = _class() typelist.append(_instance) _object = _newclass() typelist.append(_object) # ############################################################################### # Tests @parametrize('compress', [0, 1]) @parametrize('member', typelist) def test_standard_types(tmpdir, compress, member): # Test pickling and saving with standard types. filename = tmpdir.join('test.pkl').strpath numpy_pickle.dump(member, filename, compress=compress) _member = numpy_pickle.load(filename) # We compare the pickled instance to the reloaded one only if it # can be compared to a copied one if member == copy.deepcopy(member): assert member == _member def test_value_error(): # Test inverting the input arguments to dump with raises(ValueError): numpy_pickle.dump('foo', dict()) @parametrize('wrong_compress', [-1, 10, 'wrong']) def test_compress_level_error(wrong_compress): # Verify that passing an invalid compress argument raises an error. exception_msg = ('Non valid compress level given: ' '"{0}"'.format(wrong_compress)) with raises(ValueError) as excinfo: numpy_pickle.dump('dummy', 'foo', compress=wrong_compress) excinfo.match(exception_msg) @with_numpy @parametrize('compress', [False, True, 0, 3]) def test_numpy_persistence(tmpdir, compress): filename = tmpdir.join('test.pkl').strpath rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])): filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 # Check that only one file was created assert filenames[0] == filename # Check that this file does exist assert os.path.exists(filenames[0]) # Unpickle the object obj_ = numpy_pickle.load(filename) # Check that the items are indeed arrays for item in obj_: assert isinstance(item, np.ndarray) # And finally, check that all the values are equal. np.testing.assert_array_equal(np.array(obj), np.array(obj_)) # Now test with array subclasses for obj in (np.matrix(np.zeros(10)), np.memmap(filename + 'mmap', mode='w+', shape=4, dtype=np.float)): filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 obj_ = numpy_pickle.load(filename) if (type(obj) is not np.memmap and hasattr(obj, '__array_prepare__')): # We don't reconstruct memmaps assert isinstance(obj_, type(obj)) np.testing.assert_array_equal(obj_, obj) # Test with an object containing multiple numpy arrays obj = ComplexTestObject() filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 obj_loaded = numpy_pickle.load(filename) assert isinstance(obj_loaded, type(obj)) np.testing.assert_array_equal(obj_loaded.array_float, obj.array_float) np.testing.assert_array_equal(obj_loaded.array_int, obj.array_int) np.testing.assert_array_equal(obj_loaded.array_obj, obj.array_obj) @with_numpy def test_numpy_persistence_bufferred_array_compression(tmpdir): big_array = np.ones((_IO_BUFFER_SIZE + 100), dtype=np.uint8) filename = tmpdir.join('test.pkl').strpath numpy_pickle.dump(big_array, filename, compress=True) arr_reloaded = numpy_pickle.load(filename) np.testing.assert_array_equal(big_array, arr_reloaded) @with_numpy def test_memmap_persistence(tmpdir): rnd = np.random.RandomState(0) a = rnd.random_sample(10) filename = tmpdir.join('test1.pkl').strpath numpy_pickle.dump(a, filename) b = numpy_pickle.load(filename, mmap_mode='r') assert isinstance(b, np.memmap) # Test with an object containing multiple numpy arrays filename = tmpdir.join('test2.pkl').strpath obj = ComplexTestObject() numpy_pickle.dump(obj, filename) obj_loaded = numpy_pickle.load(filename, mmap_mode='r') assert isinstance(obj_loaded, type(obj)) assert isinstance(obj_loaded.array_float, np.memmap) assert not obj_loaded.array_float.flags.writeable assert isinstance(obj_loaded.array_int, np.memmap) assert not obj_loaded.array_int.flags.writeable # Memory map not allowed for numpy object arrays assert not isinstance(obj_loaded.array_obj, np.memmap) np.testing.assert_array_equal(obj_loaded.array_float, obj.array_float) np.testing.assert_array_equal(obj_loaded.array_int, obj.array_int) np.testing.assert_array_equal(obj_loaded.array_obj, obj.array_obj) # Test we can write in memmaped arrays obj_loaded = numpy_pickle.load(filename, mmap_mode='r+') assert obj_loaded.array_float.flags.writeable obj_loaded.array_float[0:10] = 10.0 assert obj_loaded.array_int.flags.writeable obj_loaded.array_int[0:10] = 10 obj_reloaded = numpy_pickle.load(filename, mmap_mode='r') np.testing.assert_array_equal(obj_reloaded.array_float, obj_loaded.array_float) np.testing.assert_array_equal(obj_reloaded.array_int, obj_loaded.array_int) # Test w+ mode is caught and the mode has switched to r+ numpy_pickle.load(filename, mmap_mode='w+') assert obj_loaded.array_int.flags.writeable assert obj_loaded.array_int.mode == 'r+' assert obj_loaded.array_float.flags.writeable assert obj_loaded.array_float.mode == 'r+' @with_numpy def test_memmap_persistence_mixed_dtypes(tmpdir): # loading datastructures that have sub-arrays with dtype=object # should not prevent memmaping on fixed size dtype sub-arrays. rnd = np.random.RandomState(0) a = rnd.random_sample(10) b = np.array([1, 'b'], dtype=object) construct = (a, b) filename = tmpdir.join('test.pkl').strpath numpy_pickle.dump(construct, filename) a_clone, b_clone = numpy_pickle.load(filename, mmap_mode='r') # the floating point array has been memory mapped assert isinstance(a_clone, np.memmap) # the object-dtype array has been loaded in memory assert not isinstance(b_clone, np.memmap) @with_numpy def test_masked_array_persistence(tmpdir): # The special-case picker fails, because saving masked_array # not implemented, but it just delegates to the standard pickler. rnd = np.random.RandomState(0) a = rnd.random_sample(10) a = np.ma.masked_greater(a, 0.5) filename = tmpdir.join('test.pkl').strpath numpy_pickle.dump(a, filename) b = numpy_pickle.load(filename, mmap_mode='r') assert isinstance(b, np.ma.masked_array) @with_numpy def test_compress_mmap_mode_warning(tmpdir): # Test the warning in case of compress + mmap_mode rnd = np.random.RandomState(0) a = rnd.random_sample(10) this_filename = tmpdir.join('test.pkl').strpath numpy_pickle.dump(a, this_filename, compress=1) with warns(UserWarning) as warninfo: numpy_pickle.load(this_filename, mmap_mode='r+') assert len(warninfo) == 1 assert (str(warninfo[0].message) == 'mmap_mode "%(mmap_mode)s" is not compatible with compressed ' 'file %(filename)s. "%(mmap_mode)s" flag will be ignored.' % {'filename': this_filename, 'mmap_mode': 'r+'}) @with_numpy @parametrize('cache_size', [None, 0, 10]) def test_cache_size_warning(tmpdir, cache_size): # Check deprecation warning raised when cache size is not None filename = tmpdir.join('test.pkl').strpath rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) warnings.simplefilter("always") with warns(None) as warninfo: numpy_pickle.dump(a, filename, cache_size=cache_size) expected_nb_warnings = 1 if cache_size is not None else 0 assert len(warninfo) == expected_nb_warnings for w in warninfo: assert w.category == DeprecationWarning assert (str(w.message) == "Please do not set 'cache_size' in joblib.dump, this " "parameter has no effect and will be removed. You " "used 'cache_size={0}'".format(cache_size)) @with_numpy @with_memory_profiler @parametrize('compress', [True, False]) def test_memory_usage(tmpdir, compress): # Verify memory stays within expected bounds. filename = tmpdir.join('test.pkl').strpath small_array = np.ones((10, 10)) big_array = np.ones(shape=100 * int(1e6), dtype=np.uint8) small_matrix = np.matrix(small_array) big_matrix = np.matrix(big_array) for obj in (small_array, big_array, small_matrix, big_matrix): size = obj.nbytes / 1e6 obj_filename = filename + str(np.random.randint(0, 1000)) mem_used = memory_used(numpy_pickle.dump, obj, obj_filename, compress=compress) # The memory used to dump the object shouldn't exceed the buffer # size used to write array chunks (16MB). write_buf_size = _IO_BUFFER_SIZE + 16 * 1024 ** 2 / 1e6 assert mem_used <= write_buf_size mem_used = memory_used(numpy_pickle.load, obj_filename) # memory used should be less than array size + buffer size used to # read the array chunk by chunk. read_buf_size = 32 + _IO_BUFFER_SIZE # MiB assert mem_used < size + read_buf_size @with_numpy def test_compressed_pickle_dump_and_load(tmpdir): expected_list = [np.arange(5, dtype=np.dtype('i8')), np.arange(5, dtype=np.dtype('f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('i8')), u"C'est l'\xe9t\xe9 !"] fname = tmpdir.join('temp.pkl.gz').strpath dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) assert len(dumped_filenames) == 1 result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected def _check_pickle(filename, expected_list): """Helper function to test joblib pickle content. Note: currently only pickles containing an iterable are supported by this function. """ if (not PY3_OR_LATER and (filename.endswith('.xz') or filename.endswith('.lzma'))): # lzma is not supported for python versions < 3.3 with raises(NotImplementedError): numpy_pickle.load(filename) return version_match = re.match(r'.+py(\d)(\d).+', filename) py_version_used_for_writing = int(version_match.group(1)) py_version_used_for_reading = sys.version_info[0] py_version_to_default_pickle_protocol = {2: 2, 3: 3} pickle_reading_protocol = py_version_to_default_pickle_protocol.get( py_version_used_for_reading, 4) pickle_writing_protocol = py_version_to_default_pickle_protocol.get( py_version_used_for_writing, 4) if pickle_reading_protocol >= pickle_writing_protocol: try: with warns(None) as warninfo: warnings.simplefilter('always') warnings.filterwarnings( 'ignore', module='numpy', message='The compiler package is deprecated') result_list = numpy_pickle.load(filename) filename_base = os.path.basename(filename) expected_nb_warnings = 1 if ("_0.9" in filename_base or "_0.8.4" in filename_base) else 0 assert len(warninfo) == expected_nb_warnings for w in warninfo: assert w.category == DeprecationWarning assert (str(w.message) == "The file '{0}' has been generated with a joblib " "version less than 0.10. Please regenerate this " "pickle file.".format(filename)) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected except Exception as exc: # When trying to read with python 3 a pickle generated # with python 2 we expect a user-friendly error if (py_version_used_for_reading == 3 and py_version_used_for_writing == 2): assert isinstance(exc, ValueError) message = ('You may be trying to read with ' 'python 3 a joblib pickle generated with python 2.') assert message in str(exc) else: raise else: # Pickle protocol used for writing is too high. We expect a # "unsupported pickle protocol" error message try: numpy_pickle.load(filename) raise AssertionError('Numpy pickle loading should ' 'have raised a ValueError exception') except ValueError as e: message = 'unsupported pickle protocol: {0}'.format( pickle_writing_protocol) assert message in str(e.args) @with_numpy def test_joblib_pickle_across_python_versions(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('= 3.3 msg = "{} compression is only available".format(cmethod) with raises(NotImplementedError) as excinfo: numpy_pickle.dump(obj, dump_filename, compress=(cmethod, compress)) excinfo.match(msg) else: numpy_pickle.dump(obj, dump_filename, compress=(cmethod, compress)) # Verify the file contains the right magic number with open(dump_filename, 'rb') as f: assert _detect_compressor(f) == cmethod # Verify the reloaded object is correct obj_reloaded = numpy_pickle.load(dump_filename) assert isinstance(obj_reloaded, type(obj)) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) else: assert obj_reloaded == obj def _gzip_file_decompress(source_filename, target_filename): """Decompress a gzip file.""" with closing(gzip.GzipFile(source_filename, "rb")) as fo: buf = fo.read() with open(target_filename, "wb") as fo: fo.write(buf) def _zlib_file_decompress(source_filename, target_filename): """Decompress a zlib file.""" with open(source_filename, 'rb') as fo: buf = zlib.decompress(fo.read()) with open(target_filename, 'wb') as fo: fo.write(buf) @parametrize('extension,decompress', [('.z', _zlib_file_decompress), ('.gz', _gzip_file_decompress)]) def test_load_externally_decompressed_files(tmpdir, extension, decompress): # Test that BinaryZlibFile generates valid gzip and zlib compressed files. obj = "a string to persist" filename_raw = tmpdir.join('test.pkl').strpath filename_compressed = filename_raw + extension # Use automatic extension detection to compress with the right method. numpy_pickle.dump(obj, filename_compressed) # Decompress with the corresponding method decompress(filename_compressed, filename_raw) # Test that the uncompressed pickle can be loaded and # that the result is correct. obj_reloaded = numpy_pickle.load(filename_raw) assert obj == obj_reloaded @parametrize('extension,cmethod', # valid compressor extensions [('.z', 'zlib'), ('.gz', 'gzip'), ('.bz2', 'bz2'), ('.lzma', 'lzma'), ('.xz', 'xz'), # invalid compressor extensions ('.pkl', 'not-compressed'), ('', 'not-compressed')]) def test_compression_using_file_extension(tmpdir, extension, cmethod): # test that compression method corresponds to the given filename extension. filename = tmpdir.join('test.pkl').strpath obj = "object to dump" dump_fname = filename + extension if not PY3_OR_LATER and cmethod in ('xz', 'lzma'): # Lzma module only available for python >= 3.3 msg = "{} compression is only available".format(cmethod) with raises(NotImplementedError) as excinfo: numpy_pickle.dump(obj, dump_fname) excinfo.match(msg) else: numpy_pickle.dump(obj, dump_fname) # Verify the file contains the right magic number with open(dump_fname, 'rb') as f: assert _detect_compressor(f) == cmethod # Verify the reloaded object is correct obj_reloaded = numpy_pickle.load(dump_fname) assert isinstance(obj_reloaded, type(obj)) assert obj_reloaded == obj @with_numpy def test_file_handle_persistence(tmpdir): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] fobjs = [bz2.BZ2File, gzip.GzipFile] if PY3_OR_LATER: import lzma fobjs += [lzma.LZMAFile] filename = tmpdir.join('test.pkl').strpath for obj in objs: for fobj in fobjs: with fobj(filename, 'wb') as f: numpy_pickle.dump(obj, f) # using the same decompressor prevents from internally # decompress again. with fobj(filename, 'rb') as f: obj_reloaded = numpy_pickle.load(f) # when needed, the correct decompressor should be used when # passing a raw file handle. with open(filename, 'rb') as f: obj_reloaded_2 = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) np.testing.assert_array_equal(obj_reloaded_2, obj) else: assert obj_reloaded == obj assert obj_reloaded_2 == obj @with_numpy def test_in_memory_persistence(): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] for obj in objs: f = io.BytesIO() numpy_pickle.dump(obj, f) obj_reloaded = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) else: assert obj_reloaded == obj @with_numpy def test_file_handle_persistence_mmap(tmpdir): obj = np.random.random((10, 10)) filename = tmpdir.join('test.pkl').strpath with open(filename, 'wb') as f: numpy_pickle.dump(obj, f) with open(filename, 'rb') as f: obj_reloaded = numpy_pickle.load(f, mmap_mode='r+') np.testing.assert_array_equal(obj_reloaded, obj) @with_numpy def test_file_handle_persistence_compressed_mmap(tmpdir): obj = np.random.random((10, 10)) filename = tmpdir.join('test.pkl').strpath with open(filename, 'wb') as f: numpy_pickle.dump(obj, f, compress=('gzip', 3)) with closing(gzip.GzipFile(filename, 'rb')) as f: with warns(UserWarning) as warninfo: numpy_pickle.load(f, mmap_mode='r+') assert len(warninfo) == 1 assert (str(warninfo[0].message) == '"%(fileobj)r" is not a raw file, mmap_mode "%(mmap_mode)s" ' 'flag will be ignored.' % {'fileobj': f, 'mmap_mode': 'r+'}) @with_numpy def test_file_handle_persistence_in_memory_mmap(): obj = np.random.random((10, 10)) buf = io.BytesIO() numpy_pickle.dump(obj, buf) with warns(UserWarning) as warninfo: numpy_pickle.load(buf, mmap_mode='r+') assert len(warninfo) == 1 assert (str(warninfo[0].message) == 'In memory persistence is not compatible with mmap_mode ' '"%(mmap_mode)s" flag passed. mmap_mode option will be ' 'ignored.' % {'mmap_mode': 'r+'}) @parametrize('data', [b'a little data as bytes.', # More bytes 10000 * "{}".format( random.randint(0, 1000) * 1000).encode('latin-1')]) @parametrize('compress_level', [1, 3, 9]) def test_binary_zlibfile(tmpdir, data, compress_level): filename = tmpdir.join('test.pkl').strpath # Regular cases with open(filename, 'wb') as f: with BinaryZlibFile(f, 'wb', compresslevel=compress_level) as fz: assert fz.writable() fz.write(data) assert fz.fileno() == f.fileno() with raises(io.UnsupportedOperation): fz._check_can_read() with raises(io.UnsupportedOperation): fz._check_can_seek() assert fz.closed with raises(ValueError): fz._check_not_closed() with open(filename, 'rb') as f: with BinaryZlibFile(f) as fz: assert fz.readable() if PY3_OR_LATER: assert fz.seekable() assert fz.fileno() == f.fileno() assert fz.read() == data with raises(io.UnsupportedOperation): fz._check_can_write() if PY3_OR_LATER: # io.BufferedIOBase doesn't have seekable() method in # python 2 assert fz.seekable() fz.seek(0) assert fz.tell() == 0 assert fz.closed # Test with a filename as input with BinaryZlibFile(filename, 'wb', compresslevel=compress_level) as fz: assert fz.writable() fz.write(data) with BinaryZlibFile(filename, 'rb') as fz: assert fz.read() == data assert fz.seekable() # Test without context manager fz = BinaryZlibFile(filename, 'wb', compresslevel=compress_level) assert fz.writable() fz.write(data) fz.close() fz = BinaryZlibFile(filename, 'rb') assert fz.read() == data fz.close() @parametrize('bad_value', [-1, 10, 15, 'a', (), {}]) def test_binary_zlibfile_bad_compression_levels(tmpdir, bad_value): filename = tmpdir.join('test.pkl').strpath with raises(ValueError) as excinfo: BinaryZlibFile(filename, 'wb', compresslevel=bad_value) pattern = re.escape("'compresslevel' must be an integer between 1 and 9. " "You provided 'compresslevel={}'".format(bad_value)) excinfo.match(pattern) @parametrize('bad_mode', ['a', 'x', 'r', 'w', 1, 2]) def test_binary_zlibfile_invalid_modes(tmpdir, bad_mode): filename = tmpdir.join('test.pkl').strpath with raises(ValueError) as excinfo: BinaryZlibFile(filename, bad_mode) excinfo.match("Invalid mode") @parametrize('bad_file', [1, (), {}]) def test_binary_zlibfile_invalid_filename_type(bad_file): with raises(TypeError) as excinfo: BinaryZlibFile(bad_file, 'rb') excinfo.match("filename must be a str or bytes object, or a file") ############################################################################### # Test dumping array subclasses if np is not None: class SubArray(np.ndarray): def __reduce__(self): return _load_sub_array, (np.asarray(self), ) def _load_sub_array(arr): d = SubArray(arr.shape) d[:] = arr return d class ComplexTestObject: """A complex object containing numpy arrays as attributes.""" def __init__(self): self.array_float = np.arange(100, dtype='float64') self.array_int = np.ones(100, dtype='int32') self.array_obj = np.array(['a', 10, 20.0], dtype='object') @with_numpy def test_numpy_subclass(tmpdir): filename = tmpdir.join('test.pkl').strpath a = SubArray((10,)) numpy_pickle.dump(a, filename) c = numpy_pickle.load(filename) assert isinstance(c, SubArray) np.testing.assert_array_equal(c, a) def test_pathlib(tmpdir): try: from pathlib import Path except ImportError: pass else: filename = tmpdir.join('test.pkl').strpath value = 123 numpy_pickle.dump(value, Path(filename)) assert numpy_pickle.load(filename) == value numpy_pickle.dump(value, filename) assert numpy_pickle.load(Path(filename)) == value @with_numpy def test_non_contiguous_array_pickling(tmpdir): filename = tmpdir.join('test.pkl').strpath for array in [ # Array that triggers a contiguousness issue with nditer, # see https://github.com/joblib/joblib/pull/352 and see # https://github.com/joblib/joblib/pull/353 np.asfortranarray([[1, 2], [3, 4]])[1:], # Non contiguous array with works fine with nditer np.ones((10, 50, 20), order='F')[:, :1, :]]: assert not array.flags.c_contiguous assert not array.flags.f_contiguous numpy_pickle.dump(array, filename) array_reloaded = numpy_pickle.load(filename) np.testing.assert_array_equal(array_reloaded, array) @with_numpy def test_pickle_highest_protocol(tmpdir): # ensure persistence of a numpy array is valid even when using # the pickle HIGHEST_PROTOCOL. # see https://github.com/joblib/joblib/issues/362 filename = tmpdir.join('test.pkl').strpath test_array = np.zeros(10) numpy_pickle.dump(test_array, filename, protocol=pickle.HIGHEST_PROTOCOL) array_reloaded = numpy_pickle.load(filename) np.testing.assert_array_equal(array_reloaded, test_array) @with_numpy def test_pickle_in_socket(): # test that joblib can pickle in sockets if not PY3_OR_LATER: raise SkipTest("Cannot peek or seek in socket in python 2.") test_array = np.arange(10) _ADDR = ("localhost", 12345) listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) listener.bind(_ADDR) listener.listen(1) client = socket.create_connection(_ADDR) server, client_addr = listener.accept() with server.makefile("wb") as sf: numpy_pickle.dump(test_array, sf) with client.makefile("rb") as cf: array_reloaded = numpy_pickle.load(cf) np.testing.assert_array_equal(array_reloaded, test_array) @with_numpy def test_load_memmap_with_big_offset(tmpdir): # Test that numpy memmap offset is set correctly if greater than # mmap.ALLOCATIONGRANULARITY, see # https://github.com/joblib/joblib/issues/451 and # https://github.com/numpy/numpy/pull/8443 for more details. fname = tmpdir.join('test.mmap').strpath size = mmap.ALLOCATIONGRANULARITY obj = [np.zeros(size, dtype='uint8'), np.ones(size, dtype='uint8')] numpy_pickle.dump(obj, fname) memmaps = numpy_pickle.load(fname, mmap_mode='r') assert isinstance(memmaps[1], np.memmap) assert memmaps[1].offset > size np.testing.assert_array_equal(obj, memmaps) joblib-0.11/joblib/test/test_numpy_pickle_compat.py000066400000000000000000000011601305577265600226050ustar00rootroot00000000000000"""Test the old numpy pickler, compatibility version.""" import random # numpy_pickle is not a drop-in replacement of pickle, as it takes # filenames instead of open files as arguments. from joblib import numpy_pickle_compat def test_z_file(tmpdir): # Test saving and loading data with Zfiles. filename = tmpdir.join('test.pkl').strpath data = numpy_pickle_compat.asbytes('Foo, \n Bar, baz, \n\nfoobar') with open(filename, 'wb') as f: numpy_pickle_compat.write_zfile(f, data) with open(filename, 'rb') as f: data_read = numpy_pickle_compat.read_zfile(f) assert data == data_read joblib-0.11/joblib/test/test_numpy_pickle_utils.py000066400000000000000000000006341305577265600224670ustar00rootroot00000000000000from joblib import numpy_pickle_utils from joblib.testing import parametrize @parametrize('filename', ['test', u'test']) # testing str and unicode names def test_binary_zlib_file(tmpdir, filename): """Testing creation of files depending on the type of the filenames.""" binary_file = numpy_pickle_utils.BinaryZlibFile( tmpdir.join(filename).strpath, mode='wb') binary_file.close() joblib-0.11/joblib/test/test_parallel.py000066400000000000000000000640271305577265600203520ustar00rootroot00000000000000""" Test the parallel module. """ # Author: Gael Varoquaux # Copyright (c) 2010-2011 Gael Varoquaux # License: BSD Style, 3 clauses. import time import sys import os from math import sqrt import threading from multiprocessing import TimeoutError from time import sleep import mmap from joblib import dump, load from joblib import parallel from joblib.test.common import np, with_numpy from joblib.test.common import with_multiprocessing from joblib.testing import (parametrize, raises, check_subprocess_call, SkipTest, warns) from joblib._compat import PY3_OR_LATER try: import cPickle as pickle PickleError = TypeError except ImportError: import pickle PickleError = pickle.PicklingError if PY3_OR_LATER: PickleError = pickle.PicklingError try: # Python 2/Python 3 compat unicode('str') except NameError: unicode = lambda s: s try: from queue import Queue except ImportError: # Backward compat from Queue import Queue try: import posix except ImportError: posix = None from joblib._parallel_backends import SequentialBackend from joblib._parallel_backends import ThreadingBackend from joblib._parallel_backends import MultiprocessingBackend from joblib._parallel_backends import SafeFunction from joblib._parallel_backends import WorkerInterrupt from joblib.parallel import Parallel, delayed from joblib.parallel import register_parallel_backend, parallel_backend from joblib.parallel import mp, cpu_count, BACKENDS, effective_n_jobs from joblib.my_exceptions import JoblibException ALL_VALID_BACKENDS = [None] + sorted(BACKENDS.keys()) # Add instances of backend classes deriving from ParallelBackendBase ALL_VALID_BACKENDS += [BACKENDS[backend_str]() for backend_str in BACKENDS] if hasattr(mp, 'get_context'): # Custom multiprocessing context in Python 3.4+ ALL_VALID_BACKENDS.append(mp.get_context('spawn')) def division(x, y): return x / y def square(x): return x ** 2 class MyExceptionWithFinickyInit(Exception): """An exception class with non trivial __init__ """ def __init__(self, a, b, c, d): pass def exception_raiser(x, custom_exception=False): if x == 7: raise (MyExceptionWithFinickyInit('a', 'b', 'c', 'd') if custom_exception else ValueError) return x def interrupt_raiser(x): time.sleep(.05) raise KeyboardInterrupt def f(x, y=0, z=0): """ A module-level function so that it can be spawn with multiprocessing. """ return x ** 2 + y + z def _active_backend_type(): return type(parallel.get_active_backend()[0]) def parallel_func(inner_n_jobs): return Parallel(n_jobs=inner_n_jobs)(delayed(square)(i) for i in range(3)) ############################################################################### def test_cpu_count(): assert cpu_count() > 0 def test_effective_n_jobs(): assert effective_n_jobs() > 0 ############################################################################### # Test parallel @parametrize('backend', ALL_VALID_BACKENDS) @parametrize('n_jobs', [1, 2, -1, -2]) @parametrize('verbose', [2, 11, 100]) def test_simple_parallel(backend, n_jobs, verbose): assert ([square(x) for x in range(5)] == Parallel(n_jobs=n_jobs, backend=backend, verbose=verbose)( delayed(square)(x) for x in range(5))) @parametrize('backend', ALL_VALID_BACKENDS) def test_main_thread_renamed_no_warning(backend, monkeypatch): # Check that no default backend relies on the name of the main thread: # https://github.com/joblib/joblib/issues/180#issuecomment-253266247 # Some programs use a different name for the main thread. This is the case # for uWSGI apps for instance. monkeypatch.setattr(target=threading.current_thread(), name='name', value='some_new_name_for_the_main_thread') with warns(None) as warninfo: results = Parallel(n_jobs=2, backend=backend)( delayed(square)(x) for x in range(3)) assert results == [0, 1, 4] # The multiprocessing backend will raise a warning when detecting that is # started from the non-main thread. Let's check that there is no false # positive because of the name change. assert len(warninfo) == 0 def nested_loop(backend): Parallel(n_jobs=2, backend=backend)( delayed(square)(.01) for _ in range(2)) @parametrize('parent_backend', BACKENDS) @parametrize('child_backend', BACKENDS) def test_nested_loop(parent_backend, child_backend): Parallel(n_jobs=2, backend=parent_backend)( delayed(nested_loop)(child_backend) for _ in range(2)) def test_mutate_input_with_threads(): """Input is mutable when using the threading backend""" q = Queue(maxsize=5) Parallel(n_jobs=2, backend="threading")( delayed(q.put, check_pickle=False)(1) for _ in range(5)) assert q.full() @parametrize('n_jobs', [1, 2, 3]) def test_parallel_kwargs(n_jobs): """Check the keyword argument processing of pmap.""" lst = range(10) assert ([f(x, y=1) for x in lst] == Parallel(n_jobs=n_jobs)(delayed(f)(x, y=1) for x in lst)) @parametrize('backend', ['multiprocessing', 'threading']) def test_parallel_as_context_manager(backend): lst = range(10) expected = [f(x, y=1) for x in lst] with Parallel(n_jobs=4, backend=backend) as p: # Internally a pool instance has been eagerly created and is managed # via the context manager protocol managed_backend = p._backend if mp is not None: assert managed_backend is not None assert managed_backend._pool is not None # We make call with the managed parallel object several times inside # the managed block: assert expected == p(delayed(f)(x, y=1) for x in lst) assert expected == p(delayed(f)(x, y=1) for x in lst) # Those calls have all used the same pool instance: if mp is not None: assert managed_backend._pool is p._backend._pool # As soon as we exit the context manager block, the pool is terminated and # no longer referenced from the parallel object: if mp is not None: assert p._backend._pool is None # It's still possible to use the parallel instance in non-managed mode: assert expected == p(delayed(f)(x, y=1) for x in lst) if mp is not None: assert p._backend._pool is None def test_parallel_pickling(): """ Check that pmap captures the errors when it is passed an object that cannot be pickled. """ def g(x): return x ** 2 try: # pickling a local function always fail but the exception # raised is a PickleError for python <= 3.4 and AttributeError # for python >= 3.5 pickle.dumps(g) except Exception as exc: exception_class = exc.__class__ with raises(exception_class): Parallel()(delayed(g)(x) for x in range(10)) @parametrize('backend', ['multiprocessing', 'threading']) def test_parallel_timeout_success(backend): # Check that timeout isn't thrown when function is fast enough assert len(Parallel(n_jobs=2, backend=backend, timeout=10)( delayed(sleep)(0.001) for x in range(10))) == 10 @with_multiprocessing @parametrize('backend', ['multiprocessing', 'threading']) def test_parallel_timeout_fail(backend): # Check that timeout properly fails when function is too slow with raises(TimeoutError): Parallel(n_jobs=2, backend=backend, timeout=0.01)( delayed(sleep)(10) for x in range(10)) def test_error_capture(): # Check that error are captured, and that correct exceptions # are raised. if mp is not None: # A JoblibException will be raised only if there is indeed # multiprocessing with raises(JoblibException): Parallel(n_jobs=2)( [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))]) with raises(WorkerInterrupt): Parallel(n_jobs=2)( [delayed(interrupt_raiser)(x) for x in (1, 0)]) # Try again with the context manager API with Parallel(n_jobs=2) as parallel: assert parallel._backend._pool is not None original_pool = parallel._backend._pool with raises(JoblibException): parallel([delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))]) # The managed pool should still be available and be in a working # state despite the previously raised (and caught) exception assert parallel._backend._pool is not None # The pool should have been interrupted and restarted: assert parallel._backend._pool is not original_pool assert ([f(x, y=1) for x in range(10)] == parallel(delayed(f)(x, y=1) for x in range(10))) original_pool = parallel._backend._pool with raises(WorkerInterrupt): parallel([delayed(interrupt_raiser)(x) for x in (1, 0)]) # The pool should still be available despite the exception assert parallel._backend._pool is not None # The pool should have been interrupted and restarted: assert parallel._backend._pool is not original_pool assert ([f(x, y=1) for x in range(10)] == parallel(delayed(f)(x, y=1) for x in range(10))) # Check that the inner pool has been terminated when exiting the # context manager assert parallel._backend._pool is None else: with raises(KeyboardInterrupt): Parallel(n_jobs=2)( [delayed(interrupt_raiser)(x) for x in (1, 0)]) # wrapped exceptions should inherit from the class of the original # exception to make it easy to catch them with raises(ZeroDivisionError): Parallel(n_jobs=2)( [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))]) with raises(MyExceptionWithFinickyInit): Parallel(n_jobs=2, verbose=0)( (delayed(exception_raiser)(i, custom_exception=True) for i in range(30))) try: # JoblibException wrapping is disabled in sequential mode: ex = JoblibException() Parallel(n_jobs=1)( delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))) except Exception as ex: assert not isinstance(ex, JoblibException) def consumer(queue, item): queue.append('Consumed %s' % item) @parametrize('backend', BACKENDS) @parametrize('batch_size, expected_queue', [(1, ['Produced 0', 'Consumed 0', 'Produced 1', 'Consumed 1', 'Produced 2', 'Consumed 2', 'Produced 3', 'Consumed 3', 'Produced 4', 'Consumed 4', 'Produced 5', 'Consumed 5']), (4, [ # First Batch 'Produced 0', 'Produced 1', 'Produced 2', 'Produced 3', 'Consumed 0', 'Consumed 1', 'Consumed 2', 'Consumed 3', # Second batch 'Produced 4', 'Produced 5', 'Consumed 4', 'Consumed 5'])]) def test_dispatch_one_job(backend, batch_size, expected_queue): """ Test that with only one job, Parallel does act as a iterator. """ queue = list() def producer(): for i in range(6): queue.append('Produced %i' % i) yield i Parallel(n_jobs=1, batch_size=batch_size, backend=backend)( delayed(consumer)(queue, x) for x in producer()) assert queue == expected_queue assert len(queue) == 12 @with_multiprocessing @parametrize('backend', ['multiprocessing', 'threading']) def test_dispatch_multiprocessing(backend): """ Check that using pre_dispatch Parallel does indeed dispatch items lazily. """ manager = mp.Manager() queue = manager.list() def producer(): for i in range(6): queue.append('Produced %i' % i) yield i Parallel(n_jobs=2, batch_size=1, pre_dispatch=3, backend=backend)( delayed(consumer)(queue, 'any') for _ in producer()) # Only 3 tasks are dispatched out of 6. The 4th task is dispatched only # after any of the first 3 jobs have completed. first_four = list(queue)[:4] # The the first consumption event can sometimes happen before the end of # the dispatching, hence, pop it before introspecting the "Produced" events first_four.remove('Consumed any') assert first_four == ['Produced 0', 'Produced 1', 'Produced 2'] assert len(queue) == 12 def test_batching_auto_threading(): # batching='auto' with the threading backend leaves the effective batch # size to 1 (no batching) as it has been found to never be beneficial with # this low-overhead backend. with Parallel(n_jobs=2, batch_size='auto', backend='threading') as p: p(delayed(id)(i) for i in range(5000)) # many very fast tasks assert p._backend.compute_batch_size() == 1 def test_batching_auto_multiprocessing(): with Parallel(n_jobs=2, batch_size='auto', backend='multiprocessing') as p: p(delayed(id)(i) for i in range(5000)) # many very fast tasks # It should be strictly larger than 1 but as we don't want heisen # failures on clogged CI worker environment be safe and only check that # it's a strictly positive number. assert p._backend.compute_batch_size() > 0 def test_exception_dispatch(): "Make sure that exception raised during dispatch are indeed captured" with raises(ValueError): Parallel(n_jobs=2, pre_dispatch=16, verbose=0)( delayed(exception_raiser)(i) for i in range(30)) def test_nested_exception_dispatch(): # Ensure TransportableException objects for nested joblib cases gets # propagated. with raises(JoblibException): Parallel(n_jobs=2, pre_dispatch=16, verbose=0)( delayed(SafeFunction(exception_raiser))(i) for i in range(30)) def _reload_joblib(): # Retrieve the path of the parallel module in a robust way joblib_path = Parallel.__module__.split(os.sep) joblib_path = joblib_path[:1] joblib_path.append('parallel.py') joblib_path = '/'.join(joblib_path) module = __import__(joblib_path) # Reload the module. This should trigger a fail reload(module) def test_multiple_spawning(): # Test that attempting to launch a new Python after spawned # subprocesses will raise an error, to avoid infinite loops on # systems that do not support fork if not int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)): raise SkipTest() with raises(ImportError): Parallel(n_jobs=2, pre_dispatch='all')( [delayed(_reload_joblib)() for i in range(10)]) class FakeParallelBackend(SequentialBackend): """Pretends to run concurrently while running sequentially.""" def configure(self, n_jobs=1, parallel=None, **backend_args): self.n_jobs = self.effective_n_jobs(n_jobs) self.parallel = parallel return n_jobs def effective_n_jobs(self, n_jobs=1): if n_jobs < 0: n_jobs = max(mp.cpu_count() + 1 + n_jobs, 1) return n_jobs def test_invalid_backend(): with raises(ValueError): Parallel(backend='unit-testing') def test_register_parallel_backend(): try: register_parallel_backend("test_backend", FakeParallelBackend) assert "test_backend" in BACKENDS assert BACKENDS["test_backend"] == FakeParallelBackend finally: del BACKENDS["test_backend"] def test_overwrite_default_backend(): assert _active_backend_type() == MultiprocessingBackend try: register_parallel_backend("threading", BACKENDS["threading"], make_default=True) assert _active_backend_type() == ThreadingBackend finally: # Restore the global default manually parallel.DEFAULT_BACKEND = 'multiprocessing' assert _active_backend_type() == MultiprocessingBackend def check_backend_context_manager(backend_name): with parallel_backend(backend_name, n_jobs=3): active_backend, active_n_jobs = parallel.get_active_backend() assert active_n_jobs == 3 assert effective_n_jobs(3) == 3 p = Parallel() assert p.n_jobs == 3 if backend_name == 'multiprocessing': assert type(active_backend) == MultiprocessingBackend assert type(p._backend) == MultiprocessingBackend elif backend_name == 'threading': assert type(active_backend) == ThreadingBackend assert type(p._backend) == ThreadingBackend elif backend_name.startswith('test_'): assert type(active_backend) == FakeParallelBackend assert type(p._backend) == FakeParallelBackend all_backends_for_context_manager = ['multiprocessing', 'threading'] + \ ['test_backend_%d' % i for i in range(3)] @with_multiprocessing @parametrize('backend', all_backends_for_context_manager) def test_backend_context_manager(monkeypatch, backend): if backend not in BACKENDS: monkeypatch.setitem(BACKENDS, backend, FakeParallelBackend) assert _active_backend_type() == MultiprocessingBackend # check that this possible to switch parallel backends sequentially check_backend_context_manager(backend) # The default backend is retored assert _active_backend_type() == MultiprocessingBackend # Check that context manager switching is thread safe: Parallel(n_jobs=2, backend='threading')( delayed(check_backend_context_manager)(b) for b in all_backends_for_context_manager if not b) # The default backend is again retored assert _active_backend_type() == MultiprocessingBackend class ParameterizedParallelBackend(SequentialBackend): """Pretends to run conncurrently while running sequentially.""" def __init__(self, param=None): if param is None: raise ValueError('param should not be None') self.param = param def test_parameterized_backend_context_manager(monkeypatch): monkeypatch.setitem(BACKENDS, 'param_backend', ParameterizedParallelBackend) assert _active_backend_type() == MultiprocessingBackend with parallel_backend('param_backend', param=42, n_jobs=3): active_backend, active_n_jobs = parallel.get_active_backend() assert type(active_backend) == ParameterizedParallelBackend assert active_backend.param == 42 assert active_n_jobs == 3 p = Parallel() assert p.n_jobs == 3 assert p._backend is active_backend results = p(delayed(sqrt)(i) for i in range(5)) assert results == [sqrt(i) for i in range(5)] # The default backend is again restored assert _active_backend_type() == MultiprocessingBackend def test_direct_parameterized_backend_context_manager(): assert _active_backend_type() == MultiprocessingBackend # Check that it's possible to pass a backend instance directly, # without registration with parallel_backend(ParameterizedParallelBackend(param=43), n_jobs=5): active_backend, active_n_jobs = parallel.get_active_backend() assert type(active_backend) == ParameterizedParallelBackend assert active_backend.param == 43 assert active_n_jobs == 5 p = Parallel() assert p.n_jobs == 5 assert p._backend is active_backend results = p(delayed(sqrt)(i) for i in range(5)) assert results == [sqrt(i) for i in range(5)] # The default backend is again retored assert _active_backend_type() == MultiprocessingBackend ############################################################################### # Test helpers def test_joblib_exception(): # Smoke-test the custom exception e = JoblibException('foobar') # Test the repr repr(e) # Test the pickle pickle.dumps(e) def test_safe_function(): safe_division = SafeFunction(division) with raises(JoblibException): safe_division(1, 0) @parametrize('batch_size', [0, -1, 1.42]) def test_invalid_batch_size(batch_size): with raises(ValueError): Parallel(batch_size=batch_size) @parametrize('n_tasks, n_jobs, pre_dispatch, batch_size', [(2, 2, 'all', 'auto'), (2, 2, 'n_jobs', 'auto'), (10, 2, 'n_jobs', 'auto'), (517, 2, 'n_jobs', 'auto'), (10, 2, 'n_jobs', 'auto'), (10, 4, 'n_jobs', 'auto'), (25, 4, '2 * n_jobs', 1), (25, 4, 'all', 1), (25, 4, '2 * n_jobs', 7), (10, 4, '2 * n_jobs', 'auto')]) def test_dispatch_race_condition(n_tasks, n_jobs, pre_dispatch, batch_size): # Check that using (async-)dispatch does not yield a race condition on the # iterable generator that is not thread-safe natively. # This is a non-regression test for the "Pool seems closed" class of error params = {'n_jobs': n_jobs, 'pre_dispatch': pre_dispatch, 'batch_size': batch_size} expected = [square(i) for i in range(n_tasks)] results = Parallel(**params)(delayed(square)(i) for i in range(n_tasks)) assert results == expected @with_multiprocessing def test_default_mp_context(): p = Parallel(n_jobs=2, backend='multiprocessing') context = p._backend_args.get('context') if sys.version_info >= (3, 4): start_method = context.get_start_method() # Under Python 3.4+ the multiprocessing context can be configured # by an environment variable env_method = os.environ.get('JOBLIB_START_METHOD', '').strip() or None if env_method is None: # Check the default behavior if sys.platform == 'win32': assert start_method == 'spawn' else: assert start_method == 'fork' else: assert start_method == env_method else: assert context is None @with_multiprocessing @with_numpy def test_no_blas_crash_or_freeze_with_multiprocessing(): if sys.version_info < (3, 4): raise SkipTest('multiprocessing can cause BLAS freeze on old Python') # Use the spawn backend that is both robust and available on all platforms spawn_backend = mp.get_context('spawn') # Check that on recent Python version, the 'spawn' start method can make # it possible to use multiprocessing in conjunction of any BLAS # implementation that happens to be used by numpy with causing a freeze or # a crash rng = np.random.RandomState(42) # call BLAS DGEMM to force the initialization of the internal thread-pool # in the main process a = rng.randn(1000, 1000) np.dot(a, a.T) # check that the internal BLAS thread-pool is not in an inconsistent state # in the worker processes managed by multiprocessing Parallel(n_jobs=2, backend=spawn_backend)( delayed(np.dot)(a, a.T) for i in range(2)) def test_parallel_with_interactively_defined_functions(): # When functions are defined interactively in a python/IPython # session, we want to be able to use them with joblib.Parallel if posix is None: # This test pass only when fork is the process start method raise SkipTest('Not a POSIX platform') code = '\n\n'.join([ 'from joblib import Parallel, delayed', 'def square(x): return x**2', 'print(Parallel(n_jobs=2)(delayed(square)(i) for i in range(5)))']) check_subprocess_call([sys.executable, '-c', code], stdout_regex=r'\[0, 1, 4, 9, 16\]') def test_parallel_with_exhausted_iterator(): exhausted_iterator = iter([]) assert Parallel(n_jobs=2)(exhausted_iterator) == [] def check_memmap(a): if not isinstance(a, np.memmap): raise TypeError('Expected np.memmap instance, got %r', type(a)) return a.copy() # return a regular array instead of a memmap @with_numpy @with_multiprocessing def test_auto_memmap_on_arrays_from_generator(): # Non-regression test for a problem with a bad interaction between the # GC collecting arrays recently created during iteration inside the # parallel dispatch loop and the auto-memmap feature of Parallel. # See: https://github.com/joblib/joblib/pull/294 def generate_arrays(n): for i in range(n): yield np.ones(10, dtype=np.float32) * i # Use max_nbytes=1 to force the use of memory-mapping even for small # arrays results = Parallel(n_jobs=2, max_nbytes=1)( delayed(check_memmap)(a) for a in generate_arrays(100)) for result, expected in zip(results, generate_arrays(len(results))): np.testing.assert_array_equal(expected, result) @with_multiprocessing def test_nested_parallel_warnings(capfd): if posix is None: # This test pass only when fork is the process start method raise SkipTest('Not a POSIX platform') # no warnings if inner_n_jobs=1 Parallel(n_jobs=2)(delayed(parallel_func)(inner_n_jobs=1) for _ in range(5)) out, err = capfd.readouterr() assert err == '' # warnings if inner_n_jobs != 1 Parallel(n_jobs=2)(delayed(parallel_func)(inner_n_jobs=2) for _ in range(5)) out, err = capfd.readouterr() assert 'Multiprocessing-backed parallel loops cannot be nested' in err def identity(arg): return arg @with_numpy @with_multiprocessing def test_memmap_with_big_offset(tmpdir): fname = tmpdir.join('test.mmap').strpath size = mmap.ALLOCATIONGRANULARITY obj = [np.zeros(size, dtype='uint8'), np.ones(size, dtype='uint8')] dump(obj, fname) memmap = load(fname, mmap_mode='r') result, = Parallel(n_jobs=2)(delayed(identity)(memmap) for _ in [0]) assert isinstance(memmap[1], np.memmap) assert memmap[1].offset > size np.testing.assert_array_equal(obj, result) def test_warning_about_timeout_not_supported_by_backend(): with warns(None) as warninfo: Parallel(timeout=1)(delayed(square)(i) for i in range(10)) assert len(warninfo) == 1 w = warninfo[0] assert isinstance(w.message, UserWarning) assert str(w.message) == ( "The backend class 'SequentialBackend' does not support timeout. " "You have set 'timeout=1' in Parallel but the 'timeout' parameter " "will not be used.") joblib-0.11/joblib/test/test_pool.py000066400000000000000000000407331305577265600175250ustar00rootroot00000000000000import os import mmap from joblib.test.common import with_numpy, np from joblib.test.common import setup_autokill from joblib.test.common import teardown_autokill from joblib.test.common import with_multiprocessing from joblib.test.common import with_dev_shm from joblib.testing import raises from joblib.backports import make_memmap from joblib.pool import MemmapingPool from joblib.pool import has_shareable_memory from joblib.pool import ArrayMemmapReducer from joblib.pool import reduce_memmap from joblib.pool import _strided_from_memmap from joblib.pool import _get_backing_memmap def setup_module(): setup_autokill(__name__, timeout=300) def teardown_module(): teardown_autokill(__name__) def check_array(args): """Dummy helper function to be executed in subprocesses Check that the provided array has the expected values in the provided range. """ data, position, expected = args np.testing.assert_array_equal(data[position], expected) def inplace_double(args): """Dummy helper function to be executed in subprocesses Check that the input array has the right values in the provided range and perform an inplace modification to double the values in the range by two. """ data, position, expected = args assert data[position] == expected data[position] *= 2 np.testing.assert_array_equal(data[position], 2 * expected) @with_numpy @with_multiprocessing def test_memmap_based_array_reducing(tmpdir): """Check that it is possible to reduce a memmap backed array""" assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a file larger than what will be used by a buffer = np.memmap(filename, dtype=np.float64, shape=500, mode='w+') # Fill the original buffer with negative markers to detect over of # underflow in case of test failures buffer[:] = - 1.0 * np.arange(buffer.shape[0], dtype=buffer.dtype) buffer.flush() # Memmap a 2D fortran array on a offseted subsection of the previous # buffer a = np.memmap(filename, dtype=np.float64, shape=(3, 5, 4), mode='r+', order='F', offset=4) a[:] = np.arange(60).reshape(a.shape) # Build various views that share the buffer with the original memmap # b is an memmap sliced view on an memmap instance b = a[1:-1, 2:-1, 2:4] # c and d are array views c = np.asarray(b) d = c.T # Array reducer with auto dumping disabled reducer = ArrayMemmapReducer(None, tmpdir.strpath, 'c') def reconstruct_array(x): cons, args = reducer(x) return cons(*args) def reconstruct_memmap(x): cons, args = reduce_memmap(x) return cons(*args) # Reconstruct original memmap a_reconstructed = reconstruct_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) # Reconstruct strided memmap view b_reconstructed = reconstruct_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) # Reconstruct arrays views on memmap base c_reconstructed = reconstruct_array(c) assert not isinstance(c_reconstructed, np.memmap) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_array(d) assert not isinstance(d_reconstructed, np.memmap) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) # Test graceful degradation on fake memmap instances with in-memory # buffers a3 = a * 3 assert not has_shareable_memory(a3) a3_reconstructed = reconstruct_memmap(a3) assert not has_shareable_memory(a3_reconstructed) assert not isinstance(a3_reconstructed, np.memmap) assert_array_equal(a3_reconstructed, a * 3) # Test graceful degradation on arrays derived from fake memmap instances b3 = np.asarray(a3) assert not has_shareable_memory(b3) b3_reconstructed = reconstruct_array(b3) assert isinstance(b3_reconstructed, np.ndarray) assert not has_shareable_memory(b3_reconstructed) assert_array_equal(b3_reconstructed, b3) @with_numpy @with_multiprocessing def test_high_dimension_memmap_array_reducing(tmpdir): assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a high dimensional memmap a = np.memmap(filename, dtype=np.float64, shape=(100, 15, 15, 3), mode='w+') a[:] = np.arange(100 * 15 * 15 * 3).reshape(a.shape) # Create some slices/indices at various dimensions b = a[0:10] c = a[:, 5:10] d = a[:, :, :, 0] e = a[1:3:4] def reconstruct_memmap(x): cons, args = reduce_memmap(x) res = cons(*args) return res a_reconstructed = reconstruct_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) b_reconstructed = reconstruct_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) c_reconstructed = reconstruct_memmap(c) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_memmap(d) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) e_reconstructed = reconstruct_memmap(e) assert has_shareable_memory(e_reconstructed) assert_array_equal(e_reconstructed, e) @with_numpy @with_multiprocessing def test_pool_with_memmap(tmpdir): """Check that subprocess can access and update shared memory memmap""" assert_array_equal = np.testing.assert_array_equal # Fork the subprocess before allocating the objects to be passed pool_temp_folder = tmpdir.mkdir('pool').strpath p = MemmapingPool(10, max_nbytes=2, temp_folder=pool_temp_folder) try: filename = tmpdir.join('test.mmap').strpath a = np.memmap(filename, dtype=np.float32, shape=(3, 5), mode='w+') a.fill(1.0) p.map(inplace_double, [(a, (i, j), 1.0) for i in range(a.shape[0]) for j in range(a.shape[1])]) assert_array_equal(a, 2 * np.ones(a.shape)) # Open a copy-on-write view on the previous data b = np.memmap(filename, dtype=np.float32, shape=(5, 3), mode='c') p.map(inplace_double, [(b, (i, j), 2.0) for i in range(b.shape[0]) for j in range(b.shape[1])]) # Passing memmap instances to the pool should not trigger the creation # of new files on the FS assert os.listdir(pool_temp_folder) == [] # the original data is untouched assert_array_equal(a, 2 * np.ones(a.shape)) assert_array_equal(b, 2 * np.ones(b.shape)) # readonly maps can be read but not updated c = np.memmap(filename, dtype=np.float32, shape=(10,), mode='r', offset=5 * 4) with raises(AssertionError): p.map(check_array, [(c, i, 3.0) for i in range(c.shape[0])]) # depending on the version of numpy one can either get a RuntimeError # or a ValueError with raises((RuntimeError, ValueError)): p.map(inplace_double, [(c, i, 2.0) for i in range(c.shape[0])]) finally: # Clean all filehandlers held by the pool p.terminate() del p @with_numpy @with_multiprocessing def test_pool_with_memmap_array_view(tmpdir): """Check that subprocess can access and update shared memory array""" assert_array_equal = np.testing.assert_array_equal # Fork the subprocess before allocating the objects to be passed pool_temp_folder = tmpdir.mkdir('pool').strpath p = MemmapingPool(10, max_nbytes=2, temp_folder=pool_temp_folder) try: filename = tmpdir.join('test.mmap').strpath a = np.memmap(filename, dtype=np.float32, shape=(3, 5), mode='w+') a.fill(1.0) # Create an ndarray view on the memmap instance a_view = np.asarray(a) assert not isinstance(a_view, np.memmap) assert has_shareable_memory(a_view) p.map(inplace_double, [(a_view, (i, j), 1.0) for i in range(a.shape[0]) for j in range(a.shape[1])]) # Both a and the a_view have been updated assert_array_equal(a, 2 * np.ones(a.shape)) assert_array_equal(a_view, 2 * np.ones(a.shape)) # Passing memmap array view to the pool should not trigger the # creation of new files on the FS assert os.listdir(pool_temp_folder) == [] finally: p.terminate() del p @with_numpy @with_multiprocessing def test_memmaping_pool_for_large_arrays(tmpdir): """Check that large arrays are not copied in memory""" # Check that the tempfolder is empty assert os.listdir(tmpdir.strpath) == [] # Build an array reducers that automaticaly dump large array content # to filesystem backed memmap instances to avoid memory explosion p = MemmapingPool(3, max_nbytes=40, temp_folder=tmpdir.strpath) try: # The temporary folder for the pool is not provisioned in advance assert os.listdir(tmpdir.strpath) == [] assert not os.path.exists(p._temp_folder) small = np.ones(5, dtype=np.float32) assert small.nbytes == 20 p.map(check_array, [(small, i, 1.0) for i in range(small.shape[0])]) # Memory has been copied, the pool filesystem folder is unused assert os.listdir(tmpdir.strpath) == [] # Try with a file larger than the memmap threshold of 40 bytes large = np.ones(100, dtype=np.float64) assert large.nbytes == 800 p.map(check_array, [(large, i, 1.0) for i in range(large.shape[0])]) # The data has been dumped in a temp folder for subprocess to share it # without per-child memory copies assert os.path.isdir(p._temp_folder) dumped_filenames = os.listdir(p._temp_folder) assert len(dumped_filenames) == 1 # Check that memory mapping is not triggered for arrays with # dtype='object' objects = np.array(['abc'] * 100, dtype='object') results = p.map(has_shareable_memory, [objects]) assert not results[0] finally: # check FS garbage upon pool termination p.terminate() assert not os.path.exists(p._temp_folder) del p @with_numpy @with_multiprocessing def test_memmaping_pool_for_large_arrays_disabled(tmpdir): """Check that large arrays memmaping can be disabled""" # Set max_nbytes to None to disable the auto memmaping feature p = MemmapingPool(3, max_nbytes=None, temp_folder=tmpdir.strpath) try: # Check that the tempfolder is empty assert os.listdir(tmpdir.strpath) == [] # Try with a file largish than the memmap threshold of 40 bytes large = np.ones(100, dtype=np.float64) assert large.nbytes == 800 p.map(check_array, [(large, i, 1.0) for i in range(large.shape[0])]) # Check that the tempfolder is still empty assert os.listdir(tmpdir.strpath) == [] finally: # Cleanup open file descriptors p.terminate() del p @with_numpy @with_multiprocessing @with_dev_shm def test_memmaping_on_dev_shm(): """Check that MemmapingPool uses /dev/shm when possible""" p = MemmapingPool(3, max_nbytes=10) try: # Check that the pool has correctly detected the presence of the # shared memory filesystem. pool_temp_folder = p._temp_folder folder_prefix = '/dev/shm/joblib_memmaping_pool_' assert pool_temp_folder.startswith(folder_prefix) assert os.path.exists(pool_temp_folder) # Try with a file larger than the memmap threshold of 10 bytes a = np.ones(100, dtype=np.float64) assert a.nbytes == 800 p.map(id, [a] * 10) # a should have been memmaped to the pool temp folder: the joblib # pickling procedure generate one .pkl file: assert len(os.listdir(pool_temp_folder)) == 1 # create a new array with content that is different from 'a' so that # it is mapped to a different file in the temporary folder of the # pool. b = np.ones(100, dtype=np.float64) * 2 assert b.nbytes == 800 p.map(id, [b] * 10) # A copy of both a and b are now stored in the shared memory folder assert len(os.listdir(pool_temp_folder)) == 2 finally: # Cleanup open file descriptors p.terminate() del p # The temp folder is cleaned up upon pool termination assert not os.path.exists(pool_temp_folder) @with_numpy @with_multiprocessing def test_memmaping_pool_for_large_arrays_in_return(tmpdir): """Check that large arrays are not copied in memory in return""" assert_array_equal = np.testing.assert_array_equal # Build an array reducers that automaticaly dump large array content # but check that the returned datastructure are regular arrays to avoid # passing a memmap array pointing to a pool controlled temp folder that # might be confusing to the user # The MemmapingPool user can always return numpy.memmap object explicitly # to avoid memory copy p = MemmapingPool(3, max_nbytes=10, temp_folder=tmpdir.strpath) try: res = p.apply_async(np.ones, args=(1000,)) large = res.get() assert not has_shareable_memory(large) assert_array_equal(large, np.ones(1000)) finally: p.terminate() del p def _worker_multiply(a, n_times): """Multiplication function to be executed by subprocess""" assert has_shareable_memory(a) return a * n_times @with_numpy @with_multiprocessing def test_workaround_against_bad_memmap_with_copied_buffers(tmpdir): """Check that memmaps with a bad buffer are returned as regular arrays Unary operations and ufuncs on memmap instances return a new memmap instance with an in-memory buffer (probably a numpy bug). """ assert_array_equal = np.testing.assert_array_equal p = MemmapingPool(3, max_nbytes=10, temp_folder=tmpdir.strpath) try: # Send a complex, large-ish view on a array that will be converted to # a memmap in the worker process a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] # Call a non-inplace multiply operation on the worker and memmap and # send it back to the parent. b = p.apply_async(_worker_multiply, args=(a, 3)).get() assert not has_shareable_memory(b) assert_array_equal(b, 3 * a) finally: p.terminate() del p @with_numpy def test__strided_from_memmap(tmpdir): fname = tmpdir.join('test.mmap').strpath size = 5 * mmap.ALLOCATIONGRANULARITY offset = mmap.ALLOCATIONGRANULARITY + 1 # This line creates the mmap file that is reused later memmap_obj = np.memmap(fname, mode='w+', shape=size + offset) # filename, dtype, mode, offset, order, shape, strides, total_buffer_len memmap_obj = _strided_from_memmap(fname, dtype='uint8', mode='r', offset=offset, order='C', shape=size, strides=None, total_buffer_len=None) assert isinstance(memmap_obj, np.memmap) assert memmap_obj.offset == offset memmap_backed_obj = _strided_from_memmap(fname, dtype='uint8', mode='r', offset=offset, order='C', shape=(size // 2,), strides=(2,), total_buffer_len=size) assert _get_backing_memmap(memmap_backed_obj).offset == offset def identity(arg): return arg @with_numpy @with_multiprocessing def test_pool_memmap_with_big_offset(tmpdir): # Test that numpy memmap offset is set correctly if greater than # mmap.ALLOCATIONGRANULARITY, see # https://github.com/joblib/joblib/issues/451 and # https://github.com/numpy/numpy/pull/8443 for more details. fname = tmpdir.join('test.mmap').strpath size = 5 * mmap.ALLOCATIONGRANULARITY offset = mmap.ALLOCATIONGRANULARITY + 1 obj = make_memmap(fname, mode='w+', shape=size, dtype='uint8', offset=offset) p = MemmapingPool(2, temp_folder=tmpdir.strpath) result = p.apply_async(identity, args=(obj,)).get() assert isinstance(result, np.memmap) assert result.offset == offset np.testing.assert_array_equal(obj, result) joblib-0.11/joblib/test/test_testing.py000066400000000000000000000046421305577265600202300ustar00rootroot00000000000000import sys import re from joblib.testing import raises, check_subprocess_call def test_check_subprocess_call(): code = '\n'.join(['result = 1 + 2 * 3', 'print(result)', 'my_list = [1, 2, 3]', 'print(my_list)']) check_subprocess_call([sys.executable, '-c', code]) # Now checking stdout with a regex check_subprocess_call([sys.executable, '-c', code], # Regex needed for platform-specific line endings stdout_regex=r'7\s{1,2}\[1, 2, 3\]') def test_check_subprocess_call_non_matching_regex(): code = '42' non_matching_pattern = '_no_way_this_matches_anything_' with raises(ValueError) as excinfo: check_subprocess_call([sys.executable, '-c', code], stdout_regex=non_matching_pattern) excinfo.match('Unexpected stdout.+{}'.format(non_matching_pattern)) def test_check_subprocess_call_wrong_command(): wrong_command = '_a_command_that_does_not_exist_' with raises(OSError): check_subprocess_call([wrong_command]) def test_check_subprocess_call_non_zero_return_code(): code_with_non_zero_exit = '\n'.join([ 'import sys', 'print("writing on stdout")', 'sys.stderr.write("writing on stderr")', 'sys.exit(123)']) pattern = re.compile('Non-zero return code: 123.+' 'Stdout:\nwriting on stdout.+' 'Stderr:\nwriting on stderr', re.DOTALL) with raises(ValueError) as excinfo: check_subprocess_call([sys.executable, '-c', code_with_non_zero_exit]) excinfo.match(pattern) def test_check_subprocess_call_timeout(): code_timing_out = '\n'.join([ 'import time', 'import sys', 'print("before sleep on stdout")', 'sys.stdout.flush()', 'sys.stderr.write("before sleep on stderr")', 'sys.stderr.flush()', 'time.sleep(1.1)', 'print("process should have be killed before")', 'sys.stdout.flush()']) pattern = re.compile('Non-zero return code:.+' 'Stdout:\nbefore sleep on stdout\s+' 'Stderr:\nbefore sleep on stderr', re.DOTALL) with raises(ValueError) as excinfo: check_subprocess_call([sys.executable, '-c', code_timing_out], timeout=1) excinfo.match(pattern) joblib-0.11/joblib/testing.py000066400000000000000000000041061305577265600162050ustar00rootroot00000000000000""" Helper for testing. """ import sys import warnings import os.path import re import subprocess import threading import pytest import _pytest from joblib._compat import PY3_OR_LATER raises = pytest.raises warns = pytest.warns SkipTest = _pytest.runner.Skipped skipif = pytest.mark.skipif fixture = pytest.fixture parametrize = pytest.mark.parametrize def warnings_to_stdout(): """ Redirect all warnings to stdout. """ showwarning_orig = warnings.showwarning def showwarning(msg, cat, fname, lno, file=None, line=0): showwarning_orig(msg, cat, os.path.basename(fname), line, sys.stdout) warnings.showwarning = showwarning # warnings.simplefilter('always') def check_subprocess_call(cmd, timeout=1, stdout_regex=None, stderr_regex=None): """Runs a command in a subprocess with timeout in seconds. Also checks returncode is zero, stdout if stdout_regex is set, and stderr if stderr_regex is set. """ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) def kill_process(): proc.kill() timer = threading.Timer(timeout, kill_process) try: timer.start() stdout, stderr = proc.communicate() if PY3_OR_LATER: stdout, stderr = stdout.decode(), stderr.decode() if proc.returncode != 0: message = ( 'Non-zero return code: {}.\nStdout:\n{}\n' 'Stderr:\n{}').format( proc.returncode, stdout, stderr) raise ValueError(message) if (stdout_regex is not None and not re.search(stdout_regex, stdout)): raise ValueError( "Unexpected stdout: {!r} does not match:\n{!r}".format( stdout_regex, stdout)) if (stderr_regex is not None and not re.search(stderr_regex, stderr)): raise ValueError( "Unexpected stderr: {!r} does not match:\n{!r}".format( stderr_regex, stderr)) finally: timer.cancel() joblib-0.11/readthedocs.yml000066400000000000000000000002421305577265600157220ustar00rootroot00000000000000python: # make sure joblib is installed in the virtualenv (it is imported in # conf.py) pip_install: true requirements_file: .readthedocs-requirements.txt joblib-0.11/setup.cfg000066400000000000000000000012111305577265600145300ustar00rootroot00000000000000[aliases] release = egg_info -RDb '' # Make sure the sphinx docs are built each time we do a dist. bdist = build_sphinx bdist sdist = build_sphinx sdist # Make sure the docs are uploaded when we do an upload upload = upload upload_docs --upload-dir build/sphinx/html [bdist_rpm] doc-files = doc [wheel] universal=1 [tool:pytest] addopts = --doctest-glob="doc/*.rst" --doctest-modules testpaths = joblib [flake8] # For PEP8 error codes see # http://pep8.readthedocs.org/en/latest/intro.html#error-codes # E402: module level import not at top of file ignore=E402 [metadata] license_file = LICENSE.txt [build_sphinx] warning-is-error = 1 joblib-0.11/setup.py000077500000000000000000000043601305577265600144340ustar00rootroot00000000000000#!/usr/bin/env python from distutils.core import setup import sys import joblib # For some commands, use setuptools if len(set(('develop', 'sdist', 'release', 'bdist', 'bdist_egg', 'bdist_dumb', 'bdist_rpm', 'bdist_wheel', 'bdist_wininst', 'install_egg_info', 'build_sphinx', 'egg_info', 'easy_install', 'upload', )).intersection(sys.argv)) > 0: import setuptools extra_setuptools_args = {} if __name__ == '__main__': setup(name='joblib', version=joblib.__version__, author='Gael Varoquaux', author_email='gael.varoquaux@normalesup.org', url='http://pythonhosted.org/joblib/', description=("Lightweight pipelining: using Python functions " "as pipeline jobs."), long_description=joblib.__doc__, license='BSD', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Intended Audience :: Education', 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Topic :: Scientific/Engineering', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries', ], platforms='any', package_data={'joblib.test': ['data/*.gz', 'data/*.gzip', 'data/*.bz2', 'data/*.xz', 'data/*.lzma', 'data/*.pkl', 'data/*.npy', 'data/*.npy.z']}, packages=['joblib', 'joblib.test', 'joblib.test.data'], **extra_setuptools_args)