././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1240816 partd-1.2.0/0000755000076500000240000000000000000000000011607 5ustar00jamesstaff././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/LICENSE.txt0000644000076500000240000000274100000000000013436 0ustar00jamesstaffCopyright (c) 2015, Continuum Analytics, Inc. and contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of Continuum Analytics nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322851.0 partd-1.2.0/MANIFEST.in0000644000076500000240000000026300000000000013346 0ustar00jamesstaffrecursive-include partd *.py include requirements.txt include setup.py include README.rst include LICENSE.txt include MANIFEST.in include versioneer.py include partd/_version.py ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1243186 partd-1.2.0/PKG-INFO0000644000076500000240000001241600000000000012710 0ustar00jamesstaffMetadata-Version: 2.1 Name: partd Version: 1.2.0 Summary: Appendable key-value storage Home-page: http://github.com/dask/partd/ Maintainer: Matthew Rocklin Maintainer-email: mrocklin@gmail.com License: BSD Description: PartD ===== |Build Status| |Version Status| Key-value byte store with appendable values Partd stores key-value pairs. Values are raw bytes. We append on old values. Partd excels at shuffling operations. Operations ---------- PartD has two main operations, ``append`` and ``get``. Example ------- 1. Create a Partd backed by a directory:: >>> import partd >>> p = partd.File('/path/to/new/dataset/') 2. Append key-byte pairs to dataset:: >>> p.append({'x': b'Hello ', 'y': b'123'}) >>> p.append({'x': b'world!', 'y': b'456'}) 3. Get bytes associated to keys:: >>> p.get('x') # One key b'Hello world!' >>> p.get(['y', 'x']) # List of keys [b'123456', b'Hello world!'] 4. Destroy partd dataset:: >>> p.drop() That's it. Implementations --------------- We can back a partd by an in-memory dictionary:: >>> p = Dict() For larger amounts of data or to share data between processes we back a partd by a directory of files. This uses file-based locks for consistency.:: >>> p = File('/path/to/dataset/') However this can fail for many small writes. In these cases you may wish to buffer one partd with another, keeping a fixed maximum of data in the buffering partd. This writes the larger elements of the first partd to the second partd when space runs low:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer You might also want to have many distributed process write to a single partd consistently. This can be done with a server * Server Process:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer >>> s = Server(p, address='ipc://server') * Worker processes:: >>> p = Client('ipc://server') # Client machine talks to remote server Encodings and Compression ------------------------- Once we can robustly and efficiently append bytes to a partd we consider compression and encodings. This is generally available with the ``Encode`` partd, which accepts three functions, one to apply on bytes as they are written, one to apply to bytes as they are read, and one to join bytestreams. Common configurations already exist for common data and compression formats. We may wish to compress and decompress data transparently as we interact with a partd. Objects like ``BZ2``, ``Blosc``, ``ZLib`` and ``Snappy`` exist and take another partd as an argument.:: >>> p = File(...) >>> p = ZLib(p) These work exactly as before, the (de)compression happens automatically. Common data formats like Python lists, numpy arrays, and pandas dataframes are also supported out of the box.:: >>> p = File(...) >>> p = NumPy(p) >>> p.append({'x': np.array([...])}) This lets us forget about bytes and think instead in our normal data types. Composition ----------- In principle we want to compose all of these choices together 1. Write policy: ``Dict``, ``File``, ``Buffer``, ``Client`` 2. Encoding: ``Pickle``, ``Numpy``, ``Pandas``, ... 3. Compression: ``Blosc``, ``Snappy``, ... Partd objects compose by nesting. Here we make a partd that writes pickle encoded BZ2 compressed bytes directly to disk:: >>> p = Pickle(BZ2(File('foo'))) We could construct more complex systems that include compression, serialization, buffering, and remote access.:: >>> server = Server(Buffer(Dict(), File(), available_memory=2e0)) >>> client = Pickle(Snappy(Client(server.address))) >>> client.append({'x': [1, 2, 3]}) .. |Build Status| image:: https://github.com/dask/partd/workflows/CI/badge.svg :target: https://github.com/dask/partd/actions?query=workflow%3ACI .. |Version Status| image:: https://img.shields.io/pypi/v/partd.svg :target: https://pypi.python.org/pypi/partd/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Requires-Python: >=3.5 Provides-Extra: complete ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322851.0 partd-1.2.0/README.rst0000644000076500000240000000736000000000000013304 0ustar00jamesstaffPartD ===== |Build Status| |Version Status| Key-value byte store with appendable values Partd stores key-value pairs. Values are raw bytes. We append on old values. Partd excels at shuffling operations. Operations ---------- PartD has two main operations, ``append`` and ``get``. Example ------- 1. Create a Partd backed by a directory:: >>> import partd >>> p = partd.File('/path/to/new/dataset/') 2. Append key-byte pairs to dataset:: >>> p.append({'x': b'Hello ', 'y': b'123'}) >>> p.append({'x': b'world!', 'y': b'456'}) 3. Get bytes associated to keys:: >>> p.get('x') # One key b'Hello world!' >>> p.get(['y', 'x']) # List of keys [b'123456', b'Hello world!'] 4. Destroy partd dataset:: >>> p.drop() That's it. Implementations --------------- We can back a partd by an in-memory dictionary:: >>> p = Dict() For larger amounts of data or to share data between processes we back a partd by a directory of files. This uses file-based locks for consistency.:: >>> p = File('/path/to/dataset/') However this can fail for many small writes. In these cases you may wish to buffer one partd with another, keeping a fixed maximum of data in the buffering partd. This writes the larger elements of the first partd to the second partd when space runs low:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer You might also want to have many distributed process write to a single partd consistently. This can be done with a server * Server Process:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer >>> s = Server(p, address='ipc://server') * Worker processes:: >>> p = Client('ipc://server') # Client machine talks to remote server Encodings and Compression ------------------------- Once we can robustly and efficiently append bytes to a partd we consider compression and encodings. This is generally available with the ``Encode`` partd, which accepts three functions, one to apply on bytes as they are written, one to apply to bytes as they are read, and one to join bytestreams. Common configurations already exist for common data and compression formats. We may wish to compress and decompress data transparently as we interact with a partd. Objects like ``BZ2``, ``Blosc``, ``ZLib`` and ``Snappy`` exist and take another partd as an argument.:: >>> p = File(...) >>> p = ZLib(p) These work exactly as before, the (de)compression happens automatically. Common data formats like Python lists, numpy arrays, and pandas dataframes are also supported out of the box.:: >>> p = File(...) >>> p = NumPy(p) >>> p.append({'x': np.array([...])}) This lets us forget about bytes and think instead in our normal data types. Composition ----------- In principle we want to compose all of these choices together 1. Write policy: ``Dict``, ``File``, ``Buffer``, ``Client`` 2. Encoding: ``Pickle``, ``Numpy``, ``Pandas``, ... 3. Compression: ``Blosc``, ``Snappy``, ... Partd objects compose by nesting. Here we make a partd that writes pickle encoded BZ2 compressed bytes directly to disk:: >>> p = Pickle(BZ2(File('foo'))) We could construct more complex systems that include compression, serialization, buffering, and remote access.:: >>> server = Server(Buffer(Dict(), File(), available_memory=2e0)) >>> client = Pickle(Snappy(Client(server.address))) >>> client.append({'x': [1, 2, 3]}) .. |Build Status| image:: https://github.com/dask/partd/workflows/CI/badge.svg :target: https://github.com/dask/partd/actions?query=workflow%3ACI .. |Version Status| image:: https://img.shields.io/pypi/v/partd.svg :target: https://pypi.python.org/pypi/partd/ ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1256127 partd-1.2.0/partd/0000755000076500000240000000000000000000000012721 5ustar00jamesstaff././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/__init__.py0000644000076500000240000000103600000000000015032 0ustar00jamesstafffrom __future__ import absolute_import from .file import File from .dict import Dict from .buffer import Buffer from .encode import Encode from .pickle import Pickle from .python import Python from .compressed import * from .utils import ignoring with ignoring(ImportError): from .numpy import Numpy with ignoring(ImportError): from .pandas import PandasColumns, PandasBlocks with ignoring(ImportError): from .zmq import Client, Server from ._version import get_versions __version__ = get_versions()['version'] del get_versions ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1257348 partd-1.2.0/partd/_version.py0000644000076500000240000000076100000000000015123 0ustar00jamesstaff # This file was generated by 'versioneer.py' (0.18) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' { "date": "2021-04-08T12:31:19-0500", "dirty": false, "error": null, "full-revisionid": "9c9ba0a3a91b6b1eeb560615114a1df81fc427c1", "version": "1.2.0" } ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/buffer.py0000644000076500000240000000706500000000000014554 0ustar00jamesstafffrom .core import Interface from threading import Lock from toolz import merge_with, topk, accumulate, pluck from operator import add from bisect import bisect from collections import defaultdict from .compatibility import Queue, Empty def zero(): return 0 class Buffer(Interface): def __init__(self, fast, slow, available_memory=1e9): self.lock = Lock() self.fast = fast self.slow = slow self.available_memory = available_memory self.lengths = defaultdict(zero) self.memory_usage = 0 Interface.__init__(self) def __getstate__(self): return {'fast': self.fast, 'slow': self.slow, 'memory_usage': self.memory_usage, 'lengths': self.lengths, 'available_memory': self.available_memory} def __setstate__(self, state): Interface.__setstate__(self, state) self.lock = Lock() self.__dict__.update(state) def append(self, data, lock=True, **kwargs): if lock: self.lock.acquire() try: for k, v in data.items(): self.lengths[k] += len(v) self.memory_usage += len(v) self.fast.append(data, lock=False, **kwargs) while self.memory_usage > self.available_memory: keys = keys_to_flush(self.lengths, 0.1, maxcount=20) self.flush(keys) finally: if lock: self.lock.release() def _get(self, keys, lock=True, **kwargs): if lock: self.lock.acquire() try: result = list(map(add, self.fast.get(keys, lock=False), self.slow.get(keys, lock=False))) finally: if lock: self.lock.release() return result def _iset(self, key, value, lock=True): """ Idempotent set """ if lock: self.lock.acquire() try: self.fast.iset(key, value, lock=False) finally: if lock: self.lock.release() def _delete(self, keys, lock=True): if lock: self.lock.acquire() try: self.fast.delete(keys, lock=False) self.slow.delete(keys, lock=False) finally: if lock: self.lock.release() def drop(self): self._iset_seen.clear() self.fast.drop() self.slow.drop() def __exit__(self, *args): self.drop() def flush(self, keys=None, block=None): """ Flush keys to disk Parameters ---------- keys: list or None list of keys to flush block: bool (defaults to None) Whether or not to block until all writing is complete If no keys are given then flush all keys """ if keys is None: keys = list(self.lengths) self.slow.append(dict(zip(keys, self.fast.get(keys)))) self.fast.delete(keys) for key in keys: self.memory_usage -= self.lengths[key] del self.lengths[key] def keys_to_flush(lengths, fraction=0.1, maxcount=100000): """ Which keys to remove >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, ... 'e': 10, 'f': 25, 'g': 5} >>> keys_to_flush(lengths, 0.5) ['f', 'a'] """ top = topk(max(len(lengths) // 2, 1), lengths.items(), key=1) total = sum(lengths.values()) cutoff = min(maxcount, max(1, bisect(list(accumulate(add, pluck(1, top))), total * fraction))) result = [k for k, v in top[:cutoff]] assert result return result ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/compatibility.py0000644000076500000240000000051200000000000016142 0ustar00jamesstafffrom __future__ import absolute_import import sys if sys.version_info[0] == 3: from io import StringIO unicode = str import pickle from queue import Queue, Empty if sys.version_info[0] == 2: from StringIO import StringIO unicode = unicode import cPickle as pickle from Queue import Queue, Empty ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/compressed.py0000644000076500000240000000222100000000000015434 0ustar00jamesstafffrom .utils import ignoring from .encode import Encode from functools import partial __all__ = [] def bytes_concat(L): return b''.join(L) with ignoring(ImportError, AttributeError): # In case snappy is not installed, or another package called snappy that does not implement compress / decompress. # For example, SnapPy (https://pypi.org/project/snappy/) import snappy Snappy = partial(Encode, snappy.compress, snappy.decompress, bytes_concat) __all__.append('Snappy') with ignoring(ImportError): import zlib ZLib = partial(Encode, zlib.compress, zlib.decompress, bytes_concat) __all__.append('ZLib') with ignoring(ImportError): import bz2 BZ2 = partial(Encode, bz2.compress, bz2.decompress, bytes_concat) __all__.append('BZ2') with ignoring(ImportError): import blosc Blosc = partial(Encode, blosc.compress, blosc.decompress, bytes_concat) __all__.append('Blosc') ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/core.py0000644000076500000240000000440300000000000014224 0ustar00jamesstafffrom __future__ import absolute_import import os import shutil import locket import string from toolz import memoize from contextlib import contextmanager from .utils import nested_get, flatten # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep def escape_filename(fn): """ Escape text so that it is a valid filename >>> escape_filename('Foo!bar?') 'Foobar' """ return ''.join(filter(valid_chars.__contains__, fn)) def filename(path, key): return os.path.join(path, escape_filename(token(key))) def token(key): """ >>> token('hello') 'hello' >>> token(('hello', 'world')) # doctest: +SKIP 'hello/world' """ if isinstance(key, str): return key elif isinstance(key, tuple): return os.path.join(*map(token, key)) else: return str(key) class Interface(object): def __init__(self): self._iset_seen = set() def __setstate__(self, state): self.__dict__.update(state) self._iset_seen = set() def iset(self, key, value, **kwargs): if key in self._iset_seen: return else: self._iset(key, value, **kwargs) self._iset_seen.add(key) def __enter__(self): return self def __exit__(self, type, value, traceback): self.drop() def iget(self, key): return self._get([key], lock=False)[0] def get(self, keys, **kwargs): if not isinstance(keys, list): return self.get([keys], **kwargs)[0] elif any(isinstance(key, list) for key in keys): # nested case flatkeys = list(flatten(keys)) result = self.get(flatkeys, **kwargs) return nested_get(keys, dict(zip(flatkeys, result))) else: return self._get(keys, **kwargs) def delete(self, keys, **kwargs): if not isinstance(keys, list): return self._delete([keys], **kwargs) else: return self._delete(keys, **kwargs) def pop(self, keys, **kwargs): with self.partd.lock: result = self.partd.get(keys, lock=False) self.partd.delete(keys, lock=False) return result ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/dict.py0000644000076500000240000000324500000000000014222 0ustar00jamesstafffrom .core import Interface from threading import Lock class Dict(Interface): def __init__(self): self.lock = Lock() self.data = dict() Interface.__init__(self) def __getstate__(self): return {'data': self.data} def __setstate__(self, state): Interface.__setstate__(self, state) Dict.__init__(self) self.data = state['data'] def append(self, data, lock=True, **kwargs): if lock: self.lock.acquire() try: for k, v in data.items(): if k not in self.data: self.data[k] = [] self.data[k].append(v) finally: if lock: self.lock.release() def _get(self, keys, lock=True, **kwargs): assert isinstance(keys, (list, tuple, set)) if lock: self.lock.acquire() try: result = [b''.join(self.data.get(key, [])) for key in keys] finally: if lock: self.lock.release() return result def _iset(self, key, value, lock=True): """ Idempotent set """ if lock: self.lock.acquire() try: self.data[key] = [value] finally: if lock: self.lock.release() def _delete(self, keys, lock=True): if lock: self.lock.acquire() try: for key in keys: if key in self.data: del self.data[key] finally: if lock: self.lock.release() def drop(self): self._iset_seen.clear() self.data.clear() def __exit__(self, *args): self.drop() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/encode.py0000644000076500000240000000240700000000000014533 0ustar00jamesstafffrom .core import Interface from .file import File from toolz import valmap from .utils import frame, framesplit class Encode(Interface): def __init__(self, encode, decode, join, partd=None): if not partd or isinstance(partd, str): partd = File(partd) self.partd = partd self.encode = encode self.decode = decode self.join = join Interface.__init__(self) def __getstate__(self): return self.__dict__ __setstate__ = Interface.__setstate__ def append(self, data, **kwargs): data = valmap(self.encode, data) data = valmap(frame, data) self.partd.append(data, **kwargs) def _get(self, keys, **kwargs): raw = self.partd._get(keys, **kwargs) return [self.join([self.decode(frame) for frame in framesplit(chunk)]) for chunk in raw] def delete(self, keys, **kwargs): return self.partd.delete(keys, **kwargs) def _iset(self, key, value, **kwargs): return self.partd.iset(key, frame(self.encode(value)), **kwargs) def drop(self): return self.partd.drop() @property def lock(self): return self.partd.lock def __exit__(self, *args): self.drop() self.partd.__exit__(*args) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/file.py0000644000076500000240000000762600000000000014225 0ustar00jamesstafffrom __future__ import absolute_import import atexit import os import shutil import string import tempfile from .core import Interface import locket from .utils import ignoring class File(Interface): def __init__(self, path=None, dir=None): if not path: path = tempfile.mkdtemp(suffix='.partd', dir=dir) cleanup_files.append(path) self._explicitly_given_path = False else: self._explicitly_given_path = True self.path = path if not os.path.exists(path): with ignoring(OSError): os.makedirs(path) self.lock = locket.lock_file(self.filename('.lock')) Interface.__init__(self) def __getstate__(self): return {'path': self.path} def __setstate__(self, state): Interface.__setstate__(self, state) File.__init__(self, state['path']) def append(self, data, lock=True, fsync=False, **kwargs): if lock: self.lock.acquire() try: for k, v in data.items(): fn = self.filename(k) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'ab') as f: f.write(v) if fsync: os.fsync(f) finally: if lock: self.lock.release() def _get(self, keys, lock=True, **kwargs): assert isinstance(keys, (list, tuple, set)) if lock: self.lock.acquire() try: result = [] for key in keys: try: with open(self.filename(key), 'rb') as f: result.append(f.read()) except IOError: result.append(b'') finally: if lock: self.lock.release() return result def _iset(self, key, value, lock=True): """ Idempotent set """ fn = self.filename(key) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) if lock: self.lock.acquire() try: with open(self.filename(key), 'wb') as f: f.write(value) finally: if lock: self.lock.release() def _delete(self, keys, lock=True): if lock: self.lock.acquire() try: for key in keys: path = filename(self.path, key) if os.path.exists(path): os.remove(path) finally: if lock: self.lock.release() def drop(self): if os.path.exists(self.path): shutil.rmtree(self.path) self._iset_seen.clear() os.mkdir(self.path) def filename(self, key): return filename(self.path, key) def __exit__(self, *args): self.drop() os.rmdir(self.path) def __del__(self): if not self._explicitly_given_path: self.drop() os.rmdir(self.path) def filename(path, key): return os.path.join(path, escape_filename(token(key))) # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python valid_chars = "-_.() " + string.ascii_letters + string.digits + os.path.sep def escape_filename(fn): """ Escape text so that it is a valid filename >>> escape_filename('Foo!bar?') 'Foobar' """ return ''.join(filter(valid_chars.__contains__, fn)) def token(key): """ >>> token('hello') 'hello' >>> token(('hello', 'world')) # doctest: +SKIP 'hello/world' """ if isinstance(key, str): return key elif isinstance(key, tuple): return os.path.join(*map(token, key)) else: return str(key) cleanup_files = list() @atexit.register def cleanup(): for fn in cleanup_files: if os.path.exists(fn): shutil.rmtree(fn) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/numpy.py0000644000076500000240000001017100000000000014443 0ustar00jamesstaff""" Store arrays We put arrays on disk as raw bytes, extending along the first dimension. Alongside each array x we ensure the value x.dtype which stores the string description of the array's dtype. """ from __future__ import absolute_import import numpy as np from toolz import valmap, identity, partial from .compatibility import pickle from .core import Interface from .file import File from .utils import frame, framesplit, suffix, ignoring def serialize_dtype(dt): """ Serialize dtype to bytes >>> serialize_dtype(np.dtype('i4')) b'>> serialize_dtype(np.dtype('M8[us]')) b'>> parse_dtype(b'i4') dtype('int32') >>> parse_dtype(b"[('a', 'i4')]") dtype([('a', '= (0, 5, 2): unpack_kwargs = {'raw': False} else: unpack_kwargs = {'encoding': 'utf-8'} blocks = [msgpack.unpackb(f, **unpack_kwargs) for f in framesplit(bytes)] except Exception: blocks = [pickle.loads(f) for f in framesplit(bytes)] result = np.empty(sum(map(len, blocks)), dtype='O') i = 0 for block in blocks: result[i:i + len(block)] = block i += len(block) return result else: result = np.frombuffer(bytes, dtype) if copy: result = result.copy() return result compress_text = identity decompress_text = identity compress_bytes = lambda bytes, itemsize: bytes decompress_bytes = identity with ignoring(ImportError): import blosc blosc.set_nthreads(1) compress_bytes = blosc.compress decompress_bytes = blosc.decompress compress_text = partial(blosc.compress, typesize=1) decompress_text = blosc.decompress with ignoring(ImportError): from snappy import compress as compress_text from snappy import decompress as decompress_text def compress(bytes, dtype): if dtype == 'O': return compress_text(bytes) else: return compress_bytes(bytes, dtype.itemsize) def decompress(bytes, dtype): if dtype == 'O': return decompress_text(bytes) else: return decompress_bytes(bytes) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/pandas.py0000644000076500000240000001564600000000000014555 0ustar00jamesstafffrom __future__ import absolute_import from functools import partial import numpy as np import pandas as pd from pandas.core.internals import create_block_manager_from_blocks, make_block from . import numpy as pnp from .core import Interface from .compatibility import pickle from .encode import Encode from .utils import extend, framesplit, frame try: # pandas >= 0.24.0 from pandas.api.types import is_extension_array_dtype except ImportError: def is_extension_array_dtype(dtype): return False try: # Some `ExtensionArray`s can have a `.dtype` which is not a `ExtensionDtype` # (e.g. they can be backed by a NumPy dtype). For these cases we check # whether the instance is a `ExtensionArray`. # https://github.com/dask/partd/issues/48 from pandas.api.extensions import ExtensionArray def is_extension_array(x): return isinstance(x, ExtensionArray) except ImportError: def is_extension_array(x): return False dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) class PandasColumns(Interface): def __init__(self, partd=None): self.partd = pnp.Numpy(partd) Interface.__init__(self) def append(self, data, **kwargs): for k, df in data.items(): self.iset(extend(k, '.columns'), dumps(list(df.columns))) self.iset(extend(k, '.index-name'), dumps(df.index.name)) # TODO: don't use values, it does some work. Look at _blocks instead # pframe/cframe do this well arrays = dict((extend(k, col), df[col].values) for k, df in data.items() for col in df.columns) arrays.update(dict((extend(k, '.index'), df.index.values) for k, df in data.items())) # TODO: handle categoricals self.partd.append(arrays, **kwargs) def _get(self, keys, columns=None, **kwargs): if columns is None: columns = self.partd.partd.get([extend(k, '.columns') for k in keys], **kwargs) columns = list(map(pickle.loads, columns)) else: columns = [columns] * len(keys) index_names = self.partd.partd.get([extend(k, '.index-name') for k in keys], **kwargs) index_names = map(pickle.loads, index_names) keys = [[extend(k, '.index'), [extend(k, col) for col in cols]] for k, cols in zip(keys, columns)] arrays = self.partd.get(keys, **kwargs) return [pd.DataFrame(dict(zip(cols, arrs)), columns=cols, index=pd.Index(index, name=iname)) for iname, (index, arrs), cols in zip(index_names, arrays, columns)] def __getstate__(self): return {'partd': self.partd} def _iset(self, key, value): return self.partd._iset(key, value) def drop(self): return self.partd.drop() @property def lock(self): return self.partd.partd.lock def __exit__(self, *args): self.drop() self.partd.__exit__(self, *args) def __del__(self): self.partd.__del__() def index_to_header_bytes(ind): # These have special `__reduce__` methods, just use pickle if isinstance(ind, (pd.DatetimeIndex, pd.MultiIndex, pd.RangeIndex)): return None, dumps(ind) if isinstance(ind, pd.CategoricalIndex): cat = (ind.ordered, ind.categories) values = ind.codes else: cat = None values = ind.values header = (type(ind), ind._get_attributes_dict(), values.dtype, cat) bytes = pnp.compress(pnp.serialize(values), values.dtype) return header, bytes def index_from_header_bytes(header, bytes): if header is None: return pickle.loads(bytes) typ, attr, dtype, cat = header data = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True) if cat: data = pd.Categorical.from_codes(data, cat[1], ordered=cat[0]) return typ.__new__(typ, data=data, **attr) def block_to_header_bytes(block): values = block.values try: # pandas >= 0.19 from pandas.api.types import is_datetime64tz_dtype except ImportError: from pandas.core.common import is_datetime64tz_dtype if isinstance(values, pd.Categorical): extension = ('categorical_type', (values.ordered, values.categories)) values = values.codes elif is_datetime64tz_dtype(block): extension = ('datetime64_tz_type', (block.values.tzinfo,)) values = values.view('i8') elif is_extension_array_dtype(block.dtype) or is_extension_array(values): extension = ("other", ()) else: extension = ('numpy_type', ()) header = (block.mgr_locs.as_array, values.dtype, values.shape, extension) if extension == ("other", ()): bytes = pickle.dumps(values) else: bytes = pnp.compress(pnp.serialize(values), values.dtype) return header, bytes def block_from_header_bytes(header, bytes): placement, dtype, shape, (extension_type, extension_values) = header if extension_type == "other": values = pickle.loads(bytes) else: values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True).reshape(shape) if extension_type == 'categorical_type': values = pd.Categorical.from_codes(values, extension_values[1], ordered=extension_values[0]) elif extension_type == 'datetime64_tz_type': tz_info = extension_values[0] values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert( tz_info) return make_block(values, placement=placement) def serialize(df): """ Serialize and compress a Pandas DataFrame Uses Pandas blocks, snappy, and blosc to deconstruct an array into bytes """ col_header, col_bytes = index_to_header_bytes(df.columns) ind_header, ind_bytes = index_to_header_bytes(df.index) headers = [col_header, ind_header] bytes = [col_bytes, ind_bytes] for block in df._data.blocks: h, b = block_to_header_bytes(block) headers.append(h) bytes.append(b) frames = [dumps(headers)] + bytes return b''.join(map(frame, frames)) def deserialize(bytes): """ Deserialize and decompress bytes back to a pandas DataFrame """ frames = list(framesplit(bytes)) headers = pickle.loads(frames[0]) bytes = frames[1:] axes = [index_from_header_bytes(headers[0], bytes[0]), index_from_header_bytes(headers[1], bytes[1])] blocks = [block_from_header_bytes(h, b) for (h, b) in zip(headers[2:], bytes[2:])] return pd.DataFrame(create_block_manager_from_blocks(blocks, axes)) def join(dfs): if not dfs: return pd.DataFrame() else: return pd.concat(dfs) PandasBlocks = partial(Encode, serialize, deserialize, join) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/pickle.py0000644000076500000240000000064400000000000014546 0ustar00jamesstaff""" get/put functions that consume/produce Python lists using Pickle to serialize """ from __future__ import absolute_import from .compatibility import pickle from .encode import Encode from functools import partial def concat(lists): return sum(lists, []) Pickle = partial(Encode, partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL), pickle.loads, concat) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/python.py0000644000076500000240000000170100000000000014613 0ustar00jamesstaff""" get/put functions that consume/produce Python lists using msgpack or pickle to serialize. First we try msgpack (it's faster). If that fails then we default to pickle. """ from __future__ import absolute_import from .compatibility import pickle try: from pandas import msgpack except ImportError: try: import msgpack except ImportError: msgpack = False from .encode import Encode from functools import partial def dumps(x): try: return msgpack.packb(x, use_bin_type=True) except: return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL) def loads(x): try: if msgpack.version >= (0, 5, 2): unpack_kwargs = {'raw': False} else: unpack_kwargs = {'encoding': 'utf-8'} return msgpack.unpackb(x, **unpack_kwargs) except: return pickle.loads(x) def concat(lists): return sum(lists, []) Python = partial(Encode, dumps, loads, concat) ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1235414 partd-1.2.0/partd/tests/0000755000076500000240000000000000000000000014063 5ustar00jamesstaff././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_buffer.py0000644000076500000240000000303000000000000016741 0ustar00jamesstafffrom partd.dict import Dict from partd.file import File from partd.buffer import Buffer, keys_to_flush import pickle import shutil import os def test_partd(): a = Dict() b = Dict() with Buffer(a, b, available_memory=10) as p: p.append({'x': b'Hello', 'y': b'abc'}) assert a.get(['x', 'y']) == [b'Hello', b'abc'] p.append({'x': b'World!', 'y': b'def'}) assert a.get(['x', 'y']) == [b'', b'abcdef'] assert b.get(['x', 'y']) == [b'HelloWorld!', b''] result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) def test_keys_to_flush(): lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, 'e': 10, 'f': 25, 'g': 5} assert keys_to_flush(lengths, 0.5) == ['f', 'a'] def test_pickle(): with Dict() as a: with File() as b: c = Buffer(a, b) c.append({'x': b'123'}) d = pickle.loads(pickle.dumps(c)) assert d.get('x') == c.get('x') pickled_attrs = ('memory_usage', 'lengths', 'available_memory') for attr in pickled_attrs: assert hasattr(d, attr) assert getattr(d, attr) == getattr(c, attr) # special case Dict and File -- some attrs do not pickle assert hasattr(d, 'fast') assert d.fast.data == c.fast.data assert hasattr(d, 'slow') assert d.slow.path == c.slow.path ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_compressed.py0000644000076500000240000000133600000000000017643 0ustar00jamesstafffrom partd.compressed import ZLib import shutil import os import pickle def test_partd(): with ZLib() as p: p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) assert os.path.exists(p.partd.filename('x')) assert os.path.exists(p.partd.filename('y')) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(p.partd.path) def test_pickle(): with ZLib() as p: p.append({'x': b'123'}) q = pickle.loads(pickle.dumps(p)) assert q.get('x') == b'123' ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_dict.py0000644000076500000240000000165400000000000016425 0ustar00jamesstafffrom partd.dict import Dict import shutil import os def test_partd(): with Dict() as p: p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) def test_key_tuple(): with Dict() as p: p.append({('a', 'b'): b'123'}) assert p.get(('a', 'b')) == b'123' def test_iset(): with Dict() as p: p.iset('x', b'123') assert 'x' in p._iset_seen assert 'y' not in p._iset_seen p.iset('x', b'123') p.iset('x', b'123') assert p.get('x') == b'123' def test_delete_non_existent_key(): with Dict() as p: p.append({'x': b'123'}) p.delete(['x', 'y']) assert p.get(['x', 'y']) == [b'', b''] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_encode.py0000644000076500000240000000127400000000000016735 0ustar00jamesstafffrom partd.file import File from partd.encode import Encode import zlib import shutil import os def test_partd(): with Encode(zlib.compress, zlib.decompress, b''.join) as p: p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) def test_ensure(): with Encode(zlib.compress, zlib.decompress, b''.join) as p: p.iset('x', b'123') p.iset('x', b'123') p.iset('x', b'123') assert p.get('x') == b'123' ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_file.py0000644000076500000240000000341000000000000016411 0ustar00jamesstafffrom partd.file import File import shutil import os def test_partd(): with File() as p: p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) assert os.path.exists(p.filename('x')) assert os.path.exists(p.filename('y')) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(p.path) def test_key_tuple(): with File() as p: p.append({('a', 'b'): b'123'}) assert os.path.exists(p.filename(('a', 'b'))) def test_iset(): with File() as p: p.iset('x', b'123') assert 'x' in p._iset_seen assert 'y' not in p._iset_seen p.iset('x', b'123') p.iset('x', b'123') assert p.get('x') == b'123' def test_nested_get(): with File() as p: p.append({'x': b'1', 'y': b'2', 'z': b'3'}) assert p.get(['x', ['y', 'z']]) == [b'1', [b'2', b'3']] def test_drop(): with File() as p: p.append({'x': b'123'}) p.iset('y', b'abc') assert p.get('x') == b'123' assert p.get('y') == b'abc' p.drop() assert p.get('x') == b'' assert p.get('y') == b'' p.append({'x': b'123'}) p.iset('y', b'def') assert p.get('x') == b'123' assert p.get('y') == b'def' def test_del(): f = File() assert f.path assert os.path.exists(f.path) f.__del__() assert not os.path.exists(f.path) with File('Foo') as p: p.__del__() assert os.path.exists(p.path) def test_specify_dirname(): with File(dir=os.getcwd()) as f: assert os.getcwd() in f.path ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/tests/test_numpy.py0000644000076500000240000000445300000000000016652 0ustar00jamesstafffrom __future__ import absolute_import import pytest np = pytest.importorskip('numpy') # noqa import pickle import partd from partd.numpy import Numpy def test_numpy(): dt = np.dtype([('a', 'i4'), ('b', 'i2'), ('c', 'f8')]) with Numpy() as p: p.append({'a': np.array([10, 20, 30], dtype=dt['a']), 'b': np.array([ 1, 2, 3], dtype=dt['b']), 'c': np.array([.1, .2, .3], dtype=dt['c'])}) p.append({'a': np.array([70, 80, 90], dtype=dt['a']), 'b': np.array([ 7, 8, 9], dtype=dt['b']), 'c': np.array([.7, .8, .9], dtype=dt['c'])}) result = p.get(['a', 'c']) assert (result[0] == np.array([10, 20, 30, 70, 80, 90],dtype=dt['a'])).all() assert (result[1] == np.array([.1, .2, .3, .7, .8, .9],dtype=dt['c'])).all() with p.lock: # uh oh, possible deadlock result = p.get(['a'], lock=False) def test_nested(): with Numpy() as p: p.append({'x': np.array([1, 2, 3]), ('y', 1): np.array([4, 5, 6]), ('z', 'a', 3): np.array([.1, .2, .3])}) assert (p.get(('z', 'a', 3)) == np.array([.1, .2, .3])).all() def test_serialization(): with Numpy() as p: p.append({'x': np.array([1, 2, 3])}) q = pickle.loads(pickle.dumps(p)) assert (q.get('x') == [1, 2, 3]).all() array_of_lists = np.empty(3, dtype='O') array_of_lists[:] = [[1, 2], [3, 4], [5, 6]] @pytest.mark.parametrize('x', [np.array(['Alice', 'Bob', 'Charlie'], dtype='O'), array_of_lists]) def test_object_dtype(x): with Numpy() as p: p.append({'x': x}) p.append({'x': x}) assert isinstance(p.get('x'), np.ndarray) assert (p.get('x') == np.concatenate([x, x])).all() def test_datetime_types(): x = np.array(['2014-01-01T12:00:00'], dtype='M8[us]') y = np.array(['2014-01-01T12:00:00'], dtype='M8[s]') with Numpy() as p: p.append({'x': x, 'y': y}) assert p.get('x').dtype == x.dtype assert p.get('y').dtype == y.dtype def test_non_utf8_bytes(): a = np.array([b'\xc3\x28', b'\xa0\xa1', b'\xe2\x28\xa1', b'\xe2\x82\x28', b'\xf0\x28\x8c\xbc'], dtype='O') s = partd.numpy.serialize(a) assert (partd.numpy.deserialize(s, 'O') == a).all() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/tests/test_pandas.py0000644000076500000240000000761300000000000016751 0ustar00jamesstafffrom __future__ import absolute_import import pytest pytest.importorskip('pandas') # noqa import numpy as np import pandas as pd import pandas.util.testing as tm import os from partd.pandas import PandasColumns, PandasBlocks, serialize, deserialize df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['x', 'y', 'x']}, columns=['a', 'b', 'c'], index=pd.Index([1, 2, 3], name='myindex')) df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [10., 20., 30.], 'c': ['X', 'Y', 'X']}, columns=['a', 'b', 'c'], index=pd.Index([10, 20, 30], name='myindex')) def test_PandasColumns(): with PandasColumns() as p: assert os.path.exists(p.partd.partd.path) p.append({'x': df1, 'y': df2}) p.append({'x': df2, 'y': df1}) assert os.path.exists(p.partd.partd.filename('x')) assert os.path.exists(p.partd.partd.filename(('x', 'a'))) assert os.path.exists(p.partd.partd.filename(('x', '.index'))) assert os.path.exists(p.partd.partd.filename('y')) result = p.get(['y', 'x']) tm.assert_frame_equal(result[0], pd.concat([df2, df1])) tm.assert_frame_equal(result[1], pd.concat([df1, df2])) with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(p.partd.partd.path) def test_column_selection(): with PandasColumns('foo') as p: p.append({'x': df1, 'y': df2}) p.append({'x': df2, 'y': df1}) result = p.get('x', columns=['c', 'b']) tm.assert_frame_equal(result, pd.concat([df1, df2])[['c', 'b']]) def test_PandasBlocks(): with PandasBlocks() as p: assert os.path.exists(p.partd.path) p.append({'x': df1, 'y': df2}) p.append({'x': df2, 'y': df1}) assert os.path.exists(p.partd.filename('x')) assert os.path.exists(p.partd.filename('y')) result = p.get(['y', 'x']) tm.assert_frame_equal(result[0], pd.concat([df2, df1])) tm.assert_frame_equal(result[1], pd.concat([df1, df2])) with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(p.partd.path) @pytest.mark.parametrize('ordered', [False, True]) def test_serialize_categoricals(ordered): frame = pd.DataFrame({'x': [1, 2, 3, 4], 'y': pd.Categorical(['c', 'a', 'b', 'a'], ordered=ordered)}, index=pd.Categorical(['x', 'y', 'z', 'x'], ordered=ordered)) frame.index.name = 'foo' frame.columns.name = 'bar' for ind, df in [(0, frame), (1, frame.T)]: df2 = deserialize(serialize(df)) tm.assert_frame_equal(df, df2) def test_serialize_multi_index(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'a', 'b', 'c'], 'y': [1, 2, 3, 4, 5, 6], 'z': [7., 8, 9, 10, 11, 12]}) df = df.groupby([df.x, df.y]).sum() df.index.name = 'foo' df.columns.name = 'bar' df2 = deserialize(serialize(df)) tm.assert_frame_equal(df, df2) @pytest.mark.parametrize('base', [ pd.Timestamp('1987-03-3T01:01:01+0001'), pd.Timestamp('1987-03-03 01:01:01-0600', tz='US/Central'), ]) def test_serialize(base): df = pd.DataFrame({'x': [ base + pd.Timedelta(seconds=i) for i in np.random.randint(0, 1000, size=10)], 'y': list(range(10)), 'z': pd.date_range('2017', periods=10)}) df2 = deserialize(serialize(df)) tm.assert_frame_equal(df, df2) def test_other_extension_types(): pytest.importorskip("pandas", minversion="0.25.0") a = pd.array([pd.Period("2000"), pd.Period("2001")]) df = pd.DataFrame({"A": a}) df2 = deserialize(serialize(df)) tm.assert_frame_equal(df, df2) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_partd.py0000644000076500000240000000240100000000000016603 0ustar00jamesstafffrom partd import File from partd.core import token, escape_filename, filename from partd import core import os import shutil from contextlib import contextmanager def test_partd(): path = 'tmp.partd' with File(path) as p: p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) assert os.path.exists(p.filename('x')) assert os.path.exists(p.filename('y')) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] assert p.get('z') == b'' with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(path) def test_key_tuple(): with File('foo') as p: p.append({('a', 'b'): b'123'}) assert os.path.exists(os.path.join(p.path, 'a', 'b')) def test_ensure(): with File('foo') as p: p.iset('x', b'123') p.iset('x', b'123') p.iset('x', b'123') assert p.get('x') == b'123' def test_filenames(): assert token('hello') == 'hello' assert token(('hello', 'world')) == os.path.join('hello', 'world') assert escape_filename(os.path.join('a', 'b')) == os.path.join('a', 'b') assert filename('dir', ('a', 'b')) == os.path.join('dir', 'a', 'b') ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_pickle.py0000644000076500000240000000137200000000000016746 0ustar00jamesstafffrom partd.pickle import Pickle import os import shutil def test_pickle(): with Pickle() as p: p.append({'x': ['Hello', 'World!'], 'y': [1, 2, 3]}) p.append({'x': ['Alice', 'Bob!'], 'y': [4, 5, 6]}) assert os.path.exists(p.partd.filename('x')) assert os.path.exists(p.partd.filename('y')) result = p.get(['y', 'x']) assert result == [[1, 2, 3, 4, 5, 6], ['Hello', 'World!', 'Alice', 'Bob!']] with p.lock: # uh oh, possible deadlock result = p.get(['x'], lock=False) assert not os.path.exists(p.partd.path) def test_ensure(): with Pickle() as p: p.iset('x', [1, 2, 3]) p.iset('x', [1, 2, 3]) assert p.get('x') == [1, 2, 3] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_python.py0000644000076500000240000000037000000000000017015 0ustar00jamesstafffrom partd.python import dumps, loads import os import shutil from math import sin def test_pack_unpack(): data = [1, 2, b'Hello', 'Hello'] assert loads(dumps(data)) == data data = [1, 2, sin] assert loads(dumps(data)) == data ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_utils.py0000644000076500000240000000040200000000000016630 0ustar00jamesstafffrom partd.utils import frame, framesplit import struct def test_frame(): assert frame(b'Hello') == struct.pack('Q', 5) + b'Hello' def test_framesplit(): L = [b'Hello', b'World!', b'123'] assert list(framesplit(b''.join(map(frame, L)))) == L ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/partd/tests/test_zmq.py0000644000076500000240000000640500000000000016310 0ustar00jamesstaffimport pytest pytest.importorskip('zmq') from partd.zmq import Server, keys_to_flush, File, Client from partd import core, Dict from threading import Thread from time import sleep from contextlib import contextmanager import pickle import os import shutil def test_server(): s = Server() try: s.start() s.append({'x': b'abc', 'y': b'1234'}) s.append({'x': b'def', 'y': b'5678'}) assert s.get(['x']) == [b'abcdef'] assert s.get(['x', 'y']) == [b'abcdef', b'12345678'] assert s.get(['x']) == [b'abcdef'] finally: s.close() def dont_test_flow_control(): path = 'bar' if os.path.exists('bar'): shutil.rmtree('bar') s = Server('bar', available_memory=1, n_outstanding_writes=3, start=False) p = Client(s.address) try: listen_thread = Thread(target=s.listen) listen_thread.start() """ Don't start these threads self._write_to_disk_thread = Thread(target=self._write_to_disk) self._write_to_disk_thread.start() self._free_frozen_sockets_thread = Thread(target=self._free_frozen_sockets) self._free_frozen_sockets_thread.start() """ p.append({'x': b'12345'}) sleep(0.1) assert s._out_disk_buffer.qsize() == 1 p.append({'x': b'12345'}) p.append({'x': b'12345'}) sleep(0.1) assert s._out_disk_buffer.qsize() == 3 held_append = Thread(target=p.append, args=({'x': b'123'},)) held_append.start() sleep(0.1) assert held_append.isAlive() # held! assert not s._frozen_sockets.empty() write_to_disk_thread = Thread(target=s._write_to_disk) write_to_disk_thread.start() free_frozen_sockets_thread = Thread(target=s._free_frozen_sockets) free_frozen_sockets_thread.start() sleep(0.2) assert not held_append.isAlive() assert s._frozen_sockets.empty() finally: s.close() @contextmanager def partd_server(**kwargs): with Server(**kwargs) as server: with Client(server.address) as p: yield (p, server) def test_partd_object(): with partd_server() as (p, server): p.append({'x': b'Hello', 'y': b'abc'}) p.append({'x': b'World!', 'y': b'def'}) result = p.get(['y', 'x']) assert result == [b'abcdef', b'HelloWorld!'] def test_delete(): with partd_server() as (p, server): p.append({'x': b'Hello'}) assert p.get('x') == b'Hello' p.delete(['x']) assert p.get('x') == b'' def test_iset(): with partd_server() as (p, server): p.iset('x', b'111') p.iset('x', b'111') assert p.get('x') == b'111' def test_tuple_keys(): with partd_server() as (p, server): p.append({('x', 'y'): b'123'}) assert p.get(('x', 'y')) == b'123' def test_serialization(): with partd_server() as (p, server): p.append({'x': b'123'}) q = pickle.loads(pickle.dumps(p)) assert q.get('x') == b'123' def test_drop(): with partd_server() as (p, server): p.append({'x': b'123'}) p.drop() assert p.get('x') == b'' def dont_test_server_autocreation(): with Client() as p: p.append({'x': b'123'}) assert p.get('x') == b'123' ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322851.0 partd-1.2.0/partd/utils.py0000644000076500000240000000731100000000000014435 0ustar00jamesstafffrom contextlib import contextmanager import os import shutil import tempfile import struct def raises(exc, lamda): try: lamda() return False except exc: return True @contextmanager def tmpfile(extension=''): extension = '.' + extension.lstrip('.') handle, filename = tempfile.mkstemp(extension) os.close(handle) os.remove(filename) try: yield filename finally: if os.path.exists(filename): if os.path.isdir(filename): shutil.rmtree(filename) else: os.remove(filename) def frame(bytes): """ Pack the length of the bytes in front of the bytes TODO: This does a full copy. This should maybe be inlined somehow wherever this gets used instead. My laptop shows a data bandwidth of 2GB/s """ return struct.pack('Q', len(bytes)) + bytes def framesplit(bytes): """ Split buffer into frames of concatenated chunks >>> data = frame(b'Hello') + frame(b'World') >>> list(framesplit(data)) # doctest: +SKIP [b'Hello', b'World'] """ i = 0; n = len(bytes) chunks = list() while i < n: nbytes = struct.unpack('Q', bytes[i:i+8])[0] i += 8 yield bytes[i: i + nbytes] i += nbytes def partition_all(n, bytes): """ Partition bytes into evenly sized blocks The final block holds the remainder and so may not be of equal size >>> list(partition_all(2, b'Hello')) [b'He', b'll', b'o'] See Also: toolz.partition_all """ if len(bytes) < n: # zero copy fast common case yield bytes else: for i in range(0, len(bytes), n): yield bytes[i: i+n] @contextmanager def ignoring(*exc): try: yield except exc: pass @contextmanager def do_nothing(*args, **kwargs): yield def nested_get(ind, coll, lazy=False): """ Get nested index from collection Examples -------- >>> nested_get(1, 'abc') 'b' >>> nested_get([1, 0], 'abc') ['b', 'a'] >>> nested_get([[1, 0], [0, 1]], 'abc') [['b', 'a'], ['a', 'b']] """ if isinstance(ind, list): if lazy: return (nested_get(i, coll, lazy=lazy) for i in ind) else: return [nested_get(i, coll, lazy=lazy) for i in ind] else: return coll[ind] def flatten(seq): """ >>> list(flatten([1])) [1] >>> list(flatten([[1, 2], [1, 2]])) [1, 2, 1, 2] >>> list(flatten([[[1], [2]], [[1], [2]]])) [1, 2, 1, 2] >>> list(flatten(((1, 2), (1, 2)))) # Don't flatten tuples [(1, 2), (1, 2)] >>> list(flatten((1, 2, [3, 4]))) # support heterogeneous [1, 2, 3, 4] """ for item in seq: if isinstance(item, list): for item2 in flatten(item): yield item2 else: yield item def suffix(key, term): """ suffix a key with a suffix Works if they key is a string or a tuple >>> suffix('x', '.dtype') 'x.dtype' >>> suffix(('a', 'b', 'c'), '.dtype') ('a', 'b', 'c.dtype') """ if isinstance(key, str): return key + term elif isinstance(key, tuple): return key[:-1] + (suffix(key[-1], term),) else: return suffix(str(key), term) def extend(key, term): """ extend a key with a another element in a tuple Works if they key is a string or a tuple >>> extend('x', '.dtype') ('x', '.dtype') >>> extend(('a', 'b', 'c'), '.dtype') ('a', 'b', 'c', '.dtype') """ if isinstance(term, tuple): pass elif isinstance(term, str): term = (term,) else: term = (str(term),) if not isinstance(key, tuple): key = (key,) return key + term ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617902584.0 partd-1.2.0/partd/zmq.py0000644000076500000240000002277100000000000014113 0ustar00jamesstafffrom __future__ import absolute_import, print_function import zmq import logging from itertools import chain from bisect import bisect import socket from operator import add from time import sleep, time from toolz import accumulate, topk, pluck, merge, keymap import uuid from collections import defaultdict from contextlib import contextmanager from threading import Thread, Lock from datetime import datetime from multiprocessing import Process import traceback import sys from .dict import Dict from .file import File from .buffer import Buffer from . import core from .compatibility import Queue, Empty, unicode from .utils import ignoring tuple_sep = b'-|-' logger = logging.getLogger(__name__) @contextmanager def logerrors(): try: yield except Exception as e: logger.exception(e) raise class Server(object): def __init__(self, partd=None, bind=None, start=True, block=False, hostname=None): self.context = zmq.Context() if partd is None: partd = Buffer(Dict(), File()) self.partd = partd self.socket = self.context.socket(zmq.ROUTER) if hostname is None: hostname = socket.gethostname() if isinstance(bind, unicode): bind = bind.encode() if bind is None: port = self.socket.bind_to_random_port('tcp://*') else: self.socket.bind(bind) port = int(bind.split(':')[-1].rstrip('/')) self.address = ('tcp://%s:%d' % (hostname, port)).encode() self.status = 'created' self.partd.lock.acquire() self._lock = Lock() self._socket_lock = Lock() if start: self.start() if block: self.block() def start(self): if self.status != 'run': self.status = 'run' self._listen_thread = Thread(target=self.listen) self._listen_thread.start() logger.debug('Start server at %s', self.address) def block(self): """ Block until all threads close """ try: self._listen_thread.join() except AttributeError: pass def listen(self): with logerrors(): logger.debug('Start listening %s', self.address) while self.status != 'closed': if not self.socket.poll(100): continue with self._socket_lock: payload = self.socket.recv_multipart() address, command, payload = payload[0], payload[1], payload[2:] logger.debug('Server receives %s %s', address, command) if command == b'close': logger.debug('Server closes') self.ack(address) self.status = 'closed' break # self.close() elif command == b'append': keys, values = payload[::2], payload[1::2] keys = list(map(deserialize_key, keys)) data = dict(zip(keys, values)) self.partd.append(data, lock=False) logger.debug('Server appends %d keys', len(data)) self.ack(address) elif command == b'iset': key, value = payload key = deserialize_key(key) self.partd.iset(key, value, lock=False) self.ack(address) elif command == b'get': keys = list(map(deserialize_key, payload)) logger.debug('get %s', keys) result = self.get(keys) self.send_to_client(address, result) self.ack(address, flow_control=False) elif command == b'delete': keys = list(map(deserialize_key, payload)) logger.debug('delete %s', keys) self.partd.delete(keys, lock=False) self.ack(address, flow_control=False) elif command == b'syn': self.ack(address) elif command == b'drop': self.drop() self.ack(address) else: logger.debug("Unknown command: %s", command) raise ValueError("Unknown command: " + command) def send_to_client(self, address, result): with logerrors(): if not isinstance(result, list): result = [result] with self._socket_lock: self.socket.send_multipart([address] + result) def ack(self, address, flow_control=True): with logerrors(): logger.debug('Server sends ack') self.send_to_client(address, b'ack') def append(self, data): self.partd.append(data, lock=False) logger.debug('Server appends %d keys', len(data)) def drop(self): with logerrors(): self.partd.drop() def get(self, keys): with logerrors(): logger.debug('Server gets keys: %s', keys) with self._lock: result = self.partd.get(keys, lock=False) return result def close(self): logger.debug('Server closes') self.status = 'closed' self.block() with ignoring(zmq.error.ZMQError): self.socket.close(1) with ignoring(zmq.error.ZMQError): self.context.destroy(3) self.partd.lock.release() def __enter__(self): self.start() return self def __exit__(self, *args): self.close() self.partd.__exit__(*args) def keys_to_flush(lengths, fraction=0.1, maxcount=100000): """ Which keys to remove >>> lengths = {'a': 20, 'b': 10, 'c': 15, 'd': 15, ... 'e': 10, 'f': 25, 'g': 5} >>> keys_to_flush(lengths, 0.5) ['f', 'a'] """ top = topk(max(len(lengths) // 2, 1), lengths.items(), key=1) total = sum(lengths.values()) cutoff = min(maxcount, max(1, bisect(list(accumulate(add, pluck(1, top))), total * fraction))) result = [k for k, v in top[:cutoff]] assert result return result def serialize_key(key): """ >>> serialize_key('x') b'x' >>> serialize_key(('a', 'b', 1)) b'a-|-b-|-1' """ if isinstance(key, tuple): return tuple_sep.join(map(serialize_key, key)) if isinstance(key, bytes): return key if isinstance(key, str): return key.encode() return str(key).encode() def deserialize_key(text): """ >>> deserialize_key(b'x') b'x' >>> deserialize_key(b'a-|-b-|-1') (b'a', b'b', b'1') """ if tuple_sep in text: return tuple(text.split(tuple_sep)) else: return text from .core import Interface from .file import File class Client(Interface): def __init__(self, address=None, create_server=False, **kwargs): self.address = address self.context = zmq.Context() self.socket = self.context.socket(zmq.DEALER) logger.debug('Client connects to %s', address) self.socket.connect(address) self.send(b'syn', [], ack_required=False) self.lock = NotALock() # Server sequentializes everything Interface.__init__(self) def __getstate__(self): return {'address': self.address} def __setstate__(self, state): self.__init__(state['address']) logger.debug('Reconstruct client from pickled state') def send(self, command, payload, recv=False, ack_required=True): if ack_required: ack = self.socket.recv_multipart() assert ack == [b'ack'] logger.debug('Client sends command: %s', command) self.socket.send_multipart([command] + payload) if recv: result = self.socket.recv_multipart() else: result = None return result def _get(self, keys, lock=None): """ Lock argument is ignored. Everything is sequential (I think) """ logger.debug('Client gets %s %s', self.address, keys) keys = list(map(serialize_key, keys)) return self.send(b'get', keys, recv=True) def append(self, data, lock=None): logger.debug('Client appends %s %s', self.address, str(len(data)) + ' keys') data = keymap(serialize_key, data) payload = list(chain.from_iterable(data.items())) self.send(b'append', payload) def _delete(self, keys, lock=None): logger.debug('Client deletes %s %s', self.address, str(len(keys)) + ' keys') keys = list(map(serialize_key, keys)) self.send(b'delete', keys) def _iset(self, key, value): self.send(b'iset', [serialize_key(key), value]) def drop(self): self.send(b'drop', []) sleep(0.05) def close_server(self): self.send(b'close', []) def close(self): if hasattr(self, 'server_process'): with ignoring(zmq.error.ZMQError): self.close_server() self.server_process.join() with ignoring(zmq.error.ZMQError): self.socket.close(1) with ignoring(zmq.error.ZMQError): self.context.destroy(1) def __exit__(self, type, value, traceback): self.drop() self.close() def __del__(self): self.close() class NotALock(object): def acquire(self): pass def release(self): pass def __enter__(self): return self def __exit__(self, *args): pass ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1617903144.1112034 partd-1.2.0/partd.egg-info/0000755000076500000240000000000000000000000014413 5ustar00jamesstaff././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/PKG-INFO0000644000076500000240000001241600000000000015514 0ustar00jamesstaffMetadata-Version: 2.1 Name: partd Version: 1.2.0 Summary: Appendable key-value storage Home-page: http://github.com/dask/partd/ Maintainer: Matthew Rocklin Maintainer-email: mrocklin@gmail.com License: BSD Description: PartD ===== |Build Status| |Version Status| Key-value byte store with appendable values Partd stores key-value pairs. Values are raw bytes. We append on old values. Partd excels at shuffling operations. Operations ---------- PartD has two main operations, ``append`` and ``get``. Example ------- 1. Create a Partd backed by a directory:: >>> import partd >>> p = partd.File('/path/to/new/dataset/') 2. Append key-byte pairs to dataset:: >>> p.append({'x': b'Hello ', 'y': b'123'}) >>> p.append({'x': b'world!', 'y': b'456'}) 3. Get bytes associated to keys:: >>> p.get('x') # One key b'Hello world!' >>> p.get(['y', 'x']) # List of keys [b'123456', b'Hello world!'] 4. Destroy partd dataset:: >>> p.drop() That's it. Implementations --------------- We can back a partd by an in-memory dictionary:: >>> p = Dict() For larger amounts of data or to share data between processes we back a partd by a directory of files. This uses file-based locks for consistency.:: >>> p = File('/path/to/dataset/') However this can fail for many small writes. In these cases you may wish to buffer one partd with another, keeping a fixed maximum of data in the buffering partd. This writes the larger elements of the first partd to the second partd when space runs low:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer You might also want to have many distributed process write to a single partd consistently. This can be done with a server * Server Process:: >>> p = Buffer(Dict(), File(), available_memory=2e9) # 2GB memory buffer >>> s = Server(p, address='ipc://server') * Worker processes:: >>> p = Client('ipc://server') # Client machine talks to remote server Encodings and Compression ------------------------- Once we can robustly and efficiently append bytes to a partd we consider compression and encodings. This is generally available with the ``Encode`` partd, which accepts three functions, one to apply on bytes as they are written, one to apply to bytes as they are read, and one to join bytestreams. Common configurations already exist for common data and compression formats. We may wish to compress and decompress data transparently as we interact with a partd. Objects like ``BZ2``, ``Blosc``, ``ZLib`` and ``Snappy`` exist and take another partd as an argument.:: >>> p = File(...) >>> p = ZLib(p) These work exactly as before, the (de)compression happens automatically. Common data formats like Python lists, numpy arrays, and pandas dataframes are also supported out of the box.:: >>> p = File(...) >>> p = NumPy(p) >>> p.append({'x': np.array([...])}) This lets us forget about bytes and think instead in our normal data types. Composition ----------- In principle we want to compose all of these choices together 1. Write policy: ``Dict``, ``File``, ``Buffer``, ``Client`` 2. Encoding: ``Pickle``, ``Numpy``, ``Pandas``, ... 3. Compression: ``Blosc``, ``Snappy``, ... Partd objects compose by nesting. Here we make a partd that writes pickle encoded BZ2 compressed bytes directly to disk:: >>> p = Pickle(BZ2(File('foo'))) We could construct more complex systems that include compression, serialization, buffering, and remote access.:: >>> server = Server(Buffer(Dict(), File(), available_memory=2e0)) >>> client = Pickle(Snappy(Client(server.address))) >>> client.append({'x': [1, 2, 3]}) .. |Build Status| image:: https://github.com/dask/partd/workflows/CI/badge.svg :target: https://github.com/dask/partd/actions?query=workflow%3ACI .. |Version Status| image:: https://img.shields.io/pypi/v/partd.svg :target: https://pypi.python.org/pypi/partd/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Requires-Python: >=3.5 Provides-Extra: complete ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/SOURCES.txt0000644000076500000240000000146200000000000016302 0ustar00jamesstaffLICENSE.txt MANIFEST.in README.rst requirements.txt setup.cfg setup.py versioneer.py partd/__init__.py partd/_version.py partd/buffer.py partd/compatibility.py partd/compressed.py partd/core.py partd/dict.py partd/encode.py partd/file.py partd/numpy.py partd/pandas.py partd/pickle.py partd/python.py partd/utils.py partd/zmq.py partd.egg-info/PKG-INFO partd.egg-info/SOURCES.txt partd.egg-info/dependency_links.txt partd.egg-info/not-zip-safe partd.egg-info/requires.txt partd.egg-info/top_level.txt partd/tests/test_buffer.py partd/tests/test_compressed.py partd/tests/test_dict.py partd/tests/test_encode.py partd/tests/test_file.py partd/tests/test_numpy.py partd/tests/test_pandas.py partd/tests/test_partd.py partd/tests/test_pickle.py partd/tests/test_python.py partd/tests/test_utils.py partd/tests/test_zmq.py././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/dependency_links.txt0000644000076500000240000000000100000000000020461 0ustar00jamesstaff ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/not-zip-safe0000644000076500000240000000000100000000000016641 0ustar00jamesstaff ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/requires.txt0000644000076500000240000000010100000000000017003 0ustar00jamesstafflocket toolz [complete] numpy>=1.9.0 pandas>=0.19.0 pyzmq blosc ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617903143.0 partd-1.2.0/partd.egg-info/top_level.txt0000644000076500000240000000000600000000000017141 0ustar00jamesstaffpartd ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322844.0 partd-1.2.0/requirements.txt0000644000076500000240000000001500000000000015067 0ustar00jamesstafflocket toolz ././@PaxHeader0000000000000000000000000000003300000000000010211 xustar0027 mtime=1617903144.125133 partd-1.2.0/setup.cfg0000644000076500000240000000030200000000000013423 0ustar00jamesstaff[versioneer] vcs = git style = pep440 versionfile_source = partd/_version.py versionfile_build = partd/_version.py tag_prefix = parentdir_prefix = partd- [egg_info] tag_build = tag_date = 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617808846.0 partd-1.2.0/setup.py0000755000076500000240000000212200000000000013321 0ustar00jamesstaff#!/usr/bin/env python from os.path import exists from setuptools import setup import versioneer setup(name='partd', version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), description='Appendable key-value storage', url='http://github.com/dask/partd/', maintainer='Matthew Rocklin', maintainer_email='mrocklin@gmail.com', license='BSD', keywords='', packages=['partd'], install_requires=list(open('requirements.txt').read().strip().split('\n')), python_requires=">=3.5", classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", ], long_description=(open('README.rst').read() if exists('README.rst') else ''), extras_require={'complete': [ 'numpy >= 1.9.0', 'pandas >=0.19.0', 'pyzmq', 'blosc', ]}, zip_safe=False) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1617322851.0 partd-1.2.0/versioneer.py0000644000076500000240000020577700000000000014364 0ustar00jamesstaff # Version: 0.18 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/warner/python-versioneer * Brian Warner * License: Public Domain * Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy * [![Latest Version] (https://pypip.in/version/versioneer/badge.svg?style=flat) ](https://pypi.python.org/pypi/versioneer/) * [![Build Status] (https://travis-ci.org/warner/python-versioneer.png?branch=master) ](https://travis-ci.org/warner/python-versioneer) This is a tool for managing a recorded version number in distutils-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install * `pip install versioneer` to somewhere to your $PATH * add a `[versioneer]` section to your setup.cfg (see below) * run `versioneer install` in your source tree, commit the results ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes. The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/warner/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other langauges) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/warner/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/warner/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ### Unicode version strings While Versioneer works (and is continually tested) with both Python 2 and Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. Newer releases probably generate unicode version strings on py2. It's not clear that this is wrong, but it may be surprising for applications when then write these strings to a network connection or include them in bytes-oriented APIs like cryptographic checksums. [Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates this question. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the Creative Commons "Public Domain Dedication" license (CC0-1.0), as described in https://creativecommons.org/publicdomain/zero/1.0/ . """ from __future__ import print_function try: import configparser except ImportError: import ConfigParser as configparser import errno import json import os import re import subprocess import sys class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_root(): """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): err = ("Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: print("Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(me), versioneer_py)) except NameError: pass return root def get_config_from_root(root): """Read the project setup.cfg file to determine Versioneer config.""" # This might raise EnvironmentError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.SafeConfigParser() with open(setup_cfg, "r") as f: parser.readfp(f) VCS = parser.get("versioneer", "VCS") # mandatory def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" cfg.versionfile_source = get(parser, "versionfile_source") cfg.versionfile_build = get(parser, "versionfile_build") cfg.tag_prefix = get(parser, "tag_prefix") if cfg.tag_prefix in ("''", '""'): cfg.tag_prefix = "" cfg.parentdir_prefix = get(parser, "parentdir_prefix") cfg.verbose = get(parser, "verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode LONG_VERSION_PY['git'] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "main". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%%s*" %% tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%%d" %% pieces["distance"] else: # exception #1 rendered = "0.post.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "main". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(manifest_in, versionfile_source, ipy): """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [manifest_in, versionfile_source] if ipy: files.append(ipy) try: me = __file__ if me.endswith(".pyc") or me.endswith(".pyo"): me = os.path.splitext(me)[0] + ".py" versioneer_file = os.path.relpath(me) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: f = open(".gitattributes", "r") for line in f.readlines(): if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True f.close() except EnvironmentError: pass if not present: f = open(".gitattributes", "a+") f.write("%s export-subst\n" % versionfile_source) f.close() files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.18) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename): """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose=False): """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose assert cfg.versionfile_source is not None, \ "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} def get_version(): """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(): """Get the custom setuptools/distutils subclasses used by Versioneer.""" if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/warner/python-versioneer/issues/52 cmds = {} # we add "version" to both distutils and setuptools from distutils.core import Command class cmd_version(Command): description = "report generated version string" user_options = [] boolean_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # we override different "build_py" commands for both environments if "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: from distutils.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if 'py2exe' in sys.modules: # py2exe enabled? try: from py2exe.distutils_buildexe import py2exe as _py2exe # py3 except ImportError: from py2exe.build_exe import py2exe as _py2exe # py2 class cmd_py2exe(_py2exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments if "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: from distutils.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self): versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir, files): root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ INIT_PY_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ def do_setup(): """Main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (EnvironmentError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except EnvironmentError: old = "" if INIT_PY_SNIPPET not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(INIT_PY_SNIPPET) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) ipy = None # Make sure both the top-level "versioneer.py" and versionfile_source # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so # they'll be copied into source distributions. Pip won't be able to # install the package without this. manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: with open(manifest_in, "r") as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) except EnvironmentError: pass # That doesn't cover everything MANIFEST.in can do # (http://docs.python.org/2/distutils/sourcedist.html#commands), so # it might give some false negatives. Appending redundant 'include' # lines is safe, though. if "versioneer.py" not in simple_includes: print(" appending 'versioneer.py' to MANIFEST.in") with open(manifest_in, "a") as f: f.write("include versioneer.py\n") else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: print(" versionfile_source already in MANIFEST.in") # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(manifest_in, cfg.versionfile_source, ipy) return 0 def scan_setup_py(): """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": errors = do_setup() errors += scan_setup_py() if errors: sys.exit(1)