pax_global_header 0000666 0000000 0000000 00000000064 13360664062 0014520 g ustar 00root root 0000000 0000000 52 comment=d15f7314f7c899b4c91532a07604a6728bc8d45d
preshed-2.0.1/ 0000775 0000000 0000000 00000000000 13360664062 0013152 5 ustar 00root root 0000000 0000000 preshed-2.0.1/.appveyor.yml 0000664 0000000 0000000 00000003313 13360664062 0015620 0 ustar 00root root 0000000 0000000 environment:
matrix:
# For Python versions available on Appveyor, see
# http://www.appveyor.com/docs/installed-software#python
# The list here is complete (excluding Python 2.6, which
# isn't covered by this document) at the time of writing.
- PYTHON: "C:\\Python27"
- PYTHON: "C:\\Python35"
- PYTHON: "C:\\Python27-x64"
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
install:
# We need wheel installed to build wheels
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -e ."
build: off
test_script:
# Put your test command here.
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
# you can remove "build.cmd" from the front of the command, as it's
# only needed to support those cases.
# Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest preshed/"
after_test:
# This step builds your wheels.
# Again, you only need build.cmd if you're building C extensions for
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
# interpreter
- "%PYTHON%\\python.exe setup.py bdist_wheel"
artifacts:
# bdist_wheel puts your built wheel in the dist directory
- path: dist\*
#on_success:
# You can use this step to upload your artifacts to a public website.
# See Appveyor's documentation for more details. Or you can simply
# access your wheels from the Appveyor "artifacts" tab for your build.
preshed-2.0.1/.gitignore 0000664 0000000 0000000 00000000221 13360664062 0015135 0 ustar 00root root 0000000 0000000 *.egg
*.egg-info
preshed/.maps.pxd.swm
preshed/.maps.pyx.swl
*.sw[a-z]
*.so
*.pyc
*.swp
*.swo
*.html
*.c
*.cpp
.env/
.denv
MANIFEST
build/
dist/
preshed-2.0.1/.travis.yml 0000664 0000000 0000000 00000000347 13360664062 0015267 0 ustar 00root root 0000000 0000000 language: python
sudo: false
dist: trusty
group: edge
python:
- "2.7"
- "3.5"
- "3.6"
- "3.7-dev"
install:
- pip install cython
- pip install -r requirements.txt
- pip install -e .
script:
- python -m pytest preshed
preshed-2.0.1/LICENSE 0000664 0000000 0000000 00000002163 13360664062 0014161 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2014 Matthew Honnibal
2016 ExplosionAI UG (haftungsbeschränkt)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
preshed-2.0.1/MANIFEST.in 0000664 0000000 0000000 00000000127 13360664062 0014710 0 ustar 00root root 0000000 0000000 recursive-include include *.h
include buildbot.json
include LICENSE
include README.rst
preshed-2.0.1/README.rst 0000664 0000000 0000000 00000002264 13360664062 0014645 0 ustar 00root root 0000000 0000000 preshed: Cython Hash Table for Pre-Hashed Keys
**********************************************
Simple but high performance Cython hash table mapping pre-randomized keys to void* values. Inspired by `Jeff Preshing `_
.. image:: https://img.shields.io/travis/explosion/preshed/master.svg?style=flat-square&logo=travis
:target: https://travis-ci.org/explosion/preshed
:alt: Build Status
.. image:: https://img.shields.io/appveyor/ci/explosion/preshed/master.svg?style=flat-square&logo=appveyor
:target: https://ci.appveyor.com/project/explosion/preshed
:alt: Appveyor Build Status
.. image:: https://img.shields.io/pypi/v/preshed.svg?style=flat-square
:target: https://pypi.python.org/pypi/preshed
:alt: pypi Version
.. image:: https://img.shields.io/conda/vn/conda-forge/preshed.svg?style=flat-square
:target: https://anaconda.org/conda-forge/preshed
:alt: conda Version
.. image:: https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white
:target: https://github.com/explosion/wheelwright/releases
:alt: Python wheels
preshed-2.0.1/bin/ 0000775 0000000 0000000 00000000000 13360664062 0013722 5 ustar 00root root 0000000 0000000 preshed-2.0.1/bin/cythonize.py 0000775 0000000 0000000 00000010462 13360664062 0016316 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
""" cythonize.py
Cythonize pyx files into C++ files as needed.
Usage: cythonize.py [root]
Checks pyx files to see if they have been changed relative to their
corresponding C++ files. If they have, then runs cython on these files to
recreate the C++ files.
Additionally, checks pxd files and setup.py if they have been changed. If
they have, rebuilds everything.
Change detection based on file hashes stored in JSON format.
For now, this script should be run by developers when changing Cython files
and the resulting C++ files checked in, so that end-users (and Python-only
developers) do not get the Cython dependencies.
Based upon:
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
Note: this script does not check any of the dependent C++ libraries.
"""
from __future__ import print_function
import os
import sys
import json
import hashlib
import subprocess
import argparse
HASH_FILE = 'cythonize.json'
def process_pyx(fromfile, tofile):
print('Processing %s' % fromfile)
try:
from Cython.Compiler.Version import version as cython_version
from distutils.version import LooseVersion
if LooseVersion(cython_version) < LooseVersion('0.19'):
raise Exception('Require Cython >= 0.19')
except ImportError:
pass
flags = ['--fast-fail']
if tofile.endswith('.cpp'):
flags += ['--cplus']
try:
try:
r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile])
if r != 0:
raise Exception('Cython failed')
except OSError:
# There are ways of installing Cython that don't result in a cython
# executable on the path, see gh-2397.
r = subprocess.call([sys.executable, '-c',
'import sys; from Cython.Compiler.Main import '
'setuptools_main as main; sys.exit(main())'] + flags +
['-o', tofile, fromfile])
if r != 0:
raise Exception('Cython failed')
except OSError:
raise OSError('Cython needs to be installed')
def preserve_cwd(path, func, *args):
orig_cwd = os.getcwd()
try:
os.chdir(path)
func(*args)
finally:
os.chdir(orig_cwd)
def load_hashes(filename):
try:
return json.load(open(filename))
except (ValueError, IOError):
return {}
def save_hashes(hash_db, filename):
with open(filename, 'w') as f:
f.write(json.dumps(hash_db))
def get_hash(path):
return hashlib.md5(open(path, 'rb').read()).hexdigest()
def hash_changed(base, path, db):
full_path = os.path.normpath(os.path.join(base, path))
return not get_hash(full_path) == db.get(full_path)
def hash_add(base, path, db):
full_path = os.path.normpath(os.path.join(base, path))
db[full_path] = get_hash(full_path)
def process(base, filename, db):
root, ext = os.path.splitext(filename)
if ext in ['.pyx', '.cpp']:
if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')):
preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
hash_add(base, root + '.cpp', db)
hash_add(base, root + '.pyx', db)
def check_changes(root, db):
res = False
new_db = {}
setup_filename = 'setup.py'
hash_add('.', setup_filename, new_db)
if hash_changed('.', setup_filename, db):
res = True
for base, _, files in os.walk(root):
for filename in files:
if filename.endswith('.pxd'):
hash_add(base, filename, new_db)
if hash_changed(base, filename, db):
res = True
if res:
db.clear()
db.update(new_db)
return res
def run(root):
db = load_hashes(HASH_FILE)
try:
check_changes(root, db)
for base, _, files in os.walk(root):
for filename in files:
process(base, filename, db)
finally:
save_hashes(db, HASH_FILE)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
parser.add_argument('root', help='root directory')
args = parser.parse_args()
run(args.root)
preshed-2.0.1/buildbot.json 0000664 0000000 0000000 00000000750 13360664062 0015653 0 ustar 00root root 0000000 0000000 {
"build": {
"sdist": [
"pip install -r requirements.txt",
"python setup.py sdist"
],
"install": [
"pip install -v source.tar.gz"
],
"wheel": [
"python untar.py source.tar.gz .",
"python setup.py bdist_wheel",
"python cpdist.py dist"
]
},
"test": {
"after": ["install", "wheel"],
"package": "preshed",
"args": "--tb=native"
}
}
preshed-2.0.1/fabfile.py 0000664 0000000 0000000 00000003142 13360664062 0015114 0 ustar 00root root 0000000 0000000 from fabric.api import local, run, lcd, cd, env
import os
from os import path
from os.path import exists as file_exists
from fabtools.python import virtualenv
PWD = path.dirname(__file__)
VENV_DIR = path.join(PWD, '.env')
DEV_ENV_DIR = path.join(PWD, '.denv')
def dev():
# Allow this to persist, since we aren't as rigorous about keeping state clean
if not file_exists('.denv'):
local('virtualenv .denv')
with virtualenv(DEV_ENV_DIR):
local('pip install -r requirements.txt')
def sdist():
if file_exists('dist/'):
local('rm -rf dist/')
local('mkdir dist')
with virtualenv(VENV_DIR):
local('python setup.py sdist')
def publish():
with virtualenv(VENV_DIR):
local('python setup.py register')
local('twine upload dist/*.tar.gz')
def setup():
if file_exists('.env'):
local('rm -rf .env')
local('rm -rf *.egg')
local('virtualenv .env')
def install():
with virtualenv(VENV_DIR):
local('pip install --upgrade setuptools')
local('pip install dist/*.tar.gz')
local('pip install pytest')
def make():
with virtualenv(DEV_ENV_DIR):
with lcd(path.dirname(__file__)):
local('python setup.py build')
def clean():
with lcd(os.path.dirname(__file__)):
local('python setup.py clean --all')
with virtualenv(DEV_ENV_DIR):
with lcd(os.path.dirname(__file__)):
local('python setup.py clean --all')
def test():
with virtualenv(VENV_DIR):
local('python -m pytest -x')
def travis():
local('open https://travis-ci.org/spacy-io/preshed')
preshed-2.0.1/include/ 0000775 0000000 0000000 00000000000 13360664062 0014575 5 ustar 00root root 0000000 0000000 preshed-2.0.1/include/msvc9/ 0000775 0000000 0000000 00000000000 13360664062 0015636 5 ustar 00root root 0000000 0000000 preshed-2.0.1/include/msvc9/stdint.h 0000664 0000000 0000000 00000017645 13360664062 0017331 0 ustar 00root root 0000000 0000000 // ISO C9x compliant stdint.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006-2013 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the product nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_STDINT_H_ // [
#define _MSC_STDINT_H_
#if _MSC_VER > 1000
#pragma once
#endif
#if _MSC_VER >= 1600 // [
#include
#else // ] _MSC_VER >= 1600 [
#include
// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
// compiling for ARM we should wrap include with 'extern "C++" {}'
// or compiler give many errors like this:
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
#ifdef __cplusplus
extern "C" {
#endif
# include
#ifdef __cplusplus
}
#endif
// Define _W64 macros to mark types changing their size, like intptr_t.
#ifndef _W64
# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
# define _W64 __w64
# else
# define _W64
# endif
#endif
// 7.18.1 Integer types
// 7.18.1.1 Exact-width integer types
// Visual Studio 6 and Embedded Visual C++ 4 doesn't
// realize that, e.g. char has the same size as __int8
// so we give up on __intX for them.
#if (_MSC_VER < 1300)
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
#else
typedef signed __int8 int8_t;
typedef signed __int16 int16_t;
typedef signed __int32 int32_t;
typedef unsigned __int8 uint8_t;
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
#endif
typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
// 7.18.1.2 Minimum-width integer types
typedef int8_t int_least8_t;
typedef int16_t int_least16_t;
typedef int32_t int_least32_t;
typedef int64_t int_least64_t;
typedef uint8_t uint_least8_t;
typedef uint16_t uint_least16_t;
typedef uint32_t uint_least32_t;
typedef uint64_t uint_least64_t;
// 7.18.1.3 Fastest minimum-width integer types
typedef int8_t int_fast8_t;
typedef int16_t int_fast16_t;
typedef int32_t int_fast32_t;
typedef int64_t int_fast64_t;
typedef uint8_t uint_fast8_t;
typedef uint16_t uint_fast16_t;
typedef uint32_t uint_fast32_t;
typedef uint64_t uint_fast64_t;
// 7.18.1.4 Integer types capable of holding object pointers
#ifdef _WIN64 // [
typedef signed __int64 intptr_t;
typedef unsigned __int64 uintptr_t;
#else // _WIN64 ][
typedef _W64 signed int intptr_t;
typedef _W64 unsigned int uintptr_t;
#endif // _WIN64 ]
// 7.18.1.5 Greatest-width integer types
typedef int64_t intmax_t;
typedef uint64_t uintmax_t;
// 7.18.2 Limits of specified-width integer types
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
// 7.18.2.1 Limits of exact-width integer types
#define INT8_MIN ((int8_t)_I8_MIN)
#define INT8_MAX _I8_MAX
#define INT16_MIN ((int16_t)_I16_MIN)
#define INT16_MAX _I16_MAX
#define INT32_MIN ((int32_t)_I32_MIN)
#define INT32_MAX _I32_MAX
#define INT64_MIN ((int64_t)_I64_MIN)
#define INT64_MAX _I64_MAX
#define UINT8_MAX _UI8_MAX
#define UINT16_MAX _UI16_MAX
#define UINT32_MAX _UI32_MAX
#define UINT64_MAX _UI64_MAX
// 7.18.2.2 Limits of minimum-width integer types
#define INT_LEAST8_MIN INT8_MIN
#define INT_LEAST8_MAX INT8_MAX
#define INT_LEAST16_MIN INT16_MIN
#define INT_LEAST16_MAX INT16_MAX
#define INT_LEAST32_MIN INT32_MIN
#define INT_LEAST32_MAX INT32_MAX
#define INT_LEAST64_MIN INT64_MIN
#define INT_LEAST64_MAX INT64_MAX
#define UINT_LEAST8_MAX UINT8_MAX
#define UINT_LEAST16_MAX UINT16_MAX
#define UINT_LEAST32_MAX UINT32_MAX
#define UINT_LEAST64_MAX UINT64_MAX
// 7.18.2.3 Limits of fastest minimum-width integer types
#define INT_FAST8_MIN INT8_MIN
#define INT_FAST8_MAX INT8_MAX
#define INT_FAST16_MIN INT16_MIN
#define INT_FAST16_MAX INT16_MAX
#define INT_FAST32_MIN INT32_MIN
#define INT_FAST32_MAX INT32_MAX
#define INT_FAST64_MIN INT64_MIN
#define INT_FAST64_MAX INT64_MAX
#define UINT_FAST8_MAX UINT8_MAX
#define UINT_FAST16_MAX UINT16_MAX
#define UINT_FAST32_MAX UINT32_MAX
#define UINT_FAST64_MAX UINT64_MAX
// 7.18.2.4 Limits of integer types capable of holding object pointers
#ifdef _WIN64 // [
# define INTPTR_MIN INT64_MIN
# define INTPTR_MAX INT64_MAX
# define UINTPTR_MAX UINT64_MAX
#else // _WIN64 ][
# define INTPTR_MIN INT32_MIN
# define INTPTR_MAX INT32_MAX
# define UINTPTR_MAX UINT32_MAX
#endif // _WIN64 ]
// 7.18.2.5 Limits of greatest-width integer types
#define INTMAX_MIN INT64_MIN
#define INTMAX_MAX INT64_MAX
#define UINTMAX_MAX UINT64_MAX
// 7.18.3 Limits of other integer types
#ifdef _WIN64 // [
# define PTRDIFF_MIN _I64_MIN
# define PTRDIFF_MAX _I64_MAX
#else // _WIN64 ][
# define PTRDIFF_MIN _I32_MIN
# define PTRDIFF_MAX _I32_MAX
#endif // _WIN64 ]
#define SIG_ATOMIC_MIN INT_MIN
#define SIG_ATOMIC_MAX INT_MAX
#ifndef SIZE_MAX // [
# ifdef _WIN64 // [
# define SIZE_MAX _UI64_MAX
# else // _WIN64 ][
# define SIZE_MAX _UI32_MAX
# endif // _WIN64 ]
#endif // SIZE_MAX ]
// WCHAR_MIN and WCHAR_MAX are also defined in
#ifndef WCHAR_MIN // [
# define WCHAR_MIN 0
#endif // WCHAR_MIN ]
#ifndef WCHAR_MAX // [
# define WCHAR_MAX _UI16_MAX
#endif // WCHAR_MAX ]
#define WINT_MIN 0
#define WINT_MAX _UI16_MAX
#endif // __STDC_LIMIT_MACROS ]
// 7.18.4 Limits of other integer types
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
// 7.18.4.1 Macros for minimum-width integer constants
#define INT8_C(val) val##i8
#define INT16_C(val) val##i16
#define INT32_C(val) val##i32
#define INT64_C(val) val##i64
#define UINT8_C(val) val##ui8
#define UINT16_C(val) val##ui16
#define UINT32_C(val) val##ui32
#define UINT64_C(val) val##ui64
// 7.18.4.2 Macros for greatest-width integer constants
// These #ifndef's are needed to prevent collisions with .
// Check out Issue 9 for the details.
#ifndef INTMAX_C // [
# define INTMAX_C INT64_C
#endif // INTMAX_C ]
#ifndef UINTMAX_C // [
# define UINTMAX_C UINT64_C
#endif // UINTMAX_C ]
#endif // __STDC_CONSTANT_MACROS ]
#endif // _MSC_VER >= 1600 ]
#endif // _MSC_STDINT_H_ ]
preshed-2.0.1/preshed/ 0000775 0000000 0000000 00000000000 13360664062 0014604 5 ustar 00root root 0000000 0000000 preshed-2.0.1/preshed/__init__.pxd 0000664 0000000 0000000 00000000000 13360664062 0017046 0 ustar 00root root 0000000 0000000 preshed-2.0.1/preshed/__init__.py 0000664 0000000 0000000 00000000025 13360664062 0016712 0 ustar 00root root 0000000 0000000 from .about import *
preshed-2.0.1/preshed/about.py 0000664 0000000 0000000 00000000673 13360664062 0016276 0 ustar 00root root 0000000 0000000 # inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'preshed'
__version__ = '2.0.1'
__summary__ = 'Cython hash table that trusts the keys are pre-hashed'
__uri__ = 'https://github.com/explosion/preshed'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@explosion.ai'
__license__ = 'MIT'
__release__ = True
preshed-2.0.1/preshed/counter.pxd 0000664 0000000 0000000 00000000602 13360664062 0016776 0 ustar 00root root 0000000 0000000 from libc.stdint cimport int64_t
from cymem.cymem cimport Pool
from .maps cimport MapStruct
from .maps cimport map_init, map_get, map_set
from .maps cimport key_t
ctypedef int64_t count_t
cdef class PreshCounter:
cdef Pool mem
cdef MapStruct* c_map
cdef public object smoother
cdef readonly count_t total
cpdef int inc(self, key_t key, count_t inc) except -1
preshed-2.0.1/preshed/counter.pyx 0000664 0000000 0000000 00000014001 13360664062 0017021 0 ustar 00root root 0000000 0000000 """Count occurrences of uint64-valued keys."""
from __future__ import division
cimport cython
from libc.math cimport log, exp, sqrt
cdef class PreshCounter:
def __init__(self, initial_size=8):
assert initial_size != 0
assert initial_size & (initial_size - 1) == 0
self.mem = Pool()
self.c_map = self.mem.alloc(1, sizeof(MapStruct))
map_init(self.mem, self.c_map, initial_size)
self.smoother = None
self.total = 0
property length:
def __get__(self):
return self.c_map.length
def __len__(self):
return self.c_map.length
def __iter__(self):
cdef int i
for i in range(self.c_map.length):
if self.c_map.cells[i].key != 0:
yield (self.c_map.cells[i].key, self.c_map.cells[i].value)
def __getitem__(self, key_t key):
return map_get(self.c_map, key)
cpdef int inc(self, key_t key, count_t inc) except -1:
cdef count_t c = map_get(self.c_map, key)
c += inc
map_set(self.mem, self.c_map, key, c)
self.total += inc
return c
def prob(self, key_t key):
cdef GaleSmoother smoother
cdef void* value = map_get(self.c_map, key)
if self.smoother is not None:
smoother = self.smoother
r_star = self.smoother(value)
return r_star / self.smoother.total
elif value == NULL:
return 0
else:
return value / self.total
def smooth(self):
self.smoother = GaleSmoother(self)
cdef class GaleSmoother:
cdef Pool mem
cdef count_t* Nr
cdef double gradient
cdef double intercept
cdef readonly count_t cutoff
cdef count_t Nr0
cdef readonly double total
def __init__(self, PreshCounter counts):
count_counts = PreshCounter()
cdef double total = 0
for _, count in counts:
count_counts.inc(count, 1)
total += count
# If we have no items seen 1 or 2 times, this doesn't work. But, this
# won't be true in real data...
assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
# Extrapolate Nr0 from Nr1 and Nr2.
self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
self.mem = Pool()
cdef double[2] mb
cdef int n_counts = 0
for _ in count_counts:
n_counts += 1
sorted_r = count_counts.mem.alloc(n_counts, sizeof(count_t))
self.Nr = self.mem.alloc(n_counts, sizeof(count_t))
for i, (count, count_count) in enumerate(sorted(count_counts)):
sorted_r[i] = count
self.Nr[i] = count_count
_fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
n_counts)
self.gradient = mb[0]
self.intercept = mb[1]
self.total = self(0) * self.Nr0
for count, count_count in count_counts:
self.total += self(count) * count_count
def __call__(self, count_t r):
if r == 0:
return self.Nr[1] / self.Nr0
elif r < self.cutoff:
return turing_estimate_of_r(r, self.Nr[r-1], self.Nr[r])
else:
return gale_estimate_of_r(r, self.gradient, self.intercept)
def count_count(self, count_t r):
if r == 0:
return self.Nr0
else:
return self.Nr[r-1]
@cython.cdivision(True)
cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
return ((r + 1) * Nr1) / Nr
@cython.cdivision(True)
cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
cdef double e_nr = exp(gradient * log(r) + intercept)
cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
return (r + 1) * (e_nr1 / e_nr)
@cython.cdivision(True)
cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
int length) except *:
cdef double x_mean = 0.0
cdef double y_mean = 0.0
cdef Pool mem = Pool()
x = mem.alloc(length, sizeof(double))
y = mem.alloc(length, sizeof(double))
cdef int i
for i in range(length):
r = sorted_r[i]
x[i] = log(r)
y[i] = log(_get_zr(i, sorted_r, Nr[i], length))
x_mean += x[i]
y_mean += y[i]
x_mean /= length
y_mean /= length
cdef double ss_xy = 0.0
cdef double ss_xx = 0.0
for i in range(length):
x_dist = x[i] - x_mean
y_dist = y[i] - y_mean
# SS_xy = sum the product of the distances from the mean
ss_xy += x_dist * y_dist
# SS_xx = sum the squares of the x distance
ss_xx += x_dist * x_dist
# Gradient
output[0] = ss_xy / ss_xx
# Intercept
output[1] = y_mean - output[0] * x_mean
@cython.cdivision(True)
cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
cdef double r_i = sorted_r[j-1] if j >= 1 else 0
cdef double r_j = sorted_r[j]
cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
return 2 * Nr_j / (r_k - r_i)
@cython.cdivision(True)
cdef double _variance(double r, double Nr, double Nr1) nogil:
return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))
@cython.cdivision(True)
cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
int length) except -1:
cdef int i
cdef count_t r
for i in range(length-1):
r = sorted_r[i]
if sorted_r[i+1] != r+1:
return r
g_r = gale_estimate_of_r(r, m, b)
t_r = turing_estimate_of_r(r, Nr[i], Nr[i+1])
if abs(t_r - g_r) <= _variance(r, Nr[i], Nr[i+1]):
return r
else:
return length - 1
preshed-2.0.1/preshed/maps.pxd 0000664 0000000 0000000 00000002162 13360664062 0016262 0 ustar 00root root 0000000 0000000 from libc.stdint cimport uint64_t
from cymem.cymem cimport Pool
ctypedef uint64_t key_t
cdef struct Cell:
key_t key
void* value
cdef struct MapStruct:
Cell* cells
void* value_for_empty_key
void* value_for_del_key
key_t length
key_t filled
bint is_empty_key_set
bint is_del_key_set
cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values,
int n) nogil
cdef void* map_get(const MapStruct* map_, const key_t key) nogil
cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except *
cdef void map_init(Pool mem, MapStruct* pmap, size_t length) except *
cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil
cdef class PreshMap:
cdef MapStruct* c_map
cdef Pool mem
cdef inline void* get(self, key_t key) nogil
cdef void set(self, key_t key, void* value) except *
cdef class PreshMapArray:
cdef Pool mem
cdef MapStruct* maps
cdef size_t length
cdef inline void* get(self, size_t i, key_t key) nogil
cdef void set(self, size_t i, key_t key, void* value) except *
preshed-2.0.1/preshed/maps.pyx 0000664 0000000 0000000 00000015052 13360664062 0016311 0 ustar 00root root 0000000 0000000 # cython: infer_types=True
# cython: cdivision=True
#
cimport cython
DEF EMPTY_KEY = 0
DEF DELETED_KEY = 1
cdef class PreshMap:
"""Hash map that assumes keys come pre-hashed. Maps uint64_t --> uint64_t.
Uses open addressing with linear probing.
Usage
map = PreshMap() # Create a table
map = PreshMap(initial_size=1024) # Create with initial size (efficiency)
map[key] = value # Set a value to a key
value = map[key] # Get a value given a key
for key, value in map.items(): # Iterate over items
len(map) # Get number of inserted keys
"""
def __init__(self, size_t initial_size=8):
# Size must be power of two
if initial_size == 0:
initial_size = 8
if initial_size & (initial_size - 1) != 0:
power = 1
while power < initial_size:
power *= 2
initial_size = power
self.mem = Pool()
self.c_map = self.mem.alloc(1, sizeof(MapStruct))
map_init(self.mem, self.c_map, initial_size)
property capacity:
def __get__(self):
return self.c_map.length
def items(self):
cdef key_t key
cdef void* value
cdef int i = 0
while map_iter(self.c_map, &i, &key, &value):
yield key, value
def keys(self):
for key, _ in self.items():
yield key
def values(self):
for _, value in self.items():
yield value
def pop(self, key_t key, default=None):
cdef void* value = map_get(self.c_map, key)
map_clear(self.c_map, key)
return value if value != NULL else default
def __getitem__(self, key_t key):
cdef void* value = map_get(self.c_map, key)
return value if value != NULL else None
def __setitem__(self, key_t key, size_t value):
map_set(self.mem, self.c_map, key, value)
def __delitem__(self, key_t key):
map_clear(self.c_map, key)
def __len__(self):
return self.c_map.filled
def __contains__(self, key_t key):
cdef void* value = map_get(self.c_map, key)
return True if value != NULL else False
def __iter__(self):
for key in self.keys():
yield key
cdef inline void* get(self, key_t key) nogil:
return map_get(self.c_map, key)
cdef void set(self, key_t key, void* value) except *:
map_set(self.mem, self.c_map, key, value)
cdef class PreshMapArray:
"""An array of hash tables that assume keys come pre-hashed. Each table
uses open addressing with linear probing.
"""
def __init__(self, size_t length, size_t initial_size=8):
self.mem = Pool()
self.length = length
self.maps = self.mem.alloc(length, sizeof(MapStruct))
for i in range(length):
map_init(self.mem, &self.maps[i], initial_size)
cdef inline void* get(self, size_t i, key_t key) nogil:
return map_get(&self.maps[i], key)
cdef void set(self, size_t i, key_t key, void* value) except *:
map_set(self.mem, &self.maps[i], key, value)
cdef void map_init(Pool mem, MapStruct* map_, size_t length) except *:
map_.length = length
map_.filled = 0
map_.cells = mem.alloc(length, sizeof(Cell))
cdef void map_set(Pool mem, MapStruct* map_, key_t key, void* value) except *:
if key == EMPTY_KEY:
map_.value_for_empty_key = value
map_.is_empty_key_set = True
elif key == DELETED_KEY:
map_.value_for_del_key = value
map_.is_del_key_set = True
cdef Cell* cell
cell = _find_cell(map_.cells, map_.length, key)
if cell.key == EMPTY_KEY:
cell.key = key
map_.filled += 1
cell.value = value
if (map_.filled + 1) * 5 >= (map_.length * 3):
_resize(mem, map_)
cdef void* map_get(const MapStruct* map_, const key_t key) nogil:
if key == EMPTY_KEY:
return map_.value_for_empty_key
elif key == DELETED_KEY:
return map_.value_for_del_key
cdef Cell* cell = _find_cell(map_.cells, map_.length, key)
return cell.value
cdef void* map_clear(MapStruct* map_, const key_t key) nogil:
if key == EMPTY_KEY:
value = map_.value_for_empty_key if map_.is_empty_key_set else NULL
map_.is_empty_key_set = False
return value
elif key == DELETED_KEY:
value = map_.value_for_del_key if map_.is_del_key_set else NULL
map_.is_del_key_set = False
return value
else:
cell = _find_cell(map_.cells, map_.length, key)
cell.key = DELETED_KEY
map_.filled -= 1
return cell.value
cdef void* map_bulk_get(const MapStruct* map_, const key_t* keys, void** values,
int n) nogil:
cdef int i
for i in range(n):
values[i] = map_get(map_, keys[i])
cdef bint map_iter(const MapStruct* map_, int* i, key_t* key, void** value) nogil:
'''Iterate over the filled items, setting the current place in i, and the
key and value. Return False when iteration finishes.
'''
cdef const Cell* cell
while i[0] < map_.length:
cell = &map_.cells[i[0]]
i[0] += 1
if cell[0].key != EMPTY_KEY and cell[0].key != DELETED_KEY:
key[0] = cell[0].key
value[0] = cell[0].value
return True
# Remember to check for cells keyed by the special empty and deleted keys
if i[0] == map_.length:
i[0] += 1
if map_.is_empty_key_set:
key[0] = EMPTY_KEY
value[0] = map_.value_for_empty_key
return True
if i[0] == map_.length + 1:
i[0] += 1
if map_.is_del_key_set:
key[0] = DELETED_KEY
value[0] = map_.value_for_del_key
return True
return False
@cython.cdivision
cdef inline Cell* _find_cell(Cell* cells, const key_t size, const key_t key) nogil:
# Modulo for powers-of-two via bitwise &
cdef key_t i = (key & (size - 1))
while cells[i].key != 0 and cells[i].key != key:
i = (i + 1) & (size - 1)
return &cells[i]
cdef void _resize(Pool mem, MapStruct* map_) except *:
cdef size_t new_size = map_.length * 2
cdef Cell* old_cells = map_.cells
cdef size_t old_size = map_.length
map_.length = new_size
map_.filled = 0
map_.cells = mem.alloc(new_size, sizeof(Cell))
cdef size_t i
cdef size_t slot
for i in range(old_size):
if old_cells[i].key != EMPTY_KEY and old_cells[i].key != DELETED_KEY:
map_set(mem, map_, old_cells[i].key, old_cells[i].value)
mem.free(old_cells)
preshed-2.0.1/preshed/tests/ 0000775 0000000 0000000 00000000000 13360664062 0015746 5 ustar 00root root 0000000 0000000 preshed-2.0.1/preshed/tests/__init__.py 0000664 0000000 0000000 00000000000 13360664062 0020045 0 ustar 00root root 0000000 0000000 preshed-2.0.1/preshed/tests/test_counter.py 0000664 0000000 0000000 00000003720 13360664062 0021040 0 ustar 00root root 0000000 0000000 from __future__ import division
import pytest
from preshed.counter import PreshCounter
def test_count():
counter = PreshCounter()
assert counter[12] == 0
counter.inc(12, 1)
assert counter[12] == 1
counter.inc(14, 10)
counter.inc(9, 10)
counter.inc(12, 4)
assert counter[12] == 5
assert counter[14] == 10
assert counter[9] == 10
def test_unsmooth_prob():
counter = PreshCounter()
assert counter.prob(12) == 0.0
counter.inc(12, 1)
assert counter.prob(12) == 1.0
counter.inc(14, 10)
assert counter.prob(14) == 10 / 11
assert counter.prob(12) == 1.0 / 11
def test_smooth_prob():
p = PreshCounter()
# 1 10
# 2 6
# 3 4
# 5 2
# 8 1
for i in range(10):
p.inc(100-i, 1) # 10 items of freq 1
for i in range(6):
p.inc(90 - i, 2) # 6 items of freq 2
for i in range(4):
p.inc(80 - i, 3) # 4 items of freq 3
for i in range(2):
p.inc(70 - i, 5) # 2 items of freq 5
for i in range(1):
p.inc(60 - i, 8) # 1 item of freq 8
assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8)
assert p.prob(100) == 1.0 / p.total
assert p.prob(200) == 0.0
assert p.prob(60) == 8.0 / p.total
p.smooth()
assert p.smoother(1) < 1.0
assert p.smoother(8) < 8.0
assert p.prob(1000) < p.prob(100)
for event, count in reversed(sorted(p, key=lambda it: it[1])):
assert p.smoother(count) < count
import os
def test_large_freqs():
if 'TEST_FILE_LOC' in os.environ:
loc = os.environ['TEST_FILE_LOC']
else:
return None
counts = PreshCounter()
for i, line in enumerate(open(loc)):
line = line.strip()
if not line:
continue
freq = int(line.split()[0])
counts.inc(i+1, freq)
oov = i+2
assert counts.prob(oov) == 0.0
assert counts.prob(1) < 0.1
counts.smooth()
assert counts.prob(oov) > 0
assert counts.prob(oov) < counts.prob(i)
preshed-2.0.1/preshed/tests/test_hashing.py 0000664 0000000 0000000 00000001615 13360664062 0021003 0 ustar 00root root 0000000 0000000 import pytest
from preshed.maps import PreshMap
import random
def test_insert():
h = PreshMap()
assert h[1] is None
h[1] = 5
assert h[1] == 5
h[2] = 6
assert h[1] == 5
assert h[2] == 6
def test_resize():
h = PreshMap(4)
h[4] = 12
for i in range(10, 100):
value = int(i * (random.random() + 1))
h[i] = value
assert h[4] == 12
def test_zero_key():
h = PreshMap()
h[0] = 6
h[5] = 12
assert h[0] == 6
assert h[5] == 12
for i in range(500, 1000):
h[i] = i * random.random()
assert h[0] == 6
assert h[5] == 12
def test_iter():
key_sum = 0
val_sum = 0
h = PreshMap()
for i in range(56, 24, -3):
h[i] = i * 2
key_sum += i
val_sum += i * 2
for key, value in h.items():
key_sum -= key
val_sum -= value
assert key_sum == 0
assert val_sum == 0
preshed-2.0.1/preshed/tests/test_pop.py 0000664 0000000 0000000 00000000317 13360664062 0020156 0 ustar 00root root 0000000 0000000 from ..maps import PreshMap
def test_pop1():
table = PreshMap()
table[10] = 20
table[30] = 25
assert table[10] == 20
assert table[30] == 25
table.pop(30)
assert table[10] == 20
preshed-2.0.1/requirements.txt 0000664 0000000 0000000 00000000050 13360664062 0016431 0 ustar 00root root 0000000 0000000 cymem>=2.0.2,<2.1.0
cython>=0.28
pytest
preshed-2.0.1/setup.py 0000775 0000000 0000000 00000011324 13360664062 0014670 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
from __future__ import print_function
import os
import subprocess
import sys
import contextlib
from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
from distutils import ccompiler, msvccompiler
try:
from setuptools import Extension, setup
except ImportError:
from distutils.core import Extension, setup
PACKAGES = [
'preshed',
'preshed.tests']
MOD_NAMES = [
'preshed.maps',
'preshed.counter']
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
compile_options = {'msvc' : ['/Ox', '/EHsc'],
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']}
link_options = {'msvc' : [],
'other' : []}
class build_ext_options:
def build_options(self):
for e in self.extensions:
e.extra_compile_args = compile_options.get(
self.compiler.compiler_type, compile_options['other'])
for e in self.extensions:
e.extra_link_args = link_options.get(
self.compiler.compiler_type, link_options['other'])
class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self):
build_ext_options.build_options(self)
build_ext.build_extensions(self)
def generate_cython(root, source):
print('Cythonizing sources')
p = subprocess.call([sys.executable,
os.path.join(root, 'bin', 'cythonize.py'),
source])
if p != 0:
raise RuntimeError('Running cythonize failed')
def is_source_release(path):
return os.path.exists(os.path.join(path, 'PKG-INFO'))
def clean(path):
for name in MOD_NAMES:
name = name.replace('.', '/')
for ext in ['.so', '.html', '.cpp', '.c']:
file_path = os.path.join(path, name + ext)
if os.path.exists(file_path):
os.unlink(file_path)
@contextlib.contextmanager
def chdir(new_dir):
old_dir = os.getcwd()
try:
os.chdir(new_dir)
sys.path.insert(0, new_dir)
yield
finally:
del sys.path[0]
os.chdir(old_dir)
def setup_package():
root = os.path.abspath(os.path.dirname(__file__))
if len(sys.argv) > 1 and sys.argv[1] == 'clean':
return clean(root)
with chdir(root):
with open(os.path.join(root, 'preshed', 'about.py')) as f:
about = {}
exec(f.read(), about)
with open(os.path.join(root, 'README.rst')) as f:
readme = f.read()
include_dirs = [
get_python_inc(plat_specific=True),
]
if (ccompiler.new_compiler().compiler_type == 'msvc'
and msvccompiler.get_build_version() == 9):
include_dirs.append(os.path.join(root, 'include', 'msvc9'))
ext_modules = []
for mod_name in MOD_NAMES:
mod_path = mod_name.replace('.', '/') + '.cpp'
ext_modules.append(
Extension(mod_name, [mod_path],
language='c++', include_dirs=include_dirs))
if not is_source_release(root):
generate_cython(root, 'preshed')
setup(
name=about['__title__'],
zip_safe=False,
packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']},
description=about['__summary__'],
long_description=readme,
author=about['__author__'],
author_email=about['__email__'],
version=about['__version__'],
url=about['__uri__'],
license=about['__license__'],
ext_modules=ext_modules,
setup_requires=['wheel>=0.32.0,<0.33.0'],
install_requires=['cymem>=2.0.2,<2.1.0'],
classifiers=[
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Operating System :: POSIX :: Linux',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows',
'Programming Language :: Cython',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Topic :: Scientific/Engineering'],
cmdclass = {
'build_ext': build_ext_subclass},
)
if __name__ == '__main__':
setup_package()
| |