pax_global_header00006660000000000000000000000064136075576150014531gustar00rootroot0000000000000052 comment=27b52d57067967415830c47ab3a83cd84b6f8887 tinyalign-0.2/000077500000000000000000000000001360755761500133705ustar00rootroot00000000000000tinyalign-0.2/.gitignore000066400000000000000000000002201360755761500153520ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /src/tinyalign/_version.py /.tox/ tinyalign-0.2/.travis.yml000066400000000000000000000012321360755761500154770ustar00rootroot00000000000000language: python dist: xenial cache: directories: - $HOME/.cache/pip python: - "3.6" - "3.7" - "3.8-dev" install: - pip install . script: - python setup.py --version # Detect encoding problems - python -m pytest env: global: - TWINE_USERNAME=marcelm jobs: include: - stage: deploy services: - docker python: "3.6" install: python3 -m pip install Cython setuptools_scm twine if: tag IS present script: - | python3 setup.py sdist && ./buildwheels.sh && ls -l dist/ && python3 -m twine upload dist/* allowed_failures: - python: "3.8-dev" tinyalign-0.2/LICENSE000066400000000000000000000021041360755761500143720ustar00rootroot00000000000000Copyright (c) 2019-2020 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tinyalign-0.2/MANIFEST.in000066400000000000000000000001361360755761500151260ustar00rootroot00000000000000include README.md include LICENSE include src/tinyalign/*.c include src/tinyalign/_version.py tinyalign-0.2/README.md000066400000000000000000000022051360755761500146460ustar00rootroot00000000000000[![Travis](https://travis-ci.org/marcelm/tinyalign.svg?branch=master)](https://travis-ci.org/marcelm/tinyalign) # tinyalign A small Python module providing edit distance (aka Levenshtein distance, that is, counting insertions, deletions and substitutions) and Hamming distance computation. Its main purpose is to speed up computation of edit distance by allowing to specify a maximum number of differences `maxdiff` (banding). If that parameter is provided, the returned edit distance is anly accurate up to `maxdiff`. That is, if the actual edit distance is higher than `maxdiff`, a value larger than `maxdiff` is returned, but not necessarily the actual edit distance. For computing regular edit distances or if your *maxdiff* is less than 4, you should prefer [https://github.com/fujimotos/polyleven](polyleven), as that is faster in that case. When `maxdiff` is 4 or more, but not too close to the length of the shortest string, this module is faster. ``` >>> from tinyalign import edit_distance, hamming_distance >>> edit_distance("banana", "ananas") 2 >>> hamming_distance("hello", "yello") 1 >>> edit_distance("hello", "world", maxdiff=2) 3 ``` tinyalign-0.2/buildwheels.sh000077500000000000000000000024111360755761500162340ustar00rootroot00000000000000#!/bin/bash # # Build manylinux1 wheels. Based on the example at # # # Run this within the repository root: # docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh # # The wheels will be put into the wheelhouse/ subdirectory. # # For interactive tests: # docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash set -xeuo pipefail # For convenience, if this script is called from outside of a docker container, # it starts a container and runs itself inside of it. if ! grep -q docker /proc/1/cgroup; then # We are not inside a container docker pull quay.io/pypa/manylinux1_x86_64 exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0 fi # Strip binaries (copied from multibuild) STRIP_FLAGS=${STRIP_FLAGS:-"-Wl,-strip-all"} export CFLAGS="${CFLAGS:-$STRIP_FLAGS}" export CXXFLAGS="${CXXFLAGS:-$STRIP_FLAGS}" for PYBIN in /opt/python/cp3[67]-*/bin; do ${PYBIN}/pip wheel /io/ -w wheelhouse/ done # Bundle external shared libraries into the wheels for whl in wheelhouse/tinyalign-*.whl; do auditwheel repair "$whl" -w repaired/ done # Created files are owned by root, so fix permissions. chown -R --reference=/io/setup.py repaired/ mv repaired/*.whl /io/dist/ tinyalign-0.2/pyproject.toml000066400000000000000000000001161360755761500163020ustar00rootroot00000000000000[build-system] requires = ["setuptools", "wheel", "setuptools_scm", "Cython"] tinyalign-0.2/setup.py000066400000000000000000000054451360755761500151120ustar00rootroot00000000000000import sys import os.path from setuptools import setup, Extension, find_packages from distutils.command.sdist import sdist as _sdist from distutils.command.build_ext import build_ext as _build_ext if sys.version_info[:2] < (3, 4): sys.stdout.write("Python 3.4 or later is required\n") sys.exit(1) def no_cythonize(extensions, **_ignore): """Change .pyx to .c or .cpp (copied from Cython documentation)""" for extension in extensions: sources = [] for sfile in extension.sources: path, ext = os.path.splitext(sfile) if ext in (".pyx", ".py"): if extension.language == "c++": ext = ".cpp" else: ext = ".c" sfile = path + ext sources.append(sfile) extension.sources[:] = sources extensions = [ Extension("tinyalign._core", sources=["src/tinyalign/_core.pyx"]), ] class BuildExt(_build_ext): def run(self): # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip # file retrieved from PyPI that already includes the pre-cythonized # extension modules, and then we do not need to run cythonize(). if os.path.exists("PKG-INFO"): no_cythonize(extensions) else: # Otherwise, this is a 'developer copy' of the code, and then the # only sensible thing is to require Cython to be installed. from Cython.Build import cythonize self.extensions = cythonize(self.extensions) super().run() class SDist(_sdist): def run(self): # Make sure the compiled Cython files in the distribution are up-to-date from Cython.Build import cythonize cythonize(extensions) super().run() with open("README.md", encoding="utf-8") as f: long_description = f.read() setup( name="tinyalign", setup_requires=["setuptools_scm"], # Support pip versions that don't know about pyproject.toml use_scm_version={"write_to": "src/tinyalign/_version.py"}, author="Marcel Martin", author_email="marcel.martin@scilifelab.se", url="https://github.com/marcelm/tinyalign/", description="Fast banded edit distance", long_description=long_description, long_description_content_type="text/markdown", license="MIT", packages=find_packages("src"), package_dir={"": "src"}, ext_modules=extensions, cmdclass={"build_ext": BuildExt, "sdist": SDist}, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics", ], ) tinyalign-0.2/src/000077500000000000000000000000001360755761500141575ustar00rootroot00000000000000tinyalign-0.2/src/tinyalign/000077500000000000000000000000001360755761500161555ustar00rootroot00000000000000tinyalign-0.2/src/tinyalign/__init__.py000066400000000000000000000001401360755761500202610ustar00rootroot00000000000000from ._core import edit_distance, hamming_distance from ._version import version as __version__ tinyalign-0.2/src/tinyalign/_core.pyx000066400000000000000000000075241360755761500200160ustar00rootroot00000000000000# cython: language_level=3 from cpython.mem cimport PyMem_Malloc, PyMem_Free import cython @cython.wraparound(False) @cython.boundscheck(False) def edit_distance(s, t, int maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ cdef: unsigned int m = len(s) # index: i unsigned int n = len(t) # index: j int e = maxdiff unsigned int i, j, start, stop, c, smallest unsigned int prev bint match bytes s_bytes, t_bytes char* sv char* tv # Return early if string lengths are too different cdef unsigned int absdiff = m - n if m > n else n - m if e != -1 and absdiff > e: return absdiff s_bytes = s.encode() if isinstance(s, unicode) else s t_bytes = t.encode() if isinstance(t, unicode) else t sv = s_bytes tv = t_bytes # Skip identical prefixes while m > 0 and n > 0 and sv[0] == tv[0]: sv += 1 tv += 1 m -= 1 n -= 1 # Skip identical suffixes while m > 0 and n > 0 and sv[m-1] == tv[n-1]: m -= 1 n -= 1 cdef unsigned int result cdef unsigned int* costs = PyMem_Malloc((m + 1) * sizeof(unsigned int)) if not costs: raise MemoryError() with nogil: for i in range(m + 1): costs[i] = i if e == -1: # Regular (unbanded) global alignment prev = 0 for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m+1): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c result = costs[m] else: # Banded alignment smallest = 0 for j in range(1, n + 1): stop = min(j + e + 1, m + 1) if j <= e: prev = costs[0] costs[0] += 1 smallest = costs[0] start = 1 else: start = j - e prev = costs[start - 1] smallest = maxdiff + 1 for i in range(start, stop): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c smallest = min(smallest, c) if smallest > maxdiff: break if smallest > maxdiff: result = smallest else: result = costs[m] PyMem_Free(costs) return result def hamming_distance(unicode s, unicode t): """ Compute hamming distance between two strings. If they do not have the same length, an IndexError is raised. Return the number of differences between the strings. """ cdef Py_ssize_t m = len(s) cdef Py_ssize_t n = len(t) if m != n: raise IndexError("sequences must have the same length") cdef Py_ssize_t e = 0 cdef Py_ssize_t i for i in range(m): if s[i] != t[i]: e += 1 return e tinyalign-0.2/tests/000077500000000000000000000000001360755761500145325ustar00rootroot00000000000000tinyalign-0.2/tests/test_core.py000066400000000000000000000054321360755761500170770ustar00rootroot00000000000000from tinyalign import edit_distance, hamming_distance import random import pytest STRING_PAIRS = [ ('', ''), ('', 'A'), ('A', 'A'), ('AB', ''), ('AB', 'ABC'), ('TGAATCCC', 'CCTGAATC'), ('ANANAS', 'BANANA'), ('SISSI', 'MISSISSIPPI'), ('GGAATCCC', 'TGAGGGATAAATATTTAGAATTTAGTAGTAGTGTT'), ('TCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCC'), ('TGAGACACGCAACATGGGAAAGGCAAGGCACACAGGGGATAGG', 'AATTTATTTTATTGTGATTTTTTGGAGGTTTGGAAGCCACTAAGCTATACTGAGACACGCAACAGGGGAAAGGCAAGGCACA'), ('TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCCTGGTGGGGTTTGGAGTATAGTGAAAGATAGGTGAGTTGGTCGGGTG'), ('A', 'TCTGCTCCTGGCCCATGATCGTATAACTTTCAAATTT'), ('GCGCGGACT', 'TAAATCCTGG'), ] def py_edit_distance(s, t): """ Pure-Python edit distance """ m = len(s) n = len(t) costs = list(range(m + 1)) for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m + 1): c = min( prev + int(s[i-1] != t[j-1]), costs[i] + 1, costs[i-1] + 1, ) prev = costs[i] costs[i] = c return costs[-1] def random_string(): return ''.join(random.choice('AC') for _ in range(random.randint(0, 20))) RANDOM_STRING_PAIRS = [(random_string(), random_string()) for _ in range(10000)] def test_edit_distance(): assert edit_distance('', '') == 0 assert edit_distance('', 'A') == 1 assert edit_distance('A', 'B') == 1 assert edit_distance('A', 'A') == 0 assert edit_distance('A', 'AB') == 1 assert edit_distance('BA', 'AB') == 2 for s, t in STRING_PAIRS + RANDOM_STRING_PAIRS: assert edit_distance(s, '') == len(s) assert edit_distance('', s) == len(s) assert edit_distance(s, t) == edit_distance(t, s) assert edit_distance(s, t) == py_edit_distance(s, t) def assert_banded(s, t, maxdiff): banded_dist = edit_distance(s, t, maxdiff=maxdiff) true_dist = edit_distance(s, t) if true_dist > maxdiff: assert banded_dist > maxdiff else: assert banded_dist == true_dist def test_edit_distance_banded(): for maxdiff in range(5): assert_banded('ABC', '', maxdiff) for s, t in STRING_PAIRS: assert_banded(s, '', maxdiff) assert_banded('', s, maxdiff) assert_banded(s, t, maxdiff) assert_banded(t, s, maxdiff) def test_hamming_distance(): assert hamming_distance('', '') == 0 assert hamming_distance('A', 'A') == 0 assert hamming_distance('HELLO', 'HELLO') == 0 assert hamming_distance('ABC', 'DEF') == 3 assert hamming_distance('ABCXDEF', 'ABCYDEF') == 1 def test_hamming_distance_incorrect_length(): with pytest.raises(IndexError): hamming_distance('A', 'BC')