pax_global_header00006660000000000000000000000064141474205340014516gustar00rootroot0000000000000052 comment=8a0e148f28f3fe641cf92663644dba1643fe9683 tinyalign-0.2.1/000077500000000000000000000000001414742053400135145ustar00rootroot00000000000000tinyalign-0.2.1/.github/000077500000000000000000000000001414742053400150545ustar00rootroot00000000000000tinyalign-0.2.1/.github/workflows/000077500000000000000000000000001414742053400171115ustar00rootroot00000000000000tinyalign-0.2.1/.github/workflows/ci.yml000066400000000000000000000035311414742053400202310ustar00rootroot00000000000000name: CI on: [push] jobs: test: timeout-minutes: 5 runs-on: ubuntu-latest strategy: matrix: python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: python -m pip install pytest Cython - name: Install run: python -m pip install -e . - name: Test run: python -m pytest deploy: timeout-minutes: 10 runs-on: ubuntu-latest needs: [test] if: startsWith(github.ref, 'refs/tags') steps: - uses: actions/checkout@v2 with: fetch-depth: 0 # required for setuptools_scm - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.7 - name: Build wheels uses: joerick/cibuildwheel@v2.2.2 with: output-dir: dist/ env: CIBW_BUILD: "cp3*-*" CIBW_SKIP: "*-manylinux_i686 *musllinux*" # CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 CIBW_ENVIRONMENT: "CFLAGS=-g0" CIBW_TEST_REQUIRES: "pytest" CIBW_TEST_COMMAND: | cd {project} pytest - name: Make sdist run: | python -m pip install build python -m build --sdist ls -l dist/ - name: Publish dev release to test PyPI if: contains(github.ref, '.dev') uses: pypa/gh-action-pypi-publish@v1.4.2 with: user: __token__ password: ${{ secrets.test_pypi_password }} repository_url: https://test.pypi.org/legacy/ - name: Publish to PyPI if: "!contains(github.ref, '.dev')" uses: pypa/gh-action-pypi-publish@v1.4.2 with: user: __token__ password: ${{ secrets.pypi_password }} tinyalign-0.2.1/.gitignore000066400000000000000000000002201414742053400154760ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /src/tinyalign/_version.py /.tox/ tinyalign-0.2.1/LICENSE000066400000000000000000000021041414742053400145160ustar00rootroot00000000000000Copyright (c) 2019-2020 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tinyalign-0.2.1/MANIFEST.in000066400000000000000000000001361414742053400152520ustar00rootroot00000000000000include README.md include LICENSE include src/tinyalign/*.c include src/tinyalign/_version.py tinyalign-0.2.1/README.md000066400000000000000000000024211414742053400147720ustar00rootroot00000000000000[![CI](https://github.com/marcelm/tinyalign/actions/workflows/ci.yml/badge.svg)](https://github.com/marcelm/tinyalign/actions/workflows/ci.yml) [![PyPI](https://img.shields.io/pypi/v/tinyalign.svg?branch=main)](https://pypi.python.org/pypi/tinyalign) # tinyalign A small Python module providing edit distance (aka Levenshtein distance, that is, counting insertions, deletions and substitutions) and Hamming distance computation. Its main purpose is to speed up computation of edit distance by allowing to specify a maximum number of differences `maxdiff` (banding). If that parameter is provided, the returned edit distance is anly accurate up to `maxdiff`. That is, if the actual edit distance is higher than `maxdiff`, a value larger than `maxdiff` is returned, but not necessarily the actual edit distance. For computing regular edit distances or if your *maxdiff* is less than 4, you should prefer [https://github.com/fujimotos/polyleven](polyleven), as that is faster in that case. When `maxdiff` is 4 or more, but not too close to the length of the shortest string, this module is faster. ``` >>> from tinyalign import edit_distance, hamming_distance >>> edit_distance("banana", "ananas") 2 >>> hamming_distance("hello", "yello") 1 >>> edit_distance("hello", "world", maxdiff=2) 3 ``` tinyalign-0.2.1/buildwheels.sh000077500000000000000000000024521414742053400163650ustar00rootroot00000000000000#!/bin/bash # # Build manylinux wheels. Based on the example at # # # It is best to run this in a fresh clone of the repository! # # Run this within the repository root: # ./buildwheels.sh # # The wheels will be put into the dist/ subdirectory. set -xeuo pipefail manylinux=quay.io/pypa/manylinux2010_x86_64 # For convenience, if this script is called from outside of a docker container, # it starts a container and runs itself inside of it. if ! grep -q docker /proc/1/cgroup; then # We are not inside a container docker pull ${manylinux} exec docker run --rm -v $(pwd):/io ${manylinux} /io/$0 fi if ! test -d /io/dist; then mkdir /io/dist chown --reference=/io/setup.py /io/dist fi # Strip binaries (copied from multibuild) STRIP_FLAGS=${STRIP_FLAGS:-"-Wl,-strip-all"} export CFLAGS="${CFLAGS:-$STRIP_FLAGS}" export CXXFLAGS="${CXXFLAGS:-$STRIP_FLAGS}" for PYBIN in /opt/python/cp3[678]-*/bin; do ${PYBIN}/pip wheel --no-deps /io/ -w wheelhouse/ done ls wheelhouse/ # Bundle external shared libraries into the wheels for whl in wheelhouse/*.whl; do auditwheel repair "$whl" --plat manylinux1_x86_64 -w repaired/ done # Created files are owned by root, so fix permissions. chown -R --reference=/io/setup.py repaired/ mv repaired/*.whl /io/dist/ tinyalign-0.2.1/pyproject.toml000066400000000000000000000001161414742053400164260ustar00rootroot00000000000000[build-system] requires = ["setuptools", "wheel", "setuptools_scm", "Cython"] tinyalign-0.2.1/setup.py000066400000000000000000000053251414742053400152330ustar00rootroot00000000000000import sys import os.path from setuptools import setup, Extension, find_packages from distutils.command.sdist import sdist as _sdist from distutils.command.build_ext import build_ext as _build_ext def no_cythonize(extensions, **_ignore): """Change .pyx to .c or .cpp (copied from Cython documentation)""" for extension in extensions: sources = [] for sfile in extension.sources: path, ext = os.path.splitext(sfile) if ext in (".pyx", ".py"): if extension.language == "c++": ext = ".cpp" else: ext = ".c" sfile = path + ext sources.append(sfile) extension.sources[:] = sources extensions = [ Extension("tinyalign._core", sources=["src/tinyalign/_core.pyx"]), ] class BuildExt(_build_ext): def run(self): # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip # file retrieved from PyPI that already includes the pre-cythonized # extension modules, and then we do not need to run cythonize(). if os.path.exists("PKG-INFO"): no_cythonize(extensions) else: # Otherwise, this is a 'developer copy' of the code, and then the # only sensible thing is to require Cython to be installed. from Cython.Build import cythonize self.extensions = cythonize(self.extensions) super().run() class SDist(_sdist): def run(self): # Make sure the compiled Cython files in the distribution are up-to-date from Cython.Build import cythonize cythonize(extensions) super().run() with open("README.md", encoding="utf-8") as f: long_description = f.read() setup( name="tinyalign", setup_requires=["setuptools_scm"], # Support pip versions that don't know about pyproject.toml use_scm_version={"write_to": "src/tinyalign/_version.py"}, author="Marcel Martin", author_email="marcel.martin@scilifelab.se", url="https://github.com/marcelm/tinyalign/", description="Fast banded edit distance", long_description=long_description, long_description_content_type="text/markdown", license="MIT", packages=find_packages("src"), package_dir={"": "src"}, ext_modules=extensions, cmdclass={"build_ext": BuildExt, "sdist": SDist}, python_requires='>=3.6', classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics", ], ) tinyalign-0.2.1/src/000077500000000000000000000000001414742053400143035ustar00rootroot00000000000000tinyalign-0.2.1/src/tinyalign/000077500000000000000000000000001414742053400163015ustar00rootroot00000000000000tinyalign-0.2.1/src/tinyalign/__init__.py000066400000000000000000000001401414742053400204050ustar00rootroot00000000000000from ._core import edit_distance, hamming_distance from ._version import version as __version__ tinyalign-0.2.1/src/tinyalign/_core.pyx000066400000000000000000000075241414742053400201420ustar00rootroot00000000000000# cython: language_level=3 from cpython.mem cimport PyMem_Malloc, PyMem_Free import cython @cython.wraparound(False) @cython.boundscheck(False) def edit_distance(s, t, int maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ cdef: unsigned int m = len(s) # index: i unsigned int n = len(t) # index: j int e = maxdiff unsigned int i, j, start, stop, c, smallest unsigned int prev bint match bytes s_bytes, t_bytes char* sv char* tv # Return early if string lengths are too different cdef unsigned int absdiff = m - n if m > n else n - m if e != -1 and absdiff > e: return absdiff s_bytes = s.encode() if isinstance(s, unicode) else s t_bytes = t.encode() if isinstance(t, unicode) else t sv = s_bytes tv = t_bytes # Skip identical prefixes while m > 0 and n > 0 and sv[0] == tv[0]: sv += 1 tv += 1 m -= 1 n -= 1 # Skip identical suffixes while m > 0 and n > 0 and sv[m-1] == tv[n-1]: m -= 1 n -= 1 cdef unsigned int result cdef unsigned int* costs = PyMem_Malloc((m + 1) * sizeof(unsigned int)) if not costs: raise MemoryError() with nogil: for i in range(m + 1): costs[i] = i if e == -1: # Regular (unbanded) global alignment prev = 0 for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m+1): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c result = costs[m] else: # Banded alignment smallest = 0 for j in range(1, n + 1): stop = min(j + e + 1, m + 1) if j <= e: prev = costs[0] costs[0] += 1 smallest = costs[0] start = 1 else: start = j - e prev = costs[start - 1] smallest = maxdiff + 1 for i in range(start, stop): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c smallest = min(smallest, c) if smallest > maxdiff: break if smallest > maxdiff: result = smallest else: result = costs[m] PyMem_Free(costs) return result def hamming_distance(unicode s, unicode t): """ Compute hamming distance between two strings. If they do not have the same length, an IndexError is raised. Return the number of differences between the strings. """ cdef Py_ssize_t m = len(s) cdef Py_ssize_t n = len(t) if m != n: raise IndexError("sequences must have the same length") cdef Py_ssize_t e = 0 cdef Py_ssize_t i for i in range(m): if s[i] != t[i]: e += 1 return e tinyalign-0.2.1/tests/000077500000000000000000000000001414742053400146565ustar00rootroot00000000000000tinyalign-0.2.1/tests/test_core.py000066400000000000000000000054321414742053400172230ustar00rootroot00000000000000from tinyalign import edit_distance, hamming_distance import random import pytest STRING_PAIRS = [ ('', ''), ('', 'A'), ('A', 'A'), ('AB', ''), ('AB', 'ABC'), ('TGAATCCC', 'CCTGAATC'), ('ANANAS', 'BANANA'), ('SISSI', 'MISSISSIPPI'), ('GGAATCCC', 'TGAGGGATAAATATTTAGAATTTAGTAGTAGTGTT'), ('TCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCC'), ('TGAGACACGCAACATGGGAAAGGCAAGGCACACAGGGGATAGG', 'AATTTATTTTATTGTGATTTTTTGGAGGTTTGGAAGCCACTAAGCTATACTGAGACACGCAACAGGGGAAAGGCAAGGCACA'), ('TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCCTGGTGGGGTTTGGAGTATAGTGAAAGATAGGTGAGTTGGTCGGGTG'), ('A', 'TCTGCTCCTGGCCCATGATCGTATAACTTTCAAATTT'), ('GCGCGGACT', 'TAAATCCTGG'), ] def py_edit_distance(s, t): """ Pure-Python edit distance """ m = len(s) n = len(t) costs = list(range(m + 1)) for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m + 1): c = min( prev + int(s[i-1] != t[j-1]), costs[i] + 1, costs[i-1] + 1, ) prev = costs[i] costs[i] = c return costs[-1] def random_string(): return ''.join(random.choice('AC') for _ in range(random.randint(0, 20))) RANDOM_STRING_PAIRS = [(random_string(), random_string()) for _ in range(10000)] def test_edit_distance(): assert edit_distance('', '') == 0 assert edit_distance('', 'A') == 1 assert edit_distance('A', 'B') == 1 assert edit_distance('A', 'A') == 0 assert edit_distance('A', 'AB') == 1 assert edit_distance('BA', 'AB') == 2 for s, t in STRING_PAIRS + RANDOM_STRING_PAIRS: assert edit_distance(s, '') == len(s) assert edit_distance('', s) == len(s) assert edit_distance(s, t) == edit_distance(t, s) assert edit_distance(s, t) == py_edit_distance(s, t) def assert_banded(s, t, maxdiff): banded_dist = edit_distance(s, t, maxdiff=maxdiff) true_dist = edit_distance(s, t) if true_dist > maxdiff: assert banded_dist > maxdiff else: assert banded_dist == true_dist def test_edit_distance_banded(): for maxdiff in range(5): assert_banded('ABC', '', maxdiff) for s, t in STRING_PAIRS: assert_banded(s, '', maxdiff) assert_banded('', s, maxdiff) assert_banded(s, t, maxdiff) assert_banded(t, s, maxdiff) def test_hamming_distance(): assert hamming_distance('', '') == 0 assert hamming_distance('A', 'A') == 0 assert hamming_distance('HELLO', 'HELLO') == 0 assert hamming_distance('ABC', 'DEF') == 3 assert hamming_distance('ABCXDEF', 'ABCYDEF') == 1 def test_hamming_distance_incorrect_length(): with pytest.raises(IndexError): hamming_distance('A', 'BC')