pax_global_header00006660000000000000000000000064144743171170014523gustar00rootroot0000000000000052 comment=4114f544d3022e7d865e56fe03b06a532345eb67 tinyalign-0.2.2/000077500000000000000000000000001447431711700135225ustar00rootroot00000000000000tinyalign-0.2.2/.github/000077500000000000000000000000001447431711700150625ustar00rootroot00000000000000tinyalign-0.2.2/.github/workflows/000077500000000000000000000000001447431711700171175ustar00rootroot00000000000000tinyalign-0.2.2/.github/workflows/ci.yml000066400000000000000000000031571447431711700202430ustar00rootroot00000000000000name: CI on: [push] jobs: test: timeout-minutes: 5 runs-on: ubuntu-latest strategy: matrix: python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: python -m pip install pytest Cython - name: Install run: python -m pip install -e . - name: Test run: python -m pytest deploy: timeout-minutes: 10 runs-on: ubuntu-latest needs: [test] if: startsWith(github.ref, 'refs/tags') steps: - uses: actions/checkout@v3 with: fetch-depth: 0 # required for setuptools_scm - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.11" - name: Build wheels uses: pypa/cibuildwheel@v2.13 with: output-dir: dist/ env: CIBW_BUILD: "cp3*-manylinux_x86_64" - name: Make sdist run: | python -m pip install build python -m build --sdist ls -l dist/ - name: Publish dev release to test PyPI if: contains(github.ref, '.dev') uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.test_pypi_password }} repository_url: https://test.pypi.org/legacy/ - name: Publish to PyPI if: "!contains(github.ref, '.dev')" uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.pypi_password }} tinyalign-0.2.2/.gitignore000066400000000000000000000002201447431711700155040ustar00rootroot00000000000000__pycache__ /.cache/ /venv/ /build/ /.pytest_cache/ /MANIFEST /dist/ /src/*/_*.c /src/*/*.so /src/*.egg-info/ /src/tinyalign/_version.py /.tox/ tinyalign-0.2.2/LICENSE000066400000000000000000000020771447431711700145350ustar00rootroot00000000000000Copyright (c) 2019 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tinyalign-0.2.2/README.md000066400000000000000000000024761447431711700150120ustar00rootroot00000000000000[![CI](https://github.com/marcelm/tinyalign/actions/workflows/ci.yml/badge.svg)](https://github.com/marcelm/tinyalign/actions/workflows/ci.yml) [![PyPI](https://img.shields.io/pypi/v/tinyalign.svg?branch=main)](https://pypi.python.org/pypi/tinyalign) # tinyalign A small Python module providing edit distance (aka Levenshtein distance, that is, counting insertions, deletions and substitutions) and Hamming distance computation. Its main purpose is to speed up computation of edit distance by allowing to specify a maximum number of differences `maxdiff` (banding). If that parameter is provided, the returned edit distance is anly accurate up to `maxdiff`. That is, if the actual edit distance is higher than `maxdiff`, a value larger than `maxdiff` is returned, but not necessarily the actual edit distance. For computing regular edit distances or if your *maxdiff* is less than 4, you should prefer [https://github.com/fujimotos/polyleven](polyleven), as that is faster in that case. When `maxdiff` is 4 or more, but not too close to the length of the shortest string, this module is faster. ``` >>> from tinyalign import edit_distance, hamming_distance >>> edit_distance("banana", "ananas") 2 >>> hamming_distance("hello", "yello") 1 >>> edit_distance("hello", "world", maxdiff=2) 3 ``` ## Changes ### v0.2.2 * Added type hints tinyalign-0.2.2/pyproject.toml000066400000000000000000000017661447431711700164500ustar00rootroot00000000000000[build-system] requires = ["setuptools >= 63", "setuptools_scm[toml] >= 6.2", "Cython >= 0.29.20"] build-backend = "setuptools.build_meta" [project] name = "tinyalign" authors = [ {name = "Marcel Martin", email = "marcel.martin@scilifelab.se"} ] description = "Fast banded edit distance" readme = "README.md" license = {text = "MIT"} classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics", ] requires-python = ">=3.7" dynamic = ["version"] [project.urls] "Homepage" = "https://github.com/marcelm/tinyalign/" [tool.setuptools.exclude-package-data] tinyalign = ["*.pyx"] [tool.setuptools_scm] write_to = "src/tinyalign/_version.py" [tool.cibuildwheel] environment = "CFLAGS=-g0" test-command = ["cd {project}; pytest"] test-requires = "pytest" tinyalign-0.2.2/setup.py000066400000000000000000000002671447431711700152410ustar00rootroot00000000000000from setuptools import setup, Extension import setuptools_scm # noqa Ensure it’s installed setup(ext_modules=[Extension("tinyalign._core", sources=["src/tinyalign/_core.pyx"])]) tinyalign-0.2.2/src/000077500000000000000000000000001447431711700143115ustar00rootroot00000000000000tinyalign-0.2.2/src/tinyalign/000077500000000000000000000000001447431711700163075ustar00rootroot00000000000000tinyalign-0.2.2/src/tinyalign/__init__.py000066400000000000000000000001401447431711700204130ustar00rootroot00000000000000from ._core import edit_distance, hamming_distance from ._version import version as __version__ tinyalign-0.2.2/src/tinyalign/_core.pyi000066400000000000000000000001631447431711700201210ustar00rootroot00000000000000def edit_distance(s: str, t: str, maxdiff: int = ...) -> int: ... def hamming_distance(s: str, t: str) -> int: ... tinyalign-0.2.2/src/tinyalign/_core.pyx000066400000000000000000000075241447431711700201500ustar00rootroot00000000000000# cython: language_level=3 from cpython.mem cimport PyMem_Malloc, PyMem_Free import cython @cython.wraparound(False) @cython.boundscheck(False) def edit_distance(s, t, int maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ cdef: unsigned int m = len(s) # index: i unsigned int n = len(t) # index: j int e = maxdiff unsigned int i, j, start, stop, c, smallest unsigned int prev bint match bytes s_bytes, t_bytes char* sv char* tv # Return early if string lengths are too different cdef unsigned int absdiff = m - n if m > n else n - m if e != -1 and absdiff > e: return absdiff s_bytes = s.encode() if isinstance(s, unicode) else s t_bytes = t.encode() if isinstance(t, unicode) else t sv = s_bytes tv = t_bytes # Skip identical prefixes while m > 0 and n > 0 and sv[0] == tv[0]: sv += 1 tv += 1 m -= 1 n -= 1 # Skip identical suffixes while m > 0 and n > 0 and sv[m-1] == tv[n-1]: m -= 1 n -= 1 cdef unsigned int result cdef unsigned int* costs = PyMem_Malloc((m + 1) * sizeof(unsigned int)) if not costs: raise MemoryError() with nogil: for i in range(m + 1): costs[i] = i if e == -1: # Regular (unbanded) global alignment prev = 0 for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m+1): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c result = costs[m] else: # Banded alignment smallest = 0 for j in range(1, n + 1): stop = min(j + e + 1, m + 1) if j <= e: prev = costs[0] costs[0] += 1 smallest = costs[0] start = 1 else: start = j - e prev = costs[start - 1] smallest = maxdiff + 1 for i in range(start, stop): match = sv[i-1] == tv[j-1] c = 1 + min( prev - match, costs[i], costs[i-1], ) prev = costs[i] costs[i] = c smallest = min(smallest, c) if smallest > maxdiff: break if smallest > maxdiff: result = smallest else: result = costs[m] PyMem_Free(costs) return result def hamming_distance(unicode s, unicode t): """ Compute hamming distance between two strings. If they do not have the same length, an IndexError is raised. Return the number of differences between the strings. """ cdef Py_ssize_t m = len(s) cdef Py_ssize_t n = len(t) if m != n: raise IndexError("sequences must have the same length") cdef Py_ssize_t e = 0 cdef Py_ssize_t i for i in range(m): if s[i] != t[i]: e += 1 return e tinyalign-0.2.2/src/tinyalign/_version.pyi000066400000000000000000000002241447431711700206540ustar00rootroot00000000000000# The _version.py file is generated on installation. By including this stub, # we can run mypy without having to install the package. version: str tinyalign-0.2.2/src/tinyalign/py.typed000066400000000000000000000000001447431711700177740ustar00rootroot00000000000000tinyalign-0.2.2/tests/000077500000000000000000000000001447431711700146645ustar00rootroot00000000000000tinyalign-0.2.2/tests/test_core.py000066400000000000000000000054321447431711700172310ustar00rootroot00000000000000from tinyalign import edit_distance, hamming_distance import random import pytest STRING_PAIRS = [ ('', ''), ('', 'A'), ('A', 'A'), ('AB', ''), ('AB', 'ABC'), ('TGAATCCC', 'CCTGAATC'), ('ANANAS', 'BANANA'), ('SISSI', 'MISSISSIPPI'), ('GGAATCCC', 'TGAGGGATAAATATTTAGAATTTAGTAGTAGTGTT'), ('TCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCC'), ('TGAGACACGCAACATGGGAAAGGCAAGGCACACAGGGGATAGG', 'AATTTATTTTATTGTGATTTTTTGGAGGTTTGGAAGCCACTAAGCTATACTGAGACACGCAACAGGGGAAAGGCAAGGCACA'), ('TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCCTGGTGGGGTTTGGAGTATAGTGAAAGATAGGTGAGTTGGTCGGGTG'), ('A', 'TCTGCTCCTGGCCCATGATCGTATAACTTTCAAATTT'), ('GCGCGGACT', 'TAAATCCTGG'), ] def py_edit_distance(s, t): """ Pure-Python edit distance """ m = len(s) n = len(t) costs = list(range(m + 1)) for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m + 1): c = min( prev + int(s[i-1] != t[j-1]), costs[i] + 1, costs[i-1] + 1, ) prev = costs[i] costs[i] = c return costs[-1] def random_string(): return ''.join(random.choice('AC') for _ in range(random.randint(0, 20))) RANDOM_STRING_PAIRS = [(random_string(), random_string()) for _ in range(10000)] def test_edit_distance(): assert edit_distance('', '') == 0 assert edit_distance('', 'A') == 1 assert edit_distance('A', 'B') == 1 assert edit_distance('A', 'A') == 0 assert edit_distance('A', 'AB') == 1 assert edit_distance('BA', 'AB') == 2 for s, t in STRING_PAIRS + RANDOM_STRING_PAIRS: assert edit_distance(s, '') == len(s) assert edit_distance('', s) == len(s) assert edit_distance(s, t) == edit_distance(t, s) assert edit_distance(s, t) == py_edit_distance(s, t) def assert_banded(s, t, maxdiff): banded_dist = edit_distance(s, t, maxdiff=maxdiff) true_dist = edit_distance(s, t) if true_dist > maxdiff: assert banded_dist > maxdiff else: assert banded_dist == true_dist def test_edit_distance_banded(): for maxdiff in range(5): assert_banded('ABC', '', maxdiff) for s, t in STRING_PAIRS: assert_banded(s, '', maxdiff) assert_banded('', s, maxdiff) assert_banded(s, t, maxdiff) assert_banded(t, s, maxdiff) def test_hamming_distance(): assert hamming_distance('', '') == 0 assert hamming_distance('A', 'A') == 0 assert hamming_distance('HELLO', 'HELLO') == 0 assert hamming_distance('ABC', 'DEF') == 3 assert hamming_distance('ABCXDEF', 'ABCYDEF') == 1 def test_hamming_distance_incorrect_length(): with pytest.raises(IndexError): hamming_distance('A', 'BC')