alignlib-0.1.1/0000755000175000017500000000000013504513702013152 5ustar moellermoelleralignlib-0.1.1/README.md0000644000175000017500000000013413504513665014437 0ustar moellermoeller# alignlib A small Python module providing edit distance and Hamming distance computation. alignlib-0.1.1/src/0000755000175000017500000000000013504513702013741 5ustar moellermoelleralignlib-0.1.1/src/alignlib/0000755000175000017500000000000013526300232015516 5ustar moellermoelleralignlib-0.1.1/src/alignlib/__init__.py0000644000175000017500000000014013504513665017636 0ustar moellermoellerfrom ._core import edit_distance, hamming_distance from ._version import version as __version__ alignlib-0.1.1/src/alignlib/_version.py0000644000175000017500000000016413504513700017717 0ustar moellermoeller# coding: utf-8 # file generated by setuptools_scm # don't change, don't track in version control version = '0.1.1' alignlib-0.1.1/src/alignlib/_core.pyx0000644000175000017500000000714613504513665017373 0ustar moellermoellerfrom cython.view cimport array as cvarray import cython @cython.boundscheck(False) def edit_distance(s, t, int maxdiff=-1): """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. If maxdiff is not -1, then a banded alignment is performed. In that case, the true edit distance is returned if and only if it is maxdiff or less. Otherwise, a value is returned that is guaranteed to be greater than maxdiff, but which is not necessarily the true edit distance. """ cdef int m = len(s) # index: i cdef int n = len(t) # index: j cdef int e = maxdiff cdef int i, j, start, stop, c, prev, smallest cdef bint match cdef bytes s_bytes, t_bytes cdef char* sv cdef char* tv # Return early if string lengths are too different if e != -1 and abs(m - n) > e: return abs(m - n) s_bytes = s.encode() if isinstance(s, unicode) else s t_bytes = t.encode() if isinstance(t, unicode) else t sv = s_bytes tv = t_bytes # Skip identical prefixes while m > 0 and n > 0 and sv[0] == tv[0]: sv += 1 tv += 1 m -= 1 n -= 1 # Skip identical suffixes while m > 0 and n > 0 and sv[m-1] == tv[n-1]: m -= 1 n -= 1 cdef int[:] costs = cvarray(shape=(m+1,), itemsize=sizeof(int), format="i") if e == -1: # Regular (unbanded) global alignment with nogil: for i in range(m + 1): costs[i] = i # compute columns of the alignment matrix (using unit costs) prev = 0 for j in range(1, n+1): prev = costs[0] costs[0] += 1 for i in range(1, m+1): match = sv[i-1] == tv[j-1] c = min( prev + 1 - match, costs[i] + 1, costs[i-1] + 1) prev = costs[i] costs[i] = c else: # Banded alignment with nogil: for i in range(m + 1): costs[i] = i smallest = 0 for j in range(1, n + 1): stop = min(j + e + 1, m + 1) if j <= e: prev = costs[0] costs[0] += 1 smallest = costs[0] start = 1 else: start = j - e prev = costs[start - 1] smallest = maxdiff + 1 for i in range(start, stop): match = sv[i-1] == tv[j-1] c = min( prev + 1 - match, costs[i] + 1, costs[i-1] + 1) prev = costs[i] costs[i] = c smallest = min(smallest, c) if smallest > maxdiff: break if smallest > maxdiff: return smallest return costs[m] def hamming_distance(unicode s, unicode t): """ Compute hamming distance between two strings. If they do not have the same length, an IndexError is raised. Return the number of differences between the strings. """ cdef Py_ssize_t m = len(s) cdef Py_ssize_t n = len(t) if m != n: raise IndexError("sequences must have the same length") cdef Py_ssize_t e = 0 cdef Py_ssize_t i for i in range(m): if s[i] != t[i]: e += 1 return e alignlib-0.1.1/PKG-INFO0000644000175000017500000000135213504513702014250 0ustar moellermoellerMetadata-Version: 2.1 Name: alignlib Version: 0.1.1 Summary: Some sequence alignment routines Home-page: https://github.com/marcelm/alignlib/ Author: Marcel Martin Author-email: marcel.martin@scilifelab.se License: MIT Description: # alignlib A small Python module providing edit distance and Hamming distance computation. Platform: UNKNOWN Classifier: Development Status :: 3 - Alpha Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: MIT License Classifier: Natural Language :: English Classifier: Programming Language :: Cython Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics Description-Content-Type: text/markdown alignlib-0.1.1/setup.py0000644000175000017500000000547713504513665014711 0ustar moellermoellerimport sys import os.path from setuptools import setup, Extension, find_packages from distutils.command.sdist import sdist as _sdist from distutils.command.build_ext import build_ext as _build_ext if sys.version_info[:2] < (3, 4): sys.stdout.write('Python 3.4 or later is required\n') sys.exit(1) def no_cythonize(extensions, **_ignore): """Change .pyx to .c or .cpp (copied from Cython documentation)""" for extension in extensions: sources = [] for sfile in extension.sources: path, ext = os.path.splitext(sfile) if ext in ('.pyx', '.py'): if extension.language == 'c++': ext = '.cpp' else: ext = '.c' sfile = path + ext sources.append(sfile) extension.sources[:] = sources extensions = [ Extension('alignlib._core', sources=['src/alignlib/_core.pyx']), ] class BuildExt(_build_ext): def run(self): # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip # file retrieved from PyPI that already includes the pre-cythonized # extension modules, and then we do not need to run cythonize(). if os.path.exists('PKG-INFO'): no_cythonize(extensions) else: # Otherwise, this is a 'developer copy' of the code, and then the # only sensible thing is to require Cython to be installed. from Cython.Build import cythonize self.extensions = cythonize(self.extensions) super().run() class SDist(_sdist): def run(self): # Make sure the compiled Cython files in the distribution are up-to-date from Cython.Build import cythonize cythonize(extensions) super().run() with open('README.md', encoding='utf-8') as f: long_description = f.read() setup( name='alignlib', setup_requires=['setuptools_scm'], # Support pip versions that don't know about pyproject.toml use_scm_version={'write_to': 'src/alignlib/_version.py'}, author='Marcel Martin', author_email='marcel.martin@scilifelab.se', url='https://github.com/marcelm/alignlib/', description='Some sequence alignment routines', long_description=long_description, long_description_content_type='text/markdown', license='MIT', packages=find_packages('src'), package_dir={'': 'src'}, ext_modules=extensions, cmdclass={'build_ext': BuildExt, 'sdist': SDist}, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics" ] ) alignlib-0.1.1/LICENSE0000644000175000017500000000207713504513665014175 0ustar moellermoellerCopyright (c) 2019 Marcel Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.