distance-master/0000755000175000017500000000000012243645633012243 5ustar jdgjdgdistance-master/MANIFEST0000644000175000017500000000013612243645633013374 0ustar jdgjdg# file GENERATED by distutils, do NOT edit setup.py distance/__init__.py distance/distance.py distance-master/README.md0000644000175000017500000001031712243645633013524 0ustar jdgjdgdistance - Utilities for comparing sequences ============================================ This package provides helpers for computing similarities between arbitrary sequences. Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses. All distance computations are implemented in pure Python, and most of them are also implemented in C. Installation ------------ If you don't want or need to use the C extension, just unpack the archive and run, as root: # python setup.py install For the C extension to work, you need the Python source files, and a C compiler (typically Microsoft Visual C++ 2010 on Windows, and GCC on Mac and Linux). On a Debian-like system, you can get all of these with: # apt-get install gcc pythonX.X-dev where X.X is the number of your Python version. Then you should type: # python setup.py install --with-c Note the use of the `--with-c` switch. Usage ----- A common use case for this module is to compare single words for similarity: >>> distance.levenshtein("lenvestein", "levenshtein") 3 >>> distance.hamming("hamming", "hamning") 1 If there is not a one-to-one mapping between sounds and glyphs in your language, or if you want to compare not glyphs, but syllables or phonems, you can pass in tuples of characters: >>> t1 = ("de", "ci", "si", "ve") >>> t2 = ("de", "ri", "si", "ve") >>> distance.levenshtein(t1, t2) 1 Comparing lists of strings can also be useful for computing similarities between sentences, paragraphs, etc.: >>> sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] >>> sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog'] >>> distance.levenshtein(sent1, sent2) 3 Hamming and Levenshtein distance can be normalized, so that the results of several distance measures can be meaningfully compared. Two strategies are available for Levenshtein: either the length of the shortest alignment between the sequences is taken as factor, or the length of the longer one. Example uses: >>> distance.hamming("fat", "cat", normalized=True) 0.3333333333333333 >>> distance.nlevenshtein("abc", "acd", method=1) # shortest alignment 0.6666666666666666 >>> distance.nlevenshtein("abc", "acd", method=2) # longest alignment 0.5 `jaccard` and `sorensen` return a normalized value per default: >>> distance.sorensen("decide", "resize") 0.5555555555555556 >>> distance.jaccard("decide", "resize") 0.7142857142857143 As for the bonuses, there is a `fast_comp` function, which computes the distance between two strings up to a value of 2 included. If the distance between the strings is higher than that, -1 is returned. This function is of limited use, but on the other hand it is quite faster than `levenshtein`. There is also a `lcsubstrings` function which can be used to find the longest common substrings in two sequences. Finally, two convenience iterators `ilevenshtein` and `ifast_comp` are provided, which are intended to be used for filtering from a long list of sequences the ones that are close to a reference one. They both return a series of tuples (distance, sequence). Example: >>> tokens = ["fo", "bar", "foob", "foo", "fooba", "foobar"] >>> sorted(distance.ifast_comp("foo", tokens)) [(0, 'foo'), (1, 'fo'), (1, 'foob'), (2, 'fooba')] >>> sorted(distance.ilevenshtein("foo", tokens, max_dist=1)) [(0, 'foo'), (1, 'fo'), (1, 'foob')] `ifast_comp` is particularly efficient, and can handle 1 million tokens without a problem. For more informations, see the functions documentation (`help(funcname)`). Have fun! Changelog --------- 20/11/13: * Switched back to using the to-be-deprecated Python unicode api. Good news is that this makes the C extension compatible with Python 2.7+, and that distance computations on unicode strings is now much faster. * Added a C version of `lcsubstrings`. * Added a new method for computing normalized Levenshtein distance. * Added some tests. 12/11/13: Expanded `fast_comp` (formerly `quick_levenshtein`) so that it can handle transpositions. Fixed variable interversions in (C) `levenshtein` which produced sometimes strange results. 10/11/13: Added `quick_levenshtein` and `iquick_levenshtein`. 05/11/13: Added Sorensen and Jaccard metrics, fixed memory issue in Levenshtein. distance-master/setup.py0000644000175000017500000001016112243645633013754 0ustar jdgjdg# -*- coding: utf-8 -*- # Distance - Utilities for comparing sequences # Copyright (C) 2013 Michaël Meyer # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os, sys, ast, _ast, re from distutils.core import setup, Extension this_dir = os.path.dirname(os.path.abspath(__file__)) pkg_dir = os.path.join(this_dir, "distance") cpkg_dir = os.path.join(this_dir, "cdistance") ctypes = ["unicode", "byte", "array"] cfunctions = { "levenshtein": ["levenshtein", "nlevenshtein"], "hamming": ["hamming"], "lcsubstrings": ["lcsubstrings"], "fastcomp": ["fastcomp"], } sequence_compare = """\ #define SEQUENCE_COMPARE(s1, i1, s2, i2) \\ (PyObject_RichCompareBool( \\ PySequence_Fast_GET_ITEM((s1), (i1)), \\ PySequence_Fast_GET_ITEM((s2), (i2)), \\ Py_EQ) \\ ) """ def make_c_doc(): buff = [] py_sources = [f for f in os.listdir(pkg_dir) if f.endswith('.py')] for file in py_sources: with open(os.path.join(pkg_dir, file)) as f: content = f.read() tree = ast.parse(content) for doc_string in parse_tree(tree, content): buff.append(doc_string) join_str = 2 * '\n' return join_str.join(buff) + '\n' def parse_tree(tree, content): for node in ast.iter_child_nodes(tree): if not isinstance(node, _ast.FunctionDef): continue doc_string = ast.get_docstring(node) if not doc_string: continue func_def = re.findall("def\s%s\s*(.+?)\s*:" % node.name, content) assert func_def and len(func_def) == 1 func_def = node.name + func_def[0] + 2 * '\\n\\\n' doc_string = doc_string.replace('\n', '\\n\\\n').replace('"', '\\"') doc_string = doc_string.replace('\n' + 8 * ' ', '\n' + 4 * ' ') doc_string = '#define %s_doc \\\n"%s%s"\n' % (node.name, func_def, doc_string) yield doc_string def format_header(): yield sequence_compare for cfile, cfuncs in cfunctions.items(): for ctype in ctypes: if ctype == "array": yield("#define SEQUENCE_COMP SEQUENCE_COMPARE") yield('#define unicode %(type)s' % dict(type=ctype)) for cfunc in cfuncs: yield("#define %(function)s %(tcode)s%(function)s" % dict(function=cfunc, tcode=ctype[0])) yield('#include "%(file)s.c"' % dict(file=cfile)) yield("#undef unicode") for cfunc in cfuncs: yield("#undef %(function)s" % dict(function=cfunc)) if ctype == "array": yield("#undef SEQUENCE_COMP") yield("") def prepare(): with open(os.path.join(cpkg_dir, "includes.h"), "w") as f: f.write(make_c_doc()) f.write(4 * '\n') f.write('\n'.join(format_header())) args = sys.argv[1:] if "prepare" in args: prepare() sys.exit() if "--with-c" in args: args.remove("--with-c") ext_modules = [Extension('distance.cdistance', sources=["cdistance/distance.c"])] else: sys.stderr.write("notice: no C support available\n") ext_modules = [] with open(os.path.join(this_dir, "README.md")) as f: long_description = f.read() setup ( name = 'Distance', version = '0.1.3', description = 'Utilities for comparing sequences', long_description = long_description, author='Michaël Meyer', author_email='michaelnm.meyer@gmail.com', url='https://github.com/doukremt/distance', ext_modules = ext_modules, script_args = args, packages = ['distance'], classifiers=( 'Intended Audience :: Developers', 'Natural Language :: English', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Operating System :: OS Independent', 'Topic :: Software Development :: Libraries :: Python Modules', 'Programming Language :: C', 'Programming Language :: Python', 'Programming Language :: Python :: 3.3', ) ) distance-master/distance/0000755000175000017500000000000012243645633014035 5ustar jdgjdgdistance-master/distance/_fastcomp.py0000644000175000017500000000360712243645633016370 0ustar jdgjdg# -*- coding: utf-8 -*- def fast_comp(seq1, seq2, transpositions=False): """Compute the distance between the two sequences `seq1` and `seq2` up to a maximum of 2 included, and return it. If the edit distance between the two sequences is higher than that, -1 is returned. If `transpositions` is `True`, transpositions will be taken into account for the computation of the distance. This can make a difference, e.g.: >>> fast_comp("abc", "bac", transpositions=False) 2 >>> fast_comp("abc", "bac", transpositions=True) 1 This is faster than `levenshtein` by an order of magnitude, but on the other hand is of limited use. The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`. I've added transpositions support to the original code. """ replace, insert, delete = "r", "i", "d" L1, L2 = len(seq1), len(seq2) if L1 < L2: L1, L2 = L2, L1 seq1, seq2 = seq2, seq1 ldiff = L1 - L2 if ldiff == 0: models = (insert+delete, delete+insert, replace+replace) elif ldiff == 1: models = (delete+replace, replace+delete) elif ldiff == 2: models = (delete+delete,) else: return -1 res = 3 for model in models: i = j = c = 0 while (i < L1) and (j < L2): if seq1[i] != seq2[j]: c = c+1 if 2 < c: break if transpositions and ldiff != 2 \ and i < L1 - 1 and j < L2 - 1 \ and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]: i, j = i+2, j+2 else: cmd = model[c-1] if cmd == delete: i = i+1 elif cmd == insert: j = j+1 else: assert cmd == replace i,j = i+1, j+1 else: i,j = i+1, j+1 if 2 < c: continue elif i < L1: if L1-i <= model[c:].count(delete): c = c + (L1-i) else: continue elif j < L2: if L2-j <= model[c:].count(insert): c = c + (L2-j) else: continue if c < res: res = c if res == 3: res = -1 return res distance-master/distance/_lcsubstrings.py0000644000175000017500000000224412243645633017272 0ustar jdgjdg# -*- coding: utf-8 -*- from array import array def lcsubstrings(seq1, seq2, positions=False): """Find the longest common substring(s) in the sequences `seq1` and `seq2`. If positions evaluates to `True` only their positions will be returned, together with their length, in a tuple: (length, [(start pos in seq1, start pos in seq2)..]) Otherwise, the substrings themselves will be returned, in a set. Example: >>> lcsubstrings("sedentar", "dentist") {'dent'} >>> lcsubstrings("sedentar", "dentist", positions=True) (4, [(2, 0)]) """ L1, L2 = len(seq1), len(seq2) ms = [] mlen = last = 0 if L1 < L2: seq1, seq2 = seq2, seq1 L1, L2 = L2, L1 column = array('L', range(L2)) for i in range(L1): for j in range(L2): old = column[j] if seq1[i] == seq2[j]: if i == 0 or j == 0: column[j] = 1 else: column[j] = last + 1 if column[j] > mlen: mlen = column[j] ms = [(i, j)] elif column[j] == mlen: ms.append((i, j)) else: column[j] = 0 last = old if positions: return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms)) return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms) distance-master/distance/_pyimports.py0000644000175000017500000000021012243645633016605 0ustar jdgjdgfrom ._fastcomp import * from ._lcsubstrings import * from ._levenshtein import * from ._simpledists import * from ._iterators import * distance-master/distance/_levenshtein.py0000644000175000017500000001045012243645633017072 0ustar jdgjdg# -*- coding: utf-8 -*- from array import array def levenshtein(seq1, seq2, normalized=False, max_dist=-1): """Compute the absolute Levenshtein distance between the two sequences `seq1` and `seq2`. The Levenshtein distance is the minimum number of edit operations necessary for transforming one sequence into the other. The edit operations allowed are: * deletion: ABC -> BC, AC, AB * insertion: ABC -> ABCD, EABC, AEBC.. * substitution: ABC -> ABE, ADC, FBC.. The `max_dist` parameter controls at which moment we should stop computing the distance between the provided sequences. If it is a negative integer, the distance will be computed until the sequences are exhausted; otherwise, the computation will stop at the moment the calculated distance is higher than `max_dist`, and then return -1. For example: >>> levenshtein("abc", "abcd", max_dist=1) # dist = 1 1 >>> levenshtein("abc", "abcde", max_dist=1) # dist = 2 -1 This can be a time saver if you're not interested in the exact distance, but only need to check if the distance between the given sequences is below a given threshold. The `normalized` parameter is here for backward compatibility; providing it will result in a call to `nlevenshtein`, which should be used directly instead. """ if normalized: return nlevenshtein(seq1, seq2, method=1) if seq1 == seq2: return 0 len1, len2 = len(seq1), len(seq2) if max_dist >= 0 and abs(len1 - len2) > max_dist: return -1 if len1 == 0: return len2 if len2 == 0: return len1 if len1 < len2: len1, len2 = len2, len1 seq1, seq2 = seq2, seq1 column = array('L', range(len2 + 1)) for x in range(1, len1 + 1): column[0] = x last = x - 1 for y in range(1, len2 + 1): old = column[y] cost = int(seq1[x - 1] != seq2[y - 1]) column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost) last = old if max_dist >= 0 and min(column) > max_dist: return -1 if max_dist >= 0 and column[len2] > max_dist: # stay consistent, even if we have the exact distance return -1 return column[len2] def nlevenshtein(seq1, seq2, method=1): """Compute the normalized Levenshtein distance between `seq1` and `seq2`. Two normalization methods are provided. For both of them, the normalized distance will be a float between 0 and 1, where 0 means equal and 1 completely different. The computation obeys the following patterns: 0.0 if seq1 == seq2 1.0 if len(seq1) == 0 or len(seq2) == 0 edit distance / factor otherwise The `method` parameter specifies which normalization factor should be used. It can have the value 1 or 2, which correspond to the following: 1: the length of the shortest alignment between the sequences (that is, the length of the longest sequence) 2: the length of the longest alignment between the sequences Which normalization factor should be chosen is a matter of taste. The first one is cheap to compute. The second one is more costly, but it accounts better than the first one for parallelisms of symbols between the sequences. For the rationale behind the use of the second method, see: Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein Distance", 2004, p. 130 sq, which is available online at: http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf """ if seq1 == seq2: return 0.0 len1, len2 = len(seq1), len(seq2) if len1 == 0 or len2 == 0: return 1.0 if len1 < len2: # minimize the arrays size len1, len2 = len2, len1 seq1, seq2 = seq2, seq1 if method == 1: return levenshtein(seq1, seq2) / float(len1) if method != 2: raise ValueError("expected either 1 or 2 for `method` parameter") column = array('L', range(len2 + 1)) length = array('L', range(len2 + 1)) for x in range(1, len1 + 1): column[0] = length[0] = x last = llast = x - 1 for y in range(1, len2 + 1): # dist old = column[y] ic = column[y - 1] + 1 dc = column[y] + 1 rc = last + (seq1[x - 1] != seq2[y - 1]) column[y] = min(ic, dc, rc) last = old # length lold = length[y] lic = length[y - 1] + 1 if ic == column[y] else 0 ldc = length[y] + 1 if dc == column[y] else 0 lrc = llast + 1 if rc == column[y] else 0 length[y] = max(ldc, lic, lrc) llast = lold return column[y] / float(length[y]) distance-master/distance/__init__.py0000644000175000017500000000073112243645633016147 0ustar jdgjdg"Utilities for comparing sequences" __all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen", "fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"] try: from .cdistance import * except ImportError: from ._pyimports import * from ._pyimports import jaccard, sorensen def quick_levenshtein(str1, str2): return fast_comp(str1, str2, transpositions=False) def iquick_levenshtein(str1, strs): return ifast_comp(str1, str2, transpositions=False) distance-master/distance/_simpledists.py0000644000175000017500000000317012243645633017107 0ustar jdgjdg# -*- coding: utf-8 -*- def hamming(seq1, seq2, normalized=False): """Compute the Hamming distance between the two sequences `seq1` and `seq2`. The Hamming distance is the number of differing items in two ordered sequences of the same length. If the sequences submitted do not have the same length, an error will be raised. If `normalized` evaluates to `False`, the return value will be an integer between 0 and the length of the sequences provided, edge values included; otherwise, it will be a float between 0 and 1 included, where 0 means equal, and 1 totally different. Normalized hamming distance is computed as: 0.0 if len(seq1) == 0 hamming_dist / len(seq1) otherwise """ L = len(seq1) if L != len(seq2): raise ValueError("expected two strings of the same length") if L == 0: return 0.0 if normalized else 0 # equal dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2)) if normalized: return dist / float(L) return dist def jaccard(seq1, seq2): """Compute the Jaccard distance between the two sequences `seq1` and `seq2`. They should contain hashable items. The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. """ set1, set2 = set(seq1), set(seq2) return 1 - len(set1 & set2) / float(len(set1 | set2)) def sorensen(seq1, seq2): """Compute the Sorensen distance between the two sequences `seq1` and `seq2`. They should contain hashable items. The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. """ set1, set2 = set(seq1), set(seq2) return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2))) distance-master/distance/_iterators.py0000644000175000017500000000303112243645633016557 0ustar jdgjdgfrom ._pyimports import levenshtein, fast_comp def ilevenshtein(seq1, seqs, max_dist=-1): """Compute the Levenshtein distance between the sequence `seq1` and the series of sequences `seqs`. `seq1`: the reference sequence `seqs`: a series of sequences (can be a generator) `max_dist`: if provided and > 0, only the sequences which distance from the reference sequence is lower or equal to this value will be returned. The return value is a series of pairs (distance, sequence). The sequence objects in `seqs` are expected to be of the same kind than the reference sequence in the C implementation; the same holds true for `ifast_comp`. """ for seq2 in seqs: dist = levenshtein(seq1, seq2, max_dist=max_dist) if dist != -1: yield dist, seq2 def ifast_comp(seq1, seqs, transpositions=False): """Return an iterator over all the sequences in `seqs` which distance from `seq1` is lower or equal to 2. The sequences which distance from the reference sequence is higher than that are dropped. `seq1`: the reference sequence. `seqs`: a series of sequences (can be a generator) `transpositions` has the same sense than in `fast_comp`. The return value is a series of pairs (distance, sequence). You might want to call `sorted()` on the iterator to get the results in a significant order: >>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"]) >>> sorted(g) [(0, 'foo'), (1, 'fo'), (1, 'foob')] """ for seq2 in seqs: dist = fast_comp(seq1, seq2, transpositions) if dist != -1: yield dist, seq2 distance-master/tests/0000755000175000017500000000000012243645633013405 5ustar jdgjdgdistance-master/tests/tests.py0000644000175000017500000001331412243645633015123 0ustar jdgjdgimport os, sys from array import array try: from distance import cdistance except ImportError: cdistance = None from distance import _pyimports as pydistance if sys.version_info.major < 3: t_unicode = unicode t_bytes = lambda s: s else: t_unicode = lambda s: s t_bytes = lambda s: s.encode() all_types = [ ("unicode", t_unicode), ("bytes", t_bytes), ("list", list), ("tuple", tuple), ] def hamming(func, t, **kwargs): # types; only for c if kwargs["lang"] == "C": try: func(1, t("foo")) except ValueError: pass try: func(t("foo"), 1) except ValueError: pass # empty string assert func(t(""), t("")) == 0 # common assert func(t("abc"), t("abc")) == 0 assert func(t("abc"), t("abd")) == 1 # wrong length try: func(t("foo"), t("foobar")) except ValueError: pass try: func(t(""), t("foo")) except ValueError: pass # normalization assert func(t(""), t(""), normalized=True) == 0.0 assert func(t("abc"), t("abc"), normalized=True) == 0.0 assert func(t("ab"), t("ac"), normalized=True) == 0.5 assert func(t("abc"), t("def"), normalized=True) == 1.0 def fast_comp(func, t, **kwargs): # types; only for c if kwargs["lang"] == "C": try: func(1, t("foo")) except ValueError: pass try: func(t("foo"), 1) except ValueError: pass # empty strings assert func(t(""), t("")) == 0 assert func(t(""), t("a")) == func(t("a"), t("")) == 1 # edit ops assert func(t("aa"), t("aa")) == 0 assert func(t("ab"), t("aa")) == 1 assert func(t("ab"), t("a")) == 1 assert func(t("ab"), t("abc")) == 1 # dist limit assert func(t("a"), t("bcd")) == func(t("bcd"), t("a")) == -1 # transpositions assert func(t("abc"), t("bac"), transpositions=True) == \ func(t("bac"), t("abc"), transpositions=True) == 1 def levenshtein(func, t, **kwargs): # types; only for c if kwargs["lang"] == "C": try: func(1, t("foo")) except ValueError: pass try: func(t("foo"), 1) except ValueError: pass # empty strings assert func(t(""), t("")) == 0 assert func(t(""), t("abcd")) == func(t("abcd"), t("")) == 4 # edit ops assert func(t("aa"), t("aa")) == 0 assert func(t("ab"), t("aa")) == 1 assert func(t("ab"), t("a")) == 1 assert func(t("ab"), t("abc")) == 1 # dist limit assert func(t("a"), t("b"), max_dist=0) == -1 assert func(t("a"), t("b"), max_dist=1) == 1 assert func(t("foo"), t("bar"), max_dist=-1) == 3 def nlevenshtein(func, t, **kwargs): # types; only for c if kwargs["lang"] == "C": try: func(1, t("foo")) except ValueError: pass try: func(t("foo"), 1) except ValueError: pass # empty strings assert func(t(""), t(""), 1) == func(t(""), t(""), 2) == 0.0 assert func(t(""), t("foo"), 1) == func(t("foo"), t(""), 1) == \ func(t(""), t("foo"), 2) == func(t("foo"), t(""), 2) == 1.0 assert func(t("aa"), t("aa"), 1) == func(t("aa"), t("aa"), 2) == 0.0 assert func(t("ab"), t("aa"), 1) == func(t("ab"), t("aa"), 2) == 0.5 assert func(t("ab"), t("a"), 1) == func(t("ab"), t("a"), 2) == 0.5 assert func(t("ab"), t("abc"), 1) == func(t("ab"), t("abc"), 2) == 0.3333333333333333 # multiple alignments assert func(t("abc"), t("adb"), 1) == 0.6666666666666666 assert func(t("abc"), t("adb"), 2) == 0.5 def lcsubstrings(func, t, **kwargs): # types; only for c if kwargs["lang"] == "C": try: func(1, t("foo")) except ValueError: pass try: func(t("foo"), 1) except ValueError: pass # empty strings try: assert func(t(""), t(""), False) == set() except TypeError: if t is not list: raise assert func(t(""), t(""), True) == (0, ()) try: assert func(t(""), t("foo"), False) == func(t("foo"), t(""), False) == set() except TypeError: if t is not list: raise assert func(t(""), t("foo"), True) == func(t("foo"), t(""), True) == (0, ()) # common try: assert func(t("abcd"), t("cdba"), False) == {t('cd')} except TypeError: if t is not list: raise assert func(t("abcd"), t("cdba"), True) == (2, ((2, 0),)) # reverse try: assert func(t("abcdef"), t("cdba"), False) == func(t("cdba"), t("abcdef"), False) except TypeError: if t is not list: raise assert func(t("abcdef"), t("cdba"), True) == func(t("cdba"), t("abcdef"), True) def itors_common(func, t, **kwargs): if kwargs["lang"] == "C": # types check; only need to do it for C impl to avoid an eventual segfaults. try: func(1, t("foo")) except ValueError: pass itor = func(t("foo"), [t("foo"), 3333]) next(itor) try: next(itor) except ValueError: pass # values drop itor = func(t("aa"), [t("aa"), t("abcd"), t("ba")]) assert next(itor) == (0, t("aa")) assert next(itor) == (1, t("ba")) def ilevenshtein(func, t, **kwargs): itors_common(lambda a, b: func(a, b, max_dist=2), t, **kwargs) def ifast_comp(func, t, **kwargs): itors_common(func, t, **kwargs) #transpositions g = func(t("abc"), [t("bac")], transpositions=False) assert next(g) == (2, t('bac')) g = func(t("abc"), [t("bac")], transpositions=True) assert next(g) == (1, t("bac")) write = lambda s: sys.stderr.write(s + '\n') tests = ["hamming", "fast_comp", "levenshtein", "lcsubstrings", "nlevenshtein", "ilevenshtein", "ifast_comp"] def run_test(name): if cdistance: cfunc = getattr(cdistance, name) run_lang_test(name, cfunc, "C") write("") pyfunc = getattr(pydistance, name) run_lang_test(name, pyfunc, "py") if cdistance is None: write("skipped C tests") write("") def run_lang_test(name, func, lang): print("%s (%s)..." % (name, lang)) for tname, typ in all_types: write("type: %s" % tname) globals()[name](func, typ, lang=lang) if __name__ == "__main__": args = sys.argv[1:] if not args: for test in tests: run_test(test) sys.exit() for name in args: if name in tests: run_test(name) else: write("no such test: %s" % name) sys.exit(1) distance-master/.gitignore0000644000175000017500000000047312243645633014237 0ustar jdgjdg*.py[cod] # C extensions *.so # Packages *.egg *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg lib lib64 __pycache__ # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox nosetests.xml # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject distance-master/LICENSE0000644000175000017500000004543212243645633013260 0ustar jdgjdgdistance license ================ Copyright (C) 2013 Michaël Meyer GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. {description} Copyright (C) {year} {fullname} This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. {signature of Ty Coon}, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. fastcomp license ================ MIT LICENSE Copyright (c) 2012 Fujimoto Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. distance-master/cdistance/0000755000175000017500000000000012243645633014200 5ustar jdgjdgdistance-master/cdistance/distance.c0000644000175000017500000004310712243645633016143 0ustar jdgjdg#include "distance.h" #include "includes.h" static unicode * get_unicode(PyObject *obj, Py_ssize_t *len) { unicode *u; if ((u = PyUnicode_AS_UNICODE(obj)) == NULL) { PyErr_Format(PyExc_RuntimeError, "failed to get unicode representation of object"); return NULL; } *len = PyUnicode_GET_LENGTH(obj); return u; } static byte * get_byte(PyObject *obj, Py_ssize_t *len) { byte *b; b = PyBytes_AS_STRING(obj); *len = PyBytes_GET_SIZE(obj); return b; } static array * get_array(PyObject *obj, Py_ssize_t *len) { array *a; if ((a = PySequence_Fast(obj, "we got a problem")) == NULL) return NULL; *len = PySequence_Fast_GET_SIZE(a); return a; } static char get_sequence(PyObject *obj, sequence *seq, Py_ssize_t *len, char type) { char t = '\0'; if (PyUnicode_Check(obj)) { t = 'u'; if ((seq->u = get_unicode(obj, len)) == NULL) return '\0'; } else if (PyBytes_Check(obj)) { t = 'b'; if ((seq->b = get_byte(obj, len)) == NULL) return '\0'; } else if (PySequence_Check(obj)) { t = 'a'; if ((seq->a = get_array(obj, len)) == NULL) return '\0'; } if (!t) { PyErr_SetString(PyExc_ValueError, "expected a sequence object as first argument"); return '\0'; } if (type && t != type) { PyErr_SetString(PyExc_ValueError, "type mismatch between the " "value provided as left argument and one of the elements in " "the right one, can't process the later"); if (t == 'a') Py_DECREF(seq->a); return '\0'; } return t; } static char get_sequences(PyObject *arg1, PyObject *arg2, sequence *seq1, sequence *seq2, Py_ssize_t *len1, Py_ssize_t *len2) { if (PyUnicode_Check(arg1) && PyUnicode_Check(arg2)) { if ((seq1->u = get_unicode(arg1, len1)) == NULL) return '\0'; if ((seq2->u = get_unicode(arg2, len2)) == NULL) return '\0'; return 'u'; } else if (PyBytes_Check(arg1) && PyBytes_Check(arg2)) { if ((seq1->b = get_byte(arg1, len1)) == NULL) return '\0'; if ((seq2->b = get_byte(arg2, len2)) == NULL) return '\0'; return 'b'; } else if (PySequence_Check(arg1) && PySequence_Check(arg2)) { if ((seq1->a = get_array(arg1, len1)) == NULL) return '\0'; if ((seq2->a = get_array(arg2, len2)) == NULL) { Py_DECREF(seq1->a); /* warning ! */ return '\0'; } return 'a'; } PyErr_SetString(PyExc_ValueError, "expected two sequence objects"); return '\0'; } static PyObject * hamming_py(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *odo_normalize = NULL; int do_normalize = 0; static char *keywords[] = {"seq1", "seq2", "normalized", NULL}; char type; sequence seq1, seq2; Py_ssize_t len1, len2; Py_ssize_t dist; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:hamming", keywords, &arg1, &arg2, &odo_normalize)) return NULL; if (odo_normalize && (do_normalize = PyObject_IsTrue(odo_normalize)) == -1) return NULL; if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') return NULL; if (len1 != len2) { PyErr_SetString(PyExc_ValueError, "expected two objects of the same length"); if (type == 'a') { Py_DECREF(seq1.a); Py_DECREF(seq2.a); } return NULL; } switch(type) { case 'u': dist = uhamming(seq1.u, seq2.u, len1); break; case 'b': dist = bhamming(seq1.b, seq2.b, len1); break; default: dist = ahamming(seq1.a, seq2.a, len1); Py_DECREF(seq1.a); Py_DECREF(seq2.a); } if (dist == -1) // comparison failed return NULL; if (do_normalize) { if (len1 == 0) return Py_BuildValue("f", 0.0f); return Py_BuildValue("d", dist / (double)len1); } return Py_BuildValue("n", dist); } static PyObject * lcsubstrings_py_make_set(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen) { PyObject *set, *ss; struct pair_t *pair; if ((set = PySet_New(NULL)) == NULL) { utarray_free(stack); return NULL; } for (pair = (struct pair_t*)utarray_front(stack); pair != NULL; pair = (struct pair_t*)utarray_next(stack, pair)) { ss = PySequence_GetSlice(arg2, pair->j - mlen + 1, pair->j + 1); if (ss == NULL) goto On_Error; if ((PySet_Add(set, ss)) == -1) goto On_Error; } utarray_free(stack); return set; On_Error: PySet_Clear(set); Py_DECREF(set); utarray_free(stack); return NULL; } static PyObject * lcsubstrings_py_make_tuple(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen) { PyObject *tp, *stp; Py_ssize_t i; struct pair_t *pair; if ((stp = PyTuple_New(utarray_len(stack))) == NULL) { utarray_free(stack); return NULL; } for (i = 0, pair = (struct pair_t*)utarray_front(stack); pair != NULL; ++i, pair = (struct pair_t*)utarray_next(stack, pair)) { PyTuple_SET_ITEM(stp, i, Py_BuildValue("(nn)", pair->i - mlen + 1, pair->j - mlen + 1)); } if ((tp = PyTuple_New(2)) == NULL) { utarray_free(stack); Py_DECREF(stp); return NULL; } PyTuple_SET_ITEM(tp, 0, Py_BuildValue("n", mlen)); PyTuple_SET_ITEM(tp, 1, stp); utarray_free(stack); return tp; } static PyObject * lcsubstrings_py(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *opos = NULL; int positions = 0; static char *keywords[] = {"seq1", "seq2", "positions", NULL}; char type; sequence seq1, seq2; Py_ssize_t len1, len2; UT_array *stack; Py_ssize_t mlen = -1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:lcsubstrings", keywords, &arg1, &arg2, &opos)) return NULL; if (opos && (positions = PyObject_IsTrue(opos)) == -1) return NULL; if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') return NULL; // special case if (type == 'a' && (!positions) && (PyList_Check(arg1) || PyList_Check(arg2))) { Py_DECREF(seq1.a); Py_DECREF(seq2.a); PyErr_SetString(PyExc_TypeError, "can't hash lists, pass in tuples instead"); return NULL; } if (len1 < len2) { SWAP(PyObject *, arg1, arg2); SWAP(sequence, seq1, seq2); SWAP(Py_ssize_t, len1, len2); } switch(type) { case 'u': stack = ulcsubstrings(seq1.u, seq2.u, len1, len2, &mlen); break; case 'b': stack = blcsubstrings(seq1.b, seq2.b, len1, len2, &mlen); break; default: stack = alcsubstrings(seq1.a, seq2.a, len1, len2, &mlen); Py_DECREF(seq1.a); Py_DECREF(seq2.a); } if (stack == NULL) { /* memory allocation failed */ return PyErr_NoMemory(); } if (positions) return lcsubstrings_py_make_tuple(arg1, arg2, stack, mlen); return lcsubstrings_py_make_set(arg1, arg2, stack, mlen); } static PyObject * nlevenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2; short method = 1; static char *keywords[] = {"seq1", "seq2", "method", NULL}; char type; sequence seq1, seq2; Py_ssize_t len1, len2; double dist; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|h:nlevenshtein", keywords, &arg1, &arg2, &method)) return NULL; if (method != 1 && method != 2) { PyErr_SetString(PyExc_ValueError, "expected either 1 or 2 for `method` parameter"); return NULL; } if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') return NULL; if (len1 < len2) { SWAP(sequence, seq1, seq2); SWAP(Py_ssize_t, len1, len2); } switch(type) { case 'u': dist = unlevenshtein(seq1.u, seq2.u, len1, len2, method); break; case 'b': dist = bnlevenshtein(seq1.b, seq2.b, len1, len2, method); break; default: dist = anlevenshtein(seq1.a, seq2.a, len1, len2, method); Py_DECREF(seq1.a); Py_DECREF(seq2.a); } if (dist < 0) { if (dist == -1) // memory allocation failed return PyErr_NoMemory(); return NULL; // comparison failed } return Py_BuildValue("d", dist); } static PyObject * levenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *onorm = NULL; Py_ssize_t dist = -1; Py_ssize_t max_dist = -1; int normalized = 0; static char *keywords[] = {"seq1", "seq2", "normalized", "max_dist", NULL}; char type; sequence seq1, seq2; Py_ssize_t len1, len2; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|On:levenshtein", keywords, &arg1, &arg2, &onorm, &max_dist)) return NULL; if (onorm && (normalized = PyObject_IsTrue(onorm)) == -1) return NULL; if (normalized) { onorm = NULL; return nlevenshtein_py(self, args, onorm); } if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') return NULL; switch(type) { case 'u': dist = ulevenshtein(seq1.u, seq2.u, len1, len2, max_dist); break; case 'b': dist = blevenshtein(seq1.b, seq2.b, len1, len2, max_dist); break; default: dist = alevenshtein(seq1.a, seq2.a, len1, len2, max_dist); Py_DECREF(seq1.a); Py_DECREF(seq2.a); } if (dist < -1) { if (dist == -2) return PyErr_NoMemory(); // memory allocation failed return NULL; // comparison failed } return Py_BuildValue("n", dist); } static PyObject * fastcomp_py(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *otr = NULL; int transpositions = 0; static char *keywords[] = {"seq1", "seq2", "transpositions", NULL}; char type; sequence seq1, seq2; Py_ssize_t len1, len2; short dist; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:fast_comp", keywords, &arg1, &arg2, &transpositions)) return NULL; if (otr && (transpositions = PyObject_IsTrue(otr)) == -1) return NULL; if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') return NULL; if (len1 < len2) { SWAP(sequence, seq1, seq2); SWAP(Py_ssize_t, len1, len2); } switch(type) { case 'u': dist = ufastcomp(seq1.u, seq2.u, len1, len2, transpositions); break; case 'b': dist = bfastcomp(seq1.b, seq2.b, len1, len2, transpositions); break; default: dist = afastcomp(seq1.a, seq2.a, len1, len2, transpositions); Py_DECREF(seq1.a); Py_DECREF(seq2.a); } if (dist == -2) // comparison failed return NULL; return Py_BuildValue("h", dist); } // Iterators (for levenshtein and fastcomp). They share the same structure. typedef struct { PyObject_HEAD PyObject *itor; char seqtype; // type of the sequence ('u', 'b', 'a') sequence seq1; // the sequence itself Py_ssize_t len1; // its length PyObject *object; // the corresponding pyobject int transpos; // only valable for fastcomp Py_ssize_t max_dist; // only for levenshtein } ItorState; static void itor_dealloc(ItorState *state) { // we got two references for tuples and lists, one for the original python object, // and one returned by `PySequence_fast` if (state->seqtype == 'a') Py_XDECREF(state->seq1.a); Py_XDECREF(state->object); Py_XDECREF(state->itor); Py_TYPE(state)->tp_free(state); } static PyObject * ifastcomp_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *itor; int transpositions = 0; static char *keywords[] = {"seq1", "seqs", "transpositions", NULL}; char seqtype; sequence seq1; Py_ssize_t len1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:ifast_comp", keywords, &arg1, &arg2, &transpositions)) return NULL; if (otr && (transpositions = PyObject_IsTrue(otr)) == -1) return NULL; if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0') return NULL; if ((itor = PyObject_GetIter(arg2)) == NULL) { PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument"); return NULL; } ItorState *state = (ItorState *)type->tp_alloc(type, 0); if (state == NULL) { Py_DECREF(itor); return NULL; } Py_INCREF(arg1); state->itor = itor; state->seqtype = seqtype; state->seq1 = seq1; state->object = arg1; state->len1 = len1; state->transpos = transpositions; return (PyObject *)state; } static PyObject * ilevenshtein_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { PyObject *arg1, *arg2, *itor; Py_ssize_t max_dist = -1; static char *keywords[] = {"seq1", "seqs", "max_dist", NULL}; char seqtype; sequence seq1; Py_ssize_t len1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|n:ilevenshtein", keywords, &arg1, &arg2, &max_dist)) return NULL; if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0') return NULL; if ((itor = PyObject_GetIter(arg2)) == NULL) { PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument"); return NULL; } ItorState *state = (ItorState *)type->tp_alloc(type, 0); if (state == NULL) { Py_DECREF(itor); return NULL; } Py_INCREF(arg1); state->itor = itor; state->seqtype = seqtype; state->seq1 = seq1; state->object = arg1; state->len1 = len1; state->max_dist = max_dist; return (PyObject *)state; } static PyObject * ilevenshtein_next(ItorState *state) { PyObject *arg2; sequence seq1, seq2; Py_ssize_t len2; Py_ssize_t dist = -1; PyObject *rv; seq1 = state->seq1; while ((arg2 = PyIter_Next(state->itor)) != NULL) { if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') { Py_DECREF(arg2); return NULL; } switch(state->seqtype) { case 'u': dist = ulevenshtein(seq1.u, seq2.u, state->len1, len2, state->max_dist); break; case 'b': dist = blevenshtein(seq1.b, seq2.b, state->len1, len2, state->max_dist); break; default: dist = alevenshtein(seq1.a, seq2.a, state->len1, len2, state->max_dist); Py_DECREF(seq2.a); } if (dist < -1) { Py_DECREF(arg2); if (dist == -2) return PyErr_NoMemory(); // memory allocation failed return NULL; // comparison failed } if (dist != -1) { rv = Py_BuildValue("(nO)", dist, arg2); Py_DECREF(arg2); return rv; } Py_DECREF(arg2); } return NULL; } static PyObject * ifastcomp_next(ItorState *state) { PyObject *arg2; sequence seq1, seq2; Py_ssize_t len2; short dist = -1; PyObject *rv; seq1 = state->seq1; while ((arg2 = PyIter_Next(state->itor)) != NULL) { if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') { Py_DECREF(arg2); return NULL; } switch(state->seqtype) { case 'u': dist = ufastcomp(seq1.u, seq2.u, state->len1, len2, state->transpos); break; case 'b': dist = bfastcomp(seq1.b, seq2.b, state->len1, len2, state->transpos); break; default: dist = afastcomp(seq1.a, seq2.a, state->len1, len2, state->transpos); Py_DECREF(seq2.a); } if (dist == -2) { // comparison failed Py_DECREF(arg2); return NULL; } if (dist != -1) { rv = Py_BuildValue("(hO)", dist, arg2); Py_DECREF(arg2); return rv; } Py_DECREF(arg2); } return NULL; } PyTypeObject IFastComp_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "distance.ifast_comp", /* tp_name */ sizeof(ItorState), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)itor_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ ifast_comp_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ PyObject_SelfIter, /* tp_iter */ (iternextfunc)ifastcomp_next, /* tp_iternext */ 0, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ PyType_GenericAlloc, /* tp_alloc */ ifastcomp_new, /* tp_new */ }; PyTypeObject ILevenshtein_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "distance.ilevenshtein", /* tp_name */ sizeof(ItorState), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)itor_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ ilevenshtein_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ PyObject_SelfIter, /* tp_iter */ (iternextfunc)ilevenshtein_next, /* tp_iternext */ 0, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ PyType_GenericAlloc, /* tp_alloc */ ilevenshtein_new, /* tp_new */ }; static PyMethodDef CDistanceMethods[] = { {"hamming", (PyCFunction)hamming_py, METH_VARARGS | METH_KEYWORDS, hamming_doc}, {"levenshtein", (PyCFunction)levenshtein_py, METH_VARARGS | METH_KEYWORDS, levenshtein_doc}, {"nlevenshtein", (PyCFunction)nlevenshtein_py, METH_VARARGS | METH_KEYWORDS, nlevenshtein_doc}, {"lcsubstrings", (PyCFunction)lcsubstrings_py, METH_VARARGS | METH_KEYWORDS, lcsubstrings_doc}, {"fast_comp", (PyCFunction)fastcomp_py, METH_VARARGS | METH_KEYWORDS, fast_comp_doc}, {NULL, NULL, 0, NULL} }; #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef cdistancemodule = { PyModuleDef_HEAD_INIT, "cdistance", NULL, -1, CDistanceMethods }; #endif #if PY_MAJOR_VERSION >= 3 PyMODINIT_FUNC PyInit_cdistance(void) #else PyMODINIT_FUNC initcdistance(void) #endif { PyObject *module; #if PY_MAJOR_VERSION >= 3 if ((module = PyModule_Create(&cdistancemodule)) == NULL) return NULL; #else if ((module = Py_InitModule("cdistance", CDistanceMethods)) == NULL) return; #endif if (PyType_Ready(&IFastComp_Type) != 0 || PyType_Ready(&ILevenshtein_Type) != 0) #if PY_MAJOR_VERSION >= 3 return NULL; #else return; #endif Py_INCREF((PyObject *)&IFastComp_Type); Py_INCREF((PyObject *)&ILevenshtein_Type); PyModule_AddObject(module, "ifast_comp", (PyObject *)&IFastComp_Type); PyModule_AddObject(module, "ilevenshtein", (PyObject *)&ILevenshtein_Type); #if PY_MAJOR_VERSION >= 3 return module; #endif } distance-master/cdistance/fastcomp.c0000644000175000017500000000527112243645633016165 0ustar jdgjdg#include "distance.h" static short fastcomp(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, int transpositions) { char *models[3]; short m, cnt, res = 3; Py_ssize_t i, j, c, ldiff; #ifdef SEQUENCE_COMP int comp; #endif if (len1 < len2) { SWAP(unicode *, seq1, seq2); SWAP(Py_ssize_t, len1, len2); } ldiff = len1 - len2; switch (ldiff) { case 0: models[2] = "id"; models[1] = "di"; models[0] = "rr"; m = 2; break; case 1: models[1] = "dr"; models[0] = "rd"; m = 1; break; case 2: models[0] = "dd"; m = 0; break; default: return -1; } for (; m >= 0; m--) { i = j = c = 0; while (i < len1 && j < len2) { #ifdef SEQUENCE_COMP comp = SEQUENCE_COMP(seq1, i, seq2, j); if (comp == -1) return -2; if (!comp) { #else if (seq1[i] != seq2[j]) { #endif c++; if (c > 2) break; /* Transpositions handling. `ldiff`, which is the absolute difference between the length of the sequences `seq1` and `seq2`, should not be equal to 2 because in this case only deletions can happen (given that the distance between the two sequences should not be higher than 2, this is the shortest path). We do a lookahead to check if a transposition is possible between the current position and the next one, and, if so, we systematically choose this path over the other alternative edit operations. We act like so because the cost of a transposition is always the lowest one in such situations. */ #ifdef SEQUENCE_COMP if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1)) { comp = SEQUENCE_COMP(seq1, i + 1, seq2, j); if (comp == -1) return -2; else if (comp) { comp = SEQUENCE_COMP(seq1, i, seq2, j + 1); if (comp == -1) return -2; else if (comp) { i = i + 2; j = j + 2; continue; } } } #else if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1) && \ seq1[i + 1] == seq2[j] && \ seq1[i] == seq2[j + 1]) { i = i + 2; j = j + 2; continue; } #endif if (models[m][c - 1] == 'd') i++; else if (models[m][c - 1] == 'i') j++; else { i++; j++; } } else { i++; j++; } } if (c > 2) continue; else if (i < len1) { if (c == 1) cnt = (models[m][1] == 'd'); else cnt = (models[m][0] == 'd') + (models[m][1] == 'd'); if (len1 - i <= cnt) { c = c + (len1 - i); } else continue; } else if (j < len2) { if (len2 - j <= (models[m][c] == 'i')) c = c + (len2 - j); else continue; } if (c < res) { res = c; } } if (res == 3) res = -1; return res; } distance-master/cdistance/includes.h0000644000175000017500000002111112243645633016153 0ustar jdgjdg#define hamming_doc \ "hamming(seq1, seq2, normalized=False)\n\ \n\ Compute the Hamming distance between the two sequences `seq1` and `seq2`.\n\ The Hamming distance is the number of differing items in two ordered\n\ sequences of the same length. If the sequences submitted do not have the\n\ same length, an error will be raised.\n\ \n\ If `normalized` evaluates to `False`, the return value will be an integer\n\ between 0 and the length of the sequences provided, edge values included;\n\ otherwise, it will be a float between 0 and 1 included, where 0 means\n\ equal, and 1 totally different. Normalized hamming distance is computed as:\n\ \n\ 0.0 if len(seq1) == 0\n\ hamming_dist / len(seq1) otherwise" #define jaccard_doc \ "jaccard(seq1, seq2)\n\ \n\ Compute the Jaccard distance between the two sequences `seq1` and `seq2`.\n\ They should contain hashable items.\n\ \n\ The return value is a float between 0 and 1, where 0 means equal, and 1 totally different." #define sorensen_doc \ "sorensen(seq1, seq2)\n\ \n\ Compute the Sorensen distance between the two sequences `seq1` and `seq2`.\n\ They should contain hashable items.\n\ \n\ The return value is a float between 0 and 1, where 0 means equal, and 1 totally different." #define lcsubstrings_doc \ "lcsubstrings(seq1, seq2, positions=False)\n\ \n\ Find the longest common substring(s) in the sequences `seq1` and `seq2`.\n\ \n\ If positions evaluates to `True` only their positions will be returned,\n\ together with their length, in a tuple:\n\ \n\ (length, [(start pos in seq1, start pos in seq2)..])\n\ \n\ Otherwise, the substrings themselves will be returned, in a set.\n\ \n\ Example:\n\ \n\ >>> lcsubstrings(\"sedentar\", \"dentist\")\n\ {'dent'}\n\ >>> lcsubstrings(\"sedentar\", \"dentist\", positions=True)\n\ (4, [(2, 0)])" #define ilevenshtein_doc \ "ilevenshtein(seq1, seqs, max_dist=-1)\n\ \n\ Compute the Levenshtein distance between the sequence `seq1` and the series\n\ of sequences `seqs`.\n\ \n\ `seq1`: the reference sequence\n\ `seqs`: a series of sequences (can be a generator)\n\ `max_dist`: if provided and > 0, only the sequences which distance from\n\ the reference sequence is lower or equal to this value will be returned.\n\ \n\ The return value is a series of pairs (distance, sequence).\n\ \n\ The sequence objects in `seqs` are expected to be of the same kind than\n\ the reference sequence in the C implementation; the same holds true for\n\ `ifast_comp`." #define ifast_comp_doc \ "ifast_comp(seq1, seqs, transpositions=False)\n\ \n\ Return an iterator over all the sequences in `seqs` which distance from\n\ `seq1` is lower or equal to 2. The sequences which distance from the\n\ reference sequence is higher than that are dropped.\n\ \n\ `seq1`: the reference sequence.\n\ `seqs`: a series of sequences (can be a generator)\n\ `transpositions` has the same sense than in `fast_comp`.\n\ \n\ The return value is a series of pairs (distance, sequence).\n\ \n\ You might want to call `sorted()` on the iterator to get the results in a\n\ significant order:\n\ \n\ >>> g = ifast_comp(\"foo\", [\"fo\", \"bar\", \"foob\", \"foo\", \"foobaz\"])\n\ >>> sorted(g)\n\ [(0, 'foo'), (1, 'fo'), (1, 'foob')]" #define fast_comp_doc \ "fast_comp(seq1, seq2, transpositions=False)\n\ \n\ Compute the distance between the two sequences `seq1` and `seq2` up to a\n\ maximum of 2 included, and return it. If the edit distance between the two\n\ sequences is higher than that, -1 is returned.\n\ \n\ If `transpositions` is `True`, transpositions will be taken into account for\n\ the computation of the distance. This can make a difference, e.g.:\n\ \n\ >>> fast_comp(\"abc\", \"bac\", transpositions=False)\n\ 2\n\ >>> fast_comp(\"abc\", \"bac\", transpositions=True)\n\ 1\n\ \n\ This is faster than `levenshtein` by an order of magnitude, but on the\n\ other hand is of limited use.\n\ \n\ The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.\n\ I've added transpositions support to the original code." #define levenshtein_doc \ "levenshtein(seq1, seq2, max_dist=-1, normalized=False)\n\ \n\ Compute the absolute Levenshtein distance between the two sequences\n\ `seq1` and `seq2`.\n\ \n\ The Levenshtein distance is the minimum number of edit operations necessary\n\ for transforming one sequence into the other. The edit operations allowed are:\n\ \n\ * deletion: ABC -> BC, AC, AB\n\ * insertion: ABC -> ABCD, EABC, AEBC..\n\ * substitution: ABC -> ABE, ADC, FBC..\n\ \n\ The `max_dist` parameter controls at which moment we should stop computing the\n\ distance between the provided sequences. If it is a negative integer, the\n\ distance will be computed until the sequences are exhausted; otherwise, the\n\ computation will stop at the moment the calculated distance is higher than\n\ `max_dist`, and then return -1. For example:\n\ \n\ >>> levenshtein(\"abc\", \"abcd\", max_dist=1) # dist = 1\n\ 1\n\ >>> levenshtein(\"abc\", \"abcde\", max_dist=1) # dist = 2\n\ -1\n\ \n\ This can be a time saver if you're not interested in the exact distance, but\n\ only need to check if the distance between the given sequences is below a\n\ given threshold.\n\ \n\ The `normalized` parameter is here for backward compatibility; providing\n\ it will result in a call to `nlevenshtein`, which should be used directly\n\ instead. " #define nlevenshtein_doc \ "nlevenshtein(seq1, seq2, method=1)\n\ \n\ Compute the normalized Levenshtein distance between `seq1` and `seq2`.\n\ \n\ Two normalization methods are provided. For both of them, the normalized\n\ distance will be a float between 0 and 1, where 0 means equal and 1\n\ completely different. The computation obeys the following patterns:\n\ \n\ 0.0 if seq1 == seq2\n\ 1.0 if len(seq1) == 0 or len(seq2) == 0\n\ edit distance / factor otherwise\n\ \n\ The `method` parameter specifies which normalization factor should be used.\n\ It can have the value 1 or 2, which correspond to the following:\n\ \n\ 1: the length of the shortest alignment between the sequences\n\ (that is, the length of the longest sequence)\n\ 2: the length of the longest alignment between the sequences\n\ \n\ Which normalization factor should be chosen is a matter of taste. The first\n\ one is cheap to compute. The second one is more costly, but it accounts\n\ better than the first one for parallelisms of symbols between the sequences.\n\ \n\ For the rationale behind the use of the second method, see:\n\ Heeringa, \"Measuring Dialect Pronunciation Differences using Levenshtein\n\ Distance\", 2004, p. 130 sq, which is available online at:\n\ http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf" #define SEQUENCE_COMPARE(s1, i1, s2, i2) \ (PyObject_RichCompareBool( \ PySequence_Fast_GET_ITEM((s1), (i1)), \ PySequence_Fast_GET_ITEM((s2), (i2)), \ Py_EQ) \ ) #define unicode unicode #define hamming uhamming #include "hamming.c" #undef unicode #undef hamming #define unicode byte #define hamming bhamming #include "hamming.c" #undef unicode #undef hamming #define SEQUENCE_COMP SEQUENCE_COMPARE #define unicode array #define hamming ahamming #include "hamming.c" #undef unicode #undef hamming #undef SEQUENCE_COMP #define unicode unicode #define levenshtein ulevenshtein #define nlevenshtein unlevenshtein #include "levenshtein.c" #undef unicode #undef levenshtein #undef nlevenshtein #define unicode byte #define levenshtein blevenshtein #define nlevenshtein bnlevenshtein #include "levenshtein.c" #undef unicode #undef levenshtein #undef nlevenshtein #define SEQUENCE_COMP SEQUENCE_COMPARE #define unicode array #define levenshtein alevenshtein #define nlevenshtein anlevenshtein #include "levenshtein.c" #undef unicode #undef levenshtein #undef nlevenshtein #undef SEQUENCE_COMP #define unicode unicode #define lcsubstrings ulcsubstrings #include "lcsubstrings.c" #undef unicode #undef lcsubstrings #define unicode byte #define lcsubstrings blcsubstrings #include "lcsubstrings.c" #undef unicode #undef lcsubstrings #define SEQUENCE_COMP SEQUENCE_COMPARE #define unicode array #define lcsubstrings alcsubstrings #include "lcsubstrings.c" #undef unicode #undef lcsubstrings #undef SEQUENCE_COMP #define unicode unicode #define fastcomp ufastcomp #include "fastcomp.c" #undef unicode #undef fastcomp #define unicode byte #define fastcomp bfastcomp #include "fastcomp.c" #undef unicode #undef fastcomp #define SEQUENCE_COMP SEQUENCE_COMPARE #define unicode array #define fastcomp afastcomp #include "fastcomp.c" #undef unicode #undef fastcomp #undef SEQUENCE_COMP distance-master/cdistance/utarray.h0000644000175000017500000003037512243645633016050 0ustar jdgjdg/* Copyright (c) 2008-2013, Troy D. Hanson http://troydhanson.github.com/uthash/ All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* a dynamic array implementation using macros */ #ifndef UTARRAY_H #define UTARRAY_H #define UTARRAY_VERSION 1.9.8 #ifdef __GNUC__ #define _UNUSED_ __attribute__ ((__unused__)) #else #define _UNUSED_ #endif #include /* size_t */ #include /* memset, etc */ #include /* exit */ #define oom() exit(-1) typedef void (ctor_f)(void *dst, const void *src); typedef void (dtor_f)(void *elt); typedef void (init_f)(void *elt); typedef struct { size_t sz; init_f *init; ctor_f *copy; dtor_f *dtor; } UT_icd; typedef struct { unsigned i,n;/* i: index of next available slot, n: num slots */ UT_icd icd; /* initializer, copy and destructor functions */ char *d; /* n slots of size icd->sz*/ } UT_array; #define utarray_init(a,_icd) do { \ memset(a,0,sizeof(UT_array)); \ (a)->icd=*_icd; \ } while(0) #define utarray_done(a) do { \ if ((a)->n) { \ if ((a)->icd.dtor) { \ size_t _ut_i; \ for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ } \ } \ free((a)->d); \ } \ (a)->n=0; \ } while(0) #define utarray_new(a,_icd) do { \ a=(UT_array*)malloc(sizeof(UT_array)); \ utarray_init(a,_icd); \ } while(0) #define utarray_free(a) do { \ utarray_done(a); \ free(a); \ } while(0) #define utarray_reserve(a,by) do { \ if (((a)->i+by) > ((a)->n)) { \ while(((a)->i+by) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \ if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom(); \ } \ } while(0) #define utarray_push_back(a,p) do { \ utarray_reserve(a,1); \ if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \ else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \ } while(0) #define utarray_pop_back(a) do { \ if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \ else { (a)->i--; } \ } while(0) #define utarray_extend_back(a) do { \ utarray_reserve(a,1); \ if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \ else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \ (a)->i++; \ } while(0) #define utarray_len(a) ((a)->i) #define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL) #define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) ))) #define utarray_insert(a,p,j) do { \ if (j > (a)->i) utarray_resize(a,j); \ utarray_reserve(a,1); \ if ((j) < (a)->i) { \ memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \ ((a)->i - (j))*((a)->icd.sz)); \ } \ if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \ else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \ (a)->i++; \ } while(0) #define utarray_inserta(a,w,j) do { \ if (utarray_len(w) == 0) break; \ if (j > (a)->i) utarray_resize(a,j); \ utarray_reserve(a,utarray_len(w)); \ if ((j) < (a)->i) { \ memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \ _utarray_eltptr(a,j), \ ((a)->i - (j))*((a)->icd.sz)); \ } \ if ((a)->icd.copy) { \ size_t _ut_i; \ for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \ (a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i)); \ } \ } else { \ memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \ utarray_len(w)*((a)->icd.sz)); \ } \ (a)->i += utarray_len(w); \ } while(0) #define utarray_resize(dst,num) do { \ size_t _ut_i; \ if (dst->i > (size_t)(num)) { \ if ((dst)->icd.dtor) { \ for(_ut_i=num; _ut_i < dst->i; _ut_i++) { \ (dst)->icd.dtor(utarray_eltptr(dst,_ut_i)); \ } \ } \ } else if (dst->i < (size_t)(num)) { \ utarray_reserve(dst,num-dst->i); \ if ((dst)->icd.init) { \ for(_ut_i=dst->i; _ut_i < num; _ut_i++) { \ (dst)->icd.init(utarray_eltptr(dst,_ut_i)); \ } \ } else { \ memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i)); \ } \ } \ dst->i = num; \ } while(0) #define utarray_concat(dst,src) do { \ utarray_inserta((dst),(src),utarray_len(dst)); \ } while(0) #define utarray_erase(a,pos,len) do { \ if ((a)->icd.dtor) { \ size_t _ut_i; \ for(_ut_i=0; _ut_i < len; _ut_i++) { \ (a)->icd.dtor(utarray_eltptr((a),pos+_ut_i)); \ } \ } \ if ((a)->i > (pos+len)) { \ memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len), \ (((a)->i)-(pos+len))*((a)->icd.sz)); \ } \ (a)->i -= (len); \ } while(0) #define utarray_renew(a,u) do { \ if (a) utarray_clear(a); \ else utarray_new((a),(u)); \ } while(0) #define utarray_clear(a) do { \ if ((a)->i > 0) { \ if ((a)->icd.dtor) { \ size_t _ut_i; \ for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ } \ } \ (a)->i = 0; \ } \ } while(0) #define utarray_sort(a,cmp) do { \ qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \ } while(0) #define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp) #define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL) #define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL)) #define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL)) #define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL) #define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(ssize_t)(a)->icd.sz) : -1) /* last we pre-define a few icd for common utarrays of ints and strings */ static void utarray_str_cpy(void *dst, const void *src) { char **_src = (char**)src, **_dst = (char**)dst; *_dst = (*_src == NULL) ? NULL : strdup(*_src); } static void utarray_str_dtor(void *elt) { char **eltc = (char**)elt; if (*eltc) free(*eltc); } static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor}; static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL}; static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL}; #endif /* UTARRAY_H */ distance-master/cdistance/distance.h0000644000175000017500000000177012243645633016150 0ustar jdgjdg#ifndef DISTANCE_H #define DISTANCE_H #include "Python.h" #include "utarray.h" // Debugging. This kills the interpreter if an assertion fails. #ifdef DISTANCE_DEBUG #undef NDEBUG #include #endif // Compatibility Python 2 && 3 #if PY_MAJOR_VERSION < 3 #define PyBytes_Check PyString_Check #define PyBytes_AS_STRING PyString_AS_STRING #define PyBytes_GET_SIZE PyString_GET_SIZE #define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE #endif // Aliases for each sequence type typedef Py_UNICODE unicode; typedef char byte; typedef PyObject array; typedef union { unicode *u; byte *b; array *a; } sequence; // Used in distance.c and some other files #define SWAP(type, a, b) \ do { \ type a##_tmp = a; \ a = b; \ b = a##_tmp; \ } while (0) // Used in lcsubstrings.c and distance.c for dynamic array struct pair_t { Py_ssize_t i; Py_ssize_t j; }; UT_icd pair_icd = {sizeof(struct pair_t), NULL, NULL, NULL}; #endif distance-master/cdistance/lcsubstrings.c0000644000175000017500000000242712243645633017073 0ustar jdgjdg#include "distance.h" static UT_array * lcsubstrings(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t *max_len) { Py_ssize_t i, j, mlen = -1; Py_ssize_t old, last, *column; UT_array *stack = NULL; struct pair_t pos; #ifdef SEQUENCE_COMP int comp; #endif assert(len1 >= len2); utarray_new(stack, &pair_icd); if (len2 == 0) { *max_len = 0; return stack; } if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) goto On_Error; last = 0; for (j = 0; j < len2; j++) column[j] = j; for (i = 0; i < len1; i++) { for (j = 0; j < len2; j++) { old = column[j]; #ifdef SEQUENCE_COMP comp = SEQUENCE_COMP(seq1, i, seq2, j); if (comp == -1) goto On_Error; if (comp) { #else if (seq1[i] == seq2[j]) { #endif column[j] = ((i == 0 || j == 0) ? 1 : (last + 1)); if (column[j] > mlen) { mlen = column[j]; pos.i = i; pos.j = j; utarray_clear(stack); utarray_push_back(stack, &pos); } else if (column[j] == mlen) { pos.i = i; pos.j = j; utarray_push_back(stack, &pos); } } else column[j] = 0; last = old; } } free(column); *max_len = mlen; return stack; On_Error: free(column); utarray_free(stack); return NULL; } distance-master/cdistance/hamming.c0000644000175000017500000000056212243645633015767 0ustar jdgjdg#include "distance.h" static Py_ssize_t hamming(unicode *seq1, unicode *seq2, Py_ssize_t len) { Py_ssize_t i, dist = 0; #ifdef SEQUENCE_COMP int comp; #endif for (i = 0; i < len; i++) { #ifdef SEQUENCE_COMP comp = SEQUENCE_COMP(seq1, i, seq2, i); if (comp == -1) return -1; if (!comp) #else if (seq1[i] != seq2[i]) #endif dist++; } return dist; } distance-master/cdistance/levenshtein.c0000644000175000017500000000667412243645633016705 0ustar jdgjdg#include "distance.h" #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) #define MAX3(a, b, c) ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c))) #ifndef LEVENSHTEIN_C #define LEVENSHTEIN_C static Py_ssize_t minimum(const Py_ssize_t *column, Py_ssize_t len) { Py_ssize_t min; assert(len > 0); min = column[--len]; while (len-- >= 0) { if (column[len] < min) min = column[len]; } return min; } #endif static Py_ssize_t levenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t max_dist) { Py_ssize_t i, j; Py_ssize_t last, old; Py_ssize_t cost, dist = -2; Py_ssize_t *column; #ifdef SEQUENCE_COMP int comp; #endif if (len1 < len2) { SWAP(unicode *, seq1, seq2); SWAP(Py_ssize_t, len1, len2); } if (max_dist >= 0 && (len1 - len2) > max_dist) return -1; else { if (len1 == 0) return len2; if (len2 == 0) return len1; } if ((column = (Py_ssize_t *) malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) return -2; for (j = 1 ; j <= len2; j++) column[j] = j; for (i = 1 ; i <= len1; i++) { column[0] = i; for (j = 1, last = i - 1; j <= len2; j++) { old = column[j]; #ifdef SEQUENCE_COMP comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1); if (comp == -1) { free(column); return -3; } cost = (!comp); #else cost = (seq1[i - 1] != seq2[j - 1]); #endif column[j] = MIN3( column[j] + 1, column[j - 1] + 1, last + cost ); last = old; } if (max_dist >= 0 && minimum(column, len2 + 1) > max_dist) { free(column); return -1; } } dist = column[len2]; free(column); if (max_dist >= 0 && dist > max_dist) return -1; return dist; } static double nlevenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, short method) { Py_ssize_t i, j; // distance Py_ssize_t ic, dc, rc; Py_ssize_t last, old; Py_ssize_t *column; Py_ssize_t fdist; // length Py_ssize_t lic, ldc, lrc; Py_ssize_t llast, lold; Py_ssize_t *length; Py_ssize_t flen; #ifdef SEQUENCE_COMP int comp; #endif assert(len1 >= len2); if (len1 == 0) // len2 is 0 too, so the two sequences are identical return 0.0; if (len2 == 0) // completely different return 1.0; if (method == 1) { fdist = levenshtein(seq1, seq2, len1, len2, -1); if (fdist < 0) // error return fdist; return fdist / (double)len1; } if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) return -1; if ((length = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) { free(column); return -1; } for (j = 1 ; j <= len2; j++) column[j] = length[j] = j; for (i = 1 ; i <= len1; i++) { column[0] = length[0] = i; for (j = 1, last = llast = i - 1; j <= len2; j++) { // distance old = column[j]; ic = column[j - 1] + 1; dc = column[j] + 1; #ifdef SEQUENCE_COMP comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1); if (comp == -1) { free(column); free(length); return -2; } rc = last + (!comp); #else rc = last + (seq1[i - 1] != seq2[j - 1]); #endif column[j] = MIN3(ic, dc, rc); last = old; // length lold = length[j]; lic = (ic == column[j] ? length[j - 1] + 1 : 0); ldc = (dc == column[j] ? length[j] + 1 : 0); lrc = (rc == column[j] ? llast + 1 : 0); length[j] = MAX3(lic, ldc, lrc); llast = lold; } } fdist = column[len2]; flen = length[len2]; free(column); free(length); return fdist / (double)flen; }