distance-master/ 0000755 0001750 0001750 00000000000 12243645633 012243 5 ustar jdg jdg distance-master/MANIFEST 0000644 0001750 0001750 00000000136 12243645633 013374 0 ustar jdg jdg # file GENERATED by distutils, do NOT edit
setup.py
distance/__init__.py
distance/distance.py
distance-master/README.md 0000644 0001750 0001750 00000010317 12243645633 013524 0 ustar jdg jdg distance - Utilities for comparing sequences
============================================
This package provides helpers for computing similarities between arbitrary sequences. Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses. All distance computations are implemented in pure Python, and most of them are also implemented in C.
Installation
------------
If you don't want or need to use the C extension, just unpack the archive and run, as root:
# python setup.py install
For the C extension to work, you need the Python source files, and a C compiler (typically Microsoft Visual C++ 2010 on Windows, and GCC on Mac and Linux). On a Debian-like system, you can get all of these with:
# apt-get install gcc pythonX.X-dev
where X.X is the number of your Python version.
Then you should type:
# python setup.py install --with-c
Note the use of the `--with-c` switch.
Usage
-----
A common use case for this module is to compare single words for similarity:
>>> distance.levenshtein("lenvestein", "levenshtein")
3
>>> distance.hamming("hamming", "hamning")
1
If there is not a one-to-one mapping between sounds and glyphs in your language, or if you want to compare not glyphs, but syllables or phonems, you can pass in tuples of characters:
>>> t1 = ("de", "ci", "si", "ve")
>>> t2 = ("de", "ri", "si", "ve")
>>> distance.levenshtein(t1, t2)
1
Comparing lists of strings can also be useful for computing similarities between sentences, paragraphs, etc.:
>>> sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
>>> sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog']
>>> distance.levenshtein(sent1, sent2)
3
Hamming and Levenshtein distance can be normalized, so that the results of several distance measures can be meaningfully compared. Two strategies are available for Levenshtein: either the length of the shortest alignment between the sequences is taken as factor, or the length of the longer one. Example uses:
>>> distance.hamming("fat", "cat", normalized=True)
0.3333333333333333
>>> distance.nlevenshtein("abc", "acd", method=1) # shortest alignment
0.6666666666666666
>>> distance.nlevenshtein("abc", "acd", method=2) # longest alignment
0.5
`jaccard` and `sorensen` return a normalized value per default:
>>> distance.sorensen("decide", "resize")
0.5555555555555556
>>> distance.jaccard("decide", "resize")
0.7142857142857143
As for the bonuses, there is a `fast_comp` function, which computes the distance between two strings up to a value of 2 included. If the distance between the strings is higher than that, -1 is returned. This function is of limited use, but on the other hand it is quite faster than `levenshtein`. There is also a `lcsubstrings` function which can be used to find the longest common substrings in two sequences.
Finally, two convenience iterators `ilevenshtein` and `ifast_comp` are provided, which are intended to be used for filtering from a long list of sequences the ones that are close to a reference one. They both return a series of tuples (distance, sequence). Example:
>>> tokens = ["fo", "bar", "foob", "foo", "fooba", "foobar"]
>>> sorted(distance.ifast_comp("foo", tokens))
[(0, 'foo'), (1, 'fo'), (1, 'foob'), (2, 'fooba')]
>>> sorted(distance.ilevenshtein("foo", tokens, max_dist=1))
[(0, 'foo'), (1, 'fo'), (1, 'foob')]
`ifast_comp` is particularly efficient, and can handle 1 million tokens without a problem.
For more informations, see the functions documentation (`help(funcname)`).
Have fun!
Changelog
---------
20/11/13:
* Switched back to using the to-be-deprecated Python unicode api. Good news is that this makes the
C extension compatible with Python 2.7+, and that distance computations on unicode strings is now
much faster.
* Added a C version of `lcsubstrings`.
* Added a new method for computing normalized Levenshtein distance.
* Added some tests.
12/11/13:
Expanded `fast_comp` (formerly `quick_levenshtein`) so that it can handle transpositions.
Fixed variable interversions in (C) `levenshtein` which produced sometimes strange results.
10/11/13:
Added `quick_levenshtein` and `iquick_levenshtein`.
05/11/13:
Added Sorensen and Jaccard metrics, fixed memory issue in Levenshtein.
distance-master/setup.py 0000644 0001750 0001750 00000010161 12243645633 013754 0 ustar jdg jdg # -*- coding: utf-8 -*-
# Distance - Utilities for comparing sequences
# Copyright (C) 2013 Michaël Meyer
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import os, sys, ast, _ast, re
from distutils.core import setup, Extension
this_dir = os.path.dirname(os.path.abspath(__file__))
pkg_dir = os.path.join(this_dir, "distance")
cpkg_dir = os.path.join(this_dir, "cdistance")
ctypes = ["unicode", "byte", "array"]
cfunctions = {
"levenshtein": ["levenshtein", "nlevenshtein"],
"hamming": ["hamming"],
"lcsubstrings": ["lcsubstrings"],
"fastcomp": ["fastcomp"],
}
sequence_compare = """\
#define SEQUENCE_COMPARE(s1, i1, s2, i2) \\
(PyObject_RichCompareBool( \\
PySequence_Fast_GET_ITEM((s1), (i1)), \\
PySequence_Fast_GET_ITEM((s2), (i2)), \\
Py_EQ) \\
)
"""
def make_c_doc():
buff = []
py_sources = [f for f in os.listdir(pkg_dir) if f.endswith('.py')]
for file in py_sources:
with open(os.path.join(pkg_dir, file)) as f:
content = f.read()
tree = ast.parse(content)
for doc_string in parse_tree(tree, content):
buff.append(doc_string)
join_str = 2 * '\n'
return join_str.join(buff) + '\n'
def parse_tree(tree, content):
for node in ast.iter_child_nodes(tree):
if not isinstance(node, _ast.FunctionDef):
continue
doc_string = ast.get_docstring(node)
if not doc_string:
continue
func_def = re.findall("def\s%s\s*(.+?)\s*:" % node.name, content)
assert func_def and len(func_def) == 1
func_def = node.name + func_def[0] + 2 * '\\n\\\n'
doc_string = doc_string.replace('\n', '\\n\\\n').replace('"', '\\"')
doc_string = doc_string.replace('\n' + 8 * ' ', '\n' + 4 * ' ')
doc_string = '#define %s_doc \\\n"%s%s"\n' % (node.name, func_def, doc_string)
yield doc_string
def format_header():
yield sequence_compare
for cfile, cfuncs in cfunctions.items():
for ctype in ctypes:
if ctype == "array":
yield("#define SEQUENCE_COMP SEQUENCE_COMPARE")
yield('#define unicode %(type)s' % dict(type=ctype))
for cfunc in cfuncs:
yield("#define %(function)s %(tcode)s%(function)s" % dict(function=cfunc, tcode=ctype[0]))
yield('#include "%(file)s.c"' % dict(file=cfile))
yield("#undef unicode")
for cfunc in cfuncs:
yield("#undef %(function)s" % dict(function=cfunc))
if ctype == "array":
yield("#undef SEQUENCE_COMP")
yield("")
def prepare():
with open(os.path.join(cpkg_dir, "includes.h"), "w") as f:
f.write(make_c_doc())
f.write(4 * '\n')
f.write('\n'.join(format_header()))
args = sys.argv[1:]
if "prepare" in args:
prepare()
sys.exit()
if "--with-c" in args:
args.remove("--with-c")
ext_modules = [Extension('distance.cdistance', sources=["cdistance/distance.c"])]
else:
sys.stderr.write("notice: no C support available\n")
ext_modules = []
with open(os.path.join(this_dir, "README.md")) as f:
long_description = f.read()
setup (
name = 'Distance',
version = '0.1.3',
description = 'Utilities for comparing sequences',
long_description = long_description,
author='Michaël Meyer',
author_email='michaelnm.meyer@gmail.com',
url='https://github.com/doukremt/distance',
ext_modules = ext_modules,
script_args = args,
packages = ['distance'],
classifiers=(
'Intended Audience :: Developers',
'Natural Language :: English',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Operating System :: OS Independent',
'Topic :: Software Development :: Libraries :: Python Modules',
'Programming Language :: C',
'Programming Language :: Python',
'Programming Language :: Python :: 3.3',
)
)
distance-master/distance/ 0000755 0001750 0001750 00000000000 12243645633 014035 5 ustar jdg jdg distance-master/distance/_fastcomp.py 0000644 0001750 0001750 00000003607 12243645633 016370 0 ustar jdg jdg # -*- coding: utf-8 -*-
def fast_comp(seq1, seq2, transpositions=False):
"""Compute the distance between the two sequences `seq1` and `seq2` up to a
maximum of 2 included, and return it. If the edit distance between the two
sequences is higher than that, -1 is returned.
If `transpositions` is `True`, transpositions will be taken into account for
the computation of the distance. This can make a difference, e.g.:
>>> fast_comp("abc", "bac", transpositions=False)
2
>>> fast_comp("abc", "bac", transpositions=True)
1
This is faster than `levenshtein` by an order of magnitude, but on the
other hand is of limited use.
The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.
I've added transpositions support to the original code.
"""
replace, insert, delete = "r", "i", "d"
L1, L2 = len(seq1), len(seq2)
if L1 < L2:
L1, L2 = L2, L1
seq1, seq2 = seq2, seq1
ldiff = L1 - L2
if ldiff == 0:
models = (insert+delete, delete+insert, replace+replace)
elif ldiff == 1:
models = (delete+replace, replace+delete)
elif ldiff == 2:
models = (delete+delete,)
else:
return -1
res = 3
for model in models:
i = j = c = 0
while (i < L1) and (j < L2):
if seq1[i] != seq2[j]:
c = c+1
if 2 < c:
break
if transpositions and ldiff != 2 \
and i < L1 - 1 and j < L2 - 1 \
and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]:
i, j = i+2, j+2
else:
cmd = model[c-1]
if cmd == delete:
i = i+1
elif cmd == insert:
j = j+1
else:
assert cmd == replace
i,j = i+1, j+1
else:
i,j = i+1, j+1
if 2 < c:
continue
elif i < L1:
if L1-i <= model[c:].count(delete):
c = c + (L1-i)
else:
continue
elif j < L2:
if L2-j <= model[c:].count(insert):
c = c + (L2-j)
else:
continue
if c < res:
res = c
if res == 3:
res = -1
return res
distance-master/distance/_lcsubstrings.py 0000644 0001750 0001750 00000002244 12243645633 017272 0 ustar jdg jdg # -*- coding: utf-8 -*-
from array import array
def lcsubstrings(seq1, seq2, positions=False):
"""Find the longest common substring(s) in the sequences `seq1` and `seq2`.
If positions evaluates to `True` only their positions will be returned,
together with their length, in a tuple:
(length, [(start pos in seq1, start pos in seq2)..])
Otherwise, the substrings themselves will be returned, in a set.
Example:
>>> lcsubstrings("sedentar", "dentist")
{'dent'}
>>> lcsubstrings("sedentar", "dentist", positions=True)
(4, [(2, 0)])
"""
L1, L2 = len(seq1), len(seq2)
ms = []
mlen = last = 0
if L1 < L2:
seq1, seq2 = seq2, seq1
L1, L2 = L2, L1
column = array('L', range(L2))
for i in range(L1):
for j in range(L2):
old = column[j]
if seq1[i] == seq2[j]:
if i == 0 or j == 0:
column[j] = 1
else:
column[j] = last + 1
if column[j] > mlen:
mlen = column[j]
ms = [(i, j)]
elif column[j] == mlen:
ms.append((i, j))
else:
column[j] = 0
last = old
if positions:
return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms))
return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms)
distance-master/distance/_pyimports.py 0000644 0001750 0001750 00000000210 12243645633 016605 0 ustar jdg jdg from ._fastcomp import *
from ._lcsubstrings import *
from ._levenshtein import *
from ._simpledists import *
from ._iterators import *
distance-master/distance/_levenshtein.py 0000644 0001750 0001750 00000010450 12243645633 017072 0 ustar jdg jdg # -*- coding: utf-8 -*-
from array import array
def levenshtein(seq1, seq2, normalized=False, max_dist=-1):
"""Compute the absolute Levenshtein distance between the two sequences
`seq1` and `seq2`.
The Levenshtein distance is the minimum number of edit operations necessary
for transforming one sequence into the other. The edit operations allowed are:
* deletion: ABC -> BC, AC, AB
* insertion: ABC -> ABCD, EABC, AEBC..
* substitution: ABC -> ABE, ADC, FBC..
The `max_dist` parameter controls at which moment we should stop computing the
distance between the provided sequences. If it is a negative integer, the
distance will be computed until the sequences are exhausted; otherwise, the
computation will stop at the moment the calculated distance is higher than
`max_dist`, and then return -1. For example:
>>> levenshtein("abc", "abcd", max_dist=1) # dist = 1
1
>>> levenshtein("abc", "abcde", max_dist=1) # dist = 2
-1
This can be a time saver if you're not interested in the exact distance, but
only need to check if the distance between the given sequences is below a
given threshold.
The `normalized` parameter is here for backward compatibility; providing
it will result in a call to `nlevenshtein`, which should be used directly
instead.
"""
if normalized:
return nlevenshtein(seq1, seq2, method=1)
if seq1 == seq2:
return 0
len1, len2 = len(seq1), len(seq2)
if max_dist >= 0 and abs(len1 - len2) > max_dist:
return -1
if len1 == 0:
return len2
if len2 == 0:
return len1
if len1 < len2:
len1, len2 = len2, len1
seq1, seq2 = seq2, seq1
column = array('L', range(len2 + 1))
for x in range(1, len1 + 1):
column[0] = x
last = x - 1
for y in range(1, len2 + 1):
old = column[y]
cost = int(seq1[x - 1] != seq2[y - 1])
column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost)
last = old
if max_dist >= 0 and min(column) > max_dist:
return -1
if max_dist >= 0 and column[len2] > max_dist:
# stay consistent, even if we have the exact distance
return -1
return column[len2]
def nlevenshtein(seq1, seq2, method=1):
"""Compute the normalized Levenshtein distance between `seq1` and `seq2`.
Two normalization methods are provided. For both of them, the normalized
distance will be a float between 0 and 1, where 0 means equal and 1
completely different. The computation obeys the following patterns:
0.0 if seq1 == seq2
1.0 if len(seq1) == 0 or len(seq2) == 0
edit distance / factor otherwise
The `method` parameter specifies which normalization factor should be used.
It can have the value 1 or 2, which correspond to the following:
1: the length of the shortest alignment between the sequences
(that is, the length of the longest sequence)
2: the length of the longest alignment between the sequences
Which normalization factor should be chosen is a matter of taste. The first
one is cheap to compute. The second one is more costly, but it accounts
better than the first one for parallelisms of symbols between the sequences.
For the rationale behind the use of the second method, see:
Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein
Distance", 2004, p. 130 sq, which is available online at:
http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf
"""
if seq1 == seq2:
return 0.0
len1, len2 = len(seq1), len(seq2)
if len1 == 0 or len2 == 0:
return 1.0
if len1 < len2: # minimize the arrays size
len1, len2 = len2, len1
seq1, seq2 = seq2, seq1
if method == 1:
return levenshtein(seq1, seq2) / float(len1)
if method != 2:
raise ValueError("expected either 1 or 2 for `method` parameter")
column = array('L', range(len2 + 1))
length = array('L', range(len2 + 1))
for x in range(1, len1 + 1):
column[0] = length[0] = x
last = llast = x - 1
for y in range(1, len2 + 1):
# dist
old = column[y]
ic = column[y - 1] + 1
dc = column[y] + 1
rc = last + (seq1[x - 1] != seq2[y - 1])
column[y] = min(ic, dc, rc)
last = old
# length
lold = length[y]
lic = length[y - 1] + 1 if ic == column[y] else 0
ldc = length[y] + 1 if dc == column[y] else 0
lrc = llast + 1 if rc == column[y] else 0
length[y] = max(ldc, lic, lrc)
llast = lold
return column[y] / float(length[y])
distance-master/distance/__init__.py 0000644 0001750 0001750 00000000731 12243645633 016147 0 ustar jdg jdg "Utilities for comparing sequences"
__all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen",
"fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"]
try:
from .cdistance import *
except ImportError:
from ._pyimports import *
from ._pyimports import jaccard, sorensen
def quick_levenshtein(str1, str2):
return fast_comp(str1, str2, transpositions=False)
def iquick_levenshtein(str1, strs):
return ifast_comp(str1, str2, transpositions=False)
distance-master/distance/_simpledists.py 0000644 0001750 0001750 00000003170 12243645633 017107 0 ustar jdg jdg # -*- coding: utf-8 -*-
def hamming(seq1, seq2, normalized=False):
"""Compute the Hamming distance between the two sequences `seq1` and `seq2`.
The Hamming distance is the number of differing items in two ordered
sequences of the same length. If the sequences submitted do not have the
same length, an error will be raised.
If `normalized` evaluates to `False`, the return value will be an integer
between 0 and the length of the sequences provided, edge values included;
otherwise, it will be a float between 0 and 1 included, where 0 means
equal, and 1 totally different. Normalized hamming distance is computed as:
0.0 if len(seq1) == 0
hamming_dist / len(seq1) otherwise
"""
L = len(seq1)
if L != len(seq2):
raise ValueError("expected two strings of the same length")
if L == 0:
return 0.0 if normalized else 0 # equal
dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
if normalized:
return dist / float(L)
return dist
def jaccard(seq1, seq2):
"""Compute the Jaccard distance between the two sequences `seq1` and `seq2`.
They should contain hashable items.
The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
"""
set1, set2 = set(seq1), set(seq2)
return 1 - len(set1 & set2) / float(len(set1 | set2))
def sorensen(seq1, seq2):
"""Compute the Sorensen distance between the two sequences `seq1` and `seq2`.
They should contain hashable items.
The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
"""
set1, set2 = set(seq1), set(seq2)
return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2)))
distance-master/distance/_iterators.py 0000644 0001750 0001750 00000003031 12243645633 016557 0 ustar jdg jdg from ._pyimports import levenshtein, fast_comp
def ilevenshtein(seq1, seqs, max_dist=-1):
"""Compute the Levenshtein distance between the sequence `seq1` and the series
of sequences `seqs`.
`seq1`: the reference sequence
`seqs`: a series of sequences (can be a generator)
`max_dist`: if provided and > 0, only the sequences which distance from
the reference sequence is lower or equal to this value will be returned.
The return value is a series of pairs (distance, sequence).
The sequence objects in `seqs` are expected to be of the same kind than
the reference sequence in the C implementation; the same holds true for
`ifast_comp`.
"""
for seq2 in seqs:
dist = levenshtein(seq1, seq2, max_dist=max_dist)
if dist != -1:
yield dist, seq2
def ifast_comp(seq1, seqs, transpositions=False):
"""Return an iterator over all the sequences in `seqs` which distance from
`seq1` is lower or equal to 2. The sequences which distance from the
reference sequence is higher than that are dropped.
`seq1`: the reference sequence.
`seqs`: a series of sequences (can be a generator)
`transpositions` has the same sense than in `fast_comp`.
The return value is a series of pairs (distance, sequence).
You might want to call `sorted()` on the iterator to get the results in a
significant order:
>>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"])
>>> sorted(g)
[(0, 'foo'), (1, 'fo'), (1, 'foob')]
"""
for seq2 in seqs:
dist = fast_comp(seq1, seq2, transpositions)
if dist != -1:
yield dist, seq2
distance-master/tests/ 0000755 0001750 0001750 00000000000 12243645633 013405 5 ustar jdg jdg distance-master/tests/tests.py 0000644 0001750 0001750 00000013314 12243645633 015123 0 ustar jdg jdg import os, sys
from array import array
try:
from distance import cdistance
except ImportError:
cdistance = None
from distance import _pyimports as pydistance
if sys.version_info.major < 3:
t_unicode = unicode
t_bytes = lambda s: s
else:
t_unicode = lambda s: s
t_bytes = lambda s: s.encode()
all_types = [
("unicode", t_unicode),
("bytes", t_bytes),
("list", list),
("tuple", tuple),
]
def hamming(func, t, **kwargs):
# types; only for c
if kwargs["lang"] == "C":
try:
func(1, t("foo"))
except ValueError:
pass
try:
func(t("foo"), 1)
except ValueError:
pass
# empty string
assert func(t(""), t("")) == 0
# common
assert func(t("abc"), t("abc")) == 0
assert func(t("abc"), t("abd")) == 1
# wrong length
try:
func(t("foo"), t("foobar"))
except ValueError:
pass
try:
func(t(""), t("foo"))
except ValueError:
pass
# normalization
assert func(t(""), t(""), normalized=True) == 0.0
assert func(t("abc"), t("abc"), normalized=True) == 0.0
assert func(t("ab"), t("ac"), normalized=True) == 0.5
assert func(t("abc"), t("def"), normalized=True) == 1.0
def fast_comp(func, t, **kwargs):
# types; only for c
if kwargs["lang"] == "C":
try:
func(1, t("foo"))
except ValueError:
pass
try:
func(t("foo"), 1)
except ValueError:
pass
# empty strings
assert func(t(""), t("")) == 0
assert func(t(""), t("a")) == func(t("a"), t("")) == 1
# edit ops
assert func(t("aa"), t("aa")) == 0
assert func(t("ab"), t("aa")) == 1
assert func(t("ab"), t("a")) == 1
assert func(t("ab"), t("abc")) == 1
# dist limit
assert func(t("a"), t("bcd")) == func(t("bcd"), t("a")) == -1
# transpositions
assert func(t("abc"), t("bac"), transpositions=True) == \
func(t("bac"), t("abc"), transpositions=True) == 1
def levenshtein(func, t, **kwargs):
# types; only for c
if kwargs["lang"] == "C":
try:
func(1, t("foo"))
except ValueError:
pass
try:
func(t("foo"), 1)
except ValueError:
pass
# empty strings
assert func(t(""), t("")) == 0
assert func(t(""), t("abcd")) == func(t("abcd"), t("")) == 4
# edit ops
assert func(t("aa"), t("aa")) == 0
assert func(t("ab"), t("aa")) == 1
assert func(t("ab"), t("a")) == 1
assert func(t("ab"), t("abc")) == 1
# dist limit
assert func(t("a"), t("b"), max_dist=0) == -1
assert func(t("a"), t("b"), max_dist=1) == 1
assert func(t("foo"), t("bar"), max_dist=-1) == 3
def nlevenshtein(func, t, **kwargs):
# types; only for c
if kwargs["lang"] == "C":
try:
func(1, t("foo"))
except ValueError:
pass
try:
func(t("foo"), 1)
except ValueError:
pass
# empty strings
assert func(t(""), t(""), 1) == func(t(""), t(""), 2) == 0.0
assert func(t(""), t("foo"), 1) == func(t("foo"), t(""), 1) == \
func(t(""), t("foo"), 2) == func(t("foo"), t(""), 2) == 1.0
assert func(t("aa"), t("aa"), 1) == func(t("aa"), t("aa"), 2) == 0.0
assert func(t("ab"), t("aa"), 1) == func(t("ab"), t("aa"), 2) == 0.5
assert func(t("ab"), t("a"), 1) == func(t("ab"), t("a"), 2) == 0.5
assert func(t("ab"), t("abc"), 1) == func(t("ab"), t("abc"), 2) == 0.3333333333333333
# multiple alignments
assert func(t("abc"), t("adb"), 1) == 0.6666666666666666
assert func(t("abc"), t("adb"), 2) == 0.5
def lcsubstrings(func, t, **kwargs):
# types; only for c
if kwargs["lang"] == "C":
try:
func(1, t("foo"))
except ValueError:
pass
try:
func(t("foo"), 1)
except ValueError:
pass
# empty strings
try:
assert func(t(""), t(""), False) == set()
except TypeError:
if t is not list: raise
assert func(t(""), t(""), True) == (0, ())
try:
assert func(t(""), t("foo"), False) == func(t("foo"), t(""), False) == set()
except TypeError:
if t is not list: raise
assert func(t(""), t("foo"), True) == func(t("foo"), t(""), True) == (0, ())
# common
try:
assert func(t("abcd"), t("cdba"), False) == {t('cd')}
except TypeError:
if t is not list: raise
assert func(t("abcd"), t("cdba"), True) == (2, ((2, 0),))
# reverse
try:
assert func(t("abcdef"), t("cdba"), False) == func(t("cdba"), t("abcdef"), False)
except TypeError:
if t is not list: raise
assert func(t("abcdef"), t("cdba"), True) == func(t("cdba"), t("abcdef"), True)
def itors_common(func, t, **kwargs):
if kwargs["lang"] == "C":
# types check; only need to do it for C impl to avoid an eventual segfaults.
try: func(1, t("foo"))
except ValueError: pass
itor = func(t("foo"), [t("foo"), 3333])
next(itor)
try: next(itor)
except ValueError: pass
# values drop
itor = func(t("aa"), [t("aa"), t("abcd"), t("ba")])
assert next(itor) == (0, t("aa"))
assert next(itor) == (1, t("ba"))
def ilevenshtein(func, t, **kwargs):
itors_common(lambda a, b: func(a, b, max_dist=2), t, **kwargs)
def ifast_comp(func, t, **kwargs):
itors_common(func, t, **kwargs)
#transpositions
g = func(t("abc"), [t("bac")], transpositions=False)
assert next(g) == (2, t('bac'))
g = func(t("abc"), [t("bac")], transpositions=True)
assert next(g) == (1, t("bac"))
write = lambda s: sys.stderr.write(s + '\n')
tests = ["hamming", "fast_comp", "levenshtein", "lcsubstrings", "nlevenshtein", "ilevenshtein", "ifast_comp"]
def run_test(name):
if cdistance:
cfunc = getattr(cdistance, name)
run_lang_test(name, cfunc, "C")
write("")
pyfunc = getattr(pydistance, name)
run_lang_test(name, pyfunc, "py")
if cdistance is None:
write("skipped C tests")
write("")
def run_lang_test(name, func, lang):
print("%s (%s)..." % (name, lang))
for tname, typ in all_types:
write("type: %s" % tname)
globals()[name](func, typ, lang=lang)
if __name__ == "__main__":
args = sys.argv[1:]
if not args:
for test in tests:
run_test(test)
sys.exit()
for name in args:
if name in tests:
run_test(name)
else:
write("no such test: %s" % name)
sys.exit(1)
distance-master/.gitignore 0000644 0001750 0001750 00000000473 12243645633 014237 0 ustar jdg jdg *.py[cod]
# C extensions
*.so
# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
nosetests.xml
# Translations
*.mo
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
distance-master/LICENSE 0000644 0001750 0001750 00000045432 12243645633 013260 0 ustar jdg jdg distance license
================
Copyright (C) 2013 Michaël Meyer
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
{description}
Copyright (C) {year} {fullname}
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
{signature of Ty Coon}, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.
fastcomp license
================
MIT LICENSE
Copyright (c) 2012 Fujimoto
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
distance-master/cdistance/ 0000755 0001750 0001750 00000000000 12243645633 014200 5 ustar jdg jdg distance-master/cdistance/distance.c 0000644 0001750 0001750 00000043107 12243645633 016143 0 ustar jdg jdg #include "distance.h"
#include "includes.h"
static unicode *
get_unicode(PyObject *obj, Py_ssize_t *len)
{
unicode *u;
if ((u = PyUnicode_AS_UNICODE(obj)) == NULL) {
PyErr_Format(PyExc_RuntimeError, "failed to get unicode representation of object");
return NULL;
}
*len = PyUnicode_GET_LENGTH(obj);
return u;
}
static byte *
get_byte(PyObject *obj, Py_ssize_t *len)
{
byte *b;
b = PyBytes_AS_STRING(obj);
*len = PyBytes_GET_SIZE(obj);
return b;
}
static array *
get_array(PyObject *obj, Py_ssize_t *len)
{
array *a;
if ((a = PySequence_Fast(obj, "we got a problem")) == NULL)
return NULL;
*len = PySequence_Fast_GET_SIZE(a);
return a;
}
static char
get_sequence(PyObject *obj, sequence *seq, Py_ssize_t *len, char type)
{
char t = '\0';
if (PyUnicode_Check(obj)) {
t = 'u';
if ((seq->u = get_unicode(obj, len)) == NULL)
return '\0';
} else if (PyBytes_Check(obj)) {
t = 'b';
if ((seq->b = get_byte(obj, len)) == NULL)
return '\0';
} else if (PySequence_Check(obj)) {
t = 'a';
if ((seq->a = get_array(obj, len)) == NULL)
return '\0';
}
if (!t) {
PyErr_SetString(PyExc_ValueError, "expected a sequence object as first argument");
return '\0';
}
if (type && t != type) {
PyErr_SetString(PyExc_ValueError, "type mismatch between the "
"value provided as left argument and one of the elements in "
"the right one, can't process the later");
if (t == 'a')
Py_DECREF(seq->a);
return '\0';
}
return t;
}
static char
get_sequences(PyObject *arg1, PyObject *arg2, sequence *seq1, sequence *seq2,
Py_ssize_t *len1, Py_ssize_t *len2)
{
if (PyUnicode_Check(arg1) && PyUnicode_Check(arg2)) {
if ((seq1->u = get_unicode(arg1, len1)) == NULL)
return '\0';
if ((seq2->u = get_unicode(arg2, len2)) == NULL)
return '\0';
return 'u';
} else if (PyBytes_Check(arg1) && PyBytes_Check(arg2)) {
if ((seq1->b = get_byte(arg1, len1)) == NULL)
return '\0';
if ((seq2->b = get_byte(arg2, len2)) == NULL)
return '\0';
return 'b';
} else if (PySequence_Check(arg1) && PySequence_Check(arg2)) {
if ((seq1->a = get_array(arg1, len1)) == NULL)
return '\0';
if ((seq2->a = get_array(arg2, len2)) == NULL) {
Py_DECREF(seq1->a); /* warning ! */
return '\0';
}
return 'a';
}
PyErr_SetString(PyExc_ValueError, "expected two sequence objects");
return '\0';
}
static PyObject *
hamming_py(PyObject *self, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *odo_normalize = NULL;
int do_normalize = 0;
static char *keywords[] = {"seq1", "seq2", "normalized", NULL};
char type;
sequence seq1, seq2;
Py_ssize_t len1, len2;
Py_ssize_t dist;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"OO|O:hamming", keywords, &arg1, &arg2, &odo_normalize))
return NULL;
if (odo_normalize && (do_normalize = PyObject_IsTrue(odo_normalize)) == -1)
return NULL;
if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
return NULL;
if (len1 != len2) {
PyErr_SetString(PyExc_ValueError, "expected two objects of the same length");
if (type == 'a') {
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
return NULL;
}
switch(type) {
case 'u':
dist = uhamming(seq1.u, seq2.u, len1);
break;
case 'b':
dist = bhamming(seq1.b, seq2.b, len1);
break;
default:
dist = ahamming(seq1.a, seq2.a, len1);
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
if (dist == -1) // comparison failed
return NULL;
if (do_normalize) {
if (len1 == 0)
return Py_BuildValue("f", 0.0f);
return Py_BuildValue("d", dist / (double)len1);
}
return Py_BuildValue("n", dist);
}
static PyObject *
lcsubstrings_py_make_set(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen)
{
PyObject *set, *ss;
struct pair_t *pair;
if ((set = PySet_New(NULL)) == NULL) {
utarray_free(stack);
return NULL;
}
for (pair = (struct pair_t*)utarray_front(stack);
pair != NULL;
pair = (struct pair_t*)utarray_next(stack, pair)) {
ss = PySequence_GetSlice(arg2, pair->j - mlen + 1, pair->j + 1);
if (ss == NULL)
goto On_Error;
if ((PySet_Add(set, ss)) == -1)
goto On_Error;
}
utarray_free(stack);
return set;
On_Error:
PySet_Clear(set);
Py_DECREF(set);
utarray_free(stack);
return NULL;
}
static PyObject *
lcsubstrings_py_make_tuple(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen)
{
PyObject *tp, *stp;
Py_ssize_t i;
struct pair_t *pair;
if ((stp = PyTuple_New(utarray_len(stack))) == NULL) {
utarray_free(stack);
return NULL;
}
for (i = 0, pair = (struct pair_t*)utarray_front(stack);
pair != NULL;
++i, pair = (struct pair_t*)utarray_next(stack, pair)) {
PyTuple_SET_ITEM(stp, i, Py_BuildValue("(nn)", pair->i - mlen + 1, pair->j - mlen + 1));
}
if ((tp = PyTuple_New(2)) == NULL) {
utarray_free(stack);
Py_DECREF(stp);
return NULL;
}
PyTuple_SET_ITEM(tp, 0, Py_BuildValue("n", mlen));
PyTuple_SET_ITEM(tp, 1, stp);
utarray_free(stack);
return tp;
}
static PyObject *
lcsubstrings_py(PyObject *self, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *opos = NULL;
int positions = 0;
static char *keywords[] = {"seq1", "seq2", "positions", NULL};
char type;
sequence seq1, seq2;
Py_ssize_t len1, len2;
UT_array *stack;
Py_ssize_t mlen = -1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"OO|O:lcsubstrings", keywords, &arg1, &arg2, &opos))
return NULL;
if (opos && (positions = PyObject_IsTrue(opos)) == -1)
return NULL;
if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
return NULL;
// special case
if (type == 'a' && (!positions) && (PyList_Check(arg1) || PyList_Check(arg2))) {
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
PyErr_SetString(PyExc_TypeError, "can't hash lists, pass in tuples instead");
return NULL;
}
if (len1 < len2) {
SWAP(PyObject *, arg1, arg2);
SWAP(sequence, seq1, seq2);
SWAP(Py_ssize_t, len1, len2);
}
switch(type) {
case 'u':
stack = ulcsubstrings(seq1.u, seq2.u, len1, len2, &mlen);
break;
case 'b':
stack = blcsubstrings(seq1.b, seq2.b, len1, len2, &mlen);
break;
default:
stack = alcsubstrings(seq1.a, seq2.a, len1, len2, &mlen);
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
if (stack == NULL) {
/* memory allocation failed */
return PyErr_NoMemory();
}
if (positions)
return lcsubstrings_py_make_tuple(arg1, arg2, stack, mlen);
return lcsubstrings_py_make_set(arg1, arg2, stack, mlen);
}
static PyObject *
nlevenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2;
short method = 1;
static char *keywords[] = {"seq1", "seq2", "method", NULL};
char type;
sequence seq1, seq2;
Py_ssize_t len1, len2;
double dist;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"OO|h:nlevenshtein", keywords, &arg1, &arg2, &method))
return NULL;
if (method != 1 && method != 2) {
PyErr_SetString(PyExc_ValueError, "expected either 1 or 2 for `method` parameter");
return NULL;
}
if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
return NULL;
if (len1 < len2) {
SWAP(sequence, seq1, seq2);
SWAP(Py_ssize_t, len1, len2);
}
switch(type) {
case 'u':
dist = unlevenshtein(seq1.u, seq2.u, len1, len2, method);
break;
case 'b':
dist = bnlevenshtein(seq1.b, seq2.b, len1, len2, method);
break;
default:
dist = anlevenshtein(seq1.a, seq2.a, len1, len2, method);
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
if (dist < 0) {
if (dist == -1) // memory allocation failed
return PyErr_NoMemory();
return NULL; // comparison failed
}
return Py_BuildValue("d", dist);
}
static PyObject *
levenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *onorm = NULL;
Py_ssize_t dist = -1;
Py_ssize_t max_dist = -1;
int normalized = 0;
static char *keywords[] = {"seq1", "seq2", "normalized", "max_dist", NULL};
char type;
sequence seq1, seq2;
Py_ssize_t len1, len2;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"OO|On:levenshtein", keywords, &arg1, &arg2, &onorm, &max_dist))
return NULL;
if (onorm && (normalized = PyObject_IsTrue(onorm)) == -1)
return NULL;
if (normalized) {
onorm = NULL;
return nlevenshtein_py(self, args, onorm);
}
if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
return NULL;
switch(type) {
case 'u':
dist = ulevenshtein(seq1.u, seq2.u, len1, len2, max_dist);
break;
case 'b':
dist = blevenshtein(seq1.b, seq2.b, len1, len2, max_dist);
break;
default:
dist = alevenshtein(seq1.a, seq2.a, len1, len2, max_dist);
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
if (dist < -1) {
if (dist == -2)
return PyErr_NoMemory(); // memory allocation failed
return NULL; // comparison failed
}
return Py_BuildValue("n", dist);
}
static PyObject *
fastcomp_py(PyObject *self, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *otr = NULL;
int transpositions = 0;
static char *keywords[] = {"seq1", "seq2", "transpositions", NULL};
char type;
sequence seq1, seq2;
Py_ssize_t len1, len2;
short dist;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:fast_comp",
keywords, &arg1, &arg2, &transpositions))
return NULL;
if (otr && (transpositions = PyObject_IsTrue(otr)) == -1)
return NULL;
if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
return NULL;
if (len1 < len2) {
SWAP(sequence, seq1, seq2);
SWAP(Py_ssize_t, len1, len2);
}
switch(type) {
case 'u':
dist = ufastcomp(seq1.u, seq2.u, len1, len2, transpositions);
break;
case 'b':
dist = bfastcomp(seq1.b, seq2.b, len1, len2, transpositions);
break;
default:
dist = afastcomp(seq1.a, seq2.a, len1, len2, transpositions);
Py_DECREF(seq1.a);
Py_DECREF(seq2.a);
}
if (dist == -2) // comparison failed
return NULL;
return Py_BuildValue("h", dist);
}
// Iterators (for levenshtein and fastcomp). They share the same structure.
typedef struct {
PyObject_HEAD
PyObject *itor;
char seqtype; // type of the sequence ('u', 'b', 'a')
sequence seq1; // the sequence itself
Py_ssize_t len1; // its length
PyObject *object; // the corresponding pyobject
int transpos; // only valable for fastcomp
Py_ssize_t max_dist; // only for levenshtein
} ItorState;
static void itor_dealloc(ItorState *state)
{
// we got two references for tuples and lists, one for the original python object,
// and one returned by `PySequence_fast`
if (state->seqtype == 'a')
Py_XDECREF(state->seq1.a);
Py_XDECREF(state->object);
Py_XDECREF(state->itor);
Py_TYPE(state)->tp_free(state);
}
static PyObject *
ifastcomp_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *itor;
int transpositions = 0;
static char *keywords[] = {"seq1", "seqs", "transpositions", NULL};
char seqtype;
sequence seq1;
Py_ssize_t len1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:ifast_comp",
keywords, &arg1, &arg2, &transpositions))
return NULL;
if (otr && (transpositions = PyObject_IsTrue(otr)) == -1)
return NULL;
if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0')
return NULL;
if ((itor = PyObject_GetIter(arg2)) == NULL) {
PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument");
return NULL;
}
ItorState *state = (ItorState *)type->tp_alloc(type, 0);
if (state == NULL) {
Py_DECREF(itor);
return NULL;
}
Py_INCREF(arg1);
state->itor = itor;
state->seqtype = seqtype;
state->seq1 = seq1;
state->object = arg1;
state->len1 = len1;
state->transpos = transpositions;
return (PyObject *)state;
}
static PyObject *
ilevenshtein_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *arg1, *arg2, *itor;
Py_ssize_t max_dist = -1;
static char *keywords[] = {"seq1", "seqs", "max_dist", NULL};
char seqtype;
sequence seq1;
Py_ssize_t len1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"OO|n:ilevenshtein", keywords, &arg1, &arg2, &max_dist))
return NULL;
if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0')
return NULL;
if ((itor = PyObject_GetIter(arg2)) == NULL) {
PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument");
return NULL;
}
ItorState *state = (ItorState *)type->tp_alloc(type, 0);
if (state == NULL) {
Py_DECREF(itor);
return NULL;
}
Py_INCREF(arg1);
state->itor = itor;
state->seqtype = seqtype;
state->seq1 = seq1;
state->object = arg1;
state->len1 = len1;
state->max_dist = max_dist;
return (PyObject *)state;
}
static PyObject *
ilevenshtein_next(ItorState *state)
{
PyObject *arg2;
sequence seq1, seq2;
Py_ssize_t len2;
Py_ssize_t dist = -1;
PyObject *rv;
seq1 = state->seq1;
while ((arg2 = PyIter_Next(state->itor)) != NULL) {
if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') {
Py_DECREF(arg2);
return NULL;
}
switch(state->seqtype) {
case 'u':
dist = ulevenshtein(seq1.u, seq2.u, state->len1, len2, state->max_dist);
break;
case 'b':
dist = blevenshtein(seq1.b, seq2.b, state->len1, len2, state->max_dist);
break;
default:
dist = alevenshtein(seq1.a, seq2.a, state->len1, len2, state->max_dist);
Py_DECREF(seq2.a);
}
if (dist < -1) {
Py_DECREF(arg2);
if (dist == -2)
return PyErr_NoMemory(); // memory allocation failed
return NULL; // comparison failed
}
if (dist != -1) {
rv = Py_BuildValue("(nO)", dist, arg2);
Py_DECREF(arg2);
return rv;
}
Py_DECREF(arg2);
}
return NULL;
}
static PyObject *
ifastcomp_next(ItorState *state)
{
PyObject *arg2;
sequence seq1, seq2;
Py_ssize_t len2;
short dist = -1;
PyObject *rv;
seq1 = state->seq1;
while ((arg2 = PyIter_Next(state->itor)) != NULL) {
if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') {
Py_DECREF(arg2);
return NULL;
}
switch(state->seqtype) {
case 'u':
dist = ufastcomp(seq1.u, seq2.u, state->len1, len2, state->transpos);
break;
case 'b':
dist = bfastcomp(seq1.b, seq2.b, state->len1, len2, state->transpos);
break;
default:
dist = afastcomp(seq1.a, seq2.a, state->len1, len2, state->transpos);
Py_DECREF(seq2.a);
}
if (dist == -2) { // comparison failed
Py_DECREF(arg2);
return NULL;
}
if (dist != -1) {
rv = Py_BuildValue("(hO)", dist, arg2);
Py_DECREF(arg2);
return rv;
}
Py_DECREF(arg2);
}
return NULL;
}
PyTypeObject IFastComp_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"distance.ifast_comp", /* tp_name */
sizeof(ItorState), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)itor_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
ifast_comp_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
PyObject_SelfIter, /* tp_iter */
(iternextfunc)ifastcomp_next, /* tp_iternext */
0, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
ifastcomp_new, /* tp_new */
};
PyTypeObject ILevenshtein_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"distance.ilevenshtein", /* tp_name */
sizeof(ItorState), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)itor_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
ilevenshtein_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
PyObject_SelfIter, /* tp_iter */
(iternextfunc)ilevenshtein_next, /* tp_iternext */
0, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
ilevenshtein_new, /* tp_new */
};
static PyMethodDef CDistanceMethods[] = {
{"hamming", (PyCFunction)hamming_py, METH_VARARGS | METH_KEYWORDS, hamming_doc},
{"levenshtein", (PyCFunction)levenshtein_py, METH_VARARGS | METH_KEYWORDS, levenshtein_doc},
{"nlevenshtein", (PyCFunction)nlevenshtein_py, METH_VARARGS | METH_KEYWORDS, nlevenshtein_doc},
{"lcsubstrings", (PyCFunction)lcsubstrings_py, METH_VARARGS | METH_KEYWORDS, lcsubstrings_doc},
{"fast_comp", (PyCFunction)fastcomp_py, METH_VARARGS | METH_KEYWORDS, fast_comp_doc},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef cdistancemodule = {
PyModuleDef_HEAD_INIT, "cdistance", NULL, -1, CDistanceMethods
};
#endif
#if PY_MAJOR_VERSION >= 3
PyMODINIT_FUNC PyInit_cdistance(void)
#else
PyMODINIT_FUNC initcdistance(void)
#endif
{
PyObject *module;
#if PY_MAJOR_VERSION >= 3
if ((module = PyModule_Create(&cdistancemodule)) == NULL)
return NULL;
#else
if ((module = Py_InitModule("cdistance", CDistanceMethods)) == NULL)
return;
#endif
if (PyType_Ready(&IFastComp_Type) != 0 || PyType_Ready(&ILevenshtein_Type) != 0)
#if PY_MAJOR_VERSION >= 3
return NULL;
#else
return;
#endif
Py_INCREF((PyObject *)&IFastComp_Type);
Py_INCREF((PyObject *)&ILevenshtein_Type);
PyModule_AddObject(module, "ifast_comp", (PyObject *)&IFastComp_Type);
PyModule_AddObject(module, "ilevenshtein", (PyObject *)&ILevenshtein_Type);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}
distance-master/cdistance/fastcomp.c 0000644 0001750 0001750 00000005271 12243645633 016165 0 ustar jdg jdg #include "distance.h"
static short
fastcomp(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, int transpositions)
{
char *models[3];
short m, cnt, res = 3;
Py_ssize_t i, j, c, ldiff;
#ifdef SEQUENCE_COMP
int comp;
#endif
if (len1 < len2) {
SWAP(unicode *, seq1, seq2);
SWAP(Py_ssize_t, len1, len2);
}
ldiff = len1 - len2;
switch (ldiff) {
case 0:
models[2] = "id";
models[1] = "di";
models[0] = "rr";
m = 2;
break;
case 1:
models[1] = "dr";
models[0] = "rd";
m = 1;
break;
case 2:
models[0] = "dd";
m = 0;
break;
default:
return -1;
}
for (; m >= 0; m--) {
i = j = c = 0;
while (i < len1 && j < len2)
{
#ifdef SEQUENCE_COMP
comp = SEQUENCE_COMP(seq1, i, seq2, j);
if (comp == -1)
return -2;
if (!comp) {
#else
if (seq1[i] != seq2[j]) {
#endif
c++;
if (c > 2)
break;
/* Transpositions handling. `ldiff`, which is the absolute difference between the length
of the sequences `seq1` and `seq2`, should not be equal to 2 because in this case only
deletions can happen (given that the distance between the two sequences should not be
higher than 2, this is the shortest path).
We do a lookahead to check if a transposition is possible between the current position
and the next one, and, if so, we systematically choose this path over the other alternative
edit operations. We act like so because the cost of a transposition is always the lowest
one in such situations.
*/
#ifdef SEQUENCE_COMP
if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1)) {
comp = SEQUENCE_COMP(seq1, i + 1, seq2, j);
if (comp == -1)
return -2;
else if (comp) {
comp = SEQUENCE_COMP(seq1, i, seq2, j + 1);
if (comp == -1)
return -2;
else if (comp) {
i = i + 2;
j = j + 2;
continue;
}
}
}
#else
if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1) && \
seq1[i + 1] == seq2[j] && \
seq1[i] == seq2[j + 1]) {
i = i + 2;
j = j + 2;
continue;
}
#endif
if (models[m][c - 1] == 'd')
i++;
else if (models[m][c - 1] == 'i')
j++;
else {
i++;
j++;
}
}
else {
i++;
j++;
}
}
if (c > 2)
continue;
else if (i < len1) {
if (c == 1)
cnt = (models[m][1] == 'd');
else
cnt = (models[m][0] == 'd') + (models[m][1] == 'd');
if (len1 - i <= cnt) {
c = c + (len1 - i);
}
else
continue;
}
else if (j < len2) {
if (len2 - j <= (models[m][c] == 'i'))
c = c + (len2 - j);
else
continue;
}
if (c < res) {
res = c;
}
}
if (res == 3)
res = -1;
return res;
}
distance-master/cdistance/includes.h 0000644 0001750 0001750 00000021111 12243645633 016153 0 ustar jdg jdg #define hamming_doc \
"hamming(seq1, seq2, normalized=False)\n\
\n\
Compute the Hamming distance between the two sequences `seq1` and `seq2`.\n\
The Hamming distance is the number of differing items in two ordered\n\
sequences of the same length. If the sequences submitted do not have the\n\
same length, an error will be raised.\n\
\n\
If `normalized` evaluates to `False`, the return value will be an integer\n\
between 0 and the length of the sequences provided, edge values included;\n\
otherwise, it will be a float between 0 and 1 included, where 0 means\n\
equal, and 1 totally different. Normalized hamming distance is computed as:\n\
\n\
0.0 if len(seq1) == 0\n\
hamming_dist / len(seq1) otherwise"
#define jaccard_doc \
"jaccard(seq1, seq2)\n\
\n\
Compute the Jaccard distance between the two sequences `seq1` and `seq2`.\n\
They should contain hashable items.\n\
\n\
The return value is a float between 0 and 1, where 0 means equal, and 1 totally different."
#define sorensen_doc \
"sorensen(seq1, seq2)\n\
\n\
Compute the Sorensen distance between the two sequences `seq1` and `seq2`.\n\
They should contain hashable items.\n\
\n\
The return value is a float between 0 and 1, where 0 means equal, and 1 totally different."
#define lcsubstrings_doc \
"lcsubstrings(seq1, seq2, positions=False)\n\
\n\
Find the longest common substring(s) in the sequences `seq1` and `seq2`.\n\
\n\
If positions evaluates to `True` only their positions will be returned,\n\
together with their length, in a tuple:\n\
\n\
(length, [(start pos in seq1, start pos in seq2)..])\n\
\n\
Otherwise, the substrings themselves will be returned, in a set.\n\
\n\
Example:\n\
\n\
>>> lcsubstrings(\"sedentar\", \"dentist\")\n\
{'dent'}\n\
>>> lcsubstrings(\"sedentar\", \"dentist\", positions=True)\n\
(4, [(2, 0)])"
#define ilevenshtein_doc \
"ilevenshtein(seq1, seqs, max_dist=-1)\n\
\n\
Compute the Levenshtein distance between the sequence `seq1` and the series\n\
of sequences `seqs`.\n\
\n\
`seq1`: the reference sequence\n\
`seqs`: a series of sequences (can be a generator)\n\
`max_dist`: if provided and > 0, only the sequences which distance from\n\
the reference sequence is lower or equal to this value will be returned.\n\
\n\
The return value is a series of pairs (distance, sequence).\n\
\n\
The sequence objects in `seqs` are expected to be of the same kind than\n\
the reference sequence in the C implementation; the same holds true for\n\
`ifast_comp`."
#define ifast_comp_doc \
"ifast_comp(seq1, seqs, transpositions=False)\n\
\n\
Return an iterator over all the sequences in `seqs` which distance from\n\
`seq1` is lower or equal to 2. The sequences which distance from the\n\
reference sequence is higher than that are dropped.\n\
\n\
`seq1`: the reference sequence.\n\
`seqs`: a series of sequences (can be a generator)\n\
`transpositions` has the same sense than in `fast_comp`.\n\
\n\
The return value is a series of pairs (distance, sequence).\n\
\n\
You might want to call `sorted()` on the iterator to get the results in a\n\
significant order:\n\
\n\
>>> g = ifast_comp(\"foo\", [\"fo\", \"bar\", \"foob\", \"foo\", \"foobaz\"])\n\
>>> sorted(g)\n\
[(0, 'foo'), (1, 'fo'), (1, 'foob')]"
#define fast_comp_doc \
"fast_comp(seq1, seq2, transpositions=False)\n\
\n\
Compute the distance between the two sequences `seq1` and `seq2` up to a\n\
maximum of 2 included, and return it. If the edit distance between the two\n\
sequences is higher than that, -1 is returned.\n\
\n\
If `transpositions` is `True`, transpositions will be taken into account for\n\
the computation of the distance. This can make a difference, e.g.:\n\
\n\
>>> fast_comp(\"abc\", \"bac\", transpositions=False)\n\
2\n\
>>> fast_comp(\"abc\", \"bac\", transpositions=True)\n\
1\n\
\n\
This is faster than `levenshtein` by an order of magnitude, but on the\n\
other hand is of limited use.\n\
\n\
The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.\n\
I've added transpositions support to the original code."
#define levenshtein_doc \
"levenshtein(seq1, seq2, max_dist=-1, normalized=False)\n\
\n\
Compute the absolute Levenshtein distance between the two sequences\n\
`seq1` and `seq2`.\n\
\n\
The Levenshtein distance is the minimum number of edit operations necessary\n\
for transforming one sequence into the other. The edit operations allowed are:\n\
\n\
* deletion: ABC -> BC, AC, AB\n\
* insertion: ABC -> ABCD, EABC, AEBC..\n\
* substitution: ABC -> ABE, ADC, FBC..\n\
\n\
The `max_dist` parameter controls at which moment we should stop computing the\n\
distance between the provided sequences. If it is a negative integer, the\n\
distance will be computed until the sequences are exhausted; otherwise, the\n\
computation will stop at the moment the calculated distance is higher than\n\
`max_dist`, and then return -1. For example:\n\
\n\
>>> levenshtein(\"abc\", \"abcd\", max_dist=1) # dist = 1\n\
1\n\
>>> levenshtein(\"abc\", \"abcde\", max_dist=1) # dist = 2\n\
-1\n\
\n\
This can be a time saver if you're not interested in the exact distance, but\n\
only need to check if the distance between the given sequences is below a\n\
given threshold.\n\
\n\
The `normalized` parameter is here for backward compatibility; providing\n\
it will result in a call to `nlevenshtein`, which should be used directly\n\
instead. "
#define nlevenshtein_doc \
"nlevenshtein(seq1, seq2, method=1)\n\
\n\
Compute the normalized Levenshtein distance between `seq1` and `seq2`.\n\
\n\
Two normalization methods are provided. For both of them, the normalized\n\
distance will be a float between 0 and 1, where 0 means equal and 1\n\
completely different. The computation obeys the following patterns:\n\
\n\
0.0 if seq1 == seq2\n\
1.0 if len(seq1) == 0 or len(seq2) == 0\n\
edit distance / factor otherwise\n\
\n\
The `method` parameter specifies which normalization factor should be used.\n\
It can have the value 1 or 2, which correspond to the following:\n\
\n\
1: the length of the shortest alignment between the sequences\n\
(that is, the length of the longest sequence)\n\
2: the length of the longest alignment between the sequences\n\
\n\
Which normalization factor should be chosen is a matter of taste. The first\n\
one is cheap to compute. The second one is more costly, but it accounts\n\
better than the first one for parallelisms of symbols between the sequences.\n\
\n\
For the rationale behind the use of the second method, see:\n\
Heeringa, \"Measuring Dialect Pronunciation Differences using Levenshtein\n\
Distance\", 2004, p. 130 sq, which is available online at:\n\
http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf"
#define SEQUENCE_COMPARE(s1, i1, s2, i2) \
(PyObject_RichCompareBool( \
PySequence_Fast_GET_ITEM((s1), (i1)), \
PySequence_Fast_GET_ITEM((s2), (i2)), \
Py_EQ) \
)
#define unicode unicode
#define hamming uhamming
#include "hamming.c"
#undef unicode
#undef hamming
#define unicode byte
#define hamming bhamming
#include "hamming.c"
#undef unicode
#undef hamming
#define SEQUENCE_COMP SEQUENCE_COMPARE
#define unicode array
#define hamming ahamming
#include "hamming.c"
#undef unicode
#undef hamming
#undef SEQUENCE_COMP
#define unicode unicode
#define levenshtein ulevenshtein
#define nlevenshtein unlevenshtein
#include "levenshtein.c"
#undef unicode
#undef levenshtein
#undef nlevenshtein
#define unicode byte
#define levenshtein blevenshtein
#define nlevenshtein bnlevenshtein
#include "levenshtein.c"
#undef unicode
#undef levenshtein
#undef nlevenshtein
#define SEQUENCE_COMP SEQUENCE_COMPARE
#define unicode array
#define levenshtein alevenshtein
#define nlevenshtein anlevenshtein
#include "levenshtein.c"
#undef unicode
#undef levenshtein
#undef nlevenshtein
#undef SEQUENCE_COMP
#define unicode unicode
#define lcsubstrings ulcsubstrings
#include "lcsubstrings.c"
#undef unicode
#undef lcsubstrings
#define unicode byte
#define lcsubstrings blcsubstrings
#include "lcsubstrings.c"
#undef unicode
#undef lcsubstrings
#define SEQUENCE_COMP SEQUENCE_COMPARE
#define unicode array
#define lcsubstrings alcsubstrings
#include "lcsubstrings.c"
#undef unicode
#undef lcsubstrings
#undef SEQUENCE_COMP
#define unicode unicode
#define fastcomp ufastcomp
#include "fastcomp.c"
#undef unicode
#undef fastcomp
#define unicode byte
#define fastcomp bfastcomp
#include "fastcomp.c"
#undef unicode
#undef fastcomp
#define SEQUENCE_COMP SEQUENCE_COMPARE
#define unicode array
#define fastcomp afastcomp
#include "fastcomp.c"
#undef unicode
#undef fastcomp
#undef SEQUENCE_COMP
distance-master/cdistance/utarray.h 0000644 0001750 0001750 00000030375 12243645633 016050 0 ustar jdg jdg /*
Copyright (c) 2008-2013, Troy D. Hanson http://troydhanson.github.com/uthash/
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* a dynamic array implementation using macros
*/
#ifndef UTARRAY_H
#define UTARRAY_H
#define UTARRAY_VERSION 1.9.8
#ifdef __GNUC__
#define _UNUSED_ __attribute__ ((__unused__))
#else
#define _UNUSED_
#endif
#include /* size_t */
#include /* memset, etc */
#include /* exit */
#define oom() exit(-1)
typedef void (ctor_f)(void *dst, const void *src);
typedef void (dtor_f)(void *elt);
typedef void (init_f)(void *elt);
typedef struct {
size_t sz;
init_f *init;
ctor_f *copy;
dtor_f *dtor;
} UT_icd;
typedef struct {
unsigned i,n;/* i: index of next available slot, n: num slots */
UT_icd icd; /* initializer, copy and destructor functions */
char *d; /* n slots of size icd->sz*/
} UT_array;
#define utarray_init(a,_icd) do { \
memset(a,0,sizeof(UT_array)); \
(a)->icd=*_icd; \
} while(0)
#define utarray_done(a) do { \
if ((a)->n) { \
if ((a)->icd.dtor) { \
size_t _ut_i; \
for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \
(a)->icd.dtor(utarray_eltptr(a,_ut_i)); \
} \
} \
free((a)->d); \
} \
(a)->n=0; \
} while(0)
#define utarray_new(a,_icd) do { \
a=(UT_array*)malloc(sizeof(UT_array)); \
utarray_init(a,_icd); \
} while(0)
#define utarray_free(a) do { \
utarray_done(a); \
free(a); \
} while(0)
#define utarray_reserve(a,by) do { \
if (((a)->i+by) > ((a)->n)) { \
while(((a)->i+by) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \
if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom(); \
} \
} while(0)
#define utarray_push_back(a,p) do { \
utarray_reserve(a,1); \
if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \
else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \
} while(0)
#define utarray_pop_back(a) do { \
if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \
else { (a)->i--; } \
} while(0)
#define utarray_extend_back(a) do { \
utarray_reserve(a,1); \
if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \
else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \
(a)->i++; \
} while(0)
#define utarray_len(a) ((a)->i)
#define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL)
#define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) )))
#define utarray_insert(a,p,j) do { \
if (j > (a)->i) utarray_resize(a,j); \
utarray_reserve(a,1); \
if ((j) < (a)->i) { \
memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \
((a)->i - (j))*((a)->icd.sz)); \
} \
if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \
else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \
(a)->i++; \
} while(0)
#define utarray_inserta(a,w,j) do { \
if (utarray_len(w) == 0) break; \
if (j > (a)->i) utarray_resize(a,j); \
utarray_reserve(a,utarray_len(w)); \
if ((j) < (a)->i) { \
memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \
_utarray_eltptr(a,j), \
((a)->i - (j))*((a)->icd.sz)); \
} \
if ((a)->icd.copy) { \
size_t _ut_i; \
for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \
(a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i)); \
} \
} else { \
memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \
utarray_len(w)*((a)->icd.sz)); \
} \
(a)->i += utarray_len(w); \
} while(0)
#define utarray_resize(dst,num) do { \
size_t _ut_i; \
if (dst->i > (size_t)(num)) { \
if ((dst)->icd.dtor) { \
for(_ut_i=num; _ut_i < dst->i; _ut_i++) { \
(dst)->icd.dtor(utarray_eltptr(dst,_ut_i)); \
} \
} \
} else if (dst->i < (size_t)(num)) { \
utarray_reserve(dst,num-dst->i); \
if ((dst)->icd.init) { \
for(_ut_i=dst->i; _ut_i < num; _ut_i++) { \
(dst)->icd.init(utarray_eltptr(dst,_ut_i)); \
} \
} else { \
memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i)); \
} \
} \
dst->i = num; \
} while(0)
#define utarray_concat(dst,src) do { \
utarray_inserta((dst),(src),utarray_len(dst)); \
} while(0)
#define utarray_erase(a,pos,len) do { \
if ((a)->icd.dtor) { \
size_t _ut_i; \
for(_ut_i=0; _ut_i < len; _ut_i++) { \
(a)->icd.dtor(utarray_eltptr((a),pos+_ut_i)); \
} \
} \
if ((a)->i > (pos+len)) { \
memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len), \
(((a)->i)-(pos+len))*((a)->icd.sz)); \
} \
(a)->i -= (len); \
} while(0)
#define utarray_renew(a,u) do { \
if (a) utarray_clear(a); \
else utarray_new((a),(u)); \
} while(0)
#define utarray_clear(a) do { \
if ((a)->i > 0) { \
if ((a)->icd.dtor) { \
size_t _ut_i; \
for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \
(a)->icd.dtor(utarray_eltptr(a,_ut_i)); \
} \
} \
(a)->i = 0; \
} \
} while(0)
#define utarray_sort(a,cmp) do { \
qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \
} while(0)
#define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp)
#define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL)
#define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL))
#define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL))
#define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL)
#define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(ssize_t)(a)->icd.sz) : -1)
/* last we pre-define a few icd for common utarrays of ints and strings */
static void utarray_str_cpy(void *dst, const void *src) {
char **_src = (char**)src, **_dst = (char**)dst;
*_dst = (*_src == NULL) ? NULL : strdup(*_src);
}
static void utarray_str_dtor(void *elt) {
char **eltc = (char**)elt;
if (*eltc) free(*eltc);
}
static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor};
static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL};
static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL};
#endif /* UTARRAY_H */
distance-master/cdistance/distance.h 0000644 0001750 0001750 00000001770 12243645633 016150 0 ustar jdg jdg #ifndef DISTANCE_H
#define DISTANCE_H
#include "Python.h"
#include "utarray.h"
// Debugging. This kills the interpreter if an assertion fails.
#ifdef DISTANCE_DEBUG
#undef NDEBUG
#include
#endif
// Compatibility Python 2 && 3
#if PY_MAJOR_VERSION < 3
#define PyBytes_Check PyString_Check
#define PyBytes_AS_STRING PyString_AS_STRING
#define PyBytes_GET_SIZE PyString_GET_SIZE
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif
// Aliases for each sequence type
typedef Py_UNICODE unicode;
typedef char byte;
typedef PyObject array;
typedef union {
unicode *u;
byte *b;
array *a;
} sequence;
// Used in distance.c and some other files
#define SWAP(type, a, b) \
do { \
type a##_tmp = a; \
a = b; \
b = a##_tmp; \
} while (0)
// Used in lcsubstrings.c and distance.c for dynamic array
struct pair_t {
Py_ssize_t i;
Py_ssize_t j;
};
UT_icd pair_icd = {sizeof(struct pair_t), NULL, NULL, NULL};
#endif
distance-master/cdistance/lcsubstrings.c 0000644 0001750 0001750 00000002427 12243645633 017073 0 ustar jdg jdg #include "distance.h"
static UT_array *
lcsubstrings(unicode *seq1, unicode *seq2,
Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t *max_len)
{
Py_ssize_t i, j, mlen = -1;
Py_ssize_t old, last, *column;
UT_array *stack = NULL;
struct pair_t pos;
#ifdef SEQUENCE_COMP
int comp;
#endif
assert(len1 >= len2);
utarray_new(stack, &pair_icd);
if (len2 == 0) {
*max_len = 0;
return stack;
}
if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
goto On_Error;
last = 0;
for (j = 0; j < len2; j++)
column[j] = j;
for (i = 0; i < len1; i++) {
for (j = 0; j < len2; j++) {
old = column[j];
#ifdef SEQUENCE_COMP
comp = SEQUENCE_COMP(seq1, i, seq2, j);
if (comp == -1)
goto On_Error;
if (comp) {
#else
if (seq1[i] == seq2[j]) {
#endif
column[j] = ((i == 0 || j == 0) ? 1 : (last + 1));
if (column[j] > mlen) {
mlen = column[j];
pos.i = i;
pos.j = j;
utarray_clear(stack);
utarray_push_back(stack, &pos);
}
else if (column[j] == mlen) {
pos.i = i;
pos.j = j;
utarray_push_back(stack, &pos);
}
}
else
column[j] = 0;
last = old;
}
}
free(column);
*max_len = mlen;
return stack;
On_Error:
free(column);
utarray_free(stack);
return NULL;
}
distance-master/cdistance/hamming.c 0000644 0001750 0001750 00000000562 12243645633 015767 0 ustar jdg jdg #include "distance.h"
static Py_ssize_t
hamming(unicode *seq1, unicode *seq2, Py_ssize_t len)
{
Py_ssize_t i, dist = 0;
#ifdef SEQUENCE_COMP
int comp;
#endif
for (i = 0; i < len; i++) {
#ifdef SEQUENCE_COMP
comp = SEQUENCE_COMP(seq1, i, seq2, i);
if (comp == -1)
return -1;
if (!comp)
#else
if (seq1[i] != seq2[i])
#endif
dist++;
}
return dist;
}
distance-master/cdistance/levenshtein.c 0000644 0001750 0001750 00000006674 12243645633 016705 0 ustar jdg jdg #include "distance.h"
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
#define MAX3(a, b, c) ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c)))
#ifndef LEVENSHTEIN_C
#define LEVENSHTEIN_C
static Py_ssize_t
minimum(const Py_ssize_t *column, Py_ssize_t len)
{
Py_ssize_t min;
assert(len > 0);
min = column[--len];
while (len-- >= 0) {
if (column[len] < min)
min = column[len];
}
return min;
}
#endif
static Py_ssize_t
levenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t max_dist)
{
Py_ssize_t i, j;
Py_ssize_t last, old;
Py_ssize_t cost, dist = -2;
Py_ssize_t *column;
#ifdef SEQUENCE_COMP
int comp;
#endif
if (len1 < len2) {
SWAP(unicode *, seq1, seq2);
SWAP(Py_ssize_t, len1, len2);
}
if (max_dist >= 0 && (len1 - len2) > max_dist)
return -1;
else {
if (len1 == 0)
return len2;
if (len2 == 0)
return len1;
}
if ((column = (Py_ssize_t *) malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
return -2;
for (j = 1 ; j <= len2; j++)
column[j] = j;
for (i = 1 ; i <= len1; i++) {
column[0] = i;
for (j = 1, last = i - 1; j <= len2; j++) {
old = column[j];
#ifdef SEQUENCE_COMP
comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1);
if (comp == -1) {
free(column);
return -3;
}
cost = (!comp);
#else
cost = (seq1[i - 1] != seq2[j - 1]);
#endif
column[j] = MIN3(
column[j] + 1,
column[j - 1] + 1,
last + cost
);
last = old;
}
if (max_dist >= 0 && minimum(column, len2 + 1) > max_dist) {
free(column);
return -1;
}
}
dist = column[len2];
free(column);
if (max_dist >= 0 && dist > max_dist)
return -1;
return dist;
}
static double
nlevenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, short method)
{
Py_ssize_t i, j;
// distance
Py_ssize_t ic, dc, rc;
Py_ssize_t last, old;
Py_ssize_t *column;
Py_ssize_t fdist;
// length
Py_ssize_t lic, ldc, lrc;
Py_ssize_t llast, lold;
Py_ssize_t *length;
Py_ssize_t flen;
#ifdef SEQUENCE_COMP
int comp;
#endif
assert(len1 >= len2);
if (len1 == 0) // len2 is 0 too, so the two sequences are identical
return 0.0;
if (len2 == 0) // completely different
return 1.0;
if (method == 1) {
fdist = levenshtein(seq1, seq2, len1, len2, -1);
if (fdist < 0) // error
return fdist;
return fdist / (double)len1;
}
if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
return -1;
if ((length = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) {
free(column);
return -1;
}
for (j = 1 ; j <= len2; j++)
column[j] = length[j] = j;
for (i = 1 ; i <= len1; i++) {
column[0] = length[0] = i;
for (j = 1, last = llast = i - 1; j <= len2; j++) {
// distance
old = column[j];
ic = column[j - 1] + 1;
dc = column[j] + 1;
#ifdef SEQUENCE_COMP
comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1);
if (comp == -1) {
free(column);
free(length);
return -2;
}
rc = last + (!comp);
#else
rc = last + (seq1[i - 1] != seq2[j - 1]);
#endif
column[j] = MIN3(ic, dc, rc);
last = old;
// length
lold = length[j];
lic = (ic == column[j] ? length[j - 1] + 1 : 0);
ldc = (dc == column[j] ? length[j] + 1 : 0);
lrc = (rc == column[j] ? llast + 1 : 0);
length[j] = MAX3(lic, ldc, lrc);
llast = lold;
}
}
fdist = column[len2];
flen = length[len2];
free(column);
free(length);
return fdist / (double)flen;
}