text-metrics-0.3.2/0000755000000000000000000000000007346545000012335 5ustar0000000000000000text-metrics-0.3.2/CHANGELOG.md0000644000000000000000000000140107346545000014142 0ustar0000000000000000## Text Metrics 0.3.2 * Works with `text-2.0`. ## Text Metrics 0.3.1 * Fixed a bug in the implementation of Jaro-Winkler distance when two strings share a long prefix. [PR 21](https://github.com/mrkkrp/text-metrics/pull/21). * Dropped support for GHC 8.6 and older. ## Text Metrics 0.3.0 * All functions are now implemented in pure Haskell. * All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio Natural`. * Added `overlap` (returns overlap coefficient) and `jaccard` (returns Jaccard similarity coefficient). ## Text Metrics 0.2.0 * Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and `demerauLevenshtein` more efficient. * Added `jaro` and `jaroWinkler` functions. ## Text Metrics 0.1.0 * Initial release. text-metrics-0.3.2/Data/Text/0000755000000000000000000000000007346545000014132 5ustar0000000000000000text-metrics-0.3.2/Data/Text/Metrics.hs0000644000000000000000000003064607346545000016105 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiWayIf #-} -- | -- Module : Data.Text.Metrics -- Copyright : © 2016–present Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module provides efficient implementations of various strings metric -- algorithms. It works with strict 'Text' values. -- -- __Note__: before version /0.3.0/ the package used C implementations of -- the algorithms under the hood. Beginning from version /0.3.0/, the -- implementations are written in Haskell while staying almost as fast, see: -- -- module Data.Text.Metrics ( -- * Levenshtein variants levenshtein, levenshteinNorm, damerauLevenshtein, damerauLevenshteinNorm, -- * Treating inputs like sets overlap, jaccard, -- * Other hamming, jaro, jaroWinkler, ) where import Control.Monad import Control.Monad.ST import Data.Map.Strict (Map) import qualified Data.Map.Strict as M import Data.Ratio import Data.Text import qualified Data.Text as T import qualified Data.Text.Internal as T import qualified Data.Text.Unsafe as TU import qualified Data.Vector.Unboxed.Mutable as VUM import GHC.Exts (inline) ---------------------------------------------------------------------------- -- Levenshtein variants -- | Return the Levenshtein distance between two 'Text' values. The -- Levenshtein distance between two strings is the minimal number of -- operations necessary to transform one string into another. For the -- Levenshtein distance allowed operations are: deletion, insertion, and -- substitution. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. levenshtein :: Text -> Text -> Int levenshtein a b = fst (levenshtein_ a b) -- | Return the normalized Levenshtein distance between two 'Text' values. -- Result is a non-negative rational number (represented as @'Ratio' -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the -- strings, while 1 means exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. levenshteinNorm :: Text -> Text -> Ratio Int levenshteinNorm = norm levenshtein_ -- | An internal helper, returns the Levenshtein distance as the first -- element of the tuple and max length of the two inputs as the second -- element of the tuple. levenshtein_ :: Text -> Text -> (Int, Int) levenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- VUM.unsafeNew (v_len * 2) let gov !i = when (i < v_len) $ do VUM.unsafeWrite v i i gov (i + 1) goi !i !na !v0 !v1 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb cost = if ai == bj then 0 else 1 x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) VUM.unsafeWrite v (v1 + j + 1) (min x (min y z)) goj (j + 1) (nb + db) when (i < lena) $ do VUM.unsafeWrite v v1 (i + 1) goj 0 0 goi (i + 1) (na + da) v1 v0 gov 0 goi 0 0 0 v_len ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE levenshtein_ #-} -- | Return the Damerau-Levenshtein distance between two 'Text' values. The -- function works like 'levenshtein', but the collection of allowed -- operations also includes transposition of two /adjacent/ characters. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. damerauLevenshtein :: Text -> Text -> Int damerauLevenshtein a b = fst (damerauLevenshtein_ a b) -- | Return the normalized Damerau-Levenshtein distance between two 'Text' -- values. 0 signifies no similarity between the strings, while 1 means -- exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. damerauLevenshteinNorm :: Text -> Text -> Ratio Int damerauLevenshteinNorm = norm damerauLevenshtein_ -- | An internal helper, returns the Damerau-Levenshtein distance as the -- first element of the tuple and max length of the two inputs as the second -- element of the tuple. damerauLevenshtein_ :: Text -> Text -> (Int, Int) damerauLevenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- VUM.unsafeNew (v_len * 3) let gov !i = when (i < v_len) $ do VUM.unsafeWrite v i i gov (i + 1) goi !i !na !ai_1 !v0 !v1 !v2 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb !bj_1 = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb cost = if ai == bj then 0 else 1 x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) let g = min x (min y z) val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1) VUM.unsafeWrite v (v1 + j + 1) $ if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g then val else g goj (j + 1) (nb + db) bj when (i < lena) $ do VUM.unsafeWrite v v1 (i + 1) goj 0 0 'a' goi (i + 1) (na + da) ai v1 v2 v0 gov 0 goi 0 0 'a' 0 v_len (v_len * 2) ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE damerauLevenshtein_ #-} ---------------------------------------------------------------------------- -- Treating inputs like sets -- | Return the overlap coefficient for two 'Text' values. Returned value is -- in the range from 0 (no similarity) to 1 (exact match). Return 1 if both -- 'Text' values are empty. -- -- See also: . -- -- @since 0.3.0 overlap :: Text -> Text -> Ratio Int overlap a b = if d == 0 then 1 % 1 else intersectionSize (mkTextMap a) (mkTextMap b) % d where d = min (T.length a) (T.length b) -- | Return the Jaccard similarity coefficient for two 'Text' values. -- Returned value is in the range from 0 (no similarity) to 1 (exact match). -- Return 1 if both -- -- See also: -- -- @since 0.3.0 jaccard :: Text -> Text -> Ratio Int jaccard a b = if d == 0 then 1 % 1 else intersectionSize ma mb % d where ma = mkTextMap a mb = mkTextMap b d = unionSize ma mb -- | Make a map from 'Char' to 'Int' representing how many times the 'Char' -- appears in the input 'Text'. mkTextMap :: Text -> Map Char Int mkTextMap = T.foldl' f M.empty where f m ch = M.insertWith (+) ch 1 m {-# INLINE mkTextMap #-} -- | Return intersection size between two 'Text'-maps. intersectionSize :: Map Char Int -> Map Char Int -> Int intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b) {-# INLINE intersectionSize #-} -- | Return union size between two 'Text'-maps. unionSize :: Map Char Int -> Map Char Int -> Int unionSize a b = M.foldl' (+) 0 (M.unionWith max a b) {-# INLINE unionSize #-} ---------------------------------------------------------------------------- -- Other -- | /O(n)/ Return the Hamming distance between two 'Text' values. Hamming -- distance is defined as the number of positions at which the corresponding -- symbols are different. The input 'Text' values should be of equal length -- or 'Nothing' will be returned. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Maybe' -- 'Data.Numeric.Natural'@. hamming :: Text -> Text -> Maybe Int hamming a@(T.Text _ _ len) b = if T.length a == T.length b then Just (go 0 0 0) else Nothing where go !na !nb !r = let !(TU.Iter cha da) = TU.iter a na !(TU.Iter chb db) = TU.iter b nb in if | na == len -> r | cha /= chb -> go (na + da) (nb + db) (r + 1) | otherwise -> go (na + da) (nb + db) r -- | Return the Jaro distance between two 'Text' values. Returned value is -- in the range from 0 (no similarity) to 1 (exact match). -- -- While the algorithm is pretty clear for artificial examples (like those -- from the linked Wikipedia article), for /arbitrary/ strings, it may be -- hard to decide which of two strings should be considered as one having -- “reference” order of characters (order of matching characters in an -- essential part of the definition of the algorithm). This makes us -- consider the first string the “reference” string (with correct order of -- characters). Thus generally, -- -- > jaro a b ≠ jaro b a -- -- This asymmetry can be found in all implementations of the algorithm on -- the internet, AFAIK. -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaro :: Text -> Text -> Ratio Int jaro a b = if T.null a || T.null b then 0 % 1 else runST $ do let lena = T.length a lenb = T.length b d = if lena >= 2 && lenb >= 2 then max lena lenb `quot` 2 - 1 else 0 v <- VUM.replicate lenb (0 :: Int) r <- VUM.replicate 3 (0 :: Int) -- tj, m, t let goi !i !na !fromb = do let !(TU.Iter ai da) = TU.iter a na (from, fromb') = if i >= d then (i - d, fromb + TU.iter_ b fromb) else (0, 0) to = min (i + d + 1) lenb goj !j !nb = when (j < to) $ do let !(TU.Iter bj db) = TU.iter b nb used <- (== 1) <$> VUM.unsafeRead v j if not used && ai == bj then do tj <- VUM.unsafeRead r 0 if j < tj then VUM.unsafeModify r (+ 1) 2 else VUM.unsafeWrite r 0 j VUM.unsafeWrite v j 1 VUM.unsafeModify r (+ 1) 1 else goj (j + 1) (nb + db) when (i < lena) $ do goj from fromb goi (i + 1) (na + da) fromb' goi 0 0 0 m <- VUM.unsafeRead r 1 t <- VUM.unsafeRead r 2 return $ if m == 0 then 0 % 1 else ( (m % lena) + (m % lenb) + ((m - t) % m) ) / 3 -- | Return the Jaro-Winkler distance between two 'Text' values. Returned -- value is in range from 0 (no similarity) to 1 (exact match). -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaroWinkler :: Text -> Text -> Ratio Int jaroWinkler a b = dj + (1 % 10) * l * (1 - dj) where dj = inline (jaro a b) l = fromIntegral (min 4 (commonPrefix a b)) -- | Return the length of the common prefix two 'Text' values have. commonPrefix :: Text -> Text -> Int commonPrefix a b = case T.commonPrefixes a b of Nothing -> 0 Just (pref, _, _) -> T.length pref {-# INLINE commonPrefix #-} ---------------------------------------------------------------------------- -- Helpers norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int norm f a b = let (r, l) = f a b in if r == 0 then 1 % 1 else 1 % 1 - r % l {-# INLINE norm #-} text-metrics-0.3.2/LICENSE.md0000644000000000000000000000265607346545000013752 0ustar0000000000000000Copyright © 2016–present Mark Karpov All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Mark Karpov nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. text-metrics-0.3.2/README.md0000644000000000000000000000521107346545000013613 0ustar0000000000000000# Text Metrics [![License BSD3](https://img.shields.io/badge/license-BSD3-brightgreen.svg)](http://opensource.org/licenses/BSD-3-Clause) [![Hackage](https://img.shields.io/hackage/v/text-metrics.svg?style=flat)](https://hackage.haskell.org/package/text-metrics) [![Stackage Nightly](http://stackage.org/package/text-metrics/badge/nightly)](http://stackage.org/nightly/package/text-metrics) [![Stackage LTS](http://stackage.org/package/text-metrics/badge/lts)](http://stackage.org/lts/package/text-metrics) ![CI](https://github.com/mrkkrp/text-metrics/workflows/CI/badge.svg?branch=master) The library provides efficient implementations of various strings metric algorithms. It works with strict `Text` values. The current version of the package implements: * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Normalized Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Normalized Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) * [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) * [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index) ## Comparison with the `edit-distance` package There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) package whose scope overlaps with the scope of this package. The differences are: * `edit-distance` allows to specify costs for every operation when calculating Levenshtein distance (insertion, deletion, substitution, and transposition). This is rarely needed though in real-world applications, IMO. * `edit-distance` only provides Levenshtein distance, `text-metrics` aims to provide implementations of most string metrics algorithms. * `edit-distance` works on `Strings`, while `text-metrics` works on strict `Text` values. ## Implementation Although we originally used C for speed, currently all functions are pure Haskell tuned for performance. See [this blog post](https://markkarpov.com/post/migrating-text-metrics.html) for more info. ## Contribution Issues, bugs, and questions may be reported in [the GitHub issue tracker for this project](https://github.com/mrkkrp/text-metrics/issues). Pull requests are also welcome. ## License Copyright © 2016–present Mark Karpov Distributed under BSD 3 clause license. text-metrics-0.3.2/Setup.hs0000644000000000000000000000012707346545000013771 0ustar0000000000000000module Main (main) where import Distribution.Simple main :: IO () main = defaultMain text-metrics-0.3.2/bench/memory/0000755000000000000000000000000007346545000014724 5ustar0000000000000000text-metrics-0.3.2/bench/memory/Main.hs0000644000000000000000000000214107346545000016142 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Control.Monad import Data.Text (Text) import qualified Data.Text as T import Data.Text.Metrics import Weigh main :: IO () main = mainWith $ do setColumns [Case, Allocated, GCs, Max] bmetric "levenshtein" levenshtein bmetric "levenshteinNorm" levenshteinNorm bmetric "damerauLevenshtein" damerauLevenshtein bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm bmetric "overlap" overlap bmetric "jaccard" jaccard bmetric "hamming" hamming bmetric "jaro" jaro bmetric "jaroWinkler" jaroWinkler -- | Perform a series to measurements with the same metric function. bmetric :: NFData a => -- | Name of the benchmark group String -> -- | The function to benchmark (Text -> Text -> a) -> Weigh () bmetric name f = forM_ stdSeries $ \n -> func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5, 10, 20, 40, 80, 160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a' .. 'z'] text-metrics-0.3.2/bench/speed/0000755000000000000000000000000007346545000014514 5ustar0000000000000000text-metrics-0.3.2/bench/speed/Main.hs0000644000000000000000000000203507346545000015734 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Criterion.Main import Data.Text (Text) import qualified Data.Text as T import Data.Text.Metrics main :: IO () main = defaultMain [ btmetric "levenshtein" levenshtein, btmetric "levenshteinNorm" levenshteinNorm, btmetric "damerauLevenshtein" damerauLevenshtein, btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm, btmetric "overlap" overlap, btmetric "jaccard" jaccard, btmetric "hamming" hamming, btmetric "jaro" jaro, btmetric "jaroWinkler" jaroWinkler ] -- | Produce benchmark group to test. btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark btmetric name f = bgroup name (bs <$> stdSeries) where bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f)) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5, 10, 20, 40, 80, 160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a' .. 'z'] text-metrics-0.3.2/tests/0000755000000000000000000000000007346545000013477 5ustar0000000000000000text-metrics-0.3.2/tests/Main.hs0000644000000000000000000001322207346545000014717 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Main (main) where import Data.Ratio import Data.Text (Text) import qualified Data.Text as T import Data.Text.Metrics import Test.Hspec import Test.QuickCheck instance Arbitrary Text where arbitrary = T.pack <$> arbitrary main :: IO () main = hspec spec spec :: Spec spec = do describe "levenshtein" $ do testSwap levenshtein context "with concrete examples" $ do testPair levenshtein "kitten" "sitting" 3 testPair levenshtein "cake" "drake" 2 testPair levenshtein "saturday" "sunday" 3 testPair levenshtein "red" "wax" 3 testPair levenshtein "a😀c" "abc" 1 testPair levenshtein "lucky" "lucky" 0 testPair levenshtein "" "" 0 describe "levenshteinNorm" $ do testSwap levenshteinNorm testPair levenshteinNorm "kitten" "sitting" (4 % 7) testPair levenshteinNorm "cake" "drake" (3 % 5) testPair levenshteinNorm "saturday" "sunday" (5 % 8) testPair levenshteinNorm "red" "wax" (0 % 1) testPair levenshteinNorm "a😀c" "abc" (2 % 3) testPair levenshteinNorm "lucky" "lucky" (1 % 1) testPair levenshteinNorm "" "" (1 % 1) describe "damerauLevenshtein" $ do testSwap damerauLevenshtein testPair damerauLevenshtein "veryvery long" "very long" 4 testPair damerauLevenshtein "thing" "think" 1 testPair damerauLevenshtein "nose" "ones" 2 testPair damerauLevenshtein "thing" "sign" 3 testPair damerauLevenshtein "red" "wax" 3 testPair damerauLevenshtein "a😀c" "abc" 1 testPair damerauLevenshtein "lucky" "lucky" 0 testPair damerauLevenshtein "" "" 0 describe "damerauLevenshteinNorm" $ do testSwap damerauLevenshteinNorm testPair damerauLevenshteinNorm "veryvery long" "very long" (9 % 13) testPair damerauLevenshteinNorm "thing" "think" (4 % 5) testPair damerauLevenshteinNorm "nose" "ones" (1 % 2) testPair damerauLevenshteinNorm "thing" "sign" (2 % 5) testPair damerauLevenshteinNorm "red" "wax" (0 % 1) testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3) testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1) testPair damerauLevenshteinNorm "" "" (1 % 1) describe "hamming" $ do testSwap hamming testPair hamming "karolin" "kathrin" (Just 3) testPair hamming "karolin" "kerstin" (Just 3) testPair hamming "1011101" "1001001" (Just 2) testPair hamming "2173896" "2233796" (Just 3) testPair hamming "toned" "roses" (Just 3) testPair hamming "red" "wax" (Just 3) testPair hamming "a😀c" "abc" (Just 1) testPair hamming "lucky" "lucky" (Just 0) testPair hamming "" "" (Just 0) testPair hamming "small" "big" Nothing describe "jaro" $ do testPair jaro "aa" "a" (5 % 6) testPair jaro "a" "aa" (5 % 6) testPair jaro "martha" "marhta" (17 % 18) testPair jaro "marhta" "martha" (17 % 18) testPair jaro "dwayne" "duane" (37 % 45) testPair jaro "duane" "dwayne" (37 % 45) testPair jaro "dixon" "dicksonx" (23 % 30) testPair jaro "dicksonx" "dixon" (23 % 30) testPair jaro "jones" "johnson" (83 % 105) testPair jaro "johnson" "jones" (83 % 105) testPair jaro "brain" "brian" (14 % 15) testPair jaro "brian" "brain" (14 % 15) testPair jaro "five" "ten" (0 % 1) testPair jaro "ten" "five" (0 % 1) testPair jaro "lucky" "lucky" (1 % 1) testPair jaro "a😀c" "abc" (7 % 9) testPair jaro "" "" (0 % 1) describe "jaroWinkler" $ do testPair jaroWinkler "aa" "a" (17 % 20) testPair jaroWinkler "a" "aa" (17 % 20) testPair jaroWinkler "martha" "marhta" (173 % 180) testPair jaroWinkler "marhta" "martha" (173 % 180) testPair jaroWinkler "dwayne" "duane" (21 % 25) testPair jaroWinkler "duane" "dwayne" (21 % 25) testPair jaroWinkler "dixon" "dicksonx" (61 % 75) testPair jaroWinkler "dicksonx" "dixon" (61 % 75) testPair jaroWinkler "jones" "johnson" (437 % 525) testPair jaroWinkler "johnson" "jones" (437 % 525) testPair jaroWinkler "brain" "brian" (71 % 75) testPair jaroWinkler "brian" "brain" (71 % 75) testPair jaroWinkler "five" "ten" (0 % 1) testPair jaroWinkler "ten" "five" (0 % 1) testPair jaroWinkler "lucky" "lucky" (1 % 1) testPair jaroWinkler "a😀c" "abc" (4 % 5) testPair jaroWinkler "" "" (0 % 1) testPair jaroWinkler "aaaaaaaaaab" "aaaaaaaaaa" (54 % 55) testPair jaroWinkler "aaaaaaaaaaaaaaaaaaaab" "aaaaaaaaaaaaaaaaaaaa" (104 % 105) describe "overlap" $ do testSwap overlap testPair overlap "fly" "butterfly" (1 % 1) testPair overlap "night" "nacht" (3 % 5) testPair overlap "context" "contact" (5 % 7) testPair overlap "red" "wax" (0 % 1) testPair overlap "a😀c" "abc" (2 % 3) testPair overlap "lucky" "lucky" (1 % 1) describe "jaccard" $ do testSwap jaccard testPair jaccard "xxx" "xyx" (1 % 2) testPair jaccard "night" "nacht" (3 % 7) testPair jaccard "context" "contact" (5 % 9) testPair overlap "a😀c" "abc" (2 % 3) testPair jaccard "lucky" "lucky" (1 % 1) -- | Test that given function returns the same results when order of -- arguments is swapped. testSwap :: (Eq a, Show a) => (Text -> Text -> a) -> SpecWith () testSwap f = context "if we swap the arguments" $ it "produces the same result" $ property $ \a b -> f a b === f b a -- | Create spec for given metric function applying it to two 'Text' values -- and comparing the result with expected one. testPair :: (Eq a, Show a) => -- | Function to test (Text -> Text -> a) -> -- | First input Text -> -- | Second input Text -> -- | Expected result a -> SpecWith () testPair f a b r = it ("‘" ++ T.unpack a ++ "’ and ‘" ++ T.unpack b ++ "’") $ f a b `shouldBe` r text-metrics-0.3.2/text-metrics.cabal0000644000000000000000000000500307346545000015747 0ustar0000000000000000cabal-version: 2.4 name: text-metrics version: 0.3.2 license: BSD-3-Clause license-file: LICENSE.md maintainer: Mark Karpov author: Mark Karpov tested-with: ghc ==8.10.7 ghc ==9.0.1 ghc ==9.2.1 homepage: https://github.com/mrkkrp/text-metrics bug-reports: https://github.com/mrkkrp/text-metrics/issues synopsis: Calculate various string metrics efficiently description: Calculate various string metrics efficiently. category: Text, Algorithms build-type: Simple extra-doc-files: CHANGELOG.md README.md source-repository head type: git location: https://github.com/mrkkrp/text-metrics.git flag dev description: Turn on development settings. default: False manual: True library exposed-modules: Data.Text.Metrics default-language: Haskell2010 build-depends: base >=4.13 && <5.0, containers >=0.5 && <0.7, text >=0.2 && <2.1, vector >=0.11 && <0.13 if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall test-suite tests type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: tests default-language: Haskell2010 build-depends: QuickCheck >=2.8 && <3.0, base >=4.13 && <5.0, hspec >=2.0 && <3.0, text >=0.2 && <2.1, text-metrics if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall if flag(dev) ghc-options: -Wcompat -Wincomplete-record-updates -Wincomplete-uni-patterns -Wnoncanonical-monad-instances benchmark bench-speed type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: bench/speed default-language: Haskell2010 build-depends: base >=4.13 && <5.0, criterion >=0.6.2.1 && <1.6, deepseq >=1.3 && <1.5, text >=0.2 && <2.1, text-metrics if flag(dev) ghc-options: -O2 -Wall -Werror else ghc-options: -O2 -Wall benchmark bench-memory type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: bench/memory default-language: Haskell2010 build-depends: base >=4.13 && <5.0, deepseq >=1.3 && <1.5, text >=0.2 && <2.1, text-metrics, weigh >=0.0.4 if flag(dev) ghc-options: -O2 -Wall -Werror else ghc-options: -O2 -Wall