pax_global_header00006660000000000000000000000064137124507260014521gustar00rootroot0000000000000052 comment=616b0685748089a3ee9b532ea6d73ed5a46267b4 golang-levenshtein-1.0.1/000077500000000000000000000000001371245072600153115ustar00rootroot00000000000000golang-levenshtein-1.0.1/.gitignore000066400000000000000000000003741371245072600173050ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe golang-levenshtein-1.0.1/LICENSE000066400000000000000000000021101371245072600163100ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2013 Kilian Evang and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-levenshtein-1.0.1/README.md000066400000000000000000000011241371245072600165660ustar00rootroot00000000000000golang-levenshtein ================== An implementation of the Levenshtein algorithm in Go. Provides edit distances, edit scripts and ratios for strings (slices of runes). Installation ------------ $ go get github.com/texttheater/golang-levenshtein/levenshtein Documentation ------------- The documentation can be viewed online here: https://godoc.org/github.com/texttheater/golang-levenshtein/levenshtein See also -------- For a package that is similar but more generic and provides more control, check out Daniël de Kok’s [editdistance](https://github.com/danieldk/editdistance). golang-levenshtein-1.0.1/go.mod000066400000000000000000000000721371245072600164160ustar00rootroot00000000000000module github.com/texttheater/golang-levenshtein go 1.13 golang-levenshtein-1.0.1/levenshtein/000077500000000000000000000000001371245072600176355ustar00rootroot00000000000000golang-levenshtein-1.0.1/levenshtein/levenshtein.go000066400000000000000000000177521371245072600225240ustar00rootroot00000000000000// This package implements the Levenshtein algorithm for computing the // similarity between two strings. The central function is MatrixForStrings, // which computes the Levenshtein matrix. The functions DistanceForMatrix, // EditScriptForMatrix and RatioForMatrix read various interesting properties // off the matrix. The package also provides the convenience functions // DistanceForStrings, EditScriptForStrings and RatioForStrings for going // directly from two strings to the property of interest. package levenshtein import ( "fmt" "io" "os" ) type EditOperation int const ( Ins = iota Del Sub Match ) type EditScript []EditOperation type MatchFunction func(rune, rune) bool // IdenticalRunes is the default MatchFunction: it checks whether two runes are // identical. func IdenticalRunes(a rune, b rune) bool { return a == b } type Options struct { InsCost int DelCost int SubCost int Matches MatchFunction } // DefaultOptions is the default options without substitution: insertion cost // is 1, deletion cost is 1, substitution cost is 2 (meaning insert and delete // will be used instead), and two runes match iff they are identical. var DefaultOptions Options = Options{ InsCost: 1, DelCost: 1, SubCost: 2, Matches: IdenticalRunes, } // DefaultOptionsWithSub is the default options with substitution: insertion // cost is 1, deletion cost is 1, substitution cost is 1, and two runes match // iff they are identical. var DefaultOptionsWithSub Options = Options{ InsCost: 1, DelCost: 1, SubCost: 1, Matches: IdenticalRunes, } func (operation EditOperation) String() string { if operation == Match { return "match" } else if operation == Ins { return "ins" } else if operation == Sub { return "sub" } return "del" } // DistanceForStrings returns the edit distance between source and target. // // It has a runtime proportional to len(source) * len(target) and memory use // proportional to len(target). func DistanceForStrings(source []rune, target []rune, op Options) int { // Note: This algorithm is a specialization of MatrixForStrings. // MatrixForStrings returns the full edit matrix. However, we only need a // single value (see DistanceForMatrix) and the main loop of the algorithm // only uses the current and previous row. As such we create a 2D matrix, // but with height 2 (enough to store current and previous row). height := len(source) + 1 width := len(target) + 1 matrix := make([][]int, 2) // Initialize trivial distances (from/to empty string). That is, fill // the left column and the top row with row/column indices multiplied // by deletion/insertion cost. for i := 0; i < 2; i++ { matrix[i] = make([]int, width) matrix[i][0] = i * op.DelCost } for j := 1; j < width; j++ { matrix[0][j] = j * op.InsCost } // Fill in the remaining cells: for each prefix pair, choose the // (edit history, operation) pair with the lowest cost. for i := 1; i < height; i++ { cur := matrix[i%2] prev := matrix[(i-1)%2] cur[0] = i * op.DelCost for j := 1; j < width; j++ { delCost := prev[j] + op.DelCost matchSubCost := prev[j-1] if !op.Matches(source[i-1], target[j-1]) { matchSubCost += op.SubCost } insCost := cur[j-1] + op.InsCost cur[j] = min(delCost, min(matchSubCost, insCost)) } } return matrix[(height-1)%2][width-1] } // DistanceForMatrix reads the edit distance off the given Levenshtein matrix. func DistanceForMatrix(matrix [][]int) int { return matrix[len(matrix)-1][len(matrix[0])-1] } // RatioForStrings returns the Levenshtein ratio for the given strings. The // ratio is computed as follows: // // (sourceLength + targetLength - distance) / (sourceLength + targetLength) func RatioForStrings(source []rune, target []rune, op Options) float64 { matrix := MatrixForStrings(source, target, op) return RatioForMatrix(matrix) } // RatioForMatrix returns the Levenshtein ratio for the given matrix. The ratio // is computed as follows: // // (sourceLength + targetLength - distance) / (sourceLength + targetLength) func RatioForMatrix(matrix [][]int) float64 { sourcelength := len(matrix) - 1 targetlength := len(matrix[0]) - 1 sum := sourcelength + targetlength if sum == 0 { return 0 } dist := DistanceForMatrix(matrix) return float64(sum-dist) / float64(sum) } // MatrixForStrings generates a 2-D array representing the dynamic programming // table used by the Levenshtein algorithm, as described e.g. here: // http://www.let.rug.nl/kleiweg/lev/ // The reason for putting the creation of the table into a separate function is // that it cannot only be used for reading of the edit distance between two // strings, but also e.g. to backtrace an edit script that provides an // alignment between the characters of both strings. func MatrixForStrings(source []rune, target []rune, op Options) [][]int { // Make a 2-D matrix. Rows correspond to prefixes of source, columns to // prefixes of target. Cells will contain edit distances. // Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html height := len(source) + 1 width := len(target) + 1 matrix := make([][]int, height) // Initialize trivial distances (from/to empty string). That is, fill // the left column and the top row with row/column indices multiplied // by deletion/insertion cost. for i := 0; i < height; i++ { matrix[i] = make([]int, width) matrix[i][0] = i * op.DelCost } for j := 1; j < width; j++ { matrix[0][j] = j * op.InsCost } // Fill in the remaining cells: for each prefix pair, choose the // (edit history, operation) pair with the lowest cost. for i := 1; i < height; i++ { for j := 1; j < width; j++ { delCost := matrix[i-1][j] + op.DelCost matchSubCost := matrix[i-1][j-1] if !op.Matches(source[i-1], target[j-1]) { matchSubCost += op.SubCost } insCost := matrix[i][j-1] + op.InsCost matrix[i][j] = min(delCost, min(matchSubCost, insCost)) } } //LogMatrix(source, target, matrix) return matrix } // EditScriptForStrings returns an optimal edit script to turn source into // target. func EditScriptForStrings(source []rune, target []rune, op Options) EditScript { return backtrace(len(source), len(target), MatrixForStrings(source, target, op), op) } // EditScriptForMatrix returns an optimal edit script based on the given // Levenshtein matrix. func EditScriptForMatrix(matrix [][]int, op Options) EditScript { return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op) } // WriteMatrix writes a visual representation of the given matrix for the given // strings to the given writer. func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) { fmt.Fprintf(writer, " ") for _, targetRune := range target { fmt.Fprintf(writer, " %c", targetRune) } fmt.Fprintf(writer, "\n") fmt.Fprintf(writer, " %2d", matrix[0][0]) for j, _ := range target { fmt.Fprintf(writer, " %2d", matrix[0][j+1]) } fmt.Fprintf(writer, "\n") for i, sourceRune := range source { fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0]) for j, _ := range target { fmt.Fprintf(writer, " %2d", matrix[i+1][j+1]) } fmt.Fprintf(writer, "\n") } } // LogMatrix writes a visual representation of the given matrix for the given // strings to os.Stderr. This function is deprecated, use // WriteMatrix(source, target, matrix, os.Stderr) instead. func LogMatrix(source []rune, target []rune, matrix [][]int) { WriteMatrix(source, target, matrix, os.Stderr) } func backtrace(i int, j int, matrix [][]int, op Options) EditScript { if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] { return append(backtrace(i-1, j, matrix, op), Del) } if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] { return append(backtrace(i, j-1, matrix, op), Ins) } if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] { return append(backtrace(i-1, j-1, matrix, op), Sub) } if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] { return append(backtrace(i-1, j-1, matrix, op), Match) } return []EditOperation{} } func min(a int, b int) int { if b < a { return b } return a } func max(a int, b int) int { if b > a { return b } return a } golang-levenshtein-1.0.1/levenshtein/levenshtein_test.go000066400000000000000000000124721371245072600235550ustar00rootroot00000000000000package levenshtein import ( "fmt" "os" "testing" ) var testCases = []struct { source string target string options Options distance int ratio float64 script EditScript }{ { source: "", target: "a", options: DefaultOptions, distance: 1, ratio: 0.0, script: EditScript{Ins}, }, { source: "a", target: "aa", options: DefaultOptions, distance: 1, ratio: 0.6666666666666666, script: EditScript{Match, Ins}, }, { source: "a", target: "aaa", options: DefaultOptions, distance: 2, ratio: 0.5, script: EditScript{Match, Ins, Ins}, }, { source: "", target: "", options: DefaultOptions, distance: 0, ratio: 0, script: EditScript{}, }, { source: "a", target: "b", options: DefaultOptions, distance: 2, ratio: 0, script: EditScript{Ins, Del}, }, { source: "aaa", target: "aba", options: DefaultOptions, distance: 2, ratio: 0.6666666666666666, script: EditScript{Match, Ins, Match, Del}, }, { source: "aaa", target: "ab", options: DefaultOptions, distance: 3, ratio: 0.4, script: EditScript{Match, Ins, Del, Del}, }, { source: "a", target: "a", options: DefaultOptions, distance: 0, ratio: 1, script: EditScript{Match}, }, { source: "ab", target: "ab", options: DefaultOptions, distance: 0, ratio: 1, script: EditScript{Match, Match}, }, { source: "a", target: "", options: DefaultOptions, distance: 1, ratio: 0, script: EditScript{Del}, }, { source: "aa", target: "a", options: DefaultOptions, distance: 1, ratio: 0.6666666666666666, script: EditScript{Match, Del}, }, { source: "aaa", target: "a", options: DefaultOptions, distance: 2, ratio: 0.5, script: EditScript{Match, Del, Del}, }, { source: "kitten", target: "sitting", options: DefaultOptions, distance: 5, ratio: 0.6153846153846154, script: EditScript{ Ins, Del, Match, Match, Match, Ins, Del, Match, Ins, }, }, { source: "kitten", target: "sitting", options: DefaultOptionsWithSub, distance: 3, ratio: 0.7692307692307693, script: EditScript{ Sub, Match, Match, Match, Sub, Match, Ins, }, }, { source: "Orange", target: "Apple", options: DefaultOptionsWithSub, distance: 5, ratio: 0.5454545454545454, script: EditScript{ Sub, Sub, Sub, Sub, Del, Match, }, }, { source: "me", target: "meme", options: Options{ InsCost: 2, DelCost: 1, SubCost: 3, Matches: IdenticalRunes, }, distance: 4, ratio: 0.3333333333333333, script: EditScript{ Match, Match, Ins, Ins, }, }, } func TestDistanceForStrings(t *testing.T) { for _, testCase := range testCases { distance := DistanceForStrings( []rune(testCase.source), []rune(testCase.target), testCase.options) if distance != testCase.distance { t.Log( "Distance between", testCase.source, "and", testCase.target, "computed as", distance, ", should be", testCase.distance) t.Fail() } // DistanceForMatrix(MatrixForStrings()) should calculate the same // value as DistanceForStrings. distance = DistanceForMatrix(MatrixForStrings( []rune(testCase.source), []rune(testCase.target), testCase.options)) if distance != testCase.distance { t.Log( "Distance between", testCase.source, "and", testCase.target, "computed as", distance, ", should be", testCase.distance) t.Fail() } } } func TestRatio(t *testing.T) { for _, testCase := range testCases { ratio := RatioForStrings( []rune(testCase.source), []rune(testCase.target), testCase.options) if ratio != testCase.ratio { t.Log( "Ratio between", testCase.source, "and", testCase.target, "computed as", ratio, ", should be", testCase.ratio) t.Fail() } } } func TestEditScriptForStrings(t *testing.T) { for _, testCase := range testCases { script := EditScriptForStrings( []rune(testCase.source), []rune(testCase.target), testCase.options) if !equal(script, testCase.script) { t.Log( "Edit script from", testCase.source, "to", testCase.target, "computed as", script, ", should be", testCase.script) t.Fail() } } } func equal(a, b EditScript) bool { for i := range a { if a[i] != b[i] { return false } } return true } func ExampleDistanceForStrings() { source := "a" target := "aa" distance := DistanceForStrings([]rune(source), []rune(target), DefaultOptions) fmt.Printf(`Distance between "%s" and "%s" computed as %d`, source, target, distance) // Output: Distance between "a" and "aa" computed as 1 } func ExampleWriteMatrix() { source := []rune("neighbor") target := []rune("Neighbour") matrix := MatrixForStrings(source, target, DefaultOptions) WriteMatrix(source, target, matrix, os.Stdout) // Output: // N e i g h b o u r // 0 1 2 3 4 5 6 7 8 9 // n 1 2 3 4 5 6 7 8 9 10 // e 2 3 2 3 4 5 6 7 8 9 // i 3 4 3 2 3 4 5 6 7 8 // g 4 5 4 3 2 3 4 5 6 7 // h 5 6 5 4 3 2 3 4 5 6 // b 6 7 6 5 4 3 2 3 4 5 // o 7 8 7 6 5 4 3 2 3 4 // r 8 9 8 7 6 5 4 3 4 3 }