pax_global_header00006660000000000000000000000064141315661140014514gustar00rootroot0000000000000052 comment=d35c0e90fb4f4efae415a52d768590aa35018ae6 golang-github-derekparker-trie-0.0~git20200317.1fdf38b/000077500000000000000000000000001413156611400222155ustar00rootroot00000000000000golang-github-derekparker-trie-0.0~git20200317.1fdf38b/.deepsource.toml000066400000000000000000000003261413156611400253270ustar00rootroot00000000000000version = 1 test_patterns = ["*_test.go"] exclude_patterns = ["vendor/*"] [[analyzers]] name = "go" enabled = true [analyzers.meta] import_path = "github.com/derekparker/trie" dependencies_vendored = truegolang-github-derekparker-trie-0.0~git20200317.1fdf38b/.gitignore000066400000000000000000000000071413156611400242020ustar00rootroot00000000000000*.test golang-github-derekparker-trie-0.0~git20200317.1fdf38b/LICENSE000066400000000000000000000020671413156611400232270ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2014 Derek Parker Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-derekparker-trie-0.0~git20200317.1fdf38b/README.md000066400000000000000000000017101413156611400234730ustar00rootroot00000000000000[![GoDoc](https://godoc.org/github.com/derekparker/trie?status.svg)](https://godoc.org/github.com/derekparker/trie) # Trie Data structure and relevant algorithms for extremely fast prefix/fuzzy string searching. ## Usage Create a Trie with: ```Go t := trie.New() ``` Add Keys with: ```Go // Add can take in meta information which can be stored with the key. // i.e. you could store any information you would like to associate with // this particular key. t.Add("foobar", 1) ``` Find a key with: ```Go node, ok := t.Find("foobar") meta := node.Meta() // use meta with meta.(type) ``` Remove Keys with: ```Go t.Remove("foobar") ``` Prefix search with: ```Go t.PrefixSearch("foo") ``` Fast test for valid prefix: ```Go t.HasKeysWithPrefix("foo") ``` Fuzzy search with: ```Go t.FuzzySearch("fb") ``` ## Contributing Fork this repo and run tests with: go test Create a feature branch, write your tests and code and submit a pull request. ## License MIT golang-github-derekparker-trie-0.0~git20200317.1fdf38b/fixtures/000077500000000000000000000000001413156611400240665ustar00rootroot00000000000000golang-github-derekparker-trie-0.0~git20200317.1fdf38b/fixtures/test.txt000066400000000000000000000000141413156611400256010ustar00rootroot00000000000000foo bar baz golang-github-derekparker-trie-0.0~git20200317.1fdf38b/trie.go000066400000000000000000000133601413156611400235120ustar00rootroot00000000000000// Implementation of an R-Way Trie data structure. // // A Trie has a root Node which is the base of the tree. // Each subsequent Node has a letter and children, which are // nodes that have letter values associated with them. package trie import ( "sort" "sync" ) type Node struct { val rune path string term bool depth int meta interface{} mask uint64 parent *Node children map[rune]*Node termCount int } type Trie struct { mu sync.Mutex root *Node size int } type ByKeys []string func (a ByKeys) Len() int { return len(a) } func (a ByKeys) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a ByKeys) Less(i, j int) bool { return len(a[i]) < len(a[j]) } const nul = 0x0 // Creates a new Trie with an initialized root Node. func New() *Trie { return &Trie{ root: &Node{children: make(map[rune]*Node), depth: 0}, size: 0, } } // Returns the root node for the Trie. func (t *Trie) Root() *Node { return t.root } // Adds the key to the Trie, including meta data. Meta data // is stored as `interface{}` and must be type cast by // the caller. func (t *Trie) Add(key string, meta interface{}) *Node { t.mu.Lock() t.size++ runes := []rune(key) bitmask := maskruneslice(runes) node := t.root node.mask |= bitmask node.termCount++ for i := range runes { r := runes[i] bitmask = maskruneslice(runes[i:]) if n, ok := node.children[r]; ok { node = n node.mask |= bitmask } else { node = node.NewChild(r, "", bitmask, nil, false) } node.termCount++ } node = node.NewChild(nul, key, 0, meta, true) t.mu.Unlock() return node } // Finds and returns meta data associated // with `key`. func (t *Trie) Find(key string) (*Node, bool) { node := findNode(t.Root(), []rune(key)) if node == nil { return nil, false } node, ok := node.Children()[nul] if !ok || !node.term { return nil, false } return node, true } func (t *Trie) HasKeysWithPrefix(key string) bool { node := findNode(t.Root(), []rune(key)) return node != nil } // Removes a key from the trie, ensuring that // all bitmasks up to root are appropriately recalculated. func (t *Trie) Remove(key string) { var ( i int rs = []rune(key) node = findNode(t.Root(), []rune(key)) ) t.mu.Lock() t.size-- for n := node.Parent(); n != nil; n = n.Parent() { i++ if len(n.Children()) > 1 { r := rs[len(rs)-i] n.RemoveChild(r) break } } t.mu.Unlock() } // Returns all the keys currently stored in the trie. func (t *Trie) Keys() []string { if t.size == 0 { return []string{} } return t.PrefixSearch("") } // Performs a fuzzy search against the keys in the trie. func (t Trie) FuzzySearch(pre string) []string { keys := fuzzycollect(t.Root(), []rune(pre)) sort.Sort(ByKeys(keys)) return keys } // Performs a prefix search against the keys in the trie. func (t Trie) PrefixSearch(pre string) []string { node := findNode(t.Root(), []rune(pre)) if node == nil { return nil } return collect(node) } // Creates and returns a pointer to a new child for the node. func (parent *Node) NewChild(val rune, path string, bitmask uint64, meta interface{}, term bool) *Node { node := &Node{ val: val, path: path, mask: bitmask, term: term, meta: meta, parent: parent, children: make(map[rune]*Node), depth: parent.depth + 1, } parent.children[node.val] = node parent.mask |= bitmask return node } func (n *Node) RemoveChild(r rune) { delete(n.children, r) for nd := n.parent; nd != nil; nd = nd.parent { nd.mask ^= nd.mask nd.mask |= uint64(1) << uint64(nd.val-'a') for _, c := range nd.children { nd.mask |= c.mask } } } // Returns the parent of this node. func (n Node) Parent() *Node { return n.parent } // Returns the meta information of this node. func (n Node) Meta() interface{} { return n.meta } // Returns the children of this node. func (n Node) Children() map[rune]*Node { return n.children } func (n Node) Terminating() bool { return n.term } func (n Node) Val() rune { return n.val } func (n Node) Depth() int { return n.depth } // Returns a uint64 representing the current // mask of this node. func (n Node) Mask() uint64 { return n.mask } func findNode(node *Node, runes []rune) *Node { if node == nil { return nil } if len(runes) == 0 { return node } n, ok := node.Children()[runes[0]] if !ok { return nil } var nrunes []rune if len(runes) > 1 { nrunes = runes[1:] } else { nrunes = runes[0:0] } return findNode(n, nrunes) } func maskruneslice(rs []rune) uint64 { var m uint64 for _, r := range rs { m |= uint64(1) << uint64(r-'a') } return m } func collect(node *Node) []string { var ( n *Node i int ) keys := make([]string, 0, node.termCount) nodes := make([]*Node, 1, len(node.children)) nodes[0] = node for l := len(nodes); l != 0; l = len(nodes) { i = l - 1 n = nodes[i] nodes = nodes[:i] for _, c := range n.children { nodes = append(nodes, c) } if n.term { word := n.path keys = append(keys, word) } } return keys } type potentialSubtree struct { idx int node *Node } func fuzzycollect(node *Node, partial []rune) []string { if len(partial) == 0 { return collect(node) } var ( m uint64 i int p potentialSubtree keys []string ) potential := []potentialSubtree{potentialSubtree{node: node, idx: 0}} for l := len(potential); l > 0; l = len(potential) { i = l - 1 p = potential[i] potential = potential[:i] m = maskruneslice(partial[p.idx:]) if (p.node.mask & m) != m { continue } if p.node.val == partial[p.idx] { p.idx++ if p.idx == len(partial) { keys = append(keys, collect(p.node)...) continue } } for _, c := range p.node.children { potential = append(potential, potentialSubtree{node: c, idx: p.idx}) } } return keys } golang-github-derekparker-trie-0.0~git20200317.1fdf38b/trie_test.go000066400000000000000000000164371413156611400245610ustar00rootroot00000000000000package trie import ( "bufio" "log" "os" "sort" "testing" ) func addFromFile(t *Trie, path string) { file, err := os.Open(path) if err != nil { log.Fatal(err) } reader := bufio.NewScanner(file) for reader.Scan() { t.Add(reader.Text(), nil) } if reader.Err() != nil { log.Fatal(err) } } func TestTrieAdd(t *testing.T) { trie := New() n := trie.Add("foo", 1) if n.Meta().(int) != 1 { t.Errorf("Expected 1, got: %d", n.Meta().(int)) } } func TestTrieFind(t *testing.T) { trie := New() trie.Add("foo", 1) n, ok := trie.Find("foo") if ok != true { t.Fatal("Could not find node") } if n.Meta().(int) != 1 { t.Errorf("Expected 1, got: %d", n.Meta().(int)) } } func TestTrieFindMissingWithSubtree(t *testing.T) { trie := New() trie.Add("fooish", 1) trie.Add("foobar", 1) n, ok := trie.Find("foo") if ok != false { t.Errorf("Expected ok to be false") } if n != nil { t.Errorf("Expected nil, got: %v", n) } } func TestTrieHasKeysWithPrefix(t *testing.T) { trie := New() trie.Add("fooish", 1) trie.Add("foobar", 1) testcases := []struct { key string expected bool }{ {"foobar", true}, {"foo", true}, {"fool", false}, } for _, testcase := range testcases { if trie.HasKeysWithPrefix(testcase.key) != testcase.expected { t.Errorf("HasKeysWithPrefix(\"%s\"): expected result to be %t", testcase.key, testcase.expected) } } } func TestTrieFindMissing(t *testing.T) { trie := New() n, ok := trie.Find("foo") if ok != false { t.Errorf("Expected ok to be false") } if n != nil { t.Errorf("Expected nil, got: %v", n) } } func TestRemove(t *testing.T) { trie := New() initial := []string{"football", "foostar", "foosball"} for _, key := range initial { trie.Add(key, nil) } trie.Remove("foosball") keys := trie.Keys() if len(keys) != 2 { t.Errorf("Expected 2 keys got %d", len(keys)) } for _, k := range keys { if k != "football" && k != "foostar" { t.Errorf("key was: %s", k) } } keys = trie.FuzzySearch("foo") if len(keys) != 2 { t.Errorf("Expected 2 keys got %d", len(keys)) } for _, k := range keys { if k != "football" && k != "foostar" { t.Errorf("Expected football got: %#v", k) } } } func TestTrieKeys(t *testing.T) { tableTests := []struct { name string expectedKeys []string }{ {"Two", []string{"bar", "foo"}}, {"One", []string{"foo"}}, {"Empty", []string{}}, } for _, test := range tableTests { t.Run(test.name, func(t *testing.T) { trie := New() for _, key := range test.expectedKeys { trie.Add(key, nil) } keys := trie.Keys() if len(keys) != len(test.expectedKeys) { t.Errorf("Expected %v keys, got %d, keys were: %v", len(test.expectedKeys), len(keys), trie.Keys()) } sort.Strings(keys) for i, key := range keys { if key != test.expectedKeys[i] { t.Errorf("Expected %#v, got %#v", test.expectedKeys[i], key) } } }) } } func TestPrefixSearch(t *testing.T) { trie := New() expected := []string{ "foo", "foosball", "football", "foreboding", "forementioned", "foretold", "foreverandeverandeverandever", "forbidden", } defer func() { r := recover() if r != nil { t.Error(r) } }() trie.Add("bar", nil) for _, key := range expected { trie.Add(key, nil) } tests := []struct { pre string expected []string length int }{ {"fo", expected, len(expected)}, {"foosbal", []string{"foosball"}, 1}, {"abc", []string{}, 0}, } for _, test := range tests { actual := trie.PrefixSearch(test.pre) sort.Strings(actual) sort.Strings(test.expected) if len(actual) != test.length { t.Errorf("Expected len(actual) to == %d for pre %s", test.length, test.pre) } for i, key := range actual { if key != test.expected[i] { t.Errorf("Expected %v got: %v", test.expected[i], key) } } } trie.PrefixSearch("fsfsdfasdf") } func TestFuzzySearch(t *testing.T) { setup := []string{ "foosball", "football", "bmerica", "ked", "kedlock", "frosty", "bfrza", "foo/bart/baz.go", } tests := []struct { partial string length int }{ {"fsb", 1}, {"footbal", 1}, {"football", 1}, {"fs", 2}, {"oos", 1}, {"kl", 1}, {"ft", 3}, {"fy", 1}, {"fz", 2}, {"a", 5}, {"", 8}, {"zzz", 0}, } trie := New() for _, key := range setup { trie.Add(key, nil) } for _, test := range tests { t.Run(test.partial, func(t *testing.T) { actual := trie.FuzzySearch(test.partial) if len(actual) != test.length { t.Errorf("Expected len(actual) to == %d, was %d for %s actual was %#v", test.length, len(actual), test.partial, actual) } }) } } func TestFuzzySearchSorting(t *testing.T) { trie := New() setup := []string{ "foosball", "football", "bmerica", "ked", "kedlock", "frosty", "bfrza", "foo/bart/baz.go", } for _, key := range setup { trie.Add(key, nil) } actual := trie.FuzzySearch("fz") expected := []string{"bfrza", "foo/bart/baz.go"} if len(actual) != len(expected) { t.Fatalf("expected len %d got %d", len(expected), len(actual)) } for i, v := range expected { if actual[i] != v { t.Errorf("Expected %s got %s", v, actual[i]) } } } func BenchmarkTieKeys(b *testing.B) { trie := New() keys := []string{"bar", "foo", "baz", "bur", "zum", "burzum", "bark", "barcelona", "football", "foosball", "footlocker"} for _, key := range keys { trie.Add(key, nil) } b.ResetTimer() for i := 0; i < b.N; i++ { trie.Keys() } } func BenchmarkPrefixSearch(b *testing.B) { trie := New() addFromFile(trie, "/usr/share/dict/words") b.ResetTimer() for i := 0; i < b.N; i++ { _ = trie.PrefixSearch("fo") } } func BenchmarkFuzzySearch(b *testing.B) { trie := New() addFromFile(trie, "/usr/share/dict/words") b.ResetTimer() for i := 0; i < b.N; i++ { _ = trie.FuzzySearch("fs") } } func BenchmarkBuildTree(b *testing.B) { for i := 0; i < b.N; i++ { trie := New() addFromFile(trie, "/usr/share/dict/words") } } func TestSupportChinese(t *testing.T) { trie := New() expected := []string{"苹果 沂水县", "苹果", "大蒜", "大豆"} for _, key := range expected { trie.Add(key, nil) } tests := []struct { pre string expected []string length int }{ {"苹", expected[:2], len(expected[:2])}, {"大", expected[2:], len(expected[2:])}, {"大蒜", []string{"大蒜"}, 1}, } for _, test := range tests { actual := trie.PrefixSearch(test.pre) sort.Strings(actual) sort.Strings(test.expected) if len(actual) != test.length { t.Errorf("Expected len(actual) to == %d for pre %s", test.length, test.pre) } for i, key := range actual { if key != test.expected[i] { t.Errorf("Expected %v got: %v", test.expected[i], key) } } } } func BenchmarkAdd(b *testing.B) { f, err := os.Open("/usr/share/dict/words") if err != nil { b.Fatal("couldn't open bag of words") } defer f.Close() scanner := bufio.NewScanner(f) var words []string for scanner.Scan() { word := scanner.Text() words = append(words, word) } b.ResetTimer() for i := 0; i < b.N; i++ { trie := New() for k := range words { trie.Add(words[k], nil) } } } func BenchmarkAddRemove(b *testing.B) { words := []string{"AAAA1", "AAAA2", "ABAA1", "AABA1", "ABAA2"} b.ResetTimer() for i := 0; i < b.N; i++ { trie := New() for k := range words { trie.Add(words[k], nil) } for k := range words { trie.Remove(words[k]) } } }