pax_global_header00006660000000000000000000000064132201221470014504gustar00rootroot0000000000000052 comment=5c06ee8586a1691fbfa230842170429e2b31cc05 minhash-lsh-1.0/000077500000000000000000000000001322012214700135575ustar00rootroot00000000000000minhash-lsh-1.0/.gitignore000066400000000000000000000004231322012214700155460ustar00rootroot00000000000000# Binaries for programs and plugins *.exe *.dll *.so *.dylib # Test binary, build with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 .glide/ minhash-lsh-1.0/.travis.yml000066400000000000000000000001301322012214700156620ustar00rootroot00000000000000language: go go: - 1.6 - 1.7 - 1.8 - 1.9 - tip minhash-lsh-1.0/LICENSE000066400000000000000000000020511322012214700145620ustar00rootroot00000000000000MIT License Copyright (c) 2017 Eric Zhu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. minhash-lsh-1.0/README.md000066400000000000000000000015671322012214700150470ustar00rootroot00000000000000# Minhash LSH in Golang [![Build Status](https://travis-ci.org/ekzhu/minhash-lsh.svg?branch=master)](https://travis-ci.org/ekzhu/minhash-lsh) [![GoDoc](https://godoc.org/github.com/ekzhu/minhash-lsh?status.svg)](https://godoc.org/github.com/ekzhu/minhash-lsh) [Documentation](https://godoc.org/github.com/ekzhu/minhash-lsh) Install: `go get github.com/ekzhu/minhash-lsh` ## Run Benchmark ### Set file format 1. One set per line 2. Each set, all items are separated by whitespaces 3. If the parameter firstItemIsID is set to true, the first itme is the unique ID of the set. 4. The rest of the items with the following format: `____` * value is an unique element of the set * frequency is an integer count of the occurance of value * `____` (4 underscores) is the separator ### All Pair Benchmark ``` minhash-lsh-all-pair -input ``` minhash-lsh-1.0/cmd/000077500000000000000000000000001322012214700143225ustar00rootroot00000000000000minhash-lsh-1.0/cmd/minhash-lsh-all-pair/000077500000000000000000000000001322012214700202345ustar00rootroot00000000000000minhash-lsh-1.0/cmd/minhash-lsh-all-pair/main.go000066400000000000000000000110511322012214700215050ustar00rootroot00000000000000package main import ( "bufio" "errors" "flag" "fmt" "os" "regexp" "strconv" "strings" "time" minhashlsh "github.com/ekzhu/minhash-lsh" ) var ( setFilename string minhashSeed int64 minhashSize int threshold float64 outputSelfPair bool hasID bool ) func main() { flag.StringVar(&setFilename, "input", "", "The set file as input") flag.Int64Var(&minhashSeed, "seed", 42, "The Minhash seed") flag.IntVar(&minhashSize, "sigsize", 128, "The Minhash signature size in number of hash functions") flag.Float64Var(&threshold, "threshold", 0.9, "The Jaccard similarity threshold") flag.BoolVar(&outputSelfPair, "selfpair", false, "Allow self-pair in results") flag.BoolVar(&hasID, "hasIDfield", true, "The input set file has ID field in the beginning of each line") flag.Parse() // Create Minhash signatures start := time.Now() sets := readSets(setFilename, hasID) setSigs := make([]setSig, 0) for setSig := range createSigantures(sets) { setSigs = append(setSigs, setSig) } signatureCreationTime := time.Now().Sub(start) fmt.Fprintf(os.Stderr, "Creating Minhash signature time: %.2f seconds\n", signatureCreationTime.Seconds()) // Indexing start = time.Now() lsh := minhashlsh.NewMinhashLSH(minhashSize, threshold) for _, s := range setSigs { lsh.Add(s.ID, s.signature) } lsh.Index() indexingTime := time.Now().Sub(start) fmt.Fprintf(os.Stderr, "Indexing time: %.2f seconds\n", indexingTime.Seconds()) // Querying and output results start = time.Now() pairs := make(chan pair) go func() { defer close(pairs) for _, s := range setSigs { for _, candidateID := range lsh.Query(s.signature) { if !outputSelfPair && candidateID == s.ID { continue } pairs <- pair{s.ID, candidateID.(string)} } } }() w := bufio.NewWriter(os.Stdout) for pair := range pairs { w.WriteString(pair.String() + "\n") } if err := w.Flush(); err != nil { panic(err) } searchTime := time.Now().Sub(start) fmt.Fprintf(os.Stderr, "All pair search time: %.2f seconds\n", searchTime.Seconds()) } func pointquery() { panic("Not implemented") } type valueCountPair struct { value string count int } var valueCountRegex = regexp.MustCompile(`^(?P.*)____(?P[0-9]+)$`) func (p *valueCountPair) Parse(str string) error { indexes := valueCountRegex.FindStringSubmatchIndex(str) if indexes == nil || len(indexes) != 6 { return errors.New("Incorrect value count pair detected: " + str) } p.value = str[indexes[2]:indexes[3]] var err error p.count, err = strconv.Atoi(str[indexes[4]:indexes[5]]) if err != nil { panic(str + "\n" + err.Error()) } return nil } type set struct { ID string values []string } // readSets takes a set file having the following format: // 1. One set per line // 2. Each set, all items are separated by whitespaces // 3. If the parameter firstItemIsID is set to true, // the first itme is the unique ID of the set. // 4. The rest of the items with the following format: // ____ // * value is an unique element of the set // * frequency is an integer count of the occurance of value // * ____ (4 underscores) is the separator func readSets(setFilename string, firstItemIsID bool) <-chan set { sets := make(chan set) go func() { defer close(sets) file, err := os.Open(setFilename) if err != nil { panic(err) } defer file.Close() scanner := bufio.NewScanner(file) scanner.Buffer(nil, 4096*1024*1024*8) var count int for scanner.Scan() { items := strings.Split(scanner.Text(), " ") var ID string if firstItemIsID { ID = items[0] items = items[1:] } else { ID = strconv.Itoa(count) } values := make([]string, len(items)) for i, item := range items { var pair valueCountPair if err := pair.Parse(item); err != nil { fmt.Println(items) panic(err) } values[i] = pair.value } sets <- set{ID, values} count++ } if err := scanner.Err(); err != nil { panic(err) } }() return sets } type setSig struct { ID string size int signature []uint64 } func createSigantures(sets <-chan set) <-chan setSig { out := make(chan setSig) go func() { defer close(out) for set := range sets { mh := minhashlsh.NewMinhash(minhashSeed, minhashSize) for _, v := range set.values { mh.Push([]byte(v)) } out <- setSig{set.ID, len(set.values), mh.Signature()} } }() return out } type pair struct { ID1 string ID2 string } func (p *pair) String() string { if p.ID1 <= p.ID2 { return fmt.Sprintf("%s, %s", p.ID1, p.ID2) } return fmt.Sprintf("%s, %s", p.ID2, p.ID1) } minhash-lsh-1.0/lsh.go000066400000000000000000000152261322012214700147020ustar00rootroot00000000000000package minhashlsh import ( "encoding/binary" "math" "sort" "sync" ) const ( integrationPrecision = 0.01 ) type hashKeyFunc func([]uint64) string func hashKeyFuncGen(hashValueSize int) hashKeyFunc { return func(sig []uint64) string { s := make([]byte, hashValueSize*len(sig)) buf := make([]byte, 8) for i, v := range sig { binary.LittleEndian.PutUint64(buf, v) copy(s[i*hashValueSize:(i+1)*hashValueSize], buf[:hashValueSize]) } return string(s) } } // Compute the integral of function f, lower limit a, upper limit l, and // precision defined as the quantize step func integral(f func(float64) float64, a, b, precision float64) float64 { var area float64 for x := a; x < b; x += precision { area += f(x+0.5*precision) * precision } return area } // Probability density function for false positive func falsePositive(l, k int) func(float64) float64 { return func(j float64) float64 { return 1.0 - math.Pow(1.0-math.Pow(j, float64(k)), float64(l)) } } // Probability density function for false negative func falseNegative(l, k int) func(float64) float64 { return func(j float64) float64 { return 1.0 - (1.0 - math.Pow(1.0-math.Pow(j, float64(k)), float64(l))) } } // Compute the cummulative probability of false negative given threshold t func probFalseNegative(l, k int, t, precision float64) float64 { return integral(falseNegative(l, k), t, 1.0, precision) } // Compute the cummulative probability of false positive given threshold t func probFalsePositive(l, k int, t, precision float64) float64 { return integral(falsePositive(l, k), 0, t, precision) } // optimalKL returns the optimal K and L for Jaccard similarity search, // and the false positive and negative probabilities. // t is the Jaccard similarity threshold. func optimalKL(numHash int, t float64) (optK, optL int, fp, fn float64) { minError := math.MaxFloat64 for l := 1; l <= numHash; l++ { for k := 1; k <= numHash; k++ { if l*k > numHash { continue } currFp := probFalsePositive(l, k, t, integrationPrecision) currFn := probFalseNegative(l, k, t, integrationPrecision) currErr := currFn + currFp if minError > currErr { minError = currErr optK = k optL = l fp = currFp fn = currFn } } } return } // NewMinhashLSH is the default constructor uses 32 bit hash value var NewMinhashLSH = NewMinhashLSH32 type keys []interface{} // For initial bootstrapping type initHashTable map[string]keys type bucket struct { hashKey string keys keys } type hashTable []bucket func (h hashTable) Len() int { return len(h) } func (h hashTable) Swap(i, j int) { h[i], h[j] = h[j], h[i] } func (h hashTable) Less(i, j int) bool { return h[i].hashKey < h[j].hashKey } // MinhashLSH represents a MinHash LSH implemented using LSH Forest // (http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf). // It supports query-time setting of the MinHash LSH parameters // L (number of bands) and // K (number of hash functions per band). type MinhashLSH struct { k int l int initHashTables []initHashTable hashTables []hashTable hashKeyFunc hashKeyFunc hashValueSize int } func newMinhashLSH(threshold float64, numHash, hashValueSize int) *MinhashLSH { k, l, _, _ := optimalKL(numHash, threshold) hashTables := make([]hashTable, l) for i := range hashTables { hashTables[i] = make(hashTable, 0) } initHashTables := make([]initHashTable, l) for i := range initHashTables { initHashTables[i] = make(initHashTable) } return &MinhashLSH{ k: k, l: l, hashValueSize: hashValueSize, initHashTables: initHashTables, hashTables: hashTables, hashKeyFunc: hashKeyFuncGen(hashValueSize), } } // NewMinhashLSH64 uses 64-bit hash values. func NewMinhashLSH64(numHash int, threshold float64) *MinhashLSH { return newMinhashLSH(threshold, numHash, 8) } // NewMinhashLSH32 uses 32-bit hash values. // MinHash signatures with 64 bit hash values will have // their hash values trimed. func NewMinhashLSH32(numHash int, threshold float64) *MinhashLSH { return newMinhashLSH(threshold, numHash, 4) } // NewMinhashLSH16 uses 16-bit hash values. // MinHash signatures with 64 or 32 bit hash values will have // their hash values trimed. func NewMinhashLSH16(numHash int, threshold float64) *MinhashLSH { return newMinhashLSH(threshold, numHash, 2) } // Params returns the LSH parameters k and l func (f *MinhashLSH) Params() (k, l int) { return f.k, f.l } // Add a key with MinHash signature into the index. // The key won't be searchable until Index() is called. func (f *MinhashLSH) Add(key interface{}, sig []uint64) { // Generate hash keys Hs := make([]string, f.l) for i := 0; i < f.l; i++ { Hs[i] = f.hashKeyFunc(sig[i*f.k : (i+1)*f.k]) } // Insert keys into the bootstrapping tables for i := range f.initHashTables { func(ht initHashTable, hk string, key interface{}) { if _, exist := ht[hk]; exist { ht[hk] = append(ht[hk], key) } else { ht[hk] = make(keys, 1) ht[hk][0] = key } }(f.initHashTables[i], Hs[i], key) } } // Index makes all the keys added searchable. func (f *MinhashLSH) Index() { var wg sync.WaitGroup wg.Add(len(f.hashTables)) for i := range f.hashTables { func(htPtr *hashTable, initHtPtr *initHashTable) { // Build sorted hash table using buckets from init hash tables initHt := *initHtPtr ht := *htPtr for hashKey := range initHt { ks, _ := initHt[hashKey] ht = append(ht, bucket{ hashKey: hashKey, keys: ks, }) } sort.Sort(ht) *htPtr = ht // Reset the init hash tables *initHtPtr = make(initHashTable) wg.Done() }(&(f.hashTables[i]), &(f.initHashTables[i])) } wg.Wait() } // Query returns candidate keys given the query signature. func (f *MinhashLSH) Query(sig []uint64) []interface{} { set := f.query(sig, f.k) results := make([]interface{}, 0, len(set)) for key := range set { results = append(results, key) } return results } func (f *MinhashLSH) query(sig []uint64, minK int) map[interface{}]bool { results := make(map[interface{}]bool) for K := f.k; K >= minK; K-- { prefixSize := f.hashValueSize * K // Generate hash keys Hs := make([]string, f.l) for i := 0; i < f.l; i++ { Hs[i] = f.hashKeyFunc(sig[i*f.k : i*f.k+K]) } // Query hash tables for i := 0; i < f.l; i++ { ht := f.hashTables[i] hk := Hs[i] k := sort.Search(len(ht), func(x int) bool { return ht[x].hashKey[:prefixSize] >= hk }) if k < len(ht) && ht[k].hashKey[:prefixSize] == hk { for j := k; j < len(ht) && ht[j].hashKey[:prefixSize] == hk; j++ { for _, key := range ht[j].keys { if _, exist := results[key]; !exist { results[key] = true } } } } } } return results } minhash-lsh-1.0/lsh_benchmark_test.go000066400000000000000000000004761322012214700177540ustar00rootroot00000000000000package minhashlsh import ( "strconv" "testing" ) func Benchmark_Insert10000(b *testing.B) { sigs := make([][]uint64, 10000) for i := range sigs { sigs[i] = randomSignature(64, int64(i)) } b.ResetTimer() f := NewMinhashLSH16(64, 0.5) for i := range sigs { f.Add(strconv.Itoa(i), sigs[i]) } f.Index() } minhash-lsh-1.0/lsh_test.go000066400000000000000000000040111322012214700157270ustar00rootroot00000000000000package minhashlsh import ( "math/rand" "testing" ) func randomSignature(size int, seed int64) []uint64 { r := rand.New(rand.NewSource(seed)) sig := make([]uint64, size) for i := range sig { sig[i] = uint64(r.Int63()) } return sig } func Test_HashKeyFunc16(t *testing.T) { sig := randomSignature(2, 1) f := hashKeyFuncGen(2) hashKey := f(sig) if len(hashKey) != 2*2 { t.Fatal(len(hashKey)) } } func Test_HashKeyFunc64(t *testing.T) { sig := randomSignature(2, 1) f := hashKeyFuncGen(8) hashKey := f(sig) if len(hashKey) != 8*2 { t.Fatal(len(hashKey)) } } func Test_MinhashLSH(t *testing.T) { f := NewMinhashLSH16(256, 0.6) // sig1 is different from sig2 and sig3 // sig2 and sig3 are identical sig1 := randomSignature(256, 1) sig2 := randomSignature(256, 2) sig3 := randomSignature(256, 2) f.Add("sig1", sig1) f.Add("sig2", sig2) f.Add("sig3", sig3) f.Index() // sig1 should be in its own bucket // sig2 and sig3 are in another bucket for i := range f.hashTables { if len(f.hashTables[i]) != 2 { t.Fatal(f.hashTables[i]) } } found := 0 for _, key := range f.Query(sig3) { if key.(string) == "sig3" || key.(string) == "sig2" { found++ } } if found != 2 { t.Fatal("unable to retrieve inserted keys") } } func Test_MinhashLSH2(t *testing.T) { minhashLsh := NewMinhashLSH16(256, 0.5) seed := int64(1) numHash := 256 mh := NewMinhash(seed, numHash) words := []string{"hello", "world", "minhash", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"} for _, word := range words { mh.Push([]byte(word)) } sig1 := mh.Signature() minhashLsh.Add("s1", sig1) minhashLsh.Index() k, l := minhashLsh.Params() t.Logf("Minhash LSH params: k = %d, l = %d", k, l) mh = NewMinhash(seed, numHash) words = []string{"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"} for _, word := range words { mh.Push([]byte(word)) } sig2 := mh.Signature() results := minhashLsh.Query(sig2) t.Log(results) if len(results) < 1 { t.Fail() } } minhash-lsh-1.0/minhash.go000066400000000000000000000027471322012214700155470ustar00rootroot00000000000000package minhashlsh import ( "encoding/binary" "hash/fnv" "math/rand" minwise "github.com/dgryski/go-minhash" ) // The number of byte in a hash value for Minhash const hashValueSize = 8 // Minhash represents a MinHash object type Minhash struct { mw *minwise.MinWise seed int64 } // NewMinhash initialize a MinHash object with a seed and the number of // hash functions. func NewMinhash(seed int64, numHash int) *Minhash { r := rand.New(rand.NewSource(seed)) b := binary.BigEndian b1 := make([]byte, hashValueSize) b2 := make([]byte, hashValueSize) b.PutUint64(b1, uint64(r.Int63())) b.PutUint64(b2, uint64(r.Int63())) fnv1 := fnv.New64a() fnv2 := fnv.New64a() h1 := func(b []byte) uint64 { fnv1.Reset() fnv1.Write(b1) fnv1.Write(b) return fnv1.Sum64() } h2 := func(b []byte) uint64 { fnv2.Reset() fnv2.Write(b2) fnv2.Write(b) return fnv2.Sum64() } return &Minhash{ mw: minwise.NewMinWise(h1, h2, numHash), seed: int64(seed), } } // Push a new value to the MinHash object. // The value should be serialized to byte slice. func (m *Minhash) Push(b []byte) { m.mw.Push(b) } // Signature exports the MinHash as a list of hash values. func (m *Minhash) Signature() []uint64 { return m.mw.Signature() } // Merge combines the signature of the other Minhash // with this one, making this one carry the signature of // the union. func (m *Minhash) Merge(o *Minhash) { if m.seed != o.seed { panic("Cannot merge Minhash with different seed") } m.mw.Merge(o.mw) } minhash-lsh-1.0/minhash_test.go000066400000000000000000000026651322012214700166050ustar00rootroot00000000000000package minhashlsh import ( "fmt" "math" "testing" ) func TestMinhash(t *testing.T) { m := NewMinhash(1, 256) m.Push([]byte("Test some input")) } func data(size int) [][]byte { d := make([][]byte, size) for i := range d { d[i] = []byte(fmt.Sprintf("salt%d %d", i, size)) } return d } func hashing(mh *Minhash, start, end int, data [][]byte) { for i := start; i < end; i++ { mh.Push(data[i]) } } func benchmark(minhashSize, dataSize int, t *testing.B) { if dataSize < 10 { fmt.Printf("\n") return } // Data is a set of unique values d := data(dataSize) // a and b are two subsets of data with some overlaps a_start, a_end := 0, int(float64(dataSize)*0.65) b_start, b_end := int(float64(dataSize)*0.35), dataSize m1 := NewMinhash(1, minhashSize) m2 := NewMinhash(1, minhashSize) t.ResetTimer() hashing(m1, a_start, a_end, d) hashing(m2, b_start, b_end, d) est := m1.mw.Similarity(m2.mw) act := float64(a_end-b_start) / float64(b_end-a_start) err := math.Abs(act - est) fmt.Printf("Data size: %8d, ", dataSize) fmt.Printf("Real resemblance: %.8f, ", act) fmt.Printf("Estimated resemblance: %.8f, ", est) fmt.Printf("Absolute Error: %.8f\n", err) } func BenchmarkMinWise64(b *testing.B) { benchmark(64, b.N, b) } func BenchmarkMinWise128(b *testing.B) { benchmark(128, b.N, b) } func BenchmarkMinWise256(b *testing.B) { benchmark(256, b.N, b) } func BenchmarkMinWise512(b *testing.B) { benchmark(512, b.N, b) }