pax_global_header00006660000000000000000000000064141253355040014514gustar00rootroot0000000000000052 comment=bb11d046376d90ddd793213d0e1fb76e7d54e945 bloom-3.1.0/000077500000000000000000000000001412533550400126255ustar00rootroot00000000000000bloom-3.1.0/.github/000077500000000000000000000000001412533550400141655ustar00rootroot00000000000000bloom-3.1.0/.github/FUNDING.yml000066400000000000000000000004441412533550400160040ustar00rootroot00000000000000# You can add one username per supported platform and one custom link patreon: # Replace with your Patreon username open_collective: # Replace with your Open Collective username ko_fi: # Replace with your Ko-fi username custom: https://donate.mcc.org/ # Replace with your custom donation URL bloom-3.1.0/.github/workflows/000077500000000000000000000000001412533550400162225ustar00rootroot00000000000000bloom-3.1.0/.github/workflows/test.yml000066400000000000000000000014661412533550400177330ustar00rootroot00000000000000on: pull_request name: Test jobs: test: strategy: matrix: go-version: [1.14.x, 1.15.x, 1.16.x] os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - name: Install Go uses: actions/setup-go@v2 with: go-version: ${{ matrix.go-version }} - name: Checkout code uses: actions/checkout@v2 - name: Vet run: go vet ./... - name: Test run: go test ./... single-ver: runs-on: ubuntu-latest steps: - name: Set up Go uses: actions/setup-go@v2 with: go-version: 1.16.x - name: Checkout code uses: actions/checkout@v2 - name: fmt run: diff <(gofmt -s -d .) <(printf "") - name: Test 386 run: GOOS=linux GOARCH=386 go test ./...bloom-3.1.0/.gitignore000066400000000000000000000004221412533550400146130ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test *.prof target bloom-3.1.0/.travis.yml000066400000000000000000000012351412533550400147370ustar00rootroot00000000000000language: go sudo: false branches: except: - release branches: only: - master - develop - travis go: - 1.8 - tip matrix: allow_failures: - go: tip before_install: - if [ -n "$GH_USER" ]; then git config --global github.user ${GH_USER}; fi; - if [ -n "$GH_TOKEN" ]; then git config --global github.token ${GH_TOKEN}; fi; - go get github.com/mattn/goveralls before_script: - make deps script: - make qa after_failure: - cat ./target/test/report.xml after_success: - if [ "$TRAVIS_GO_VERSION" = "1.8" ]; then $HOME/gopath/bin/goveralls -covermode=count -coverprofile=target/report/coverage.out -service=travis-ci; fi; bloom-3.1.0/LICENSE000066400000000000000000000024201412533550400136300ustar00rootroot00000000000000Copyright (c) 2014 Will Fitzgerald. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bloom-3.1.0/Makefile000066400000000000000000000146231412533550400142730ustar00rootroot00000000000000# MAKEFILE # # @author Nicola Asuni # @link https://github.com/willf/bloom # ------------------------------------------------------------------------------ # List special make targets that are not associated with files .PHONY: help all test format fmtcheck vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan qa deps clean nuke # Use bash as shell (Note: Ubuntu now uses dash which doesn't support PIPESTATUS). SHELL=/bin/bash # CVS path (path to the parent dir containing the project) CVSPATH=github.com/willf # Project owner OWNER=willf # Project vendor VENDOR=willf # Project name PROJECT=bloom # Project version VERSION=$(shell cat VERSION) # Name of RPM or DEB package PKGNAME=${VENDOR}-${PROJECT} # Current directory CURRENTDIR=$(shell pwd) # GO lang path ifneq ($(GOPATH),) ifeq ($(findstring $(GOPATH),$(CURRENTDIR)),) # the defined GOPATH is not valid GOPATH= endif endif ifeq ($(GOPATH),) # extract the GOPATH GOPATH=$(firstword $(subst /src/, ,$(CURRENTDIR))) endif # --- MAKE TARGETS --- # Display general help about this command help: @echo "" @echo "$(PROJECT) Makefile." @echo "GOPATH=$(GOPATH)" @echo "The following commands are available:" @echo "" @echo " make qa : Run all the tests" @echo " make test : Run the unit tests" @echo "" @echo " make format : Format the source code" @echo " make fmtcheck : Check if the source code has been formatted" @echo " make vet : Check for suspicious constructs" @echo " make lint : Check for style errors" @echo " make coverage : Generate the coverage report" @echo " make cyclo : Generate the cyclomatic complexity report" @echo " make ineffassign : Detect ineffectual assignments" @echo " make misspell : Detect commonly misspelled words in source files" @echo " make structcheck : Find unused struct fields" @echo " make varcheck : Find unused global variables and constants" @echo " make errcheck : Check that error return values are used" @echo " make gosimple : Suggest code simplifications" @echo " make astscan : GO AST scanner" @echo "" @echo " make docs : Generate source code documentation" @echo "" @echo " make deps : Get the dependencies" @echo " make clean : Remove any build artifact" @echo " make nuke : Deletes any intermediate file" @echo "" # Alias for help target all: help # Run the unit tests test: @mkdir -p target/test @mkdir -p target/report GOPATH=$(GOPATH) \ go test \ -covermode=atomic \ -bench=. \ -race \ -cpuprofile=target/report/cpu.out \ -memprofile=target/report/mem.out \ -mutexprofile=target/report/mutex.out \ -coverprofile=target/report/coverage.out \ -v ./... | \ tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); \ test $${PIPESTATUS[0]} -eq 0 # Format the source code format: @find . -type f -name "*.go" -exec gofmt -s -w {} \; # Check if the source code has been formatted fmtcheck: @mkdir -p target @find . -type f -name "*.go" -exec gofmt -s -d {} \; | tee target/format.diff @test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; } # Check for syntax errors vet: GOPATH=$(GOPATH) go vet . # Check for style errors lint: GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint . # Generate the coverage report coverage: @mkdir -p target/report GOPATH=$(GOPATH) \ go tool cover -html=target/report/coverage.out -o target/report/coverage.html # Report cyclomatic complexity cyclo: @mkdir -p target/report GOPATH=$(GOPATH) gocyclo -avg ./ | tee target/report/cyclo.txt ; test $${PIPESTATUS[0]} -eq 0 # Detect ineffectual assignments ineffassign: @mkdir -p target/report GOPATH=$(GOPATH) ineffassign ./ | tee target/report/ineffassign.txt ; test $${PIPESTATUS[0]} -eq 0 # Detect commonly misspelled words in source files misspell: @mkdir -p target/report GOPATH=$(GOPATH) misspell -error ./ | tee target/report/misspell.txt ; test $${PIPESTATUS[0]} -eq 0 # Find unused struct fields structcheck: @mkdir -p target/report GOPATH=$(GOPATH) structcheck -a ./ | tee target/report/structcheck.txt # Find unused global variables and constants varcheck: @mkdir -p target/report GOPATH=$(GOPATH) varcheck -e ./ | tee target/report/varcheck.txt # Check that error return values are used errcheck: @mkdir -p target/report GOPATH=$(GOPATH) errcheck ./ | tee target/report/errcheck.txt # Suggest code simplifications gosimple: @mkdir -p target/report GOPATH=$(GOPATH) gosimple ./ | tee target/report/gosimple.txt # AST scanner astscan: @mkdir -p target/report GOPATH=$(GOPATH) gas .//*.go | tee target/report/astscan.txt ; test $${PIPESTATUS[0]} -eq 0 # Generate source docs docs: @mkdir -p target/docs nohup sh -c 'GOPATH=$(GOPATH) godoc -http=127.0.0.1:6060' > target/godoc_server.log 2>&1 & wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/${VENDOR}/${PROJECT}/ ; kill -9 `lsof -ti :6060` @echo ''${PKGNAME}' Documentation ...' > target/docs/index.html # Alias to run all quality-assurance checks qa: fmtcheck test vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan # --- INSTALL --- # Get the dependencies deps: GOPATH=$(GOPATH) go get ./... GOPATH=$(GOPATH) go get github.com/golang/lint/golint GOPATH=$(GOPATH) go get github.com/jstemmer/go-junit-report GOPATH=$(GOPATH) go get github.com/axw/gocov/gocov GOPATH=$(GOPATH) go get github.com/fzipp/gocyclo GOPATH=$(GOPATH) go get github.com/gordonklaus/ineffassign GOPATH=$(GOPATH) go get github.com/client9/misspell/cmd/misspell GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/structcheck GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/varcheck GOPATH=$(GOPATH) go get github.com/kisielk/errcheck GOPATH=$(GOPATH) go get honnef.co/go/tools/cmd/gosimple GOPATH=$(GOPATH) go get github.com/GoASTScanner/gas # Remove any build artifact clean: GOPATH=$(GOPATH) go clean ./... # Deletes any intermediate file nuke: rm -rf ./target GOPATH=$(GOPATH) go clean -i ./... bloom-3.1.0/README.md000066400000000000000000000077241412533550400141160ustar00rootroot00000000000000Bloom filters ------------- [![Test](https://github.com/bits-and-blooms/bloom/actions/workflows/test.yml/badge.svg)](https://github.com/bits-and-blooms/bloom/actions/workflows/test.yml) [![Go Report Card](https://goreportcard.com/badge/github.com/bits-and-blooms/bloom)](https://goreportcard.com/report/github.com/bits-and-blooms/bloom) [![Go Reference](https://pkg.go.dev/badge/github.com/bits-and-blooms/bloom.svg)](https://pkg.go.dev/github.com/bits-and-blooms/bloom) A Bloom filter is a concise/compressed representation of a set, where the main requirement is to make membership queries; _i.e._, whether an item is a member of a set. A Bloom filter will always correctly report the presence of an element in the set when the element is indeed present. A Bloom filter can use much less storage than the original set, but it allows for some 'false positives': it may sometimes report that an element is in the set whereas it is not. When you construct, you need to know how many elements you have (the desired capacity), and what is the desired false positive rate you are willing to tolerate. A common false-positive rate is 1%. The lower the false-positive rate, the more memory you are going to require. Similarly, the higher the capacity, the more memory you will use. You may construct the Bloom filter capable of receiving 1 million elements with a false-positive rate of 1% in the following manner. ```Go filter := bloom.NewWithEstimates(1000000, 0.01) ``` You should call `NewWithEstimates` conservatively: if you specify a number of elements that it is too small, the false-positive bound might be exceeded. A Bloom filter is not a dynamic data structure: you must know ahead of time what your desired capacity is. Our implementation accepts keys for setting and testing as `[]byte`. Thus, to add a string item, `"Love"`: ```Go filter.Add([]byte("Love")) ``` Similarly, to test if `"Love"` is in bloom: ```Go if filter.Test([]byte("Love")) ``` For numerical data, we recommend that you look into the encoding/binary library. But, for example, to add a `uint32` to the filter: ```Go i := uint32(100) n1 := make([]byte, 4) binary.BigEndian.PutUint32(n1, i) filter.Add(n1) ``` Discussion here: [Bloom filter](https://groups.google.com/d/topic/golang-nuts/6MktecKi1bE/discussion) Godoc documentation: https://pkg.go.dev/github.com/bits-and-blooms/bloom ## Installation ```bash go get -u github.com/bits-and-blooms/bloom/v3 ``` ## Contributing If you wish to contribute to this project, please branch and issue a pull request against master ("[GitHub Flow](https://guides.github.com/introduction/flow/)") This project includes a Makefile that allows you to test and build the project with simple commands. To see all available options: ```bash make help ``` ## Running all tests Before committing the code, please check if it passes all tests using (note: this will install some dependencies): ```bash make deps make qa ``` ## Design A Bloom filter has two parameters: _m_, the number of bits used in storage, and _k_, the number of hashing functions on elements of the set. (The actual hashing functions are important, too, but this is not a parameter for this implementation). A Bloom filter is backed by a [BitSet](https://github.com/bits-and-blooms/bitset); a key is represented in the filter by setting the bits at each value of the hashing functions (modulo _m_). Set membership is done by _testing_ whether the bits at each value of the hashing functions (again, modulo _m_) are set. If so, the item is in the set. If the item is actually in the set, a Bloom filter will never fail (the true positive rate is 1.0); but it is susceptible to false positives. The art is to choose _k_ and _m_ correctly. In this implementation, the hashing functions used is [murmurhash](https://github.com/spaolacci/murmur3), a non-cryptographic hashing function. Given the particular hashing scheme, it's best to be empirical about this. Note that estimating the FP rate will clear the Bloom filter. bloom-3.1.0/VERSION000066400000000000000000000000061412533550400136710ustar00rootroot000000000000002.0.3 bloom-3.1.0/bloom.go000066400000000000000000000256421412533550400142750ustar00rootroot00000000000000/* Package bloom provides data structures and methods for creating Bloom filters. A Bloom filter is a representation of a set of _n_ items, where the main requirement is to make membership queries; _i.e._, whether an item is a member of a set. A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large multiple of the cardinality of the set to represent) and _k_, the number of hashing functions on elements of the set. (The actual hashing functions are important, too, but this is not a parameter for this implementation). A Bloom filter is backed by a BitSet; a key is represented in the filter by setting the bits at each value of the hashing functions (modulo _m_). Set membership is done by _testing_ whether the bits at each value of the hashing functions (again, modulo _m_) are set. If so, the item is in the set. If the item is actually in the set, a Bloom filter will never fail (the true positive rate is 1.0); but it is susceptible to false positives. The art is to choose _k_ and _m_ correctly. In this implementation, the hashing functions used is murmurhash, a non-cryptographic hashing function. This implementation accepts keys for setting as testing as []byte. Thus, to add a string item, "Love": uint n = 1000 filter := bloom.New(20*n, 5) // load of 20, 5 keys filter.Add([]byte("Love")) Similarly, to test if "Love" is in bloom: if filter.Test([]byte("Love")) For numeric data, I recommend that you look into the binary/encoding library. But, for example, to add a uint32 to the filter: i := uint32(100) n1 := make([]byte,4) binary.BigEndian.PutUint32(n1,i) f.Add(n1) Finally, there is a method to estimate the false positive rate of a particular Bloom filter for a set of size _n_: if filter.EstimateFalsePositiveRate(1000) > 0.001 Given the particular hashing scheme, it's best to be empirical about this. Note that estimating the FP rate will clear the Bloom filter. */ package bloom import ( "bytes" "encoding/binary" "encoding/json" "fmt" "io" "math" "github.com/bits-and-blooms/bitset" ) // A BloomFilter is a representation of a set of _n_ items, where the main // requirement is to make membership queries; _i.e._, whether an item is a // member of a set. type BloomFilter struct { m uint k uint b *bitset.BitSet } func max(x, y uint) uint { if x > y { return x } return y } // New creates a new Bloom filter with _m_ bits and _k_ hashing functions // We force _m_ and _k_ to be at least one to avoid panics. func New(m uint, k uint) *BloomFilter { return &BloomFilter{max(1, m), max(1, k), bitset.New(m)} } // From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing // functions. The data slice is not going to be reset. func From(data []uint64, k uint) *BloomFilter { m := uint(len(data) * 64) return &BloomFilter{m, k, bitset.From(data)} } // baseHashes returns the four hash values of data that are used to create k // hashes func baseHashes(data []byte) [4]uint64 { var d digest128 // murmur hashing hash1, hash2, hash3, hash4 := d.sum256(data) return [4]uint64{ hash1, hash2, hash3, hash4, } } // location returns the ith hashed location using the four base hash values func location(h [4]uint64, i uint) uint64 { ii := uint64(i) return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)] } // location returns the ith hashed location using the four base hash values func (f *BloomFilter) location(h [4]uint64, i uint) uint { return uint(location(h, i) % uint64(f.m)) } // EstimateParameters estimates requirements for m and k. // Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go // used with permission. func EstimateParameters(n uint, p float64) (m uint, k uint) { m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2))) k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n))) return } // NewWithEstimates creates a new Bloom filter for about n items with fp // false positive rate func NewWithEstimates(n uint, fp float64) *BloomFilter { m, k := EstimateParameters(n, fp) return New(m, k) } // Cap returns the capacity, _m_, of a Bloom filter func (f *BloomFilter) Cap() uint { return f.m } // K returns the number of hash functions used in the BloomFilter func (f *BloomFilter) K() uint { return f.k } // BitSet returns the underlying bitset for this filter. func (f *BloomFilter) BitSet() *bitset.BitSet { return f.b } // Add data to the Bloom Filter. Returns the filter (allows chaining) func (f *BloomFilter) Add(data []byte) *BloomFilter { h := baseHashes(data) for i := uint(0); i < f.k; i++ { f.b.Set(f.location(h, i)) } return f } // Merge the data from two Bloom Filters. func (f *BloomFilter) Merge(g *BloomFilter) error { // Make sure the m's and k's are the same, otherwise merging has no real use. if f.m != g.m { return fmt.Errorf("m's don't match: %d != %d", f.m, g.m) } if f.k != g.k { return fmt.Errorf("k's don't match: %d != %d", f.m, g.m) } f.b.InPlaceUnion(g.b) return nil } // Copy creates a copy of a Bloom filter. func (f *BloomFilter) Copy() *BloomFilter { fc := New(f.m, f.k) fc.Merge(f) // #nosec return fc } // AddString to the Bloom Filter. Returns the filter (allows chaining) func (f *BloomFilter) AddString(data string) *BloomFilter { return f.Add([]byte(data)) } // Test returns true if the data is in the BloomFilter, false otherwise. // If true, the result might be a false positive. If false, the data // is definitely not in the set. func (f *BloomFilter) Test(data []byte) bool { h := baseHashes(data) for i := uint(0); i < f.k; i++ { if !f.b.Test(f.location(h, i)) { return false } } return true } // TestString returns true if the string is in the BloomFilter, false otherwise. // If true, the result might be a false positive. If false, the data // is definitely not in the set. func (f *BloomFilter) TestString(data string) bool { return f.Test([]byte(data)) } // TestLocations returns true if all locations are set in the BloomFilter, false // otherwise. func (f *BloomFilter) TestLocations(locs []uint64) bool { for i := 0; i < len(locs); i++ { if !f.b.Test(uint(locs[i] % uint64(f.m))) { return false } } return true } // TestAndAdd is the equivalent to calling Test(data) then Add(data). // Returns the result of Test. func (f *BloomFilter) TestAndAdd(data []byte) bool { present := true h := baseHashes(data) for i := uint(0); i < f.k; i++ { l := f.location(h, i) if !f.b.Test(l) { present = false } f.b.Set(l) } return present } // TestAndAddString is the equivalent to calling Test(string) then Add(string). // Returns the result of Test. func (f *BloomFilter) TestAndAddString(data string) bool { return f.TestAndAdd([]byte(data)) } // TestOrAdd is the equivalent to calling Test(data) then if not present Add(data). // Returns the result of Test. func (f *BloomFilter) TestOrAdd(data []byte) bool { present := true h := baseHashes(data) for i := uint(0); i < f.k; i++ { l := f.location(h, i) if !f.b.Test(l) { present = false f.b.Set(l) } } return present } // TestOrAddString is the equivalent to calling Test(string) then if not present Add(string). // Returns the result of Test. func (f *BloomFilter) TestOrAddString(data string) bool { return f.TestOrAdd([]byte(data)) } // ClearAll clears all the data in a Bloom filter, removing all keys func (f *BloomFilter) ClearAll() *BloomFilter { f.b.ClearAll() return f } // EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits // and k hash functions, what the false positive rate will be // while storing n entries; runs 100,000 tests. This is an empirical // test using integers as keys. As a side-effect, it clears the BloomFilter. func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) { rounds := uint32(100000) f.ClearAll() n1 := make([]byte, 4) for i := uint32(0); i < uint32(n); i++ { binary.BigEndian.PutUint32(n1, i) f.Add(n1) } fp := 0 // test for number of rounds for i := uint32(0); i < rounds; i++ { binary.BigEndian.PutUint32(n1, i+uint32(n)+1) if f.Test(n1) { //fmt.Printf("%v failed.\n", i+uint32(n)+1) fp++ } } fpRate = float64(fp) / (float64(rounds)) f.ClearAll() return } // Approximating the number of items // https://en.wikipedia.org/wiki/Bloom_filter#Approximating_the_number_of_items_in_a_Bloom_filter func (f *BloomFilter) ApproximatedSize() uint32 { x := float64(f.b.Count()) m := float64(f.Cap()) k := float64(f.K()) size := -1 * m / k * math.Log(1-x/m) / math.Log(math.E) return uint32(math.Floor(size + 0.5)) // round } // bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct. type bloomFilterJSON struct { M uint `json:"m"` K uint `json:"k"` B *bitset.BitSet `json:"b"` } // MarshalJSON implements json.Marshaler interface. func (f *BloomFilter) MarshalJSON() ([]byte, error) { return json.Marshal(bloomFilterJSON{f.m, f.k, f.b}) } // UnmarshalJSON implements json.Unmarshaler interface. func (f *BloomFilter) UnmarshalJSON(data []byte) error { var j bloomFilterJSON err := json.Unmarshal(data, &j) if err != nil { return err } f.m = j.M f.k = j.K f.b = j.B return nil } // WriteTo writes a binary representation of the BloomFilter to an i/o stream. // It returns the number of bytes written. func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) { err := binary.Write(stream, binary.BigEndian, uint64(f.m)) if err != nil { return 0, err } err = binary.Write(stream, binary.BigEndian, uint64(f.k)) if err != nil { return 0, err } numBytes, err := f.b.WriteTo(stream) return numBytes + int64(2*binary.Size(uint64(0))), err } // ReadFrom reads a binary representation of the BloomFilter (such as might // have been written by WriteTo()) from an i/o stream. It returns the number // of bytes read. func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) { var m, k uint64 err := binary.Read(stream, binary.BigEndian, &m) if err != nil { return 0, err } err = binary.Read(stream, binary.BigEndian, &k) if err != nil { return 0, err } b := &bitset.BitSet{} numBytes, err := b.ReadFrom(stream) if err != nil { return 0, err } f.m = uint(m) f.k = uint(k) f.b = b return numBytes + int64(2*binary.Size(uint64(0))), nil } // GobEncode implements gob.GobEncoder interface. func (f *BloomFilter) GobEncode() ([]byte, error) { var buf bytes.Buffer _, err := f.WriteTo(&buf) if err != nil { return nil, err } return buf.Bytes(), nil } // GobDecode implements gob.GobDecoder interface. func (f *BloomFilter) GobDecode(data []byte) error { buf := bytes.NewBuffer(data) _, err := f.ReadFrom(buf) return err } // Equal tests for the equality of two Bloom filters func (f *BloomFilter) Equal(g *BloomFilter) bool { return f.m == g.m && f.k == g.k && f.b.Equal(g.b) } // Locations returns a list of hash locations representing a data item. func Locations(data []byte, k uint) []uint64 { locs := make([]uint64, k) // calculate locations h := baseHashes(data) for i := uint(0); i < k; i++ { locs[i] = location(h, i) } return locs } bloom-3.1.0/bloom_test.go000066400000000000000000000331161412533550400153270ustar00rootroot00000000000000package bloom import ( "bytes" "encoding/binary" "encoding/gob" "encoding/json" "math" "testing" ) // This implementation of Bloom filters is _not_ // safe for concurrent use. Uncomment the following // method and run go test -race // // func TestConcurrent(t *testing.T) { // gmp := runtime.GOMAXPROCS(2) // defer runtime.GOMAXPROCS(gmp) // // f := New(1000, 4) // n1 := []byte("Bess") // n2 := []byte("Jane") // f.Add(n1) // f.Add(n2) // // var wg sync.WaitGroup // const try = 1000 // var err1, err2 error // // wg.Add(1) // go func() { // for i := 0; i < try; i++ { // n1b := f.Test(n1) // if !n1b { // err1 = fmt.Errorf("%v should be in", n1) // break // } // } // wg.Done() // }() // // wg.Add(1) // go func() { // for i := 0; i < try; i++ { // n2b := f.Test(n2) // if !n2b { // err2 = fmt.Errorf("%v should be in", n2) // break // } // } // wg.Done() // }() // // wg.Wait() // // if err1 != nil { // t.Fatal(err1) // } // if err2 != nil { // t.Fatal(err2) // } // } func TestBasic(t *testing.T) { f := New(1000, 4) n1 := []byte("Bess") n2 := []byte("Jane") n3 := []byte("Emma") f.Add(n1) n3a := f.TestAndAdd(n3) n1b := f.Test(n1) n2b := f.Test(n2) n3b := f.Test(n3) if !n1b { t.Errorf("%v should be in.", n1) } if n2b { t.Errorf("%v should not be in.", n2) } if n3a { t.Errorf("%v should not be in the first time we look.", n3) } if !n3b { t.Errorf("%v should be in the second time we look.", n3) } } func TestBasicUint32(t *testing.T) { f := New(1000, 4) n1 := make([]byte, 4) n2 := make([]byte, 4) n3 := make([]byte, 4) n4 := make([]byte, 4) n5 := make([]byte, 4) binary.BigEndian.PutUint32(n1, 100) binary.BigEndian.PutUint32(n2, 101) binary.BigEndian.PutUint32(n3, 102) binary.BigEndian.PutUint32(n4, 103) binary.BigEndian.PutUint32(n5, 104) f.Add(n1) n3a := f.TestAndAdd(n3) n1b := f.Test(n1) n2b := f.Test(n2) n3b := f.Test(n3) n5a := f.TestOrAdd(n5) n5b := f.Test(n5) f.Test(n4) if !n1b { t.Errorf("%v should be in.", n1) } if n2b { t.Errorf("%v should not be in.", n2) } if n3a { t.Errorf("%v should not be in the first time we look.", n3) } if !n3b { t.Errorf("%v should be in the second time we look.", n3) } if n5a { t.Errorf("%v should not be in the first time we look.", n5) } if !n5b { t.Errorf("%v should be in the second time we look.", n5) } } func TestNewWithLowNumbers(t *testing.T) { f := New(0, 0) if f.k != 1 { t.Errorf("%v should be 1", f.k) } if f.m != 1 { t.Errorf("%v should be 1", f.m) } } func TestString(t *testing.T) { f := NewWithEstimates(1000, 0.001) n1 := "Love" n2 := "is" n3 := "in" n4 := "bloom" n5 := "blooms" f.AddString(n1) n3a := f.TestAndAddString(n3) n1b := f.TestString(n1) n2b := f.TestString(n2) n3b := f.TestString(n3) n5a := f.TestOrAddString(n5) n5b := f.TestString(n5) f.TestString(n4) if !n1b { t.Errorf("%v should be in.", n1) } if n2b { t.Errorf("%v should not be in.", n2) } if n3a { t.Errorf("%v should not be in the first time we look.", n3) } if !n3b { t.Errorf("%v should be in the second time we look.", n3) } if n5a { t.Errorf("%v should not be in the first time we look.", n5) } if !n5b { t.Errorf("%v should be in the second time we look.", n5) } } func testEstimated(n uint, maxFp float64, t *testing.T) { m, k := EstimateParameters(n, maxFp) f := NewWithEstimates(n, maxFp) fpRate := f.EstimateFalsePositiveRate(n) if fpRate > 1.5*maxFp { t.Errorf("False positive rate too high: n: %v; m: %v; k: %v; maxFp: %f; fpRate: %f, fpRate/maxFp: %f", n, m, k, maxFp, fpRate, fpRate/maxFp) } } func TestEstimated1000_0001(t *testing.T) { testEstimated(1000, 0.000100, t) } func TestEstimated10000_0001(t *testing.T) { testEstimated(10000, 0.000100, t) } func TestEstimated100000_0001(t *testing.T) { testEstimated(100000, 0.000100, t) } func TestEstimated1000_001(t *testing.T) { testEstimated(1000, 0.001000, t) } func TestEstimated10000_001(t *testing.T) { testEstimated(10000, 0.001000, t) } func TestEstimated100000_001(t *testing.T) { testEstimated(100000, 0.001000, t) } func TestEstimated1000_01(t *testing.T) { testEstimated(1000, 0.010000, t) } func TestEstimated10000_01(t *testing.T) { testEstimated(10000, 0.010000, t) } func TestEstimated100000_01(t *testing.T) { testEstimated(100000, 0.010000, t) } func min(a, b uint) uint { if a < b { return a } return b } // The following function courtesy of Nick @turgon // This helper function ranges over the input data, applying the hashing // which returns the bit locations to set in the filter. // For each location, increment a counter for that bit address. // // If the Bloom Filter's location() method distributes locations uniformly // at random, a property it should inherit from its hash function, then // each bit location in the filter should end up with roughly the same // number of hits. Importantly, the value of k should not matter. // // Once the results are collected, we can run a chi squared goodness of fit // test, comparing the result histogram with the uniform distribition. // This yields a test statistic with degrees-of-freedom of m-1. func chiTestBloom(m, k, rounds uint, elements [][]byte) (succeeds bool) { f := New(m, k) results := make([]uint, m) chi := make([]float64, m) for _, data := range elements { h := baseHashes(data) for i := uint(0); i < f.k; i++ { results[f.location(h, i)]++ } } // Each element of results should contain the same value: k * rounds / m. // Let's run a chi-square goodness of fit and see how it fares. var chiStatistic float64 e := float64(k*rounds) / float64(m) for i := uint(0); i < m; i++ { chi[i] = math.Pow(float64(results[i])-e, 2.0) / e chiStatistic += chi[i] } // this tests at significant level 0.005 up to 20 degrees of freedom table := [20]float64{ 7.879, 10.597, 12.838, 14.86, 16.75, 18.548, 20.278, 21.955, 23.589, 25.188, 26.757, 28.3, 29.819, 31.319, 32.801, 34.267, 35.718, 37.156, 38.582, 39.997} df := min(m-1, 20) succeeds = table[df-1] > chiStatistic return } func TestLocation(t *testing.T) { var m, k, rounds uint m = 8 k = 3 rounds = 100000 // 15000000 elements := make([][]byte, rounds) for x := uint(0); x < rounds; x++ { ctrlist := make([]uint8, 4) ctrlist[0] = uint8(x) ctrlist[1] = uint8(x >> 8) ctrlist[2] = uint8(x >> 16) ctrlist[3] = uint8(x >> 24) data := []byte(ctrlist) elements[x] = data } succeeds := chiTestBloom(m, k, rounds, elements) if !succeeds { t.Error("random assignment is too unrandom") } } func TestCap(t *testing.T) { f := New(1000, 4) if f.Cap() != f.m { t.Error("not accessing Cap() correctly") } } func TestK(t *testing.T) { f := New(1000, 4) if f.K() != f.k { t.Error("not accessing K() correctly") } } func TestMarshalUnmarshalJSON(t *testing.T) { f := New(1000, 4) data, err := json.Marshal(f) if err != nil { t.Fatal(err.Error()) } var g BloomFilter err = json.Unmarshal(data, &g) if err != nil { t.Fatal(err.Error()) } if g.m != f.m { t.Error("invalid m value") } if g.k != f.k { t.Error("invalid k value") } if g.b == nil { t.Fatal("bitset is nil") } if !g.b.Equal(f.b) { t.Error("bitsets are not equal") } } func TestUnmarshalInvalidJSON(t *testing.T) { data := []byte("{invalid}") var g BloomFilter err := g.UnmarshalJSON(data) if err == nil { t.Error("expected error while unmarshalling invalid data") } } func TestWriteToReadFrom(t *testing.T) { var b bytes.Buffer f := New(1000, 4) _, err := f.WriteTo(&b) if err != nil { t.Fatal(err) } g := New(1000, 1) _, err = g.ReadFrom(&b) if err != nil { t.Fatal(err) } if g.m != f.m { t.Error("invalid m value") } if g.k != f.k { t.Error("invalid k value") } if g.b == nil { t.Fatal("bitset is nil") } if !g.b.Equal(f.b) { t.Error("bitsets are not equal") } g.Test([]byte("")) } func TestReadWriteBinary(t *testing.T) { f := New(1000, 4) var buf bytes.Buffer bytesWritten, err := f.WriteTo(&buf) if err != nil { t.Fatal(err.Error()) } if bytesWritten != int64(buf.Len()) { t.Errorf("incorrect write length %d != %d", bytesWritten, buf.Len()) } var g BloomFilter bytesRead, err := g.ReadFrom(&buf) if err != nil { t.Fatal(err.Error()) } if bytesRead != bytesWritten { t.Errorf("read unexpected number of bytes %d != %d", bytesRead, bytesWritten) } if g.m != f.m { t.Error("invalid m value") } if g.k != f.k { t.Error("invalid k value") } if g.b == nil { t.Fatal("bitset is nil") } if !g.b.Equal(f.b) { t.Error("bitsets are not equal") } } func TestEncodeDecodeGob(t *testing.T) { f := New(1000, 4) f.Add([]byte("one")) f.Add([]byte("two")) f.Add([]byte("three")) var buf bytes.Buffer err := gob.NewEncoder(&buf).Encode(f) if err != nil { t.Fatal(err.Error()) } var g BloomFilter err = gob.NewDecoder(&buf).Decode(&g) if err != nil { t.Fatal(err.Error()) } if g.m != f.m { t.Error("invalid m value") } if g.k != f.k { t.Error("invalid k value") } if g.b == nil { t.Fatal("bitset is nil") } if !g.b.Equal(f.b) { t.Error("bitsets are not equal") } if !g.Test([]byte("three")) { t.Errorf("missing value 'three'") } if !g.Test([]byte("two")) { t.Errorf("missing value 'two'") } if !g.Test([]byte("one")) { t.Errorf("missing value 'one'") } } func TestEqual(t *testing.T) { f := New(1000, 4) f1 := New(1000, 4) g := New(1000, 20) h := New(10, 20) n1 := []byte("Bess") f1.Add(n1) if !f.Equal(f) { t.Errorf("%v should be equal to itself", f) } if f.Equal(f1) { t.Errorf("%v should not be equal to %v", f, f1) } if f.Equal(g) { t.Errorf("%v should not be equal to %v", f, g) } if f.Equal(h) { t.Errorf("%v should not be equal to %v", f, h) } } func BenchmarkEstimated(b *testing.B) { for n := uint(100000); n <= 100000; n *= 10 { for fp := 0.1; fp >= 0.0001; fp /= 10.0 { f := NewWithEstimates(n, fp) f.EstimateFalsePositiveRate(n) } } } func BenchmarkSeparateTestAndAdd(b *testing.B) { f := NewWithEstimates(uint(b.N), 0.0001) key := make([]byte, 100) b.ResetTimer() for i := 0; i < b.N; i++ { binary.BigEndian.PutUint32(key, uint32(i)) f.Test(key) f.Add(key) } } func BenchmarkCombinedTestAndAdd(b *testing.B) { f := NewWithEstimates(uint(b.N), 0.0001) key := make([]byte, 100) b.ResetTimer() for i := 0; i < b.N; i++ { binary.BigEndian.PutUint32(key, uint32(i)) f.TestAndAdd(key) } } func TestMerge(t *testing.T) { f := New(1000, 4) n1 := []byte("f") f.Add(n1) g := New(1000, 4) n2 := []byte("g") g.Add(n2) h := New(999, 4) n3 := []byte("h") h.Add(n3) j := New(1000, 5) n4 := []byte("j") j.Add(n4) err := f.Merge(g) if err != nil { t.Errorf("There should be no error when merging two similar filters") } err = f.Merge(h) if err == nil { t.Errorf("There should be an error when merging filters with mismatched m") } err = f.Merge(j) if err == nil { t.Errorf("There should be an error when merging filters with mismatched k") } n2b := f.Test(n2) if !n2b { t.Errorf("The value doesn't exist after a valid merge") } n3b := f.Test(n3) if n3b { t.Errorf("The value exists after an invalid merge") } n4b := f.Test(n4) if n4b { t.Errorf("The value exists after an invalid merge") } } func TestCopy(t *testing.T) { f := New(1000, 4) n1 := []byte("f") f.Add(n1) // copy here instead of New g := f.Copy() n2 := []byte("g") g.Add(n2) n1fb := f.Test(n1) if !n1fb { t.Errorf("The value doesn't exist in original after making a copy") } n1gb := g.Test(n1) if !n1gb { t.Errorf("The value doesn't exist in the copy") } n2fb := f.Test(n2) if n2fb { t.Errorf("The value exists in the original, it should only exist in copy") } n2gb := g.Test(n2) if !n2gb { t.Errorf("The value doesn't exist in copy after Add()") } } func TestFrom(t *testing.T) { var ( k = uint(5) data = make([]uint64, 10) test = []byte("test") ) bf := From(data, k) if bf.K() != k { t.Errorf("Constant k does not match the expected value") } if bf.Cap() != uint(len(data)*64) { t.Errorf("Capacity does not match the expected value") } if bf.Test(test) { t.Errorf("Bloom filter should not contain the value") } bf.Add(test) if !bf.Test(test) { t.Errorf("Bloom filter should contain the value") } // create a new Bloom filter from an existing (populated) data slice. bf = From(data, k) if !bf.Test(test) { t.Errorf("Bloom filter should contain the value") } } func TestTestLocations(t *testing.T) { f := NewWithEstimates(1000, 0.001) n1 := []byte("Love") n2 := []byte("is") n3 := []byte("in") n4 := []byte("bloom") f.Add(n1) n3a := f.TestLocations(Locations(n3, f.K())) f.Add(n3) n1b := f.TestLocations(Locations(n1, f.K())) n2b := f.TestLocations(Locations(n2, f.K())) n3b := f.TestLocations(Locations(n3, f.K())) n4b := f.TestLocations(Locations(n4, f.K())) if !n1b { t.Errorf("%v should be in.", n1) } if n2b { t.Errorf("%v should not be in.", n2) } if n3a { t.Errorf("%v should not be in the first time we look.", n3) } if !n3b { t.Errorf("%v should be in the second time we look.", n3) } if n4b { t.Errorf("%v should be in.", n4) } } func TestApproximatedSize(t *testing.T) { f := NewWithEstimates(1000, 0.001) f.Add([]byte("Love")) f.Add([]byte("is")) f.Add([]byte("in")) f.Add([]byte("bloom")) size := f.ApproximatedSize() if size != 4 { t.Errorf("%d should equal 4.", size) } } func TestFPP(t *testing.T) { f := NewWithEstimates(1000, 0.001) for i := uint32(0); i < 1000; i++ { n := make([]byte, 4) binary.BigEndian.PutUint32(n, i) f.Add(n) } count := 0 for i := uint32(0); i < 1000; i++ { n := make([]byte, 4) binary.BigEndian.PutUint32(n, i+1000) if f.Test(n) { count += 1 } } if float64(count)/1000.0 > 0.001 { t.Errorf("Excessive fpp") } } bloom-3.1.0/go.mod000066400000000000000000000002201412533550400137250ustar00rootroot00000000000000module github.com/bits-and-blooms/bloom/v3 go 1.14 require ( github.com/bits-and-blooms/bitset v1.2.0 github.com/spaolacci/murmur3 v1.1.0 ) bloom-3.1.0/go.sum000066400000000000000000000005501412533550400137600ustar00rootroot00000000000000github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA= github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= bloom-3.1.0/murmur.go000066400000000000000000000163311412533550400145070ustar00rootroot00000000000000/* The bloom library relied on the excellent murmur library by Sébastien Paolacci. Unfortunately, it involved some heap allocation. We want to avoid any heap allocation whatsoever in the hashing process. To preserve backward compatibility, we roll our own hashing functions. They are designed to be strictly equivalent to Paolacci's implementation. License on original code: Copyright 2013, Sébastien Paolacci. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the library nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package bloom import ( "math/bits" "unsafe" ) const ( c1_128 = 0x87c37b91114253d5 c2_128 = 0x4cf5ad432745937f block_size = 16 ) // digest128 represents a partial evaluation of a 128 bites hash. type digest128 struct { h1 uint64 // Unfinalized running hash part 1. h2 uint64 // Unfinalized running hash part 2. } //bmix will hash blocks (16 bytes) func (d *digest128) bmix(p []byte) { nblocks := len(p) / block_size for i := 0; i < nblocks; i++ { t := (*[2]uint64)(unsafe.Pointer(&p[i*block_size])) k1, k2 := t[0], t[1] d.bmix_words(k1, k2) } } //bmix_words will hash two 64-bit words (16 bytes) func (d *digest128) bmix_words(k1, k2 uint64) { h1, h2 := d.h1, d.h2 k1 *= c1_128 k1 = bits.RotateLeft64(k1, 31) k1 *= c2_128 h1 ^= k1 h1 = bits.RotateLeft64(h1, 27) h1 += h2 h1 = h1*5 + 0x52dce729 k2 *= c2_128 k2 = bits.RotateLeft64(k2, 33) k2 *= c1_128 h2 ^= k2 h2 = bits.RotateLeft64(h2, 31) h2 += h1 h2 = h2*5 + 0x38495ab5 d.h1, d.h2 = h1, h2 } // sum128 computers two 64-bit hash value. It is assumed that // bmix was first called on the data to process complete blocks // of 16 bytes. The 'tail' is a slice representing the 'tail' (leftover // elements, fewer than 16). If pad_tail is true, we make it seem like // there is an extra element with value 1 appended to the tail. // The length parameter represents the full length of the data (including // the blocks of 16 bytes, and, if pad_tail is true, an extra byte). func (d *digest128) sum128(pad_tail bool, length uint, tail []byte) (h1, h2 uint64) { h1, h2 = d.h1, d.h2 var k1, k2 uint64 if pad_tail { switch (len(tail) + 1) & 15 { case 15: k2 ^= uint64(1) << 48 break case 14: k2 ^= uint64(1) << 40 break case 13: k2 ^= uint64(1) << 32 break case 12: k2 ^= uint64(1) << 24 break case 11: k2 ^= uint64(1) << 16 break case 10: k2 ^= uint64(1) << 8 break case 9: k2 ^= uint64(1) << 0 k2 *= c2_128 k2 = bits.RotateLeft64(k2, 33) k2 *= c1_128 h2 ^= k2 break case 8: k1 ^= uint64(1) << 56 break case 7: k1 ^= uint64(1) << 48 break case 6: k1 ^= uint64(1) << 40 break case 5: k1 ^= uint64(1) << 32 break case 4: k1 ^= uint64(1) << 24 break case 3: k1 ^= uint64(1) << 16 break case 2: k1 ^= uint64(1) << 8 break case 1: k1 ^= uint64(1) << 0 k1 *= c1_128 k1 = bits.RotateLeft64(k1, 31) k1 *= c2_128 h1 ^= k1 } } switch len(tail) & 15 { case 15: k2 ^= uint64(tail[14]) << 48 fallthrough case 14: k2 ^= uint64(tail[13]) << 40 fallthrough case 13: k2 ^= uint64(tail[12]) << 32 fallthrough case 12: k2 ^= uint64(tail[11]) << 24 fallthrough case 11: k2 ^= uint64(tail[10]) << 16 fallthrough case 10: k2 ^= uint64(tail[9]) << 8 fallthrough case 9: k2 ^= uint64(tail[8]) << 0 k2 *= c2_128 k2 = bits.RotateLeft64(k2, 33) k2 *= c1_128 h2 ^= k2 fallthrough case 8: k1 ^= uint64(tail[7]) << 56 fallthrough case 7: k1 ^= uint64(tail[6]) << 48 fallthrough case 6: k1 ^= uint64(tail[5]) << 40 fallthrough case 5: k1 ^= uint64(tail[4]) << 32 fallthrough case 4: k1 ^= uint64(tail[3]) << 24 fallthrough case 3: k1 ^= uint64(tail[2]) << 16 fallthrough case 2: k1 ^= uint64(tail[1]) << 8 fallthrough case 1: k1 ^= uint64(tail[0]) << 0 k1 *= c1_128 k1 = bits.RotateLeft64(k1, 31) k1 *= c2_128 h1 ^= k1 } h1 ^= uint64(length) h2 ^= uint64(length) h1 += h2 h2 += h1 h1 = fmix64(h1) h2 = fmix64(h2) h1 += h2 h2 += h1 return h1, h2 } func fmix64(k uint64) uint64 { k ^= k >> 33 k *= 0xff51afd7ed558ccd k ^= k >> 33 k *= 0xc4ceb9fe1a85ec53 k ^= k >> 33 return k } // sum256 will compute 4 64-bit hash values from the input. // It is designed to never allocate memory on the heap. So it // works without any byte buffer whatsoever. // It is designed to be strictly equivalent to // a1 := []byte{1} // hasher := murmur3.New128() // hasher.Write(data) // #nosec // v1, v2 := hasher.Sum128() // hasher.Write(a1) // #nosec // v3, v4 := hasher.Sum128() // See TestHashRandom. func (d *digest128) sum256(data []byte) (hash1, hash2, hash3, hash4 uint64) { // We always start from zero. d.h1, d.h2 = 0, 0 // Process as many bytes as possible. d.bmix(data) // We have enough to compute the first two 64-bit numbers length := uint(len(data)) tail_length := length % block_size tail := data[length-tail_length:] hash1, hash2 = d.sum128(false, length, tail) // Next we want to 'virtually' append 1 to the input, but, // we do not want to append to an actual array!!! if tail_length+1 == block_size { // We are left with no tail!!! // Note that murmur3 is sensitive to endianess and so are we. // We assume a little endian system. Go effectively never run // on big endian systems so this is fine. word1 := *(*uint64)(unsafe.Pointer(&tail[0])) word2 := uint64(*(*uint32)(unsafe.Pointer(&tail[8]))) word2 = word2 | (uint64(tail[12]) << 32) | (uint64(tail[13]) << 40) | (uint64(tail[14]) << 48) // We append 1. word2 = word2 | (uint64(1) << 56) // We process the resulting 2 words. d.bmix_words(word1, word2) tail := data[length:] // empty slice, deliberate. hash3, hash4 = d.sum128(false, length+1, tail) } else { // We still have a tail (fewer than 15 bytes) but we // need to append '1' to it. hash3, hash4 = d.sum128(true, length+1, tail) } return hash1, hash2, hash3, hash4 } bloom-3.1.0/murmur_test.go000066400000000000000000000030451412533550400155440ustar00rootroot00000000000000package bloom import ( "math/rand" "testing" "github.com/spaolacci/murmur3" ) // We want to preserve backward compatibility func TestHashBasic(t *testing.T) { max_length := 1000 bigdata := make([]byte, max_length) for i := 0; i < max_length; i++ { bigdata[i] = byte(i) } for length := 0; length <= 1000; length++ { data := bigdata[:length] var d digest128 h1, h2, h3, h4 := d.sum256(data) // a1 := []byte{1} // to grab another bit of data hasher := murmur3.New128() hasher.Write(data) // #nosec v1, v2 := hasher.Sum128() hasher.Write(a1) // #nosec v3, v4 := hasher.Sum128() if v1 != h1 || v2 != h2 || v3 != h3 || v4 != h4 { t.Errorf("Backward compatibillity break.") } } } func TestDocumentation(t *testing.T) { filter := NewWithEstimates(10000, 0.01) got := filter.EstimateFalsePositiveRate(10000) if got > 0.011 || got < 0.009 { t.Errorf("Bad false positive rate %v", got) } } // We want to preserve backward compatibility func TestHashRandom(t *testing.T) { max_length := 1000 bigdata := make([]byte, max_length) for length := 0; length <= 1000; length++ { data := bigdata[:length] for trial := 1; trial < 10; trial++ { rand.Read(data) var d digest128 h1, h2, h3, h4 := d.sum256(data) // a1 := []byte{1} // to grab another bit of data hasher := murmur3.New128() hasher.Write(data) // #nosec v1, v2 := hasher.Sum128() hasher.Write(a1) // #nosec v3, v4 := hasher.Sum128() if v1 != h1 || v2 != h2 || v3 != h3 || v4 != h4 { t.Errorf("Backward compatibillity break.") } } } }