pax_global_header00006660000000000000000000000064131443274170014520gustar00rootroot0000000000000052 comment=a54fd363f33e39175a23bf6e9ac3762ac38a95cc bloom-0.2.0/000077500000000000000000000000001314432741700126275ustar00rootroot00000000000000bloom-0.2.0/.gitignore000066400000000000000000000000471314432741700146200ustar00rootroot00000000000000*.bloom.gz *.bloom build/* *.bin *.exe bloom-0.2.0/.travis.yml000066400000000000000000000016251314432741700147440ustar00rootroot00000000000000language: go sudo: false before_script: - go vet ./... script: make release deploy: provider: releases api_key: secure: t/3lERsIUmsJsSXG53yL4SAPlskwzFuVunOPILMZ6gVhiGbvW+G99a9ihGxXpyf32BSX3Jo+Jt9/JHTxJ6Kkx7rmdMzaSIIF2JRiuJ7+KzkhsB7Ngzij3iNPIZp6xLm066TiOQEYy+R0A0+fi4R522cQoIoA3+DRRHuElUs8xYCfgqoluYQ+4M5VTyzrybUzKCUpNFEpY4ESl83sdsviOW0QmApZQH1gQCoKbHdJInY9/h75419sKaT4WfplZ8X5NW0klxv1qY0WtmOrlaOzqlqWWjeHNiqJDsu2K/baReqTJUs32SkOG7BpYpA1DiFtXRz+KFkcfAOTxocRx7IMZspGdl4SWayFND+luM9+hmMJwDgE1y1iMrRCkTCy06MRc3e4Ws0KlKzeLUiMkhCxC+RPRkZxJrWmvho6w4FqyymjrE9sKG23gKoKqvecCcVjscwn7GrTCerulir43Rct3tZj8fLFSdOyOPA6ZA3aicCj+KR6r3wJtxVRk5amY9PDs7vQTxCfQbVhldCTHNhbdf5jJb6S6KcAD/5mThYghzpHPXT83pmgoFzlPIu5wMBiHjItM7dUqvPhL8HF9XM1qX+CylztdwRQJ5h49L1bX8JRLyhue019udf4F0qx58RL70FdcQ/tS4xSkleL2mdCVoD0bW/q+PuMiqaaxWhVgD0= file: - bloom_linux_amd64.bin - bloom_windows_amd64.exe on: repo: DCSO/bloom tags: true bloom-0.2.0/CHANGELOG.md000066400000000000000000000001031314432741700144320ustar00rootroot00000000000000# Changelog ## v0.1.0 (2017-06-27) - Initial open source release bloom-0.2.0/LICENSE000066400000000000000000000030261314432741700136350ustar00rootroot00000000000000Copyright (c) 2017, DCSO Deutsche Cyber-Sicherheitsorganisation GmbH All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the DCSO Deutsche Cyber-Sicherheitsorganisation GmbH nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bloom-0.2.0/Makefile000066400000000000000000000007661314432741700143000ustar00rootroot00000000000000## simple makefile to log workflow .PHONY: all test clean build install GOFLAGS ?= $(GOFLAGS:) all: install test build: @go build $(GOFLAGS) ./... install: @go get $(GOFLAGS) ./... test: install @go test -cover $(GOFLAGS) ./... bench: install @go test -run=NONE -bench=. $(GOFLAGS) ./... clean: @go clean $(GOFLAGS) -i ./... release: @go get $(GOFLAGS) ./... @go build -v -o bloom_linux_amd64.bin bloom/* GOOS=windows GOARCH=amd64 go build -v -o bloom_windows_amd64.exe bloom/* ## EOF bloom-0.2.0/README.md000066400000000000000000000115661314432741700141170ustar00rootroot00000000000000# Bloom ### A highly efficient bloom filter implementation for Go [![GoDoc](https://godoc.org/github.com/DCSO/bloom?status.svg)](http://godoc.org/github.com/DCSO/bloom) [![Build Status](https://travis-ci.org/DCSO/bloom.svg?branch=master)](https://travis-ci.org/DCSO/bloom) Bloom is a simple tool that provides a very efficient implementation of Bloom filters for the go language. It provides a command line tool that can be used to easily create Bloom filters with desired capacity and false positive probability. Values can be added to filters through standard input, which makes it easy to use the tool in a pipeline workflow. # Usage NAME: Bloom Filter - Utility to work with bloom filters USAGE: bloom [global options] command [command options] [arguments...] VERSION: 0.1.1 COMMANDS: create, c Create a new Bloom filter and store it in the given filename. insert, i Inserts new values into an existing Bloom filter. join, j, merge, m Joins two Bloom filters into one. check, c Checks values against an existing Bloom filter. set-data, c Sets the data associated with the Bloom filter. get-data, c Prints the data associated with the Bloom filter. show, s Shows various details about a given Bloom filter. help, h Shows a list of commands or help for one command GLOBAL OPTIONS: --gzip, --gz compress bloom file with gzip --interactive, -i interactively add values to the filter --split, -s split the input string --each, -e print each match of a splitted string individually --delimiter value, -d value delimiter to use for splitting (default: ",") --fields value, -f value fields of splitted output to use in filter (a single number or a comma-separated list of numbers, zero-indexed) --print-fields value, --pf value fields of splitted output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed). --help, -h show help --version, -v print the version # Examples To create a new bloom filter with a desired capacity and false positive probability, you can use the `create` command: #will create a gzipped Bloom filter with 100.000 capacity and a 0.1 % false positive probability bloom --gzip create -p 0.001 -n 100000 test.bloom.gz To insert values, you can use the `insert` command and pipe some input to it (each line will be treated as one value): cat values | bloom --gzip insert test.bloom.gz You can also interactively add values to the filter by specifying the `--interactive` command line option: bloom --gzip --interactive insert test.bloom.gz To check if a given value or a list of values is in the filter, you can use the `check` command: cat values | bloom --gzip check test.bloom.gz This will return a list of all values in the filter. # Advanced Usage Sometimes it is useful to attach additional information to a string that we want to check against the Bloom filter, such as a timestamp or the original line content. To make passing along this additional information easier within a shell context, the Bloom tool provides an option for splitting the input string by a given delimiter and checking the filter against the resulting field values. Example: # will check the Bloom filter for the values foo, bar and baz cat "foo,bar,baz" | bloom -s filter.bloom # uses a different delimiter (--magic-delimiter--) cat "foo--ac5ba--bar--ac5ba--baz" | bloom -d "--ac5ba--" -s filter.bloom # will check the Bloom filter against the second field value only cat "foo,bar,baz" | bloom -f 1 -s filter.bloom # will check the Bloom filter against the second and third field values only cat "foo,bar,baz" | bloom -f 1,2 -s filter.bloom # will print one line for each field value that matched against the filter cat "foo,bar,baz" | bloom -e -s filter.bloom # will print the last field value for each line whose fields matched against the filter cat "foo,bar,baz" | bloom -e -s --pf -1 filter.bloom This functionality is especially handy when using CSV data, as it allows you to filter CSV rows by checking individual columns against the filter without having to use external tools to split and reassemble the lines. # Installation To install the command line tool: make install To run the tests: make test To run the benchmarks: make bench # Cross-Compiling To compile a binary, simply specify the target architecture and go: #Windows, 64 bit env GOOS=windows GOARCH=amd64 go build -v -o bloom.exe github.com/DCSO/bloom #Windows, 32 bit env GOOS=windows GOARCH=i386 go build -v -o /tmp/bloom github.com/DCSO/bloombloom-0.2.0/bloom.go000066400000000000000000000156601314432741700142760ustar00rootroot00000000000000// DCSO go bloom filter // Copyright (c) 2017, DCSO GmbH //Implements a simple and highly efficient variant of the Bloom filter that uses only two hash functions. package bloom import ( "encoding/binary" "errors" "fmt" "hash/fnv" "io" "io/ioutil" "math" ) const magicSeed = "this-is-magical" // SetError represents an error with a given message related to set operations. type SetError struct { msg string } // BloomFilter represents a Bloom filter, a data structure for quickly checking // for set membership, with a specific desired capacity and false positive // probability. type BloomFilter struct { //bit array v []uint64 //desired maximum number of elements n uint32 //desired false positive probability p float64 //number of hash functions k uint32 //number of bits m uint32 //number of elements in the filter N uint32 //number of 64-bit integers (generated automatically) M uint32 //arbitrary data that we can attach to the filter Data []byte } // Read loads a filter from a reader object. func (s *BloomFilter) Read(input io.Reader) error { bs4 := make([]byte, 4) bs8 := make([]byte, 8) if _, err := io.ReadFull(input, bs4); err != nil { return err } s.n = binary.LittleEndian.Uint32(bs4) if _, err := io.ReadFull(input, bs8); err != nil { return err } s.p = math.Float64frombits(binary.LittleEndian.Uint64(bs8)) if _, err := io.ReadFull(input, bs4); err != nil { return err } s.k = binary.LittleEndian.Uint32(bs4) if _, err := io.ReadFull(input, bs4); err != nil { return err } s.m = binary.LittleEndian.Uint32(bs4) if _, err := io.ReadFull(input, bs4); err != nil { return err } s.N = binary.LittleEndian.Uint32(bs4) s.M = uint32(math.Ceil(float64(s.m) / 64.0)) s.v = make([]uint64, s.M) for i := uint32(0); i < s.M; i++ { n, err := io.ReadFull(input, bs8) if err != nil { return err } if n != 8 { return fmt.Errorf("Cannot read from file: %d, position: %d, %d", n, i*8, len(bs8)) } s.v[i] = binary.LittleEndian.Uint64(bs8) } b, err := ioutil.ReadAll(input) if err != nil { return err } s.Data = b return nil } // NumHashFuncs returns the number of hash functions used in the Bloom filter. func (s *BloomFilter) NumHashFuncs() uint32 { return s.k } // MaxNumElements returns the maximal supported number of elements in the Bloom // filter (capacity). func (s *BloomFilter) MaxNumElements() uint32 { return s.n } // NumBits returns the number of bits used in the Bloom filter. func (s *BloomFilter) NumBits() uint32 { return s.m } // FalsePositiveProb returns the chosen false positive probability for the // Bloom filter. func (s *BloomFilter) FalsePositiveProb() float64 { return s.p } // Write writes the binary representation of a Bloom filter to an io.Writer. func (s *BloomFilter) Write(output io.Writer) error { bs4 := make([]byte, 4) bs8 := make([]byte, 8) binary.LittleEndian.PutUint32(bs4, s.n) output.Write(bs4) binary.LittleEndian.PutUint64(bs8, math.Float64bits(s.p)) output.Write(bs8) binary.LittleEndian.PutUint32(bs4, s.k) output.Write(bs4) binary.LittleEndian.PutUint32(bs4, s.m) output.Write(bs4) binary.LittleEndian.PutUint32(bs4, s.N) output.Write(bs4) for i := uint32(0); i < s.M; i++ { binary.LittleEndian.PutUint64(bs8, s.v[i]) n, err := output.Write(bs8) if n != 8 { return errors.New("Cannot write to file!") } if err != nil { return err } } if s.Data != nil { output.Write(s.Data) } return nil } // Reset clears the Bloom filter of all elements. func (s *BloomFilter) Reset() { for i := uint32(0); i < s.M; i++ { s.v[i] = 0 } s.N = 0 } // Fingerprint returns the fingerprint of a given value, as an array of index // values. func (s *BloomFilter) Fingerprint(value []byte, fingerprint []uint32) { hashValue1 := fnv.New64() hashValue2 := fnv.New64() hashValue1.Write(value) hashValue2.Write(value) hashValue2.Write([]byte(magicSeed)) h1 := hashValue1.Sum64() h2 := hashValue2.Sum64() for i := uint32(0); i < s.k; i++ { fingerprint[i] = uint32((h1 + (uint64(i)+1)*h2) % uint64(s.m)) } } // Add adds a byte array element to the Bloom filter. func (s *BloomFilter) Add(value []byte) { var k, l uint32 newValue := false fingerprint := make([]uint32, s.k) s.Fingerprint(value, fingerprint) for i := uint32(0); i < s.k; i++ { k = uint32(fingerprint[i] / 64) l = uint32(fingerprint[i] % 64) v := uint64(1 << l) if (s.v[k]&v) == 0 { newValue = true } s.v[k] |= v } if newValue { s.N++ } } // Join adds the items of another Bloom filter with identical dimensions to // the receiver. That is, all elements that are described in the // second filter will also described by the receiver, and the number of elements // of the receiver will grow by the number of elements in the added filter. // Note that it is implicitly assumed that both filters are disjoint! Otherwise // the number of elements in the joined filter must _only_ be considered an // upper bound and not an exact value! // Joining two differently dimensioned filters may yield unexpected results and // hence is not allowed. An error will be returned in this case, and the // receiver will be left unaltered. func (s *BloomFilter) Join(s2 *BloomFilter) error { var i uint32 if s.n != s2.n { return fmt.Errorf("filters have different dimensions (n = %d vs. %d))", s.n, s2.n) } if s.p != s2.p { return fmt.Errorf("filters have different dimensions (p = %f vs. %f))", s.p, s2.p) } if s.k != s2.k { return fmt.Errorf("filters have different dimensions (k = %d vs. %d))", s.k, s2.k) } if s.m != s2.m { return fmt.Errorf("filters have different dimensions (m = %d vs. %d))", s.m, s2.m) } if s.M != s2.M { return fmt.Errorf("filters have different dimensions (M = %d vs. %d))", s.M, s2.M) } for i = 0; i < s.M; i++ { s.v[i] |= s2.v[i] } if s.N+s2.N < s.N { return fmt.Errorf("addition of member counts would overflow") } else { s.N += s2.N } return nil } // Check returns true if the given value may be in the Bloom filter, false if it // is definitely not in it. func (s *BloomFilter) Check(value []byte) bool { fingerprint := make([]uint32, s.k) s.Fingerprint(value, fingerprint) return s.CheckFingerprint(fingerprint) } // CheckFingerprint returns true if the given fingerprint occurs in the Bloom // filter, false if it does not. func (s *BloomFilter) CheckFingerprint(fingerprint []uint32) bool { var k, l uint32 for i := uint32(0); i < s.k; i++ { k = uint32(fingerprint[i] / 64) l = uint32(fingerprint[i] % 64) if (s.v[k] & (1 << l)) == 0 { return false } } return true } // Initialize returns a new, empty Bloom filter with the given capacity (n) // and FP probability (p). func Initialize(n uint32, p float64) BloomFilter { var bf BloomFilter bf.n = n bf.p = p bf.m = uint32(math.Abs(math.Ceil(float64(n) * math.Log(float64(p)) / (math.Pow(math.Log(2.0), 2.0))))) bf.M = uint32(math.Ceil(float64(bf.m) / 64.0)) bf.k = uint32(math.Ceil(math.Log(2) * float64(bf.m) / float64(n))) bf.v = make([]uint64, bf.M) return bf } bloom-0.2.0/bloom/000077500000000000000000000000001314432741700137375ustar00rootroot00000000000000bloom-0.2.0/bloom/manage.go000066400000000000000000000302021314432741700155130ustar00rootroot00000000000000// DCSO go bloom filter // Copyright (c) 2017, DCSO GmbH package main import ( "bytes" "bufio" "fmt" "github.com/DCSO/bloom" "gopkg.in/urfave/cli.v1" "os" "path/filepath" "strconv" "strings" ) // BloomParams represents the parameters of the 'bloom' command line tool. type BloomParams struct { gzip bool interactive bool split bool printEachMatch bool delimiter string fields []int printFields []int } func exitWithError(message string) { fmt.Fprintf(os.Stderr, "Error: %s \n", message) os.Exit(-1) } func readValuesIntoFilter(filter *bloom.BloomFilter, bloomParams BloomParams) { //we determine if the program is run interactively or within a pipe stat, _ := os.Stdin.Stat() var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0 //if we are not in an interactive session and this is a terminal, we quit if !bloomParams.interactive && isTerminal { return } if bloomParams.interactive { fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).") } scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { line := scanner.Text() if line == "" && bloomParams.interactive { break } if bloomParams.split { values := strings.Split(line, bloomParams.delimiter) for i, value := range values { j := i - len(values) if len(bloomParams.fields) > 0 { if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) { continue } } filter.Add([]byte(value)) } } else { filter.Add([]byte(line)) } } } func readInputIntoData(filter *bloom.BloomFilter, bloomParams BloomParams) { //we determine if the program is run interactively or within a pipe stat, _ := os.Stdin.Stat() var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0 //if we are not in an interactive session and this is a terminal, we quit if !bloomParams.interactive && isTerminal { return } if bloomParams.interactive { fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).") } scanner := bufio.NewScanner(os.Stdin) dataBuffer := bytes.NewBuffer([]byte("")) for scanner.Scan() { line := scanner.Bytes() if len(line) == 0 && bloomParams.interactive { break } dataBuffer.Write(line) dataBuffer.Write([]byte("\n")) } filter.Data = dataBuffer.Bytes() } func insertIntoFilter(path string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } readValuesIntoFilter(filter, bloomParams) err = bloom.WriteFilter(filter, path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } } func updateFilterData(path string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } readInputIntoData(filter, bloomParams) err = bloom.WriteFilter(filter, path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } } func getFilterData(path string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } fmt.Print(string(filter.Data)) } func contains(s []int, e int) bool { for _, a := range s { if a == e { return true } } return false } func checkAgainstFilter(path string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } scanner := bufio.NewScanner(os.Stdin) if bloomParams.interactive { fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit.") } for scanner.Scan() { line := scanner.Text() if line == "" && bloomParams.interactive { break } var valuesToCheck []string if bloomParams.split { valuesToCheck = strings.Split(line, bloomParams.delimiter) } else { valuesToCheck = make([]string, 1) valuesToCheck[0] = line } printed := false prefix := "" if bloomParams.interactive { prefix = ">" } for i, value := range valuesToCheck { j := i - len(valuesToCheck) //we only check fields that are in the "fields" parameters (if defined) if len(bloomParams.fields) > 0 { if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) { continue } } if filter.Check([]byte(value)) { if bloomParams.printEachMatch { fmt.Printf("%s%s\n", prefix, value) } else { if !printed { if len(bloomParams.printFields) > 0 { values := make([]string, 0, len(bloomParams.printFields)) for _, i := range bloomParams.printFields { j := i if j < 0 { j = j + len(valuesToCheck) } if j >= len(valuesToCheck) || j < 0 { continue } values = append(values, valuesToCheck[j]) } fmt.Printf("%s%s\n", prefix, strings.Join(values, bloomParams.delimiter)) } else { fmt.Printf("%s%s\n", prefix, line) } } printed = true } } } } } func printStats(path string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } fmt.Printf("File:\t\t\t%s\n", path) fmt.Printf("Capacity:\t\t%d\n", filter.MaxNumElements()) fmt.Printf("Elements present:\t%d\n", filter.N) fmt.Printf("FP probability:\t\t%f\n", filter.FalsePositiveProb()) fmt.Printf("Bits:\t\t\t%d\n", filter.NumBits()) fmt.Printf("Hash functions:\t\t%d\n", filter.NumHashFuncs()) } func createFilter(path string, n uint32, p float64, bloomParams BloomParams) { filter := bloom.Initialize(n, p) readValuesIntoFilter(&filter, bloomParams) err := bloom.WriteFilter(&filter, path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } } func joinFilters(path string, pathToAdd string, bloomParams BloomParams) { filter, err := bloom.LoadFilter(path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } filter2, err := bloom.LoadFilter(pathToAdd, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } err = filter.Join(filter2) if err != nil { exitWithError(err.Error()) } err = bloom.WriteFilter(filter, path, bloomParams.gzip) if err != nil { exitWithError(err.Error()) } } func parseFieldIndexes(s string) ([]int, error) { fields := strings.Split(s, ",") fieldNumbers := make([]int, len(fields)) for i, field := range fields { num, err := strconv.Atoi(field) if err != nil { return nil, err } fieldNumbers[i] = num } return fieldNumbers, nil } func parseBloomParams(c *cli.Context) BloomParams { var bloomParams BloomParams var err error bloomParams.gzip = c.GlobalBool("gzip") bloomParams.interactive = c.GlobalBool("interactive") bloomParams.split = c.GlobalBool("split") bloomParams.delimiter = c.GlobalString("delimiter") bloomParams.printEachMatch = c.GlobalBool("each") if c.GlobalString("fields") != "" { bloomParams.fields, err = parseFieldIndexes(c.GlobalString("fields")) if err != nil { exitWithError(err.Error()) } } if c.GlobalString("print-fields") != "" { bloomParams.printFields, err = parseFieldIndexes(c.GlobalString("print-fields")) if err != nil { exitWithError(err.Error()) } //if printFields is set we also set printEachMatch if len(bloomParams.printFields) > 0 { bloomParams.printEachMatch = false } } return bloomParams } func main() { app := cli.NewApp() app.Name = "Bloom Filter" app.Usage = "Utility to work with bloom filters" app.Flags = []cli.Flag{ cli.BoolFlag{ Name: "gzip, gz", Usage: "compress bloom file with gzip", }, cli.BoolFlag{ Name: "interactive, i", Usage: "interactively add values to the filter", }, cli.BoolFlag{ Name: "split, s", Usage: "split the input string", }, cli.BoolFlag{ Name: "each, e", Usage: "print each match of a splitted string individually", }, cli.StringFlag{ Name: "delimiter, d", Value: ",", Usage: "delimiter to use for splitting", }, cli.StringFlag{ Name: "fields, f", Value: "", Usage: "fields of splitted output to use in filter (a single number or a comma-separated list of numbers, zero-indexed)", }, cli.StringFlag{ Name: "print-fields, pf", Value: "", Usage: "fields of splitted output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed).", }, } app.Commands = []cli.Command{ { Name: "create", Aliases: []string{"c"}, Flags: []cli.Flag{ cli.Float64Flag{Name: "p", Value: 0.01, Usage: "The desired false positive probability."}, cli.IntFlag{Name: "n", Value: 10000, Usage: "The desired capacity."}, }, Usage: "Create a new Bloom filter and store it in the given filename.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } n := c.Int("n") p := c.Float64("p") if n < 0 { exitWithError("n cannot be negative.") } if p < 0 || p > 1 { exitWithError("p must be between 0 and 1.") } createFilter(path, uint32(n), p, bloomParams) return nil }, }, { Name: "insert", Aliases: []string{"i"}, Flags: []cli.Flag{}, Usage: "Inserts new values into an existing Bloom filter.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } insertIntoFilter(path, bloomParams) return nil }, }, { Name: "join", Aliases: []string{"j", "merge", "m"}, Flags: []cli.Flag{}, Usage: "Joins two Bloom filters into one.", Action: func(c *cli.Context) error { if len(c.Args()) != 2 { exitWithError("Two filenames are required.") } bloomParams := parseBloomParams(c) path := c.Args().First() if path == "" { exitWithError("No first filename given.") } path, err := filepath.Abs(path) if err != nil { return err } pathToAdd := c.Args().Get(1) if pathToAdd == "" { exitWithError("No second filename given.") } pathToAdd, err = filepath.Abs(pathToAdd) if err != nil { return err } joinFilters(path, pathToAdd, bloomParams) return nil }, }, { Name: "check", Aliases: []string{"c"}, Flags: []cli.Flag{}, Usage: "Checks values against an existing Bloom filter.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } checkAgainstFilter(path, bloomParams) return nil }, }, { Name: "set-data", Aliases: []string{"c"}, Flags: []cli.Flag{}, Usage: "Sets the data associated with the Bloom filter.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } updateFilterData(path, bloomParams) return nil }, }, { Name: "get-data", Aliases: []string{"c"}, Flags: []cli.Flag{}, Usage: "Prints the data associated with the Bloom filter.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } getFilterData(path, bloomParams) return nil }, }, { Name: "show", Aliases: []string{"s"}, Flags: []cli.Flag{}, Usage: "Shows various details about a given Bloom filter.", Action: func(c *cli.Context) error { path := c.Args().First() bloomParams := parseBloomParams(c) if path == "" { exitWithError("No filename given.") } path, err := filepath.Abs(path) if err != nil { return err } printStats(path, bloomParams) return nil }, }, } app.Version = "0.1.1" app.Run(os.Args) } bloom-0.2.0/bloom_test.go000066400000000000000000000246001314432741700153270ustar00rootroot00000000000000// DCSO go bloom filter // Copyright (c) 2017, DCSO GmbH package bloom import ( "bytes" "io/ioutil" "log" "math" "math/rand" "os" "path/filepath" "strings" "testing" ) func TestInitialization(t *testing.T) { filter := Initialize(10000, 0.001) if filter.k != 10 { t.Error("k does not match expectation!") } if filter.m != 143775 { t.Error("m does not match expectation: ", filter.m) } if filter.M != uint32(math.Ceil(float64(filter.m)/64)) { t.Error("M does not match expectation: ", filter.M) } for i := uint32(0); i < filter.M; i++ { if filter.v[i] != 0 { t.Error("Filter value is not initialized to zero!") } } } func checkFilters(a BloomFilter, b BloomFilter, t *testing.T) bool { if b.n != a.n || b.p != a.p || b.k != a.k || b.m != a.m || b.M != a.M || bytes.Compare(b.Data, a.Data) != 0 { return false } for i := uint32(0); i < a.M; i++ { if a.v[i] != b.v[i] { return false } } return true } func serializeToBuffer(filter BloomFilter) (*BloomFilter, error) { var buf bytes.Buffer filter.Write(&buf) var newFilter BloomFilter newFilter.Read(&buf) return &newFilter, nil } func serializeToDisk(filter BloomFilter) (*BloomFilter, error) { tempFile, err := ioutil.TempFile("", "filter") if err != nil { return nil, err } defer os.Remove(tempFile.Name()) filter.Write(tempFile) tempFile.Sync() tempFile.Seek(0, 0) var newFilter BloomFilter err = newFilter.Read(tempFile) if err != nil { return nil, err } return &newFilter, nil } func TestSerialization(t *testing.T) { capacity := uint32(100000) p := float64(0.01) samples := uint32(1000) filter, _ := GenerateExampleFilter(capacity, p, samples) newFilter, err := serializeToBuffer(filter) if err != nil { t.Error("Cannot serialize filter to buffer!") return } if !checkFilters(filter, *newFilter, t) { t.Error("Filters do not match!") } newFilter, err = serializeToDisk(filter) if err != nil { t.Error("Cannot serialize filter to file!") return } if !checkFilters(filter, *newFilter, t) { t.Error("Filters do not match!") } filter.Add(GenerateTestValue(100)) newFilter.Add(GenerateTestValue(100)) newFilter, err = serializeToDisk(filter) if err != nil { t.Error("Cannot serialize filter to disk!") return } if !checkFilters(filter, *newFilter, t) { t.Error("Filters do not match!") } filter.Add(GenerateTestValue(100)) newFilter.Add(GenerateTestValue(100)) newFilter, err = serializeToDisk(filter) if err != nil { t.Error("Cannot serialize filter to disk!") return } if !checkFilters(filter, *newFilter, t) { t.Error("Filters do not match!") } checkFilters(filter, *newFilter, t) } func TestSerializationToDisk(t *testing.T) { capacity := uint32(100000) p := float64(0.001) samples := uint32(1000) filter, _ := GenerateExampleFilter(capacity, p, samples) var buf bytes.Buffer filter.Write(&buf) var newFilter BloomFilter newFilter.Read(&buf) checkFilters(filter, newFilter, t) } func TestSerializationWriteFail(t *testing.T) { capacity := uint32(100000) p := float64(0.001) samples := uint32(1000) filter, _ := GenerateExampleFilter(capacity, p, samples) dir, err := ioutil.TempDir("", "bloomtest") if err != nil { log.Fatal(err) } defer os.RemoveAll(dir) tmpfn := filepath.Join(dir, "tmpfile") tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE|os.O_RDONLY, 0000) if err != nil { t.Fatal(err) } defer tmpfile.Close() err = filter.Write(tmpfile) if err == nil { t.Error("writing to read-only file should fail") } } func TestSerializationReadFail(t *testing.T) { var newFilter BloomFilter dir, err := ioutil.TempDir("", "bloomtest") if err != nil { log.Fatal(err) } defer os.RemoveAll(dir) tmpfn := filepath.Join(dir, "tmpfile") tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE, 0777) if err != nil { t.Fatal(err) } defer tmpfile.Close() err = newFilter.Read(tmpfile) if err == nil { t.Error("reading from empty file should fail") } } func GenerateTestValue(length uint32) []byte { value := make([]byte, length) for i := uint32(0); i < length; i++ { value[i] = byte(rand.Int() % 256) } return value } func GenerateExampleFilter(capacity uint32, p float64, samples uint32) (BloomFilter, [][]byte) { filter := Initialize(capacity, p) filter.Data = []byte("foobar") testValues := make([][]byte, 0, samples) for i := uint32(0); i < samples; i++ { testValue := GenerateTestValue(100) testValues = append(testValues, testValue) filter.Add(testValue) } return filter, testValues } func GenerateDisjointExampleFilter(capacity uint32, p float64, samples uint32, other BloomFilter) (BloomFilter, [][]byte) { filter := Initialize(capacity, p) testValues := make([][]byte, 0, samples) for i := uint32(0); i < samples; { testValue := GenerateTestValue(100) if !other.Check(testValue) { testValues = append(testValues, testValue) filter.Add(testValue) i++ } } return filter, testValues } //This tests the checking of values against a given filter func TestChecking(t *testing.T) { capacity := uint32(100000) p := float64(0.001) samples := uint32(100000) filter, testValues := GenerateExampleFilter(capacity, p, samples) fingerprint := make([]uint32, filter.k) for _, value := range testValues { filter.Fingerprint(value, fingerprint) if !filter.CheckFingerprint(fingerprint) { t.Error("Did not find test value in filter!") } } } //This tests the checking of values against a given filter after resetting it func TestReset(t *testing.T) { capacity := uint32(100000) p := float64(0.001) samples := uint32(100000) filter, testValues := GenerateExampleFilter(capacity, p, samples) filter.Reset() fingerprint := make([]uint32, filter.k) for _, value := range testValues { filter.Fingerprint(value, fingerprint) if filter.CheckFingerprint(fingerprint) { t.Error("Did not find test value in filter!") } } } //This tests the checking of values against a given filter //see https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives func TestFalsePositives(t *testing.T) { capacity := uint32(10000) p := float64(0.001) fillingFactor := 0.9 N := uint32(float64(capacity) * fillingFactor) filter, _ := GenerateExampleFilter(capacity, p, N) pAcceptable := math.Pow(1-math.Exp(-float64(filter.k)*float64(N)/float64(filter.m)), float64(filter.k)) fingerprint := make([]uint32, filter.k) cnt := 0.0 matches := 0.0 for { cnt++ value := GenerateTestValue(100) filter.Fingerprint(value, fingerprint) if filter.CheckFingerprint(fingerprint) { matches++ } if cnt > float64(capacity)*10 { break } } //this might still fail sometimes... //we allow for a probability that is two times higher than the normally acceptable probability if matches/cnt > pAcceptable*2 { t.Error("False positive probability is too high at ", matches/cnt*100, "% vs ", pAcceptable*100, "%") } } func TestJoiningRegularMisdimensioned(t *testing.T) { a := Initialize(100000, 0.0001) b := Initialize(10000, 0.0001) err := a.Join(&b) if err == nil { t.Error("joining filters with different capacity should fail") } if !strings.Contains(err.Error(), "different dimensions") { t.Error("wrong error message returned") } a = Initialize(100000, 0.0001) b = Initialize(100000, 0.001) err = a.Join(&b) if err == nil { t.Error("joining filters with different FP prob should fail") } if !strings.Contains(err.Error(), "different dimensions") { t.Error("wrong error message returned") } a = Initialize(100000, 0.0001) b = Initialize(100000, 0.0001) b.k = 1 err = a.Join(&b) if err == nil { t.Error("joining filters with different number of hash funcs should fail") } if !strings.Contains(err.Error(), "different dimensions") { t.Error("wrong error message returned") } a = Initialize(100000, 0.0001) b = Initialize(100000, 0.0001) b.m = 1 err = a.Join(&b) if err == nil { t.Error("joining filters with different number of bits should fail") } if !strings.Contains(err.Error(), "different dimensions") { t.Error("wrong error message returned") } a = Initialize(100000, 0.0001) b = Initialize(100000, 0.0001) b.M = 1 err = a.Join(&b) if err == nil { t.Error("joining filters with different int array size should fail") } if !strings.Contains(err.Error(), "different dimensions") { t.Error("wrong error message returned") } } func TestAccessors(t *testing.T) { a, _ := GenerateExampleFilter(100000, 0.0001, 10000) if a.MaxNumElements() != 100000 { t.Error("unexpected capacity in filter") } if a.NumBits() != 1917011 { t.Error("unexpected number of bits in filter") } if a.NumHashFuncs() != 14 { t.Error("unexpected number of hash funcs in filter") } if a.FalsePositiveProb() != 0.0001 { t.Error("unexpected FP prob in filter") } } func TestJoiningRegular(t *testing.T) { a, aval := GenerateExampleFilter(100000, 0.0001, 10000) b, bval := GenerateDisjointExampleFilter(100000, 0.0001, 20000, a) for _, v := range bval { if a.Check(v) { t.Errorf("value not missing in joined filter: %s", string(v)) } } if a.N != 10000 { t.Error("unexpected number of elements in filter") } if b.N != 20000 { t.Error("unexpected number of elements in filter") } err := a.Join(&b) if a.N != 30000 { t.Errorf("unexpected number of elements in filter") } if err != nil { t.Fatal(err) } for _, v := range aval { if !a.Check(v) { t.Errorf("value not found in joined filter: %s", string(v)) } } for _, v := range bval { if !a.Check(v) { t.Errorf("value not found in joined filter: %s", string(v)) } } } //This benchmarks the checking of values against a given filter func BenchmarkChecking(b *testing.B) { capacity := uint32(1e9) p := float64(0.001) samples := uint32(100000) filter, testValues := GenerateExampleFilter(capacity, p, samples) fingerprint := make([]uint32, filter.k) b.ResetTimer() for i := 0; i < b.N; i++ { value := testValues[rand.Int()%len(testValues)] filter.Fingerprint(value, fingerprint) if !filter.CheckFingerprint(fingerprint) { b.Error("Did not find test value in filter!") } } } //This benchmarks the checking without using a fixed fingerprint variable (instead a temporary variable is created each time) func BenchmarkSimpleChecking(b *testing.B) { capacity := uint32(1e9) p := float64(0.001) samples := uint32(100000) filter, testValues := GenerateExampleFilter(capacity, p, samples) b.ResetTimer() for i := 0; i < b.N; i++ { value := testValues[rand.Int()%len(testValues)] if !filter.Check(value) { b.Error("Did not find test value in filter!") } } } bloom-0.2.0/io.go000066400000000000000000000043151314432741700135700ustar00rootroot00000000000000// DCSO go bloom filter // Copyright (c) 2017, DCSO GmbH package bloom import ( "bufio" "bytes" gz "compress/gzip" "io" "os" ) // LoadFromBytes reads a binary Bloom filter representation from a byte array // and returns a BloomFilter struct pointer based on it. // If 'gzip' is true, then compressed input will be expected. func LoadFromBytes(input []byte, gzip bool) (*BloomFilter, error) { return LoadFromReader(bytes.NewReader(input), gzip) } // LoadFilter reads a binary Bloom filter representation from a file // and returns a BloomFilter struct pointer based on it. // If 'gzip' is true, then compressed input will be expected. func LoadFilter(path string, gzip bool) (*BloomFilter, error) { file, err := os.Open(path) if err != nil { return nil, err } defer file.Close() return LoadFromReader(file, gzip) } // LoadFromReader reads a binary Bloom filter representation from an io.Reader // and returns a BloomFilter struct pointer based on it. // If 'gzip' is true, then compressed input will be expected. func LoadFromReader(inReader io.Reader, gzip bool) (*BloomFilter, error) { var err error var reader io.Reader var gzipReader *gz.Reader var ioReader *bufio.Reader if gzip { gzipReader, err = gz.NewReader(inReader) if err != nil { return nil, err } defer gzipReader.Close() reader = gzipReader } else { ioReader = bufio.NewReader(inReader) reader = ioReader } var filter BloomFilter if err = filter.Read(reader); err != nil { return nil, err } return &filter, nil } // WriteFilter writes a binary Bloom filter representation for a given struct // to a file. If 'gzip' is true, then a compressed file will be written. func WriteFilter(filter *BloomFilter, path string, gzip bool) error { file, err := os.Create(path) if err != nil { return err } defer file.Close() file.Seek(0, 0) var writer io.Writer var gzipWriter *gz.Writer var ioWriter *bufio.Writer if gzip { gzipWriter = gz.NewWriter(file) defer gzipWriter.Close() writer = gzipWriter } else { ioWriter = bufio.NewWriter(file) writer = ioWriter } err = filter.Write(writer) if err != nil { return err } if gzip { gzipWriter.Flush() } else { ioWriter.Flush() } file.Sync() return nil } bloom-0.2.0/io_test.go000066400000000000000000000036561314432741700146360ustar00rootroot00000000000000// DCSO go bloom filter // Copyright (c) 2017, DCSO GmbH package bloom import ( "io/ioutil" "os" "testing" ) func checkResults(t *testing.T, bf *BloomFilter) { for _, v := range []string{"foo", "bar", "baz"} { if !bf.Check([]byte(v)) { t.Fatalf("value %s expected in filter but wasn't found", v) } } if bf.Check([]byte("")) { t.Fatal("empty value not expected in filter but was found") } if bf.Check([]byte("12345")) { t.Fatal("missing value not expected in filter but was found") } } func TestFromReaderFile(t *testing.T) { f, err := os.Open("testdata/test.bloom") if err != nil { t.Fatal(err) } defer f.Close() bf, err := LoadFromReader(f, false) if err != nil { t.Fatal(err) } checkResults(t, bf) } func testFromSerialized(t *testing.T, gzip bool) { bf := Initialize(100, 0.0001) for _, v := range []string{"foo", "bar", "baz"} { bf.Add([]byte(v)) } tmpfile, err := ioutil.TempFile("", "test") if err != nil { t.Fatal(err) } defer os.Remove(tmpfile.Name()) err = WriteFilter(&bf, tmpfile.Name(), gzip) if err != nil { t.Fatal(err) } loadedBf, err := LoadFilter(tmpfile.Name(), gzip) if err != nil { t.Fatal(err) } checkResults(t, loadedBf) } func TestFromSerialized(t *testing.T) { testFromSerialized(t, false) } func TestFromSerializedZip(t *testing.T) { testFromSerialized(t, true) } func TestFromReaderFileZip(t *testing.T) { f, err := os.Open("testdata/test.bloom.gz") if err != nil { t.Fatal(err) } defer f.Close() bf, err := LoadFromReader(f, true) if err != nil { t.Fatal(err) } checkResults(t, bf) } func TestFromBytes(t *testing.T) { testBytes, err := ioutil.ReadFile("testdata/test.bloom") if err != nil { t.Fatal(err) } bf, err := LoadFromBytes(testBytes, false) if err != nil { t.Fatal(err) } checkResults(t, bf) } func TestFromFile(t *testing.T) { bf, err := LoadFilter("testdata/test.bloom", false) if err != nil { t.Fatal(err) } checkResults(t, bf) } bloom-0.2.0/testdata/000077500000000000000000000000001314432741700144405ustar00rootroot00000000000000bloom-0.2.0/testdata/test-input.txt000066400000000000000000000000141314432741700173100ustar00rootroot00000000000000foo bar baz bloom-0.2.0/testdata/test.bloom000066400000000000000000000273501314432741700164600ustar00rootroot00000000000000'{Gz?jv@@@@@@@bloom-0.2.0/testdata/test.bloom.gz000066400000000000000000000002461314432741700170720ustar00rootroot00000000000000cYر @ aCBI2FܱK MNlֻ WUg򹹛nC;̞]aW@*uMusP WNkClA/aRUKCÿ@"\`;>ހǖ]F ouՆVZ~NC.