pax_global_header00006660000000000000000000000064120671014370014513gustar00rootroot0000000000000052 comment=abfccef7929f28a0dec4e865bb68a1181330c56c python-bloomfilter-2.0/000077500000000000000000000000001206710143700152115ustar00rootroot00000000000000python-bloomfilter-2.0/.gitignore000066400000000000000000000000421206710143700171750ustar00rootroot00000000000000*.egg *.pyc *.egg-info build dist python-bloomfilter-2.0/CHANGES.txt000066400000000000000000000006501206710143700170230ustar00rootroot00000000000000Changes in 2.0 ============== Made major corrections to the algorithms for both BloomFilter and ScalableBloomFilter. Not numerically compatible with serialized representations of filters from previous versions. Specifically, BloomFilter was more accurate than requested and ScalableBloomFilter was much less accurate than requested. Changes in 1.1 ============== Added copy, intersection and union functions to BloomFilter python-bloomfilter-2.0/LICENSE.txt000066400000000000000000000020611206710143700170330ustar00rootroot00000000000000Copyright (c) <2011> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.python-bloomfilter-2.0/MANIFEST.in000066400000000000000000000000241206710143700167430ustar00rootroot00000000000000include ez_setup.py python-bloomfilter-2.0/README.txt000066400000000000000000000032061206710143700167100ustar00rootroot00000000000000pybloom is a module that includes a Bloom Filter data structure along with an implmentation of Scalable Bloom Filters as discussed in: P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters, (GLOBECOM 2007), IEEE, 2007. Bloom filters are great if you understand what amount of bits you need to set aside early to store your entire set. Scalable Bloom Filters allow your bloom filter bits to grow as a function of false positive probability and size. A filter is "full" when at capacity: M * ((ln 2 ^ 2) / abs(ln p)), where M is the number of bits and p is the false positive probability. When capacity is reached a new filter is then created exponentially larger than the last with a tighter probability of false positives and a larger number of hash functions. >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> [f.add(x) for x in range(10)] [False, False, False, False, False, False, False, False, False, False] >>> all([(x in f) for x in range(10)]) True >>> 10 in f False >>> 5 in f True >>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> for i in xrange(0, f.capacity): ... _ = f.add(i) >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 True >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 >>> for i in xrange(0, count): ... _ = sbf.add(i) ... >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 True # len(sbf) may not equal the entire input length. 0.01% error is well # below the default 0.1% error threshold. As the capacity goes up, the # error will approach 0.1%. python-bloomfilter-2.0/ez_setup.py000066400000000000000000000227551206710143700174340ustar00rootroot00000000000000#!python """Bootstrap setuptools installation If you want to use setuptools in your package's setup.py, just include this file in the same directory with it, and add this to the top of your setup.py:: from ez_setup import use_setuptools use_setuptools() If you want to require a specific version of setuptools, set a download mirror, or use an alternate download directory, you can do so by supplying the appropriate options to ``use_setuptools()``. This file can also be run as a script to install or upgrade setuptools. """ import sys DEFAULT_VERSION = "0.6c9" DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] md5_data = { 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', } import sys, os try: from hashlib import md5 except ImportError: from md5 import md5 def _validate_md5(egg_name, data): if egg_name in md5_data: digest = md5(data).hexdigest() if digest != md5_data[egg_name]: print >>sys.stderr, ( "md5 validation of %s failed! (Possible download problem?)" % egg_name ) sys.exit(2) return data def use_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15 ): """Automatically find/download setuptools and make it available on sys.path `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where setuptools will be downloaded, if it is not already available. If `download_delay` is specified, it should be the number of seconds that will be paused before initiating a download, should one be required. If an older version of setuptools is installed, this routine will print a message to ``sys.stderr`` and raise SystemExit in an attempt to abort the calling script. """ was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules def do_download(): egg = download_setuptools(version, download_base, to_dir, download_delay) sys.path.insert(0, egg) import setuptools; setuptools.bootstrap_install_from = egg try: import pkg_resources except ImportError: return do_download() try: pkg_resources.require("setuptools>="+version); return except pkg_resources.VersionConflict, e: if was_imported: print >>sys.stderr, ( "The required version of setuptools (>=%s) is not available, and\n" "can't be installed while this script is running. Please install\n" " a more recent version first, using 'easy_install -U setuptools'." "\n\n(Currently using %r)" ) % (version, e.args[0]) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok return do_download() except pkg_resources.DistributionNotFound: return do_download() def download_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay = 15 ): """Download setuptools from a specified location and return its filename `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. """ import urllib2, shutil egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) url = download_base + egg_name saveto = os.path.join(to_dir, egg_name) src = dst = None if not os.path.exists(saveto): # Avoid repeated downloads try: from distutils import log if delay: log.warn(""" --------------------------------------------------------------------------- This script requires setuptools version %s to run (even to display help). I will attempt to download it for you (from %s), but you may need to enable firewall access for this script first. I will start the download in %d seconds. (Note: if this machine does not have network access, please obtain the file %s and place it in this directory before rerunning this script.) ---------------------------------------------------------------------------""", version, download_base, delay, url ); from time import sleep; sleep(delay) log.warn("Downloading %s", url) src = urllib2.urlopen(url) # Read/write all in one block, so we don't create a corrupt file # if the download is interrupted. data = _validate_md5(egg_name, src.read()) dst = open(saveto,"wb"); dst.write(data) finally: if src: src.close() if dst: dst.close() return os.path.realpath(saveto) def main(argv, version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" try: import setuptools except ImportError: egg = None try: egg = download_setuptools(version, delay=0) sys.path.insert(0,egg) from setuptools.command.easy_install import main return main(list(argv)+[egg]) # we're done here finally: if egg and os.path.exists(egg): os.unlink(egg) else: if setuptools.__version__ == '0.0.1': print >>sys.stderr, ( "You have an obsolete version of setuptools installed. Please\n" "remove it from your system entirely before rerunning this script." ) sys.exit(2) req = "setuptools>="+version import pkg_resources try: pkg_resources.require(req) except pkg_resources.VersionConflict: try: from setuptools.command.easy_install import main except ImportError: from easy_install import main main(list(argv)+[download_setuptools(delay=0)]) sys.exit(0) # try to force an exit else: if argv: from setuptools.command.easy_install import main main(argv) else: print "Setuptools version",version,"or greater has been installed." print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' def update_md5(filenames): """Update our built-in md5 registry""" import re for name in filenames: base = os.path.basename(name) f = open(name,'rb') md5_data[base] = md5(f.read()).hexdigest() f.close() data = [" %r: %r,\n" % it for it in md5_data.items()] data.sort() repl = "".join(data) import inspect srcfile = inspect.getsourcefile(sys.modules[__name__]) f = open(srcfile, 'rb'); src = f.read(); f.close() match = re.search("\nmd5_data = {\n([^}]+)}", src) if not match: print >>sys.stderr, "Internal error!" sys.exit(2) src = src[:match.start(1)] + repl + src[match.end(1):] f = open(srcfile,'w') f.write(src) f.close() if __name__=='__main__': if len(sys.argv)>2 and sys.argv[1]=='--md5update': update_md5(sys.argv[2:]) else: main(sys.argv[1:])python-bloomfilter-2.0/pybloom/000077500000000000000000000000001206710143700166725ustar00rootroot00000000000000python-bloomfilter-2.0/pybloom/__init__.py000066400000000000000000000001361206710143700210030ustar00rootroot00000000000000"""pybloom """ from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__ python-bloomfilter-2.0/pybloom/benchmarks.py000077500000000000000000000034431206710143700213700ustar00rootroot00000000000000#!/usr/bin/env python # """Test performance of BloomFilter at a set capacity and error rate.""" import sys from pybloom import BloomFilter import bitarray, math, time def main(capacity=100000, request_error_rate=0.1): f = BloomFilter(capacity=capacity, error_rate=request_error_rate) assert (capacity == f.capacity) start = time.time() for i in xrange(0, f.capacity): f.add(i, skip_check=True) end = time.time() print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end - start, f.capacity / (end - start)) oneBits = f.bitarray.count(True) zeroBits = f.bitarray.count(False) #print "Number of 1 bits:", oneBits #print "Number of 0 bits:", zeroBits print "Number of Filter Bits:", f.num_bits print "Number of slices:", f.num_slices print "Bits per slice:", f.bits_per_slice print "------" print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits)) # Look for false positives and measure the actual fp rate trials = f.capacity fp = 0 start = time.time() for i in xrange(f.capacity, f.capacity + trials + 1): if i in f: fp += 1 end = time.time() print ("{:5.3f} seconds to check false positives, " "{:10.2f} checks/second".format(end - start, trials / (end - start))) print "Requested FP rate: {:2.4f}".format(request_error_rate) print "Experimental false positive rate: {:2.4f}".format(fp / float(trials)) # Compute theoretical fp max (Goel/Gupta) k = f.num_slices m = f.num_bits n = f.capacity fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory) if __name__ == '__main__' : status = main() sys.exit(status) python-bloomfilter-2.0/pybloom/pybloom.py000066400000000000000000000334411206710143700207320ustar00rootroot00000000000000# -*- encoding: utf-8 -*- """This module implements a bloom filter probabilistic data structure and an a Scalable Bloom Filter that grows in size as your add more items to it without increasing the false positive error_rate. Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=10000, error_rate=0.001) >>> for i in xrange(0, f.capacity): ... _ = f.add(i) ... >>> 0 in f True >>> f.capacity in f False >>> len(f) <= f.capacity True >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 True >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 >>> for i in xrange(0, count): ... _ = sbf.add(i) ... >>> sbf.capacity > count True >>> len(sbf) <= count True >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 True """ import math import hashlib from struct import unpack, pack, calcsize try: import bitarray except ImportError: raise ImportError('pybloom requires bitarray >= 0.3.4') __version__ = '2.0' __author__ = "Jay Baird , Bob Ippolito ,\ Marius Eriksen ,\ Alex Brasetvik " def make_hashfuncs(num_slices, num_bits): if num_bits >= (1 << 31): fmt_code, chunk_size = 'Q', 8 elif num_bits >= (1 << 15): fmt_code, chunk_size = 'I', 4 else: fmt_code, chunk_size = 'H', 2 total_hash_bits = 8 * num_slices * chunk_size if total_hash_bits > 384: hashfn = hashlib.sha512 elif total_hash_bits > 256: hashfn = hashlib.sha384 elif total_hash_bits > 160: hashfn = hashlib.sha256 elif total_hash_bits > 128: hashfn = hashlib.sha1 else: hashfn = hashlib.md5 fmt = fmt_code * (hashfn().digest_size // chunk_size) num_salts, extra = divmod(num_slices, len(fmt)) if extra: num_salts += 1 salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)] def _make_hashfuncs(key): if isinstance(key, unicode): key = key.encode('utf-8') else: key = str(key) rval = [] for salt in salts: h = salt.copy() h.update(key) rval.extend(uint % num_bits for uint in unpack(fmt, h.digest())) del rval[num_slices:] return rval return _make_hashfuncs class BloomFilter(object): FILE_FMT = '>> b = BloomFilter(capacity=100000, error_rate=0.001) >>> b.add("test") False >>> "test" in b True """ if not (0 < error_rate < 1): raise ValueError("Error_Rate must be between 0 and 1.") if not capacity > 0: raise ValueError("Capacity must be > 0") # given M = num_bits, k = num_slices, P = error_rate, n = capacity # k = log2(1/P) # solving for m = bits_per_slice # n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) bits_per_slice = int(math.ceil( (capacity * abs(math.log(error_rate))) / (num_slices * (math.log(2) ** 2)))) self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) self.bitarray = bitarray.bitarray(self.num_bits, endian='little') self.bitarray.setall(False) def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): self.error_rate = error_rate self.num_slices = num_slices self.bits_per_slice = bits_per_slice self.capacity = capacity self.num_bits = num_slices * bits_per_slice self.count = count self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) def __contains__(self, key): """Tests a key's membership in this bloom filter. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> "hello" in b True """ bits_per_slice = self.bits_per_slice bitarray = self.bitarray if not isinstance(key, list): hashes = self.make_hashes(key) else: hashes = key offset = 0 for k in hashes: if not bitarray[offset + k]: return False offset += bits_per_slice return True def __len__(self): """Return the number of keys stored by this bloom filter.""" return self.count def add(self, key, skip_check=False): """ Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> b.add("hello") True """ bitarray = self.bitarray bits_per_slice = self.bits_per_slice hashes = self.make_hashes(key) if not skip_check and hashes in self: return True if self.count > self.capacity: raise IndexError("BloomFilter is at capacity") offset = 0 for k in hashes: self.bitarray[offset + k] = True offset += bits_per_slice self.count += 1 return False def copy(self): """Return a copy of this bloom filter. """ new_filter = BloomFilter(self.capacity, self.error_rate) new_filter.bitarray = self.bitarray.copy() return new_filter def union(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or \ self.error_rate != other.error_rate: raise ValueError("Unioning filters requires both filters to have \ both the same capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray | other.bitarray return new_bloom def __or__(self, other): return self.union(other) def intersection(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or \ self.error_rate != other.error_rate: raise ValueError("Intersecting filters requires both filters to \ have equal capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray & other.bitarray return new_bloom def __and__(self, other): return self.intersection(other) def tofile(self, f): """Write the bloom filter to file object `f'. Underlying bits are written as machine values. This is much more space efficient than pickling the object.""" f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, self.bits_per_slice, self.capacity, self.count)) self.bitarray.tofile(f) @classmethod def fromfile(cls, f, n=-1): """Read a bloom filter from file-object `f' serialized with ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" headerlen = calcsize(cls.FILE_FMT) if 0 < n < headerlen: raise ValueError, 'n too small!' filter = cls(1) # Bogus instantiation, we will `_setup'. filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) filter.bitarray = bitarray.bitarray(endian='little') if n > 0: filter.bitarray.fromfile(f, n - headerlen) else: filter.bitarray.fromfile(f) if filter.num_bits != filter.bitarray.length() and \ (filter.num_bits + (8 - filter.num_bits % 8) != filter.bitarray.length()): raise ValueError, 'Bit length mismatch!' return filter def __getstate__(self): d = self.__dict__.copy() del d['make_hashes'] return d def __setstate__(self, d): self.__dict__.update(d) self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) class ScalableBloomFilter(object): SMALL_SET_GROWTH = 2 # slower, but takes up less memory LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster FILE_FMT = '>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("test") False >>> "test" in b True >>> unicode_string = u'¡' >>> b.add(unicode_string) False >>> unicode_string in b True """ if not error_rate or error_rate < 0: raise ValueError("Error_Rate must be a decimal less than 0.") self._setup(mode, 0.9, initial_capacity, error_rate) self.filters = [] def _setup(self, mode, ratio, initial_capacity, error_rate): self.scale = mode self.ratio = ratio self.initial_capacity = initial_capacity self.error_rate = error_rate def __contains__(self, key): """Tests a key's membership in this bloom filter. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> "hello" in b True """ for f in reversed(self.filters): if key in f: return True return False def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True if not self.filters: filter = BloomFilter( capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio)) self.filters.append(filter) else: filter = self.filters[-1] if filter.count >= filter.capacity: filter = BloomFilter( capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio) self.filters.append(filter) filter.add(key, skip_check=True) return False @property def capacity(self): """Returns the total capacity for all filters in this SBF""" return sum([f.capacity for f in self.filters]) @property def count(self): return len(self) def tofile(self, f): """Serialize this ScalableBloomFilter into the file-object `f'.""" f.write(pack(self.FILE_FMT, self.scale, self.ratio, self.initial_capacity, self.error_rate)) # Write #-of-filters f.write(pack(' 0: # Then each filter directly, with a header describing # their lengths. headerpos = f.tell() headerfmt = '<' + 'Q'*(len(self.filters)) f.write('.' * calcsize(headerfmt)) filter_sizes = [] for filter in self.filters: begin = f.tell() filter.tofile(f) filter_sizes.append(f.tell() - begin) f.seek(headerpos) f.write(pack(headerfmt, *filter_sizes)) @classmethod def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f'.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) nfilters, = unpack(' 0: header_fmt = '<' + 'Q'*nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: filter.filters.append(BloomFilter.fromfile(f, fl)) else: filter.filters = [] return filter def __len__(self): """Returns the total number of elements stored in this SBF""" return sum([f.count for f in self.filters]) if __name__ == "__main__": import doctest doctest.testmod() python-bloomfilter-2.0/pybloom/tests.py000066400000000000000000000063551206710143700204170ustar00rootroot00000000000000import os import doctest import unittest import random import tempfile from pybloom import BloomFilter, ScalableBloomFilter from unittest import TestSuite def additional_tests(): proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) readme_fn = os.path.join(proj_dir, 'README.txt') suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')]) if os.path.exists(readme_fn): suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False)) return suite class TestUnionIntersection(unittest.TestCase): def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[len(chars)/2:]: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assert_(char in new_bloom) def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) for char in chars[:len(chars)/2]: self.assert_(char in new_bloom) for char in chars[len(chars)/2:]: self.assert_(char not in new_bloom) def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run) def test_union_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run) def test_intersection_k_fail(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.01) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run) def test_union_k_fail(self): bloom_one = BloomFilter(100, 0.01) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run) class Serialization(unittest.TestCase): SIZE = 12345 EXPECTED = set([random.randint(0, 10000100) for _ in xrange(SIZE)]) def test_serialization(self): for klass, args in [(BloomFilter, (self.SIZE,)), (ScalableBloomFilter, ())]: filter = klass(*args) for item in self.EXPECTED: filter.add(item) # It seems bitarray is finicky about the object being an # actual file, so we can't just use StringIO. Grr. f = tempfile.TemporaryFile() filter.tofile(f) del filter f.seek(0) filter = klass.fromfile(f) for item in self.EXPECTED: self.assert_(item in filter) if __name__ == '__main__': unittest.main() python-bloomfilter-2.0/requirements.txt000066400000000000000000000000171206710143700204730ustar00rootroot00000000000000bitarray>=0.3.4python-bloomfilter-2.0/setup.py000066400000000000000000000025031206710143700167230ustar00rootroot00000000000000#!/usr/bin/env python from ez_setup import use_setuptools use_setuptools() import os from setuptools import setup, find_packages, Extension VERSION = '2.0.0' DESCRIPTION = "PyBloom: A Probabilistic data structure" LONG_DESCRIPTION = """ pybloom is a Python implementation of the bloom filter probabilistic data structure. The module also provides a Scalable Bloom Filter that allows a bloom filter to grow without knowing the original set size. """ CLASSIFIERS = filter(None, map(str.strip, """ Intended Audience :: Developers License :: OSI Approved :: MIT License Programming Language :: Python Operating System :: OS Independent Topic :: Utilities Topic :: Database :: Database Engines/Servers Topic :: Software Development :: Libraries :: Python Modules """.splitlines())) setup( name="pybloom", version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, keywords=('data structures', 'bloom filter', 'bloom', 'filter', 'probabilistic', 'set'), author="Jay Baird", author_email="jay.baird@me.com", url="http://github.com/jaybaird/python-bloomfilter/", license="MIT License", packages=find_packages(exclude=['ez_setup']), platforms=['any'], test_suite="pybloom.tests", zip_safe=True, install_requires=['bitarray>=0.3.4'] )