pybloom-1.1/000755 000765 000024 00000000000 11537217027 012767 5ustar00jaystaff000000 000000 pybloom-1.1/ez_setup.py000644 000765 000024 00000022755 11341015704 015201 0ustar00jaystaff000000 000000 #!python """Bootstrap setuptools installation If you want to use setuptools in your package's setup.py, just include this file in the same directory with it, and add this to the top of your setup.py:: from ez_setup import use_setuptools use_setuptools() If you want to require a specific version of setuptools, set a download mirror, or use an alternate download directory, you can do so by supplying the appropriate options to ``use_setuptools()``. This file can also be run as a script to install or upgrade setuptools. """ import sys DEFAULT_VERSION = "0.6c9" DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] md5_data = { 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', } import sys, os try: from hashlib import md5 except ImportError: from md5 import md5 def _validate_md5(egg_name, data): if egg_name in md5_data: digest = md5(data).hexdigest() if digest != md5_data[egg_name]: print >>sys.stderr, ( "md5 validation of %s failed! (Possible download problem?)" % egg_name ) sys.exit(2) return data def use_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15 ): """Automatically find/download setuptools and make it available on sys.path `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where setuptools will be downloaded, if it is not already available. If `download_delay` is specified, it should be the number of seconds that will be paused before initiating a download, should one be required. If an older version of setuptools is installed, this routine will print a message to ``sys.stderr`` and raise SystemExit in an attempt to abort the calling script. """ was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules def do_download(): egg = download_setuptools(version, download_base, to_dir, download_delay) sys.path.insert(0, egg) import setuptools; setuptools.bootstrap_install_from = egg try: import pkg_resources except ImportError: return do_download() try: pkg_resources.require("setuptools>="+version); return except pkg_resources.VersionConflict, e: if was_imported: print >>sys.stderr, ( "The required version of setuptools (>=%s) is not available, and\n" "can't be installed while this script is running. Please install\n" " a more recent version first, using 'easy_install -U setuptools'." "\n\n(Currently using %r)" ) % (version, e.args[0]) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok return do_download() except pkg_resources.DistributionNotFound: return do_download() def download_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay = 15 ): """Download setuptools from a specified location and return its filename `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. """ import urllib2, shutil egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) url = download_base + egg_name saveto = os.path.join(to_dir, egg_name) src = dst = None if not os.path.exists(saveto): # Avoid repeated downloads try: from distutils import log if delay: log.warn(""" --------------------------------------------------------------------------- This script requires setuptools version %s to run (even to display help). I will attempt to download it for you (from %s), but you may need to enable firewall access for this script first. I will start the download in %d seconds. (Note: if this machine does not have network access, please obtain the file %s and place it in this directory before rerunning this script.) ---------------------------------------------------------------------------""", version, download_base, delay, url ); from time import sleep; sleep(delay) log.warn("Downloading %s", url) src = urllib2.urlopen(url) # Read/write all in one block, so we don't create a corrupt file # if the download is interrupted. data = _validate_md5(egg_name, src.read()) dst = open(saveto,"wb"); dst.write(data) finally: if src: src.close() if dst: dst.close() return os.path.realpath(saveto) def main(argv, version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" try: import setuptools except ImportError: egg = None try: egg = download_setuptools(version, delay=0) sys.path.insert(0,egg) from setuptools.command.easy_install import main return main(list(argv)+[egg]) # we're done here finally: if egg and os.path.exists(egg): os.unlink(egg) else: if setuptools.__version__ == '0.0.1': print >>sys.stderr, ( "You have an obsolete version of setuptools installed. Please\n" "remove it from your system entirely before rerunning this script." ) sys.exit(2) req = "setuptools>="+version import pkg_resources try: pkg_resources.require(req) except pkg_resources.VersionConflict: try: from setuptools.command.easy_install import main except ImportError: from easy_install import main main(list(argv)+[download_setuptools(delay=0)]) sys.exit(0) # try to force an exit else: if argv: from setuptools.command.easy_install import main main(argv) else: print "Setuptools version",version,"or greater has been installed." print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' def update_md5(filenames): """Update our built-in md5 registry""" import re for name in filenames: base = os.path.basename(name) f = open(name,'rb') md5_data[base] = md5(f.read()).hexdigest() f.close() data = [" %r: %r,\n" % it for it in md5_data.items()] data.sort() repl = "".join(data) import inspect srcfile = inspect.getsourcefile(sys.modules[__name__]) f = open(srcfile, 'rb'); src = f.read(); f.close() match = re.search("\nmd5_data = {\n([^}]+)}", src) if not match: print >>sys.stderr, "Internal error!" sys.exit(2) src = src[:match.start(1)] + repl + src[match.end(1):] f = open(srcfile,'w') f.write(src) f.close() if __name__=='__main__': if len(sys.argv)>2 and sys.argv[1]=='--md5update': update_md5(sys.argv[2:]) else: main(sys.argv[1:])pybloom-1.1/._MANIFEST.in000644 000765 000024 00000000270 11341015704 014730 0ustar00jaystaff000000 000000 Mac OS X  2ATTR  com.macromates.caretxR<[k0?'3/«pybloom-1.1/MANIFEST.in000644 000765 000024 00000000024 11341015704 014510 0ustar00jaystaff000000 000000 include ez_setup.py pybloom-1.1/PKG-INFO000644 000765 000024 00000001620 11537217027 014063 0ustar00jaystaff000000 000000 Metadata-Version: 1.0 Name: pybloom Version: 1.1 Summary: PyBloom: A Probabilistic data structure Home-page: http://github.com/jaybaird/python-bloomfilter/ Author: Jay Baird Author-email: jay.baird@me.com License: MIT License Description: pybloom is a Python implementation of the bloom filter probabilistic data structure. The module also provides a Scalable Bloom Filter that allows a bloom filter to grow without knowing the original set size. Keywords: data structures,bloom filter,bloom,filter,probabilistic,set Platform: any Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Operating System :: OS Independent Classifier: Topic :: Utilities Classifier: Topic :: Database :: Database Engines/Servers Classifier: Topic :: Software Development :: Libraries :: Python Modules pybloom-1.1/pybloom/000755 000765 000024 00000000000 11537217027 014450 5ustar00jaystaff000000 000000 pybloom-1.1/pybloom.egg-info/000755 000765 000024 00000000000 11537217027 016142 5ustar00jaystaff000000 000000 pybloom-1.1/._README.txt000644 000765 000024 00000000270 11341015704 014670 0ustar00jaystaff000000 000000 Mac OS X  2ATTR  com.macromates.caretxR<[k0?'3/«pybloom-1.1/README.txt000644 000765 000024 00000003103 11341015704 014451 0ustar00jaystaff000000 000000 pybloom is a module that includes a Bloom Filter data structure along with an implmentation of Scalable Bloom Filters as discussed in: P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters, (GLOBECOM 2007), IEEE, 2007. Bloom filters are great if you understand what amount of bits you need to set aside early to store your entire set. Scalable Bloom Filters allow your bloom filter bits to grow as a function of false positive probability and size. A filter is "full" when at capacity: M * ((ln 2 ^ 2) / abs(ln p)), where M is the number of bits and p is the false positive probability. When capacity is reached a new filter is then created exponentially larger than the last with a tighter probability of false positives and a larger number of hash functions. >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> [f.add(x) for x in range(10)] [False, False, False, False, False, False, False, False, False, False] >>> all([(x in f) for x in range(10)]) True >>> 10 in f False >>> 5 in f True >>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> for i in xrange(0, f.capacity): ... _ = f.add(i) >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate True >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 >>> for i in xrange(0, count): ... _ = sbf.add(i) ... >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate True # len(sbf) may not equal the entire input length. 0.006% error is well # below the default 0.1% error threshold pybloom-1.1/setup.cfg000644 000765 000024 00000000073 11537217027 014610 0ustar00jaystaff000000 000000 [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 pybloom-1.1/setup.py000644 000765 000024 00000002523 11537216401 014476 0ustar00jaystaff000000 000000 #!/usr/bin/env python from ez_setup import use_setuptools use_setuptools() import os from setuptools import setup, find_packages, Extension VERSION = '1.1' DESCRIPTION = "PyBloom: A Probabilistic data structure" LONG_DESCRIPTION = """ pybloom is a Python implementation of the bloom filter probabilistic data structure. The module also provides a Scalable Bloom Filter that allows a bloom filter to grow without knowing the original set size. """ CLASSIFIERS = filter(None, map(str.strip, """ Intended Audience :: Developers License :: OSI Approved :: MIT License Programming Language :: Python Operating System :: OS Independent Topic :: Utilities Topic :: Database :: Database Engines/Servers Topic :: Software Development :: Libraries :: Python Modules """.splitlines())) setup( name="pybloom", version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, keywords=('data structures', 'bloom filter', 'bloom', 'filter', 'probabilistic', 'set'), author="Jay Baird", author_email="jay.baird@me.com", url="http://github.com/jaybaird/python-bloomfilter/", license="MIT License", packages=find_packages(exclude=['ez_setup']), platforms=['any'], test_suite="pybloom.tests", zip_safe=True, install_requires=['bitarray>=0.3.4'] ) pybloom-1.1/pybloom.egg-info/dependency_links.txt000644 000765 000024 00000000001 11537217027 022210 0ustar00jaystaff000000 000000 pybloom-1.1/pybloom.egg-info/PKG-INFO000644 000765 000024 00000001620 11537217027 017236 0ustar00jaystaff000000 000000 Metadata-Version: 1.0 Name: pybloom Version: 1.1 Summary: PyBloom: A Probabilistic data structure Home-page: http://github.com/jaybaird/python-bloomfilter/ Author: Jay Baird Author-email: jay.baird@me.com License: MIT License Description: pybloom is a Python implementation of the bloom filter probabilistic data structure. The module also provides a Scalable Bloom Filter that allows a bloom filter to grow without knowing the original set size. Keywords: data structures,bloom filter,bloom,filter,probabilistic,set Platform: any Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Operating System :: OS Independent Classifier: Topic :: Utilities Classifier: Topic :: Database :: Database Engines/Servers Classifier: Topic :: Software Development :: Libraries :: Python Modules pybloom-1.1/pybloom.egg-info/requires.txt000644 000765 000024 00000000017 11537217027 020540 0ustar00jaystaff000000 000000 bitarray>=0.3.4pybloom-1.1/pybloom.egg-info/SOURCES.txt000644 000765 000024 00000000427 11537217027 020031 0ustar00jaystaff000000 000000 MANIFEST.in README.txt ez_setup.py setup.py pybloom/__init__.py pybloom/pybloom.py pybloom/tests.py pybloom.egg-info/PKG-INFO pybloom.egg-info/SOURCES.txt pybloom.egg-info/dependency_links.txt pybloom.egg-info/requires.txt pybloom.egg-info/top_level.txt pybloom.egg-info/zip-safepybloom-1.1/pybloom.egg-info/top_level.txt000644 000765 000024 00000000010 11537217027 020663 0ustar00jaystaff000000 000000 pybloom pybloom-1.1/pybloom.egg-info/zip-safe000644 000765 000024 00000000001 11341016254 017562 0ustar00jaystaff000000 000000 pybloom-1.1/pybloom/.___init__.py000644 000765 000024 00000000270 11341015704 016764 0ustar00jaystaff000000 000000 Mac OS X  2ATTR  com.macromates.caretxR<[k0?'3/«pybloom-1.1/pybloom/__init__.py000644 000765 000024 00000000143 11341015704 016546 0ustar00jaystaff000000 000000 """pybloom """ from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__ pybloom-1.1/pybloom/pybloom.py000644 000765 000024 00000033364 11537216223 016511 0ustar00jaystaff000000 000000 # -*- encoding: utf-8 -*- """This module implements a bloom filter probabilistic data structure and an a Scalable Bloom Filter that grows in size as your add more items to it without increasing the false positive error_rate. Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=10000, error_rate=0.001) >>> for i in xrange(0, f.capacity): ... _ = f.add(i) ... >>> 0 in f True >>> f.capacity in f False >>> len(f) <= f.capacity True >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate True >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 >>> for i in xrange(0, count): ... _ = sbf.add(i) ... >>> sbf.capacity > count True >>> len(sbf) <= count True >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate True """ import math import hashlib from struct import unpack, pack, calcsize try: import bitarray except ImportError: raise ImportError('pybloom requires bitarray >= 0.3.4') __version__ = '1.1' __author__ = "Jay Baird , Bob Ippolito ,\ Marius Eriksen ,\ Alex Brasetvik " def make_hashfuncs(num_slices, num_bits): if num_bits >= (1 << 31): fmt_code, chunk_size = 'Q', 8 elif num_bits >= (1 << 15): fmt_code, chunk_size = 'I', 4 else: fmt_code, chunk_size = 'H', 2 total_hash_bits = 8 * num_slices * chunk_size if total_hash_bits > 384: hashfn = hashlib.sha512 elif total_hash_bits > 256: hashfn = hashlib.sha384 elif total_hash_bits > 160: hashfn = hashlib.sha256 elif total_hash_bits > 128: hashfn = hashlib.sha1 else: hashfn = hashlib.md5 fmt = fmt_code * (hashfn().digest_size // chunk_size) num_salts, extra = divmod(num_slices, len(fmt)) if extra: num_salts += 1 salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)] def _make_hashfuncs(key): if isinstance(key, unicode): key = key.encode('utf-8') else: key = str(key) rval = [] for salt in salts: h = salt.copy() h.update(key) rval.extend(uint % num_bits for uint in unpack(fmt, h.digest())) del rval[num_slices:] return rval return _make_hashfuncs class BloomFilter(object): FILE_FMT = '>> b = BloomFilter(capacity=100000, error_rate=0.001) >>> b.add("test") False >>> "test" in b True """ if not (0 < error_rate < 1): raise ValueError("Error_Rate must be between 0 and 1.") if not capacity > 0: raise ValueError("Capacity must be > 0") # given M = num_bits, k = num_slices, p = error_rate, n = capacity # solving for m = bits_per_slice # n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) num_slices = int(math.ceil(math.log(1 / error_rate, 2))) # the error_rate constraint assumes a fill rate of 1/2 # so we double the capacity to simplify the API bits_per_slice = int(math.ceil( (2 * capacity * abs(math.log(error_rate))) / (num_slices * (math.log(2) ** 2)))) self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) self.bitarray = bitarray.bitarray(self.num_bits, endian='little') self.bitarray.setall(False) def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): self.error_rate = error_rate self.num_slices = num_slices self.bits_per_slice = bits_per_slice self.capacity = capacity self.num_bits = num_slices * bits_per_slice self.count = count self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) def __contains__(self, key): """Tests a key's membership in this bloom filter. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> "hello" in b True """ bits_per_slice = self.bits_per_slice bitarray = self.bitarray if not isinstance(key, list): hashes = self.make_hashes(key) else: hashes = key offset = 0 for k in hashes: if not bitarray[offset + k]: return False offset += bits_per_slice return True def __len__(self): """Return the number of keys stored by this bloom filter.""" return self.count def add(self, key, skip_check=False): """ Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> b.add("hello") True """ bitarray = self.bitarray bits_per_slice = self.bits_per_slice hashes = self.make_hashes(key) if not skip_check and hashes in self: return True if self.count > self.capacity: raise IndexError("BloomFilter is at capacity") offset = 0 for k in hashes: self.bitarray[offset + k] = True offset += bits_per_slice self.count += 1 return False def copy(self): """Return a copy of this bloom filter. """ new_filter = BloomFilter(self.capacity, self.error_rate) new_filter.bitarray = self.bitarray.copy() return new_filter def union(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or \ self.error_rate != other.error_rate: raise ValueError("Unioning filters requires both filters to have \ both the same capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray | other.bitarray return new_bloom def __or__(self, other): return self.union(other) def intersection(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or \ self.error_rate != other.error_rate: raise ValueError("Intersecting filters requires both filters to \ have equal capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray & other.bitarray return new_bloom def __and__(self, other): return self.intersection(other) def tofile(self, f): """Write the bloom filter to file object `f'. Underlying bits are written as machine values. This is much more space efficient than pickling the object.""" f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, self.bits_per_slice, self.capacity, self.count)) self.bitarray.tofile(f) @classmethod def fromfile(cls, f, n=-1): """Read a bloom filter from file-object `f' serialized with ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" headerlen = calcsize(cls.FILE_FMT) if 0 < n < headerlen: raise ValueError, 'n too small!' filter = cls(1) # Bogus instantiation, we will `_setup'. filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) filter.bitarray = bitarray.bitarray(endian='little') if n > 0: filter.bitarray.fromfile(f, n - headerlen) else: filter.bitarray.fromfile(f) if filter.num_bits != filter.bitarray.length() and \ (filter.num_bits + (8 - filter.num_bits % 8) != filter.bitarray.length()): raise ValueError, 'Bit length mismatch!' return filter def __getstate__(self): d = self.__dict__.copy() del d['make_hashes'] return d def __setstate__(self, d): self.__dict__.update(d) self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) class ScalableBloomFilter(object): SMALL_SET_GROWTH = 2 # slower, but takes up less memory LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster FILE_FMT = '>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("test") False >>> "test" in b True >>> unicode_string = u'¡' >>> b.add(unicode_string) False >>> unicode_string in b True """ if not error_rate or error_rate < 0: raise ValueError("Error_Rate must be a decimal less than 0.") self._setup(mode, 0.9, initial_capacity, error_rate) self.filters = [] def _setup(self, mode, ratio, initial_capacity, error_rate): self.scale = mode self.ratio = ratio self.initial_capacity = initial_capacity self.error_rate = error_rate def __contains__(self, key): """Tests a key's membership in this bloom filter. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> "hello" in b True """ for f in reversed(self.filters): if key in f: return True return False def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True filter = self.filters[-1] if self.filters else None if filter is None or filter.count >= filter.capacity: num_filters = len(self.filters) filter = BloomFilter( capacity=self.initial_capacity * (self.scale ** num_filters), error_rate=self.error_rate * (self.ratio ** num_filters)) self.filters.append(filter) filter.add(key, skip_check=True) return False @property def capacity(self): """Returns the total capacity for all filters in this SBF""" return sum([f.capacity for f in self.filters]) @property def count(self): return len(self) def tofile(self, f): """Serialize this ScalableBloomFilter into the file-object `f'.""" f.write(pack(self.FILE_FMT, self.scale, self.ratio, self.initial_capacity, self.error_rate)) # Write #-of-filters f.write(pack(' 0: # Then each filter directly, with a header describing # their lengths. headerpos = f.tell() headerfmt = '<' + 'Q'*(len(self.filters)) f.write('.' * calcsize(headerfmt)) filter_sizes = [] for filter in self.filters: begin = f.tell() filter.tofile(f) filter_sizes.append(f.tell() - begin) f.seek(headerpos) f.write(pack(headerfmt, *filter_sizes)) @classmethod def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f'.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) nfilters, = unpack(' 0: header_fmt = '<' + 'Q'*nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: filter.filters.append(BloomFilter.fromfile(f, fl)) else: filter.filters = [] return filter def __len__(self): """Returns the total number of elements stored in this SBF""" return sum([f.count for f in self.filters]) if __name__ == "__main__": import doctest doctest.testmod() pybloom-1.1/pybloom/tests.py000644 000765 000024 00000006355 11537216223 016172 0ustar00jaystaff000000 000000 import os import doctest import unittest import random import tempfile from pybloom import BloomFilter, ScalableBloomFilter from unittest import TestSuite def additional_tests(): proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) readme_fn = os.path.join(proj_dir, 'README.txt') suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')]) if os.path.exists(readme_fn): suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False)) return suite class TestUnionIntersection(unittest.TestCase): def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[len(chars)/2:]: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assert_(char in new_bloom) def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) for char in chars[:len(chars)/2]: self.assert_(char in new_bloom) for char in chars[len(chars)/2:]: self.assert_(char not in new_bloom) def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run) def test_union_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run) def test_intersection_k_fail(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.01) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run) def test_union_k_fail(self): bloom_one = BloomFilter(100, 0.01) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run) class Serialization(unittest.TestCase): SIZE = 12345 EXPECTED = set([random.randint(0, 10000100) for _ in xrange(SIZE)]) def test_serialization(self): for klass, args in [(BloomFilter, (self.SIZE,)), (ScalableBloomFilter, ())]: filter = klass(*args) for item in self.EXPECTED: filter.add(item) # It seems bitarray is finicky about the object being an # actual file, so we can't just use StringIO. Grr. f = tempfile.TemporaryFile() filter.tofile(f) del filter f.seek(0) filter = klass.fromfile(f) for item in self.EXPECTED: self.assert_(item in filter) if __name__ == '__main__': unittest.main()