bloomfilter-2.0.1.2/0000755000000000000000000000000007346545000012361 5ustar0000000000000000bloomfilter-2.0.1.2/CHANGELOG.md0000644000000000000000000000051407346545000014172 0ustar0000000000000000#### 2.0.1.2 * Fix Data.BloomFilter.Easy on 32 bit to not incorrectly fail with "capacity too large to represent". #### 2.0.1.1 * Fix build with GHC 9.2, thanks to Simon Jakobi. * Add CI with GitHub Actions, thanks to Simon Jakobi. * New maintainer for the package, under the https://github.com/haskell-pkg-janitors umbrella. bloomfilter-2.0.1.2/Data/0000755000000000000000000000000007346545000013232 5ustar0000000000000000bloomfilter-2.0.1.2/Data/BloomFilter.hs0000644000000000000000000002613207346545000016010 0ustar0000000000000000{-# LANGUAGE BangPatterns, Rank2Types, ScopedTypeVariables, TypeOperators #-} -- | -- Module: Data.BloomFilter -- Copyright: Bryan O'Sullivan -- License: BSD3 -- -- Maintainer: Bryan O'Sullivan -- Stability: unstable -- Portability: portable -- -- A fast, space efficient Bloom filter implementation. A Bloom -- filter is a set-like data structure that provides a probabilistic -- membership test. -- -- * Queries do not give false negatives. When an element is added to -- a filter, a subsequent membership test will definitely return -- 'True'. -- -- * False positives /are/ possible. If an element has not been added -- to a filter, a membership test /may/ nevertheless indicate that -- the element is present. -- -- This module provides low-level control. For an easier to use -- interface, see the "Data.BloomFilter.Easy" module. module Data.BloomFilter ( -- * Overview -- $overview -- ** Ease of use -- $ease -- ** Performance -- $performance -- * Types Hash , Bloom , MBloom -- * Immutable Bloom filters -- ** Conversion , freeze , thaw , unsafeFreeze -- ** Creation , unfold , fromList , empty , singleton -- ** Accessors , length , elem , notElem -- ** Modification , insert , insertList -- * The underlying representation -- | If you serialize the raw bit arrays below to disk, do not -- expect them to be portable to systems with different -- conventions for endianness or word size. -- | The raw bit array used by the immutable 'Bloom' type. , bitArray ) where import Control.Monad (liftM, forM_) import Control.Monad.ST (ST, runST) import Control.DeepSeq (NFData(..)) import Data.Array.Base (unsafeAt) import qualified Data.Array.Base as ST import Data.Array.Unboxed (UArray) import Data.Bits ((.&.), unsafeShiftL, unsafeShiftR) import Data.BloomFilter.Util ((:*)(..)) import qualified Data.BloomFilter.Mutable as MB import qualified Data.BloomFilter.Mutable.Internal as MB import Data.BloomFilter.Mutable.Internal (Hash, MBloom) import Data.Word (Word32) import Prelude hiding (elem, length, notElem, (/), (*), div, divMod, mod, rem) -- | An immutable Bloom filter, suitable for querying from pure code. data Bloom a = B { hashes :: !(a -> [Hash]) , shift :: {-# UNPACK #-} !Int , mask :: {-# UNPACK #-} !Int , bitArray :: {-# UNPACK #-} !(UArray Int Hash) } instance Show (Bloom a) where show ub = "Bloom { " ++ show ((1::Int) `unsafeShiftL` shift ub) ++ " bits } " instance NFData (Bloom a) where rnf !_ = () logBitsInHash :: Int logBitsInHash = 5 -- Data.BloomFilter.Mutable.logPower2 bitsInHash -- | Create an immutable Bloom filter, using the given setup function -- which executes in the 'ST' monad. -- -- Example: -- -- @ --import "Data.BloomFilter.Hash" (cheapHashes) -- --filter = create (cheapHashes 3) 1024 $ \mf -> do -- insertMB mf \"foo\" -- insertMB mf \"bar\" -- @ -- -- Note that the result of the setup function is not used. create :: (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> (forall s. (MBloom s a -> ST s ())) -- ^ setup function -> Bloom a {-# INLINE create #-} create hash numBits body = runST $ do mb <- MB.new hash numBits body mb unsafeFreeze mb -- | Create an immutable Bloom filter from a mutable one. The mutable -- filter may be modified afterwards. freeze :: MBloom s a -> ST s (Bloom a) freeze mb = B (MB.hashes mb) (MB.shift mb) (MB.mask mb) `liftM` ST.freeze (MB.bitArray mb) -- | Create an immutable Bloom filter from a mutable one. The mutable -- filter /must not/ be modified afterwards, or a runtime crash may -- occur. For a safer creation interface, use 'freeze' or 'create'. unsafeFreeze :: MBloom s a -> ST s (Bloom a) unsafeFreeze mb = B (MB.hashes mb) (MB.shift mb) (MB.mask mb) `liftM` ST.unsafeFreeze (MB.bitArray mb) -- | Copy an immutable Bloom filter to create a mutable one. There is -- no non-copying equivalent. thaw :: Bloom a -> ST s (MBloom s a) thaw ub = MB.MB (hashes ub) (shift ub) (mask ub) `liftM` ST.thaw (bitArray ub) -- | Create an empty Bloom filter. -- -- This function is subject to fusion with 'insert' -- and 'insertList'. empty :: (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> Bloom a {-# INLINE [1] empty #-} empty hash numBits = create hash numBits (\_ -> return ()) -- | Create a Bloom filter with a single element. -- -- This function is subject to fusion with 'insert' -- and 'insertList'. singleton :: (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> a -- ^ element to insert -> Bloom a {-# INLINE [1] singleton #-} singleton hash numBits elt = create hash numBits (\mb -> MB.insert mb elt) -- | Given a filter's mask and a hash value, compute an offset into -- a word array and a bit offset within that word. hashIdx :: Int -> Word32 -> (Int :* Int) hashIdx msk x = (y `unsafeShiftR` logBitsInHash) :* (y .&. hashMask) where hashMask = 31 -- bitsInHash - 1 y = fromIntegral x .&. msk -- | Hash the given value, returning a list of (word offset, bit -- offset) pairs, one per hash value. hashesU :: Bloom a -> a -> [Int :* Int] hashesU ub elt = hashIdx (mask ub) `map` hashes ub elt -- | Query an immutable Bloom filter for membership. If the value is -- present, return @True@. If the value is not present, there is -- /still/ some possibility that @True@ will be returned. elem :: a -> Bloom a -> Bool elem elt ub = all test (hashesU ub elt) where test (off :* bit) = (bitArray ub `unsafeAt` off) .&. (1 `unsafeShiftL` bit) /= 0 modify :: (forall s. (MBloom s a -> ST s z)) -- ^ mutation function (result is discarded) -> Bloom a -> Bloom a {-# INLINE modify #-} modify body ub = runST $ do mb <- thaw ub _ <- body mb unsafeFreeze mb -- | Create a new Bloom filter from an existing one, with the given -- member added. -- -- This function may be expensive, as it is likely to cause the -- underlying bit array to be copied. -- -- Repeated applications of this function with itself are subject to -- fusion. insert :: a -> Bloom a -> Bloom a {-# NOINLINE insert #-} insert elt = modify (flip MB.insert elt) -- | Create a new Bloom filter from an existing one, with the given -- members added. -- -- This function may be expensive, as it is likely to cause the -- underlying bit array to be copied. -- -- Repeated applications of this function with itself are subject to -- fusion. insertList :: [a] -> Bloom a -> Bloom a {-# NOINLINE insertList #-} insertList elts = modify $ \mb -> mapM_ (MB.insert mb) elts {-# RULES "Bloom insert . insert" forall a b u. insert b (insert a u) = insertList [a,b] u #-} {-# RULES "Bloom insertList . insert" forall x xs u. insertList xs (insert x u) = insertList (x:xs) u #-} {-# RULES "Bloom insert . insertList" forall x xs u. insert x (insertList xs u) = insertList (x:xs) u #-} {-# RULES "Bloom insertList . insertList" forall xs ys u. insertList xs (insertList ys u) = insertList (xs++ys) u #-} {-# RULES "Bloom insertList . empty" forall h n xs. insertList xs (empty h n) = fromList h n xs #-} {-# RULES "Bloom insertList . singleton" forall h n x xs. insertList xs (singleton h n x) = fromList h n (x:xs) #-} -- | Query an immutable Bloom filter for non-membership. If the value -- /is/ present, return @False@. If the value is not present, there -- is /still/ some possibility that @False@ will be returned. notElem :: a -> Bloom a -> Bool notElem elt ub = any test (hashesU ub elt) where test (off :* bit) = (bitArray ub `unsafeAt` off) .&. (1 `unsafeShiftL` bit) == 0 -- | Return the size of an immutable Bloom filter, in bits. length :: Bloom a -> Int length = unsafeShiftL 1 . shift -- | Build an immutable Bloom filter from a seed value. The seeding -- function populates the filter as follows. -- -- * If it returns 'Nothing', it is finished producing values to -- insert into the filter. -- -- * If it returns @'Just' (a,b)@, @a@ is added to the filter and -- @b@ is used as a new seed. unfold :: forall a b. (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> (b -> Maybe (a, b)) -- ^ seeding function -> b -- ^ initial seed -> Bloom a {-# INLINE unfold #-} unfold hs numBits f k = create hs numBits (loop k) where loop :: forall s. b -> MBloom s a -> ST s () loop j mb = case f j of Just (a, j') -> MB.insert mb a >> loop j' mb _ -> return () -- | Create an immutable Bloom filter, populating it from a list of -- values. -- -- Here is an example that uses the @cheapHashes@ function from the -- "Data.BloomFilter.Hash" module to create a hash function that -- returns three hashes. -- -- @ --import "Data.BloomFilter.Hash" (cheapHashes) -- --filt = fromList (cheapHashes 3) 1024 [\"foo\", \"bar\", \"quux\"] -- @ fromList :: (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> [a] -- ^ values to populate with -> Bloom a {-# INLINE [1] fromList #-} fromList hs numBits list = create hs numBits $ forM_ list . MB.insert {-# RULES "Bloom insertList . fromList" forall h n xs ys. insertList xs (fromList h n ys) = fromList h n (xs ++ ys) #-} {- -- This is a simpler definition, but GHC doesn't inline the unfold -- sensibly. fromList hashes numBits = unfold hashes numBits convert where convert (x:xs) = Just (x, xs) convert _ = Nothing -} -- $overview -- -- Each of the functions for creating Bloom filters accepts two parameters: -- -- * The number of bits that should be used for the filter. Note that -- a filter is fixed in size; it cannot be resized after creation. -- -- * A function that accepts a value, and should return a fixed-size -- list of hashes of that value. To keep the false positive rate -- low, the hashes computes should, as far as possible, be -- independent. -- -- By choosing these parameters with care, it is possible to tune for -- a particular false positive rate. The @suggestSizing@ function in -- the "Data.BloomFilter.Easy" module calculates useful estimates for -- these parameters. -- $ease -- -- This module provides immutable interfaces for working with a -- query-only Bloom filter, and for converting to and from mutable -- Bloom filters. -- -- For a higher-level interface that is easy to use, see the -- 'Data.BloomFilter.Easy' module. -- $performance -- -- The implementation has been carefully tuned for high performance -- and low space consumption. -- -- For efficiency, the number of bits requested when creating a Bloom -- filter is rounded up to the nearest power of two. This lets the -- implementation use bitwise operations internally, instead of much -- more expensive multiplication, division, and modulus operations. bloomfilter-2.0.1.2/Data/BloomFilter/0000755000000000000000000000000007346545000015450 5ustar0000000000000000bloomfilter-2.0.1.2/Data/BloomFilter/Array.hs0000644000000000000000000000160507346545000017064 0ustar0000000000000000{-# LANGUAGE CPP, FlexibleContexts, ForeignFunctionInterface, MagicHash, Rank2Types, UnliftedFFITypes #-} module Data.BloomFilter.Array (newArray) where import Control.Monad.ST (ST) import Control.Monad.ST.Unsafe (unsafeIOToST) import Data.Array.Base (MArray, STUArray(..), unsafeNewArray_) #if __GLASGOW_HASKELL__ >= 704 import Foreign.C.Types (CInt(..), CSize(..)) #else import Foreign.C.Types (CInt, CSize) #endif import Foreign.Ptr (Ptr) import GHC.Base (MutableByteArray#) newArray :: forall e s. (MArray (STUArray s) e (ST s)) => Int -> Int -> ST s (STUArray s Int e) {-# INLINE newArray #-} newArray numElems numBytes = do ary@(STUArray _ _ _ marr#) <- unsafeNewArray_ (0, numElems - 1) _ <- unsafeIOToST (memset marr# 0 (fromIntegral numBytes)) return ary foreign import ccall unsafe "memset" memset :: MutableByteArray# s -> CInt -> CSize -> IO (Ptr a) bloomfilter-2.0.1.2/Data/BloomFilter/Easy.hs0000644000000000000000000000770207346545000016713 0ustar0000000000000000{-# LANGUAGE ScopedTypeVariables #-} -- | -- Module: Data.BloomFilter.Easy -- Copyright: Bryan O'Sullivan -- License: BSD3 -- -- Maintainer: Bryan O'Sullivan -- Stability: unstable -- Portability: portable -- -- An easy-to-use Bloom filter interface. module Data.BloomFilter.Easy ( -- * Easy creation and querying Bloom , easyList , B.elem , B.notElem , B.length -- ** Example: a spell checker -- $example -- * Useful defaults for creation , safeSuggestSizing , suggestSizing ) where import Data.BloomFilter (Bloom) import Data.BloomFilter.Hash (Hashable, cheapHashes) import Data.BloomFilter.Util (nextPowerOfTwo) import qualified Data.ByteString as SB import qualified Data.ByteString.Lazy as LB import qualified Data.BloomFilter as B -- | Create a Bloom filter with the given false positive rate and -- members. The hash functions used are computed by the @cheapHashes@ -- function from the 'Data.BloomFilter.Hash' module. easyList :: (Hashable a) => Double -- ^ desired false positive rate (0 < /e/ < 1) -> [a] -- ^ values to populate with -> Bloom a {-# SPECIALIZE easyList :: Double -> [String] -> Bloom String #-} {-# SPECIALIZE easyList :: Double -> [LB.ByteString] -> Bloom LB.ByteString #-} {-# SPECIALIZE easyList :: Double -> [SB.ByteString] -> Bloom SB.ByteString #-} easyList errRate xs = B.fromList (cheapHashes numHashes) numBits xs where capacity = length xs (numBits, numHashes) | capacity > 0 = suggestSizing capacity errRate | otherwise = (1, 1) -- | Suggest a good combination of filter size and number of hash -- functions for a Bloom filter, based on its expected maximum -- capacity and a desired false positive rate. -- -- The false positive rate is the rate at which queries against the -- filter should return @True@ when an element is not actually -- present. It should be a fraction between 0 and 1, so a 1% false -- positive rate is represented by 0.01. safeSuggestSizing :: Int -- ^ expected maximum capacity -> Double -- ^ desired false positive rate (0 < /e/ < 1) -> Either String (Int, Int) safeSuggestSizing capacity errRate | capacity <= 0 = Left "invalid capacity" | errRate <= 0 || errRate >= 1 = Left "invalid error rate" | otherwise = let cap = fromIntegral capacity (bits :: Double, hashes :: Double) = minimum [((-k) * cap / log (1 - (errRate ** (1 / k))), k) | k <- [1..100]] roundedBits = nextPowerOfTwo (ceiling bits) in if roundedBits <= 0 || maxbitstoolarge roundedBits then Left "capacity too large to represent" else Right (roundedBits, truncate hashes) where maxbits = 0xffffffff -- On 32 bit, maxbits is larger than maxBound :: Int, so wraps around -- to a negative number; avoid using it in that case. maxbitstoolarge n = if maxbits > 0 then n > maxbits else True -- | Behaves as 'safeSuggestSizing', but calls 'error' if given -- invalid or out-of-range inputs. suggestSizing :: Int -- ^ expected maximum capacity -> Double -- ^ desired false positive rate (0 < /e/ < 1) -> (Int, Int) suggestSizing cap errs = either fatal id (safeSuggestSizing cap errs) where fatal = error . ("Data.BloomFilter.Util.suggestSizing: " ++) -- $example -- -- This example reads a dictionary file containing one word per line, -- constructs a Bloom filter with a 1% false positive rate, and -- spellchecks its standard input. Like the Unix @spell@ command, it -- prints each word that it does not recognize. -- -- @ --import Data.BloomFilter.Easy (easyList, elemB) -- --main = do -- filt <- ('easyList' 0.01 . words) \`fmap\` readFile \"/usr/share/dict/words\" -- let check word | 'elemB' word filt = \"\" -- | otherwise = word ++ \"\\n\" -- interact (concat . map check . lines) -- @ bloomfilter-2.0.1.2/Data/BloomFilter/Hash.hs0000644000000000000000000003215507346545000016675 0ustar0000000000000000{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, TypeOperators #-} -- | -- Module: Data.BloomFilter.Hash -- Copyright: Bryan O'Sullivan -- License: BSD3 -- -- Maintainer: Bryan O'Sullivan -- Stability: unstable -- Portability: portable -- -- Fast hashing of Haskell values. The hash functions used are Bob -- Jenkins's public domain functions, which combine high performance -- with excellent mixing properties. For more details, see -- . -- -- In addition to the usual "one input, one output" hash functions, -- this module provides multi-output hash functions, suitable for use -- in applications that need multiple hashes, such as Bloom filtering. module Data.BloomFilter.Hash ( -- * Basic hash functionality Hashable(..) , hash32 , hash64 , hashSalt32 , hashSalt64 -- * Compute a family of hash values , hashes , cheapHashes -- * Hash functions for 'Storable' instances , hashOne32 , hashOne64 , hashList32 , hashList64 ) where import Control.Monad (foldM) import Data.Bits ((.&.), (.|.), unsafeShiftL, unsafeShiftR, xor) import Data.List (unfoldr) import Data.Int (Int8, Int16, Int32, Int64) import Data.Word (Word8, Word16, Word32, Word64) import Foreign.C.String (CString) #if __GLASGOW_HASKELL__ >= 704 import Foreign.C.Types (CInt(..), CSize(..)) #else import Foreign.C.Types (CInt, CSize) #endif import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Marshal.Array (allocaArray, withArrayLen) import Foreign.Ptr (Ptr, castPtr, nullPtr, plusPtr) import Foreign.Storable (Storable, peek, poke, sizeOf) import System.IO.Unsafe (unsafePerformIO) import Data.ByteString.Internal (ByteString(..)) import qualified Data.ByteString as SB import qualified Data.ByteString.Lazy.Internal as LB import qualified Data.ByteString.Lazy as LB #include "HsBaseConfig.h" -- Make sure we're not performing any expensive arithmetic operations. -- import Prelude hiding ((/), (*), div, divMod, mod, rem) foreign import ccall unsafe "lookup3.h _jenkins_hashword" hashWord :: Ptr Word32 -> CSize -> Word32 -> IO Word32 foreign import ccall unsafe "lookup3.h _jenkins_hashword2" hashWord2 :: Ptr Word32 -> CSize -> Ptr Word32 -> Ptr Word32 -> IO () foreign import ccall unsafe "lookup3.h _jenkins_hashlittle" hashLittle :: Ptr a -> CSize -> Word32 -> IO Word32 foreign import ccall unsafe "lookup3.h _jenkins_hashlittle2" hashLittle2 :: Ptr a -> CSize -> Ptr Word32 -> Ptr Word32 -> IO () class Hashable a where -- | Compute a 32-bit hash of a value. The salt value perturbs -- the result. hashIO32 :: a -- ^ value to hash -> Word32 -- ^ salt -> IO Word32 -- | Compute a 64-bit hash of a value. The first salt value -- perturbs the first element of the result, and the second salt -- perturbs the second. hashIO64 :: a -- ^ value to hash -> Word64 -- ^ salt -> IO Word64 hashIO64 v salt = do let s1 = fromIntegral (salt `unsafeShiftR` 32) .&. maxBound s2 = fromIntegral salt h1 <- hashIO32 v s1 h2 <- hashIO32 v s2 return $ (fromIntegral h1 `unsafeShiftL` 32) .|. fromIntegral h2 -- | Compute a 32-bit hash. hash32 :: Hashable a => a -> Word32 hash32 = hashSalt32 0x16fc397c hash64 :: Hashable a => a -> Word64 hash64 = hashSalt64 0x16fc397cf62f64d3 -- | Compute a salted 32-bit hash. hashSalt32 :: Hashable a => Word32 -- ^ salt -> a -- ^ value to hash -> Word32 {-# INLINE hashSalt32 #-} hashSalt32 salt k = unsafePerformIO $ hashIO32 k salt -- | Compute a salted 64-bit hash. hashSalt64 :: Hashable a => Word64 -- ^ salt -> a -- ^ value to hash -> Word64 {-# INLINE hashSalt64 #-} hashSalt64 salt k = unsafePerformIO $ hashIO64 k salt -- | Compute a list of 32-bit hashes. The value to hash may be -- inspected as many times as there are hashes requested. hashes :: Hashable a => Int -- ^ number of hashes to compute -> a -- ^ value to hash -> [Word32] hashes n v = unfoldr go (n,0x3f56da2d) where go (k,s) | k <= 0 = Nothing | otherwise = let s' = hashSalt32 s v in Just (s', (k-1,s')) -- | Compute a list of 32-bit hashes relatively cheaply. The value to -- hash is inspected at most twice, regardless of the number of hashes -- requested. -- -- We use a variant of Kirsch and Mitzenmacher's technique from \"Less -- Hashing, Same Performance: Building a Better Bloom Filter\", -- . -- -- Where Kirsch and Mitzenmacher multiply the second hash by a -- coefficient, we shift right by the coefficient. This offers better -- performance (as a shift is much cheaper than a multiply), and the -- low order bits of the final hash stay well mixed. cheapHashes :: Hashable a => Int -- ^ number of hashes to compute -> a -- ^ value to hash -> [Word32] {-# SPECIALIZE cheapHashes :: Int -> SB.ByteString -> [Word32] #-} {-# SPECIALIZE cheapHashes :: Int -> LB.ByteString -> [Word32] #-} {-# SPECIALIZE cheapHashes :: Int -> String -> [Word32] #-} cheapHashes k v = go 0 where go i | i == j = [] | otherwise = hash : go (i + 1) where !hash = h1 + (h2 `unsafeShiftR` i) h1 = fromIntegral (h `unsafeShiftR` 32) h2 = fromIntegral h h = hashSalt64 0x9150a946c4a8966e v j = fromIntegral k instance Hashable () where hashIO32 _ salt = return salt instance Hashable Integer where hashIO32 k salt | k < 0 = hashIO32 (unfoldr go (-k)) (salt `xor` 0x3ece731e) | otherwise = hashIO32 (unfoldr go k) salt where go 0 = Nothing go i = Just (fromIntegral i :: Word32, i `unsafeShiftR` 32) instance Hashable Bool where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Ordering where hashIO32 = hashIO32 . fromEnum hashIO64 = hashIO64 . fromEnum instance Hashable Char where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Int where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Float where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Double where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Int8 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Int16 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Int32 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Int64 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Word8 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Word16 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Word32 where hashIO32 = hashOne32 hashIO64 = hashOne64 instance Hashable Word64 where hashIO32 = hashOne32 hashIO64 = hashOne64 -- | A fast unchecked shift. Nasty, but otherwise GHC 6.8.2 does a -- test and branch on every shift. div4 :: CSize -> CSize div4 k = fromIntegral ((fromIntegral k :: HTYPE_SIZE_T) `unsafeShiftR` 2) alignedHash :: Ptr a -> CSize -> Word32 -> IO Word32 alignedHash ptr bytes salt | bytes .&. 3 == 0 = hashWord (castPtr ptr) (div4 bytes) salt' | otherwise = hashLittle ptr bytes salt' where salt' = fromIntegral salt -- Inlined from Foreign.Marshal.Utils, for performance reasons. with :: Storable a => a -> (Ptr a -> IO b) -> IO b with val f = alloca $ \ptr -> do poke ptr val f ptr alignedHash2 :: Ptr a -> CSize -> Word64 -> IO Word64 alignedHash2 ptr bytes salt = with (fromIntegral salt) $ \sp -> do let p1 = castPtr sp p2 = castPtr sp `plusPtr` 4 doubleHash ptr bytes p1 p2 peek sp doubleHash :: Ptr a -> CSize -> Ptr Word32 -> Ptr Word32 -> IO () doubleHash ptr bytes p1 p2 | bytes .&. 3 == 0 = hashWord2 (castPtr ptr) (div4 bytes) p1 p2 | otherwise = hashLittle2 ptr bytes p1 p2 instance Hashable SB.ByteString where hashIO32 bs salt = unsafeUseAsCStringLen bs $ \ptr len -> alignedHash ptr (fromIntegral len) salt {-# INLINE hashIO64 #-} hashIO64 bs salt = unsafeUseAsCStringLen bs $ \ptr len -> alignedHash2 ptr (fromIntegral len) salt rechunk :: LB.ByteString -> [SB.ByteString] rechunk s | LB.null s = [] | otherwise = let (pre,suf) = LB.splitAt chunkSize s in repack pre : rechunk suf where repack = SB.concat . LB.toChunks chunkSize = fromIntegral LB.defaultChunkSize instance Hashable LB.ByteString where hashIO32 bs salt = foldM (flip hashIO32) salt (rechunk bs) {-# INLINE hashIO64 #-} hashIO64 = hashChunks instance Hashable a => Hashable (Maybe a) where hashIO32 Nothing salt = return salt hashIO32 (Just k) salt = hashIO32 k salt hashIO64 Nothing salt = return salt hashIO64 (Just k) salt = hashIO64 k salt instance (Hashable a, Hashable b) => Hashable (Either a b) where hashIO32 (Left a) salt = hashIO32 a salt hashIO32 (Right b) salt = hashIO32 b (salt + 1) hashIO64 (Left a) salt = hashIO64 a salt hashIO64 (Right b) salt = hashIO64 b (salt + 1) instance (Hashable a, Hashable b) => Hashable (a, b) where hashIO32 (a,b) salt = hashIO32 a salt >>= hashIO32 b hashIO64 (a,b) salt = hashIO64 a salt >>= hashIO64 b instance (Hashable a, Hashable b, Hashable c) => Hashable (a, b, c) where hashIO32 (a,b,c) salt = hashIO32 a salt >>= hashIO32 b >>= hashIO32 c instance (Hashable a, Hashable b, Hashable c, Hashable d) => Hashable (a, b, c, d) where hashIO32 (a,b,c,d) salt = hashIO32 a salt >>= hashIO32 b >>= hashIO32 c >>= hashIO32 d instance (Hashable a, Hashable b, Hashable c, Hashable d, Hashable e) => Hashable (a, b, c, d, e) where hashIO32 (a,b,c,d,e) salt = hashIO32 a salt >>= hashIO32 b >>= hashIO32 c >>= hashIO32 d >>= hashIO32 e instance Storable a => Hashable [a] where hashIO32 = hashList32 {-# INLINE hashIO64 #-} hashIO64 = hashList64 -- | Compute a 32-bit hash of a 'Storable' instance. hashOne32 :: Storable a => a -> Word32 -> IO Word32 hashOne32 k salt = with k $ \ptr -> alignedHash ptr (fromIntegral (sizeOf k)) salt -- | Compute a 64-bit hash of a 'Storable' instance. hashOne64 :: Storable a => a -> Word64 -> IO Word64 hashOne64 k salt = with k $ \ptr -> alignedHash2 ptr (fromIntegral (sizeOf k)) salt -- | Compute a 32-bit hash of a list of 'Storable' instances. hashList32 :: Storable a => [a] -> Word32 -> IO Word32 hashList32 xs salt = withArrayLen xs $ \len ptr -> alignedHash ptr (fromIntegral (len * sizeOf (head xs))) salt -- | Compute a 64-bit hash of a list of 'Storable' instances. hashList64 :: Storable a => [a] -> Word64 -> IO Word64 hashList64 xs salt = withArrayLen xs $ \len ptr -> alignedHash2 ptr (fromIntegral (len * sizeOf (head xs))) salt unsafeUseAsCStringLen :: SB.ByteString -> (CString -> Int -> IO a) -> IO a unsafeUseAsCStringLen (PS fp o l) action = withForeignPtr fp $ \p -> action (p `plusPtr` o) l type HashState = Ptr Word32 foreign import ccall unsafe "lookup3.h _jenkins_little2_begin" c_begin :: Ptr Word32 -> Ptr Word32 -> HashState -> IO () foreign import ccall unsafe "lookup3.h _jenkins_little2_frag" c_frag :: Ptr a -> CSize -> HashState -> CSize -> IO CSize foreign import ccall unsafe "lookup3.h _jenkins_little2_step" c_step :: Ptr a -> CSize -> HashState -> IO CSize foreign import ccall unsafe "lookup3.h _jenkins_little2_end" c_end :: CInt -> Ptr Word32 -> Ptr Word32 -> HashState -> IO () unsafeAdjustCStringLen :: SB.ByteString -> Int -> (CString -> Int -> IO a) -> IO a unsafeAdjustCStringLen (PS fp o l) d action | d > l = action nullPtr 0 | otherwise = withForeignPtr fp $ \p -> action (p `plusPtr` (o + d)) (l - d) hashChunks :: LB.ByteString -> Word64 -> IO Word64 hashChunks s salt = do with (fromIntegral salt) $ \sp -> do let p1 = castPtr sp p2 = castPtr sp `plusPtr` 4 allocaArray 3 $ \st -> do let step :: LB.ByteString -> Int -> IO Int step (LB.Chunk x xs) off = do unread <- unsafeAdjustCStringLen x off $ \ptr len -> c_step ptr (fromIntegral len) st if unread > 0 then frag xs unread else step xs 0 step _ _ = return 0 frag :: LB.ByteString -> CSize -> IO Int frag c@(LB.Chunk x xs) stoff = do nstoff <- unsafeUseAsCStringLen x $ \ptr len -> do c_frag ptr (fromIntegral len) st stoff if nstoff == 12 then step c (fromIntegral (nstoff - stoff)) else frag xs nstoff frag LB.Empty stoff = return (fromIntegral (12 - stoff)) c_begin p1 p2 st unread <- step s 0 c_end (fromIntegral unread) p1 p2 st peek sp bloomfilter-2.0.1.2/Data/BloomFilter/Mutable.hs0000644000000000000000000001416507346545000017404 0ustar0000000000000000{-# LANGUAGE BangPatterns, CPP, Rank2Types, TypeOperators,FlexibleContexts #-} -- | -- Module: Data.BloomFilter.Mutable -- Copyright: Bryan O'Sullivan -- License: BSD3 -- -- Maintainer: Bryan O'Sullivan -- Stability: unstable -- Portability: portable -- -- A fast, space efficient Bloom filter implementation. A Bloom -- filter is a set-like data structure that provides a probabilistic -- membership test. -- -- * Queries do not give false negatives. When an element is added to -- a filter, a subsequent membership test will definitely return -- 'True'. -- -- * False positives /are/ possible. If an element has not been added -- to a filter, a membership test /may/ nevertheless indicate that -- the element is present. -- -- This module provides low-level control. For an easier to use -- interface, see the "Data.BloomFilter.Easy" module. module Data.BloomFilter.Mutable ( -- * Overview -- $overview -- ** Ease of use -- $ease -- ** Performance -- $performance -- * Types Hash , MBloom -- * Mutable Bloom filters -- ** Creation , new -- ** Accessors , length , elem -- ** Mutation , insert -- * The underlying representation -- | If you serialize the raw bit arrays below to disk, do not -- expect them to be portable to systems with different -- conventions for endianness or word size. -- | The raw bit array used by the mutable 'MBloom' type. , bitArray ) where #include "MachDeps.h" import Control.Monad (liftM, forM_) import Control.Monad.ST (ST) import Data.Array.Base (unsafeRead, unsafeWrite) import Data.Bits ((.&.), (.|.), unsafeShiftL, unsafeShiftR) import Data.BloomFilter.Array (newArray) import Data.BloomFilter.Util ((:*)(..), nextPowerOfTwo) import Data.Word (Word32) import Data.BloomFilter.Mutable.Internal import Prelude hiding (elem, length, notElem, (/), (*), div, divMod, mod, rem) -- | Create a new mutable Bloom filter. For efficiency, the number of -- bits used may be larger than the number requested. It is always -- rounded up to the nearest higher power of two, but will be clamped -- at a maximum of 4 gigabits, since hashes are 32 bits in size. new :: (a -> [Hash]) -- ^ family of hash functions to use -> Int -- ^ number of bits in filter -> ST s (MBloom s a) new hash numBits = MB hash shft msk `liftM` newArray numElems numBytes where twoBits | numBits < 1 = 1 | numBits > maxHash = maxHash | isPowerOfTwo numBits = numBits | otherwise = nextPowerOfTwo numBits numElems = max 2 (twoBits `unsafeShiftR` logBitsInHash) numBytes = numElems `unsafeShiftL` logBytesInHash trueBits = numElems `unsafeShiftL` logBitsInHash shft = logPower2 trueBits msk = trueBits - 1 isPowerOfTwo n = n .&. (n - 1) == 0 maxHash :: Int #if WORD_SIZE_IN_BITS == 64 maxHash = 4294967296 #else maxHash = 1073741824 #endif logBitsInHash :: Int logBitsInHash = 5 -- logPower2 bitsInHash logBytesInHash :: Int logBytesInHash = 2 -- logPower2 (sizeOf (undefined :: Hash)) -- | Given a filter's mask and a hash value, compute an offset into -- a word array and a bit offset within that word. hashIdx :: Int -> Word32 -> (Int :* Int) hashIdx msk x = (y `unsafeShiftR` logBitsInHash) :* (y .&. hashMask) where hashMask = 31 -- bitsInHash - 1 y = fromIntegral x .&. msk -- | Hash the given value, returning a list of (word offset, bit -- offset) pairs, one per hash value. hashesM :: MBloom s a -> a -> [Int :* Int] hashesM mb elt = hashIdx (mask mb) `map` hashes mb elt -- | Insert a value into a mutable Bloom filter. Afterwards, a -- membership query for the same value is guaranteed to return @True@. insert :: MBloom s a -> a -> ST s () insert mb elt = do let mu = bitArray mb forM_ (hashesM mb elt) $ \(word :* bit) -> do old <- unsafeRead mu word unsafeWrite mu word (old .|. (1 `unsafeShiftL` bit)) -- | Query a mutable Bloom filter for membership. If the value is -- present, return @True@. If the value is not present, there is -- /still/ some possibility that @True@ will be returned. elem :: a -> MBloom s a -> ST s Bool elem elt mb = loop (hashesM mb elt) where mu = bitArray mb loop ((word :* bit):wbs) = do i <- unsafeRead mu word if i .&. (1 `unsafeShiftL` bit) == 0 then return False else loop wbs loop _ = return True -- bitsInHash :: Int -- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3 -- | Return the size of a mutable Bloom filter, in bits. length :: MBloom s a -> Int length = unsafeShiftL 1 . shift -- | Slow, crummy way of computing the integer log of an integer known -- to be a power of two. logPower2 :: Int -> Int logPower2 k = go 0 k where go j 1 = j go j n = go (j+1) (n `unsafeShiftR` 1) -- $overview -- -- Each of the functions for creating Bloom filters accepts two parameters: -- -- * The number of bits that should be used for the filter. Note that -- a filter is fixed in size; it cannot be resized after creation. -- -- * A function that accepts a value, and should return a fixed-size -- list of hashes of that value. To keep the false positive rate -- low, the hashes computes should, as far as possible, be -- independent. -- -- By choosing these parameters with care, it is possible to tune for -- a particular false positive rate. The @suggestSizing@ function in -- the "Data.BloomFilter.Easy" module calculates useful estimates for -- these parameters. -- $ease -- -- This module provides both mutable interfaces for creating and -- querying a Bloom filter. It is most useful as a low-level way to -- manage a Bloom filter with a custom set of characteristics. -- $performance -- -- The implementation has been carefully tuned for high performance -- and low space consumption. -- -- For efficiency, the number of bits requested when creating a Bloom -- filter is rounded up to the nearest power of two. This lets the -- implementation use bitwise operations internally, instead of much -- more expensive multiplication, division, and modulus operations. bloomfilter-2.0.1.2/Data/BloomFilter/Mutable/0000755000000000000000000000000007346545000017041 5ustar0000000000000000bloomfilter-2.0.1.2/Data/BloomFilter/Mutable/Internal.hs0000644000000000000000000000176407346545000021161 0ustar0000000000000000-- | -- Module: Data.BloomFilter.Mutable.Internal -- Copyright: Bryan O'Sullivan -- License: BSD3 -- -- Maintainer: Bryan O'Sullivan -- Stability: unstable -- Portability: portable module Data.BloomFilter.Mutable.Internal ( -- * Types Hash , MBloom(..) ) where import Data.Array.Base (STUArray) import Data.Bits (shiftL) import Data.Word (Word32) import Prelude hiding (elem, length, notElem, (/), (*), div, divMod, mod, rem) -- | A hash value is 32 bits wide. This limits the maximum size of a -- filter to about four billion elements, or 512 megabytes of memory. type Hash = Word32 -- | A mutable Bloom filter, for use within the 'ST' monad. data MBloom s a = MB { hashes :: !(a -> [Hash]) , shift :: {-# UNPACK #-} !Int , mask :: {-# UNPACK #-} !Int , bitArray :: {-# UNPACK #-} !(STUArray s Int Hash) } instance Show (MBloom s a) where show mb = "MBloom { " ++ show ((1::Int) `shiftL` shift mb) ++ " bits } " bloomfilter-2.0.1.2/Data/BloomFilter/Util.hs0000644000000000000000000000135207346545000016722 0ustar0000000000000000{-# LANGUAGE BangPatterns, MagicHash, TypeOperators #-} module Data.BloomFilter.Util ( nextPowerOfTwo , (:*)(..) ) where import Data.Bits ((.|.), unsafeShiftR) -- | A strict pair type. data a :* b = !a :* !b deriving (Eq, Ord, Show) -- | Compute the nearest power of two greater to or equal than the -- given number. nextPowerOfTwo :: Int -> Int {-# INLINE nextPowerOfTwo #-} nextPowerOfTwo n = let a = n - 1 b = a .|. (a `unsafeShiftR` 1) c = b .|. (b `unsafeShiftR` 2) d = c .|. (c `unsafeShiftR` 4) e = d .|. (d `unsafeShiftR` 8) f = e .|. (e `unsafeShiftR` 16) g = f .|. (f `unsafeShiftR` 32) -- in case we're on a 64-bit host !h = g + 1 in h bloomfilter-2.0.1.2/LICENSE0000644000000000000000000000271107346545000013367 0ustar0000000000000000Copyright 2008 Bryan O'Sullivan . All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the author nor the names of his contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bloomfilter-2.0.1.2/README.markdown0000644000000000000000000000136507346545000015067 0ustar0000000000000000# A fast, space efficient Bloom filter implementation Copyright 2008, 2009, 2010, 2011 Bryan O'Sullivan . This package provides both mutable and immutable Bloom filter data types, along with a family of hash function and an easy-to-use interface. To build: cabal install bloomfilter For examples of usage, see the Haddock documentation and the files in the examples directory. # Get involved! Please report bugs via the [github issue tracker](https://github.com/haskell-pkg-janitors/bloomfilter). Master [git repository](https://github.com/haskell-pkg-janitors/bloomfilter): * `git clone git://github.com/haskell-pkg-janitors/bloomfilter.git` # Authors This library is written by Bryan O'Sullivan, . bloomfilter-2.0.1.2/Setup.lhs0000644000000000000000000000011407346545000014165 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain bloomfilter-2.0.1.2/bloomfilter.cabal0000644000000000000000000000365507346545000015674 0ustar0000000000000000cabal-version: 3.0 name: bloomfilter version: 2.0.1.2 license: BSD-3-Clause license-file: LICENSE author: Bryan O'Sullivan maintainer: Joey Hess homepage: https://github.com/haskell-pkg-janitors/bloomfilter bug-reports: https://github.com/haskell-pkg-janitors/bloomfilter/issues description: Pure and impure Bloom Filter implementations. synopsis: Pure and impure Bloom Filter implementations. category: Data stability: provisional build-type: Simple extra-source-files: README.markdown CHANGELOG.md cbits/lookup3.c cbits/lookup3.h examples/Makefile examples/SpellChecker.hs examples/Words.hs tested-with: GHC == 9.2.1 GHC == 9.0.1 GHC == 8.10.7 GHC == 8.8.4 GHC == 8.6.5 GHC == 8.4.4 GHC == 8.2.2 GHC == 8.0.2 GHC == 7.10.3 GHC == 7.8.4 GHC == 7.6.3 GHC == 7.4.2 library default-language: Haskell2010 build-depends: array, base >= 4.5 && < 5, bytestring >= 0.9, deepseq exposed-modules: Data.BloomFilter Data.BloomFilter.Easy Data.BloomFilter.Mutable Data.BloomFilter.Hash other-modules: Data.BloomFilter.Array Data.BloomFilter.Mutable.Internal Data.BloomFilter.Util c-sources: cbits/lookup3.c ghc-options: -O2 -Wall include-dirs: cbits includes: lookup3.h install-includes: lookup3.h test-suite tests default-language: Haskell2010 type: exitcode-stdio-1.0 hs-source-dirs: tests main-is: QC.hs other-modules: QCSupport build-depends: QuickCheck >= 2.5, base >= 4.4 && < 5, bloomfilter, bytestring, random, test-framework, test-framework-quickcheck2 source-repository head type: git location: git://github.com/haskell-pkg-janitors/bloomfilter.git bloomfilter-2.0.1.2/cbits/0000755000000000000000000000000007346545000013465 5ustar0000000000000000bloomfilter-2.0.1.2/cbits/lookup3.c0000644000000000000000000007301707346545000015235 0ustar0000000000000000/* ------------------------------------------------------------------------------- lookup3.c, by Bob Jenkins, May 2006, Public Domain. These are functions for producing 32-bit hashes for hash table lookup. hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() are externally useful functions. Routines to test the hash are included if SELF_TEST is defined. You can use this free for any purpose. It's in the public domain. It has no warranty. You probably want to use hashlittle(). hashlittle() and hashbig() hash byte arrays. hashlittle() is is faster than hashbig() on little-endian machines. Intel and AMD are little-endian machines. On second thought, you probably want hashlittle2(), which is identical to hashlittle() except it returns two 32-bit hashes for the price of one. You could implement hashbig2() if you wanted but I haven't bothered here. If you want to find a hash of, say, exactly 7 integers, do a = i1; b = i2; c = i3; mix(a,b,c); a += i4; b += i5; c += i6; mix(a,b,c); a += i7; final(a,b,c); then use c as the hash value. If you have a variable length array of 4-byte integers to hash, use hashword(). If you have a byte array (like a character string), use hashlittle(). If you have several byte arrays, or a mix of things, see the comments above hashlittle(). Why is this so big? I read 12 bytes at a time into 3 4-byte integers, then mix those integers. This is fast (you can do a lot more thorough mixing with 12*3 instructions on 3 integers than you can with 3 instructions on 1 byte), but shoehorning those bytes into integers efficiently is messy. ------------------------------------------------------------------------------- */ /* #define SELF_TEST 1 */ #define hashword _jenkins_hashword #define hashword2 _jenkins_hashword2 #define hashlittle _jenkins_hashlittle #define hashlittle2 _jenkins_hashlittle2 #define hashbig _jenkins_hashbig #include /* defines printf for tests */ #include /* defines time_t for timings in the test */ #include /* defines uint32_t etc */ #include /* attempt to define endianness */ #ifdef linux # include /* attempt to define endianness */ #endif #include "lookup3.h" #define hashsize(n) ((uint32_t)1<<(n)) #define hashmask(n) (hashsize(n)-1) /* -------------------------------------------------------------------- This works on all machines. To be useful, it requires -- that the key be an array of uint32_t's, and -- that the length be the number of uint32_t's in the key The function hashword() is identical to hashlittle() on little-endian machines, and identical to hashbig() on big-endian machines, except that the length has to be measured in uint32_ts rather than in bytes. hashlittle() is more complicated than hashword() only because hashlittle() has to dance around fitting the key bytes into registers. -------------------------------------------------------------------- */ uint32_t hashword( const uint32_t *k, /* the key, an array of uint32_t values */ size_t length, /* the length of the key, in uint32_ts */ uint32_t initval) /* the previous hash, or an arbitrary value */ { uint32_t a,b,c; /* Set up the internal state */ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; /*------------------------------------------------- handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 3; k += 3; } /*------------------------------------------- handle the last 3 uint32_t's */ switch(length) /* all the case statements fall through */ { case 3 : c+=k[2]; case 2 : b+=k[1]; case 1 : a+=k[0]; final(a,b,c); case 0: /* case 0: nothing left to add */ break; } /*------------------------------------------------------ report the result */ return c; } /* -------------------------------------------------------------------- hashword2() -- same as hashword(), but take two seeds and return two 32-bit values. pc and pb must both be nonnull, and *pc and *pb must both be initialized with seeds. If you pass in (*pb)==0, the output (*pc) will be the same as the return value from hashword(). -------------------------------------------------------------------- */ void hashword2 ( const uint32_t *k, /* the key, an array of uint32_t values */ size_t length, /* the length of the key, in uint32_ts */ uint32_t *pc, /* IN: seed OUT: primary hash value */ uint32_t *pb) /* IN: more seed OUT: secondary hash value */ { uint32_t a,b,c; /* Set up the internal state */ a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; c += *pb; /*------------------------------------------------- handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 3; k += 3; } /*------------------------------------------- handle the last 3 uint32_t's */ switch(length) /* all the case statements fall through */ { case 3 : c+=k[2]; case 2 : b+=k[1]; case 1 : a+=k[0]; final(a,b,c); case 0: /* case 0: nothing left to add */ break; } /*------------------------------------------------------ report the result */ *pc=c; *pb=b; } /* ------------------------------------------------------------------------------- hashlittle() -- hash a variable-length key into a 32-bit value k : the key (the unaligned variable-length array of bytes) length : the length of the key, counting by bytes initval : can be any 4-byte value Returns a 32-bit value. Every bit of the key affects every bit of the return value. Two keys differing by one or two bits will have totally different hash values. The best hash table sizes are powers of 2. There is no need to do mod a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. For example, if you need only 10 bits, do h = (h & hashmask(10)); In which case, the hash table should have hashsize(10) elements. If you are hashing n strings (uint8_t **)k, do it like this: for (i=0, h=0; i 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } /*----------------------------- handle the last (probably partial) block */ /* * "k[2]&0xffffff" actually reads beyond the end of the string, but * then masks off the part it's not allowed to read. Because the * string is aligned, the masked-off tail is in the same word as the * rest of the string. Every machine with memory protection I've seen * does it on word boundaries, so is OK with this. But VALGRIND will * still catch it and complain. The masking trick does make the hash * noticably faster for short strings (like English words). */ #ifndef VALGRIND switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=k[1]&0xffffff; a+=k[0]; break; case 6 : b+=k[1]&0xffff; a+=k[0]; break; case 5 : b+=k[1]&0xff; a+=k[0]; break; case 4 : a+=k[0]; break; case 3 : a+=k[0]&0xffffff; break; case 2 : a+=k[0]&0xffff; break; case 1 : a+=k[0]&0xff; break; case 0 : return c; /* zero length strings require no mixing */ } #else /* make valgrind happy */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]; break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ case 1 : a+=k8[0]; break; case 0 : return c; } #endif /* !valgrind */ } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ const uint8_t *k8; /*--------------- all but last block: aligned reads and different mixing */ while (length > 12) { a += k[0] + (((uint32_t)k[1])<<16); b += k[2] + (((uint32_t)k[3])<<16); c += k[4] + (((uint32_t)k[5])<<16); mix(a,b,c); length -= 12; k += 6; } /*----------------------------- handle the last (probably partial) block */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[4]+(((uint32_t)k[5])<<16); b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=k[4]; b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=k[2]; a+=k[0]+(((uint32_t)k[1])<<16); break; case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]+(((uint32_t)k[1])<<16); break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=k[0]; break; case 1 : a+=k8[0]; break; case 0 : return c; /* zero length requires no mixing */ } } else { /* need to read the key one byte at a time */ const uint8_t *k = (const uint8_t *)key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; a += ((uint32_t)k[1])<<8; a += ((uint32_t)k[2])<<16; a += ((uint32_t)k[3])<<24; b += k[4]; b += ((uint32_t)k[5])<<8; b += ((uint32_t)k[6])<<16; b += ((uint32_t)k[7])<<24; c += k[8]; c += ((uint32_t)k[9])<<8; c += ((uint32_t)k[10])<<16; c += ((uint32_t)k[11])<<24; mix(a,b,c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ switch(length) /* all the case statements fall through */ { case 12: c+=((uint32_t)k[11])<<24; case 11: c+=((uint32_t)k[10])<<16; case 10: c+=((uint32_t)k[9])<<8; case 9 : c+=k[8]; case 8 : b+=((uint32_t)k[7])<<24; case 7 : b+=((uint32_t)k[6])<<16; case 6 : b+=((uint32_t)k[5])<<8; case 5 : b+=k[4]; case 4 : a+=((uint32_t)k[3])<<24; case 3 : a+=((uint32_t)k[2])<<16; case 2 : a+=((uint32_t)k[1])<<8; case 1 : a+=k[0]; break; case 0 : return c; } } final(a,b,c); return c; } /* * hashlittle2: return 2 32-bit hash values * * This is identical to hashlittle(), except it returns two 32-bit hash * values instead of just one. This is good enough for hash table * lookup with 2^^64 buckets, or if you want a second hash if you're not * happy with the first, or if you want a probably-unique 64-bit ID for * the key. *pc is better mixed than *pb, so use *pc first. If you want * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)". */ void hashlittle2( const void *key, /* the key to hash */ size_t length, /* length of the key */ uint32_t *pc, /* IN: primary initval, OUT: primary hash */ uint32_t *pb) /* IN: secondary initval, OUT: secondary hash */ { uint32_t a,b,c; /* internal state */ union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ /* Set up the internal state */ a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc; c += *pb; u.ptr = key; if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } /*----------------------------- handle the last (probably partial) block */ /* * "k[2]&0xffffff" actually reads beyond the end of the string, but * then masks off the part it's not allowed to read. Because the * string is aligned, the masked-off tail is in the same word as the * rest of the string. Every machine with memory protection I've seen * does it on word boundaries, so is OK with this. But VALGRIND will * still catch it and complain. The masking trick does make the hash * noticably faster for short strings (like English words). */ #ifndef VALGRIND switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=k[1]&0xffffff; a+=k[0]; break; case 6 : b+=k[1]&0xffff; a+=k[0]; break; case 5 : b+=k[1]&0xff; a+=k[0]; break; case 4 : a+=k[0]; break; case 3 : a+=k[0]&0xffffff; break; case 2 : a+=k[0]&0xffff; break; case 1 : a+=k[0]&0xff; break; case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ } #else /* make valgrind happy */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]; break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ case 1 : a+=k8[0]; break; case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ } #endif /* !valgrind */ } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ const uint8_t *k8; /*--------------- all but last block: aligned reads and different mixing */ while (length > 12) { a += k[0] + (((uint32_t)k[1])<<16); b += k[2] + (((uint32_t)k[3])<<16); c += k[4] + (((uint32_t)k[5])<<16); mix(a,b,c); length -= 12; k += 6; } /*----------------------------- handle the last (probably partial) block */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[4]+(((uint32_t)k[5])<<16); b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=k[4]; b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=k[2]; a+=k[0]+(((uint32_t)k[1])<<16); break; case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]+(((uint32_t)k[1])<<16); break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=k[0]; break; case 1 : a+=k8[0]; break; case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ } } else { /* need to read the key one byte at a time */ const uint8_t *k = (const uint8_t *)key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; a += ((uint32_t)k[1])<<8; a += ((uint32_t)k[2])<<16; a += ((uint32_t)k[3])<<24; b += k[4]; b += ((uint32_t)k[5])<<8; b += ((uint32_t)k[6])<<16; b += ((uint32_t)k[7])<<24; c += k[8]; c += ((uint32_t)k[9])<<8; c += ((uint32_t)k[10])<<16; c += ((uint32_t)k[11])<<24; mix(a,b,c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ switch(length) /* all the case statements fall through */ { case 12: c+=((uint32_t)k[11])<<24; case 11: c+=((uint32_t)k[10])<<16; case 10: c+=((uint32_t)k[9])<<8; case 9 : c+=k[8]; case 8 : b+=((uint32_t)k[7])<<24; case 7 : b+=((uint32_t)k[6])<<16; case 6 : b+=((uint32_t)k[5])<<8; case 5 : b+=k[4]; case 4 : a+=((uint32_t)k[3])<<24; case 3 : a+=((uint32_t)k[2])<<16; case 2 : a+=((uint32_t)k[1])<<8; case 1 : a+=k[0]; break; case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ } } final(a,b,c); *pc=c; *pb=b; } /* * hashbig(): * This is the same as hashword() on big-endian machines. It is different * from hashlittle() on all machines. hashbig() takes advantage of * big-endian byte ordering. */ uint32_t hashbig( const void *key, size_t length, uint32_t initval) { uint32_t a,b,c; union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */ /* Set up the internal state */ a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; u.ptr = key; if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) { const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } /*----------------------------- handle the last (probably partial) block */ /* * "k[2]<<8" actually reads beyond the end of the string, but * then shifts out the part it's not allowed to read. Because the * string is aligned, the illegal read is in the same word as the * rest of the string. Every machine with memory protection I've seen * does it on word boundaries, so is OK with this. But VALGRIND will * still catch it and complain. The masking trick does make the hash * noticably faster for short strings (like English words). */ #ifndef VALGRIND switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break; case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break; case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break; case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=k[1]&0xffffff00; a+=k[0]; break; case 6 : b+=k[1]&0xffff0000; a+=k[0]; break; case 5 : b+=k[1]&0xff000000; a+=k[0]; break; case 4 : a+=k[0]; break; case 3 : a+=k[0]&0xffffff00; break; case 2 : a+=k[0]&0xffff0000; break; case 1 : a+=k[0]&0xff000000; break; case 0 : return c; /* zero length strings require no mixing */ } #else /* make valgrind happy */ k8 = (const uint8_t *)k; switch(length) /* all the case statements fall through */ { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=((uint32_t)k8[10])<<8; /* fall through */ case 10: c+=((uint32_t)k8[9])<<16; /* fall through */ case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */ case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */ case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */ case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */ case 4 : a+=k[0]; break; case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */ case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */ case 1 : a+=((uint32_t)k8[0])<<24; break; case 0 : return c; } #endif /* !VALGRIND */ } else { /* need to read the key one byte at a time */ const uint8_t *k = (const uint8_t *)key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += ((uint32_t)k[0])<<24; a += ((uint32_t)k[1])<<16; a += ((uint32_t)k[2])<<8; a += ((uint32_t)k[3]); b += ((uint32_t)k[4])<<24; b += ((uint32_t)k[5])<<16; b += ((uint32_t)k[6])<<8; b += ((uint32_t)k[7]); c += ((uint32_t)k[8])<<24; c += ((uint32_t)k[9])<<16; c += ((uint32_t)k[10])<<8; c += ((uint32_t)k[11]); mix(a,b,c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ switch(length) /* all the case statements fall through */ { case 12: c+=k[11]; case 11: c+=((uint32_t)k[10])<<8; case 10: c+=((uint32_t)k[9])<<16; case 9 : c+=((uint32_t)k[8])<<24; case 8 : b+=k[7]; case 7 : b+=((uint32_t)k[6])<<8; case 6 : b+=((uint32_t)k[5])<<16; case 5 : b+=((uint32_t)k[4])<<24; case 4 : a+=k[3]; case 3 : a+=((uint32_t)k[2])<<8; case 2 : a+=((uint32_t)k[1])<<16; case 1 : a+=((uint32_t)k[0])<<24; break; case 0 : return c; } } final(a,b,c); return c; } #ifdef SELF_TEST /* used for timings */ void driver1() { uint8_t buf[256]; uint32_t i; uint32_t h=0; time_t a,z; time(&a); for (i=0; i<256; ++i) buf[i] = 'x'; for (i=0; i<1; ++i) { h = hashlittle(&buf[0],1,h); } time(&z); if (z-a > 0) printf("time %ld %.8x\n", (long) z-a, h); } /* check that every input bit changes every output bit half the time */ #define HASHSTATE 1 #define HASHLEN 1 #define MAXPAIR 60 #define MAXLEN 70 void driver2() { uint8_t qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1]; uint32_t c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, m=0, z; uint32_t e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE]; uint32_t x[HASHSTATE],y[HASHSTATE]; uint32_t hlen; printf("No more than %d trials should ever be needed \n",MAXPAIR/2); for (hlen=0; hlen < MAXLEN; ++hlen) { z=0; for (i=0; i>(8-j)); c[0] = hashlittle(a, hlen, m); b[i] ^= ((k+1)<>(8-j)); d[0] = hashlittle(b, hlen, m); /* check every bit is 1, 0, set, and not set at least once */ for (l=0; lz) z=k; if (k==MAXPAIR) { printf("Some bit didn't change: "); printf("%.8x %.8x %.8x %.8x %.8x %.8x ", e[0],f[0],g[0],h[0],x[0],y[0]); printf("i %d j %d m %d len %d\n", i, j, m, hlen); } if (z==MAXPAIR) goto done; } } } done: if (z < MAXPAIR) { printf("Mix success %2d bytes %2d initvals ",i,m); printf("required %d trials\n", z/2); } } printf("\n"); } /* Check for reading beyond the end of the buffer and alignment problems */ void driver3() { uint8_t buf[MAXLEN+20], *b; uint32_t len; uint8_t q[] = "This is the time for all good men to come to the aid of their country..."; uint32_t h; uint8_t qq[] = "xThis is the time for all good men to come to the aid of their country..."; uint32_t i; uint8_t qqq[] = "xxThis is the time for all good men to come to the aid of their country..."; uint32_t j; uint8_t qqqq[] = "xxxThis is the time for all good men to come to the aid of their country..."; uint32_t ref,x,y; uint8_t *p; printf("Endianness. These lines should all be the same (for values filled in):\n"); printf("%.8x %.8x %.8x\n", hashword((const uint32_t *)q, (sizeof(q)-1)/4, 13), hashword((const uint32_t *)q, (sizeof(q)-5)/4, 13), hashword((const uint32_t *)q, (sizeof(q)-9)/4, 13)); p = q; printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); p = &qq[1]; printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); p = &qqq[2]; printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); p = &qqqq[3]; printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); printf("\n"); /* check that hashlittle2 and hashlittle produce the same results */ i=47; j=0; hashlittle2(q, sizeof(q), &i, &j); if (hashlittle(q, sizeof(q), 47) != i) printf("hashlittle2 and hashlittle mismatch\n"); /* check that hashword2 and hashword produce the same results */ len = 0xdeadbeef; i=47, j=0; hashword2(&len, 1, &i, &j); if (hashword(&len, 1, 47) != i) printf("hashword2 and hashword mismatch %x %x\n", i, hashword(&len, 1, 47)); /* check hashlittle doesn't read before or after the ends of the string */ for (h=0, b=buf+1; h<8; ++h, ++b) { for (i=0; i #include /* * My best guess at if you are big-endian or little-endian. This may * need adjustment. */ #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ __BYTE_ORDER == __LITTLE_ENDIAN) || \ (defined(i386) || defined(__i386__) || defined(__i486__) || \ defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL)) # define HASH_LITTLE_ENDIAN 1 # define HASH_BIG_ENDIAN 0 #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \ __BYTE_ORDER == __BIG_ENDIAN) || \ (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel)) # define HASH_LITTLE_ENDIAN 0 # define HASH_BIG_ENDIAN 1 #else # define HASH_LITTLE_ENDIAN 0 # define HASH_BIG_ENDIAN 0 #endif #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) /* * */ #if (defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 610) # define MAYBESTATIC #else # define MAYBESTATIC static inline #endif /* ------------------------------------------------------------------------------- mix -- mix 3 32-bit values reversibly. This is reversible, so any information in (a,b,c) before mix() is still in (a,b,c) after mix(). If four pairs of (a,b,c) inputs are run through mix(), or through mix() in reverse, there are at least 32 bits of the output that are sometimes the same for one pair and different for another pair. This was tested for: * pairs that differed by one bit, by two bits, in any combination of top bits of (a,b,c), or in any combination of bottom bits of (a,b,c). * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed the output delta to a Gray code (a^(a>>1)) so a string of 1's (as is commonly produced by subtraction) look like a single 1-bit difference. * the base values were pseudorandom, all zero but one bit set, or all zero plus a counter that starts at zero. Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that satisfy this are 4 6 8 16 19 4 9 15 3 18 27 15 14 9 3 7 17 3 Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for "differ" defined as + with a one-bit base and a two-bit delta. I used http://burtleburtle.net/bob/hash/avalanche.html to choose the operations, constants, and arrangements of the variables. This does not achieve avalanche. There are input bits of (a,b,c) that fail to affect some output bits of (a,b,c), especially of a. The most thoroughly mixed value is c, but it doesn't really even achieve avalanche in c. This allows some parallelism. Read-after-writes are good at doubling the number of bits affected, so the goal of mixing pulls in the opposite direction as the goal of parallelism. I did what I could. Rotates seem to cost as much as shifts on every machine I could lay my hands on, and rotates are much kinder to the top and bottom bits, so I used rotates. ------------------------------------------------------------------------------- */ #define mix(a,b,c) \ { \ a -= c; a ^= rot(c, 4); c += b; \ b -= a; b ^= rot(a, 6); a += c; \ c -= b; c ^= rot(b, 8); b += a; \ a -= c; a ^= rot(c,16); c += b; \ b -= a; b ^= rot(a,19); a += c; \ c -= b; c ^= rot(b, 4); b += a; \ } /* ------------------------------------------------------------------------------- final -- final mixing of 3 32-bit values (a,b,c) into c Pairs of (a,b,c) values differing in only a few bits will usually produce values of c that look totally different. This was tested for * pairs that differed by one bit, by two bits, in any combination of top bits of (a,b,c), or in any combination of bottom bits of (a,b,c). * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed the output delta to a Gray code (a^(a>>1)) so a string of 1's (as is commonly produced by subtraction) look like a single 1-bit difference. * the base values were pseudorandom, all zero but one bit set, or all zero plus a counter that starts at zero. These constants passed: 14 11 25 16 4 14 24 12 14 25 16 4 14 24 and these came close: 4 8 15 26 3 22 24 10 8 15 26 3 22 24 11 8 15 26 3 22 24 ------------------------------------------------------------------------------- */ #define final(a,b,c) \ { \ c ^= b; c -= rot(b,14); \ a ^= c; a -= rot(c,11); \ b ^= a; b -= rot(a,25); \ c ^= b; c -= rot(b,16); \ a ^= c; a -= rot(c,4); \ b ^= a; b -= rot(a,14); \ c ^= b; c -= rot(b,24); \ } uint32_t _jenkins_hashword(const uint32_t *k, size_t length, uint32_t initval); uint32_t _jenkins_hashlittle(const void *key, size_t length, uint32_t initval); void _jenkins_hashword2(const uint32_t *key, size_t length, uint32_t *pc, uint32_t *pb); void _jenkins_hashlittle2(const void *key, size_t length, uint32_t *pc, uint32_t *pb); MAYBESTATIC void _jenkins_little2_begin(const uint32_t *pc, const uint32_t *pb, uint32_t st[3]) { uint32_t a,b,c; /* Set up the internal state */ a = b = c = 0xdeadbeef + *pc; c += *pb; st[0] = a; st[1] = b; st[2] = c; } MAYBESTATIC size_t _jenkins_little2_frag(const void *key, size_t length, uint32_t st[4], size_t offset) { const uint8_t *k = key; size_t i; for (i = 0; i < length && offset < 12; i++, offset++) { st[offset >> 2] += k[i] << (8 * (offset & 3)); } if (offset == 12) { uint32_t a = st[0], b = st[1], c = st[2]; mix(a,b,c); st[0] = a; st[1] = b; st[2] = c; } return offset; } MAYBESTATIC size_t _jenkins_little2_step(const void *key, size_t length, uint32_t st[3]) { uint32_t a = st[0], b = st[1], c = st[2]; /* internal state */ union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ u.ptr = key; if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ while (length >= 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } /*----------------------------- handle the last (probably partial) block */ /* * "k[2]&0xffffff" actually reads beyond the end of the string, but * then masks off the part it's not allowed to read. Because the * string is aligned, the masked-off tail is in the same word as the * rest of the string. Every machine with memory protection I've seen * does it on word boundaries, so is OK with this. But VALGRIND will * still catch it and complain. The masking trick does make the hash * noticably faster for short strings (like English words). */ #ifndef VALGRIND switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=k[1]&0xffffff; a+=k[0]; break; case 6 : b+=k[1]&0xffff; a+=k[0]; break; case 5 : b+=k[1]&0xff; a+=k[0]; break; case 4 : a+=k[0]; break; case 3 : a+=k[0]&0xffffff; break; case 2 : a+=k[0]&0xffff; break; case 1 : a+=k[0]&0xff; break; } #else /* make valgrind happy */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]; break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ case 1 : a+=k8[0]; break; } #endif /* !valgrind */ } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ const uint8_t *k8; /*--------------- all but last block: aligned reads and different mixing */ while (length >= 12) { a += k[0] + (((uint32_t)k[1])<<16); b += k[2] + (((uint32_t)k[3])<<16); c += k[4] + (((uint32_t)k[5])<<16); mix(a,b,c); length -= 12; k += 6; } /*----------------------------- handle the last (probably partial) block */ k8 = (const uint8_t *)k; switch(length) { case 12: c+=k[4]+(((uint32_t)k[5])<<16); b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ case 10: c+=k[4]; b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[2]+(((uint32_t)k[3])<<16); a+=k[0]+(((uint32_t)k[1])<<16); break; case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ case 6 : b+=k[2]; a+=k[0]+(((uint32_t)k[1])<<16); break; case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]+(((uint32_t)k[1])<<16); break; case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ case 2 : a+=k[0]; break; case 1 : a+=k8[0]; break; } } else { /* need to read the key one byte at a time */ const uint8_t *k = (const uint8_t *)key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ while (length >= 12) { a += k[0]; a += ((uint32_t)k[1])<<8; a += ((uint32_t)k[2])<<16; a += ((uint32_t)k[3])<<24; b += k[4]; b += ((uint32_t)k[5])<<8; b += ((uint32_t)k[6])<<16; b += ((uint32_t)k[7])<<24; c += k[8]; c += ((uint32_t)k[9])<<8; c += ((uint32_t)k[10])<<16; c += ((uint32_t)k[11])<<24; mix(a,b,c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ switch(length) /* all the case statements fall through */ { case 12: c+=((uint32_t)k[11])<<24; case 11: c+=((uint32_t)k[10])<<16; case 10: c+=((uint32_t)k[9])<<8; case 9 : c+=k[8]; case 8 : b+=((uint32_t)k[7])<<24; case 7 : b+=((uint32_t)k[6])<<16; case 6 : b+=((uint32_t)k[5])<<8; case 5 : b+=k[4]; case 4 : a+=((uint32_t)k[3])<<24; case 3 : a+=((uint32_t)k[2])<<16; case 2 : a+=((uint32_t)k[1])<<8; case 1 : a+=k[0]; break; } } st[0] = a; st[1] = b; st[2] = c; return length; } MAYBESTATIC void _jenkins_little2_end(int nonempty, uint32_t *pc, uint32_t *pb, const uint32_t st[3]) { uint32_t a = st[0], b = st[1], c = st[2]; if (nonempty) final(a,b,c); *pc=c; *pb=b; } #endif /* _lookup3_h */ bloomfilter-2.0.1.2/examples/0000755000000000000000000000000007346545000014177 5ustar0000000000000000bloomfilter-2.0.1.2/examples/Makefile0000644000000000000000000000035607346545000015643 0ustar0000000000000000hc := ghc hcflags := --make -O2 -fvia-C -optc-O2 examples := words spellchecker all: $(examples) words: Words.hs $(hc) $(hcflags) -o $@ $^ spellchecker: SpellChecker.hs $(hc) $(hcflags) -o $@ $^ clean: -rm -f *.hi *.o $(examples) bloomfilter-2.0.1.2/examples/SpellChecker.hs0000644000000000000000000000030207346545000017072 0ustar0000000000000000import qualified Data.BloomFilter.Easy as BFE main = do filt <- (BFE.easyList 0.01 . words) `fmap` readFile "/usr/share/dict/words" interact (unlines . filter (`BFE.notElem` filt) . lines) bloomfilter-2.0.1.2/examples/Words.hs0000644000000000000000000000272207346545000015634 0ustar0000000000000000-- This program is intended for performance analysis. It simply -- builds a Bloom filter from a list of words, one per line, and -- queries it exhaustively. module Main (main) where import Control.Monad (forM_, mapM_) import qualified Data.BloomFilter as BF import Data.BloomFilter.Hash (cheapHashes) import Data.BloomFilter.Easy (easyList, suggestSizing) import qualified Data.ByteString.Lazy.Char8 as B import Data.Time.Clock (diffUTCTime, getCurrentTime) import System.Environment (getArgs) conservative, aggressive :: Double -> [B.ByteString] -> BF.Bloom B.ByteString conservative = easyList aggressive fpr xs = let (size, numHashes) = suggestSizing (length xs) fpr k = 3 in BF.fromList (cheapHashes (numHashes - k)) (size * k) xs testFunction = conservative main = do args <- getArgs let files | null args = ["/usr/share/dict/words"] | otherwise = args forM_ files $ \file -> do a <- getCurrentTime words <- B.lines `fmap` B.readFile file putStrLn $ {-# SCC "words/length" #-} show (length words) ++ " words" b <- getCurrentTime putStrLn $ show (diffUTCTime b a) ++ "s to count words" let filt = {-# SCC "construct" #-} testFunction 0.01 words print filt c <- getCurrentTime putStrLn $ show (diffUTCTime c b) ++ "s to construct filter" {-# SCC "query" #-} mapM_ print $ filter (not . (`BF.elem` filt)) words d <- getCurrentTime putStrLn $ show (diffUTCTime d c) ++ "s to query every element" bloomfilter-2.0.1.2/tests/0000755000000000000000000000000007346545000013523 5ustar0000000000000000bloomfilter-2.0.1.2/tests/QC.hs0000644000000000000000000000472707346545000014374 0ustar0000000000000000module Main where import Control.Monad (forM_) import qualified Data.BloomFilter.Easy as B import Data.BloomFilter.Hash (Hashable(..), hash64) import qualified Data.ByteString.Char8 as SB import qualified Data.ByteString.Lazy.Char8 as LB import Data.Int (Int8, Int16, Int32, Int64) import Data.Word (Word8, Word16, Word32, Word64) import Test.Framework.Providers.QuickCheck2 (testProperty) import System.IO (BufferMode(..), hSetBuffering, stdout) import Test.Framework (Test, defaultMain) import Test.QuickCheck (Property, Testable, (==>), choose, forAll) import QCSupport (P(..)) prop_pai :: (Hashable a) => a -> a -> P -> Bool prop_pai _ xs (P q) = let bf = B.easyList q [xs] in xs `B.elem` bf tests :: [Test] tests = [ testProperty "()" $ prop_pai () , testProperty "Bool" $ prop_pai (undefined :: Bool) , testProperty "Ordering" $ prop_pai (undefined :: Ordering) , testProperty "Char" $ prop_pai (undefined :: Char) , testProperty "Int" $ prop_pai (undefined :: Int) , testProperty "Float" $ prop_pai (undefined :: Float) , testProperty "Double" $ prop_pai (undefined :: Double) , testProperty "Int8" $ prop_pai (undefined :: Int8) , testProperty "Int16" $ prop_pai (undefined :: Int16) , testProperty "Int32" $ prop_pai (undefined :: Int32) , testProperty "Int64" $ prop_pai (undefined :: Int64) , testProperty "Word8" $ prop_pai (undefined :: Word8) , testProperty "Word16" $ prop_pai (undefined :: Word16) , testProperty "Word32" $ prop_pai (undefined :: Word32) , testProperty "Word64" $ prop_pai (undefined :: Word64) , testProperty "String" $ prop_pai (undefined :: String) , testProperty "LB.ByteString" $ prop_pai (undefined :: LB.ByteString) , testProperty "prop_rechunked_eq" prop_rechunked_eq ] rechunk :: Int64 -> LB.ByteString -> LB.ByteString rechunk k xs | k <= 0 = xs | otherwise = LB.fromChunks (go xs) where go s | LB.null s = [] | otherwise = let (pre,suf) = LB.splitAt k s in repack pre : go suf repack = SB.concat . LB.toChunks -- Ensure that a property over a lazy ByteString holds if we change -- the chunk boundaries. prop_rechunked :: Eq a => (LB.ByteString -> a) -> LB.ByteString -> Property prop_rechunked f s = let l = LB.length s in l > 0 ==> forAll (choose (1,l-1)) $ \k -> let n = k `mod` l in n > 0 ==> f s == f (rechunk n s) prop_rechunked_eq :: LB.ByteString -> Property prop_rechunked_eq = prop_rechunked hash64 main :: IO () main = defaultMain tests bloomfilter-2.0.1.2/tests/QCSupport.hs0000644000000000000000000000153107346545000015757 0ustar0000000000000000{-# LANGUAGE CPP, GeneralizedNewtypeDeriving #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module QCSupport ( P(..) ) where import Data.Int (Int8, Int16, Int32, Int64) import Data.Word (Word8, Word16, Word32, Word64) import qualified Data.ByteString.Char8 as SB import qualified Data.ByteString.Lazy.Char8 as LB import System.Random (Random(..), RandomGen) import Test.QuickCheck newtype P = P { unP :: Double } deriving (Eq, Ord, Show, Fractional, Num, Random) instance Arbitrary P where arbitrary = choose (epsilon, 1 - epsilon) where epsilon = 1e-6 :: P instance Arbitrary LB.ByteString where arbitrary = sized $ \n -> resize (round (sqrt (toEnum n :: Double))) ((LB.fromChunks . filter (not . SB.null)) `fmap` arbitrary) instance Arbitrary SB.ByteString where arbitrary = SB.pack `fmap` arbitrary