-- | A fast, space efficient Bloom filter implementation.  A Bloom filter is a
-- set-like data structure that provides a probabilistic membership test.
--
-- * Queries do not give false negatives. When an element is added to a filter,
--   a subsequent membership test will definitely return 'True'.
--
-- * False positives /are/ possible. If an element has not been added to a
--   filter, a membership test /may/ nevertheless indicate that the element is
--   present.
--
module Data.BloomFilter.Blocked (
    -- * Overview
    -- $overview

    -- * Types
    Hash,
    Salt,
    Hashable,

    -- * Immutable Bloom filters
    Bloom,

    -- ** Creation
    create,
    unfold,
    fromList,

    -- ** (De)Serialisation
    formatVersion,
    serialise,
    deserialise,

    -- ** Sizes
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,

    -- ** Accessors
    size,
    elem,
    notElem,
    (?),

    -- * Mutable Bloom filters
    MBloom,
    new,
    maxSizeBits,
    insert,
    insertMany,
    read,

    -- ** Conversion
    freeze,
    thaw,
    unsafeFreeze,

    -- * Low level variants
    Hashes,
    hashesWithSalt,
    insertHashes,
    elemHashes,
    readHashes,
    -- ** Prefetching
    prefetchInsert,
    prefetchElem,
) where

import           Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
                     stToPrim)
import           Control.Monad.ST (ST, runST)
import           Data.Bits ((.&.))
import           Data.Primitive.ByteArray (MutableByteArray)
import qualified Data.Primitive.PrimArray as P

import           Data.BloomFilter.Blocked.Calc (BitsPerEntry, BloomPolicy (..),
                     BloomSize (..), FPR, NumEntries, policyFPR, policyForBits,
                     policyForFPR, sizeForBits, sizeForFPR, sizeForPolicy)
import           Data.BloomFilter.Blocked.Internal hiding (deserialise)
import qualified Data.BloomFilter.Blocked.Internal as Internal
import           Data.BloomFilter.Hash

import           Prelude hiding (elem, notElem, read)

-- $setup
--
-- >>> import Text.Printf

-- $overview
--
-- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
-- size determines the number of bits that should be used for the filter. Note
-- that a filter is fixed in size; it cannot be resized after creation.
--
-- The size can be specified by asking for a target false positive rate (FPR)
-- or a number of bits per element, and the number of elements in the filter.
-- For example:
--
-- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
--   with a false positive rate of 1 in 1000
--
-- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
--   with 10 bits per element
--
-- Depending on the application it may be more important to target a fixed
-- amount of memory to use, or target a specific FPR.
--
-- As a very rough guide for filter sizes, here are a range of FPRs and bits
-- per element:
--
-- * FPR of 1e-1 requires approximately 4.8 bits per element
-- * FPR of 1e-2 requires approximately 9.8 bits per element
-- * FPR of 1e-3 requires approximately 15.8 bits per element
-- * FPR of 1e-4 requires approximately 22.6 bits per element
-- * FPR of 1e-5 requires approximately 30.2 bits per element
--
-- >>> fmap (printf "%0.1f" . policyBits . policyForFPR) [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] :: [String]
-- ["4.8","9.8","15.8","22.6","30.2"]

-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- >>> :{
-- filter = create (sizeForBits 16 2) 4 $ \mf -> do
--  insert mf "foo"
--  insert mf "bar"
-- :}
--
-- Note that the result of the setup function is not used.
create :: BloomSize
       -> Salt
       -> (forall s. (MBloom s a -> ST s ()))  -- ^ setup function
       -> Bloom a
{-# INLINE create #-}
create :: forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize Salt
bloomsalt forall s. MBloom s a -> ST s ()
body =
    (forall s. ST s (Bloom a)) -> Bloom a
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Bloom a)) -> Bloom a)
-> (forall s. ST s (Bloom a)) -> Bloom a
forall a b. (a -> b) -> a -> b
$ do
      MBloom s a
mb <- BloomSize -> Salt -> ST s (MBloom s a)
forall s a. BloomSize -> Salt -> ST s (MBloom s a)
new BloomSize
bloomsize Salt
bloomsalt
      MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body MBloom s a
mb
      MBloom s a -> ST s (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb

{-# INLINEABLE insert #-}
-- | Insert a value into a mutable Bloom filter.  Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: Hashable a => MBloom s a -> a -> ST s ()
insert :: forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert = \ !MBloom s a
mb !a
x -> MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
insertHashes MBloom s a
mb (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
mb) a
x)

{-# INLINE elem #-}
-- | Query an immutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: Hashable a => a -> Bloom a -> Bool
elem :: forall a. Hashable a => a -> Bloom a -> Bool
elem = \ !a
x !Bloom a
b -> Bloom a -> Hashes a -> Bool
forall a. Bloom a -> Hashes a -> Bool
elemHashes Bloom a
b (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (Bloom a -> Salt
forall a. Bloom a -> Salt
hashSalt Bloom a
b) a
x)

-- | Same as 'elem' but with the opposite argument order:
--
-- > x `elem` bfilter
--
-- versus
--
-- > bfilter ? x
--
(?) :: Hashable a => Bloom a -> a -> Bool
? :: forall a. Hashable a => Bloom a -> a -> Bool
(?) = (a -> Bloom a -> Bool) -> Bloom a -> a -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
elem

{-# INLINE notElem #-}
-- | Query an immutable Bloom filter for non-membership.  If the value
-- /is/ present, return @False@.  If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: Hashable a => a -> Bloom a -> Bool
notElem :: forall a. Hashable a => a -> Bloom a -> Bool
notElem = \a
x Bloom a
b -> Bool -> Bool
not (a
x a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
`elem` Bloom a
b)

-- | Query a mutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
read :: Hashable a => MBloom s a -> a -> ST s Bool
read :: forall a s. Hashable a => MBloom s a -> a -> ST s Bool
read !MBloom s a
mb !a
x = MBloom s a -> Hashes a -> ST s Bool
forall s a. MBloom s a -> Hashes a -> ST s Bool
readHashes MBloom s a
mb (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
mb) a
x)

-- | Build an immutable Bloom filter from a seed value.  The seeding
-- function populates the filter as follows.
--
--   * If it returns 'Nothing', it is finished producing values to
--     insert into the filter.
--
--   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
--     @b@ is used as a new seed.
unfold :: forall a b.
          Hashable a
       => BloomSize
       -> Salt
       -> (b -> Maybe (a, b))       -- ^ seeding function
       -> b                         -- ^ initial seed
       -> Bloom a
{-# INLINE unfold #-}
unfold :: forall a b.
Hashable a =>
BloomSize -> Salt -> (b -> Maybe (a, b)) -> b -> Bloom a
unfold BloomSize
bloomsize Salt
bloomsalt b -> Maybe (a, b)
f b
k =
    BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize Salt
bloomsalt MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body
  where
    body :: forall s. MBloom s a -> ST s ()
    body :: forall s. MBloom s a -> ST s ()
body MBloom s a
mb = b -> ST s ()
loop b
k
      where
        loop :: b -> ST s ()
        loop :: b -> ST s ()
loop !b
j = case b -> Maybe (a, b)
f b
j of
                    Maybe (a, b)
Nothing      -> () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
                    Just (a
a, b
j') -> MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
mb a
a ST s () -> ST s () -> ST s ()
forall a b. ST s a -> ST s b -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> ST s ()
loop b
j'

{-# INLINEABLE fromList #-}
-- | Create a Bloom filter, populating it from a sequence of values.
--
-- For example
--
-- @
-- filter = fromList (policyForBits 10) 4 [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (Foldable t, Hashable a)
         => BloomPolicy
         -> Salt
         -> t a -- ^ values to populate with
         -> Bloom a
fromList :: forall (t :: * -> *) a.
(Foldable t, Hashable a) =>
BloomPolicy -> Salt -> t a -> Bloom a
fromList BloomPolicy
policy Salt
bloomsalt t a
xs =
    BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bsize Salt
bloomsalt (\MBloom s a
b -> (a -> ST s ()) -> t a -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
b) t a
xs)
  where
    bsize :: BloomSize
bsize = BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy BloomPolicy
policy (t a -> NumEntries
forall a. t a -> NumEntries
forall (t :: * -> *) a. Foldable t => t a -> NumEntries
length t a
xs)

{-# SPECIALISE deserialise ::
     BloomSize
  -> Salt
  -> (MutableByteArray RealWorld -> Int -> Int -> IO ())
  -> IO (Bloom a) #-}
deserialise :: PrimMonad m
            => BloomSize
            -> Salt
            -> (MutableByteArray (PrimState m) -> Int -> Int -> m ())
            -> m (Bloom a)
deserialise :: forall (m :: * -> *) a.
PrimMonad m =>
BloomSize
-> Salt
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m (Bloom a)
deserialise BloomSize
bloomsize Salt
bloomsalt MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill = do
    MBloom (PrimState m) a
mbloom <- ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (MBloom (PrimState m) a)
 -> m (MBloom (PrimState m) a))
-> ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall a b. (a -> b) -> a -> b
$ BloomSize -> Salt -> ST (PrimState m) (MBloom (PrimState m) a)
forall s a. BloomSize -> Salt -> ST s (MBloom s a)
new BloomSize
bloomsize Salt
bloomsalt
    MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
forall (m :: * -> *) a.
PrimMonad m =>
MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
Internal.deserialise MBloom (PrimState m) a
mbloom MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill
    ST (PrimState m) (Bloom a) -> m (Bloom a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (Bloom a) -> m (Bloom a))
-> ST (PrimState m) (Bloom a) -> m (Bloom a)
forall a b. (a -> b) -> a -> b
$ MBloom (PrimState m) a -> ST (PrimState m) (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom (PrimState m) a
mbloom


-----------------------------------------------------------
-- Bulk insert
--

{-# INLINABLE insertMany #-}
-- | A bulk insert of many elements.
--
-- This is somewhat faster than repeated insertion using 'insert'. It uses
-- memory prefetching to improve the utilisation of memory bandwidth. This has
-- greatest benefit for large filters (that do not fit in L3 cache) and for
-- inserting many elements, e.g. > 10.
--
-- To get best performance, you probably want to specialise this function to
-- the 'Hashable' instance and to the lookup action. It is marked @INLINABLE@
-- to help with this.
--
insertMany ::
     forall a s.
     Hashable a
  => MBloom s a
  -> (Int -> ST s a) -- ^ Action to lookup elements, indexed @0..n-1@
  -> Int             -- ^ @n@, number of elements to insert
  -> ST s ()
insertMany :: forall a s.
Hashable a =>
MBloom s a -> (NumEntries -> ST s a) -> NumEntries -> ST s ()
insertMany MBloom s a
bloom NumEntries -> ST s a
key NumEntries
n =
    NumEntries -> ST s (MutablePrimArray (PrimState (ST s)) (Hashes a))
forall (m :: * -> *) a.
(PrimMonad m, Prim a) =>
NumEntries -> m (MutablePrimArray (PrimState m) a)
P.newPrimArray NumEntries
0x10 ST s (MutablePrimArray s (Hashes a))
-> (MutablePrimArray s (Hashes a) -> ST s ()) -> ST s ()
forall a b. ST s a -> (a -> ST s b) -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= MutablePrimArray s (Hashes a) -> ST s ()
body
  where
    -- The general strategy is to use a rolling buffer @buf@ (of size 16). At
    -- the write end of the buffer, we prepare the probe locations and prefetch
    -- the corresponding cache line. At the read end, we do the hash insert.
    -- By having a prefetch distance of 15 between the write and read ends, we
    -- can have up to 15 memory reads in flight at once, thus improving
    -- utilisation of the memory bandwidth.
    body :: P.MutablePrimArray s (Hashes a) -> ST s ()
    body :: MutablePrimArray s (Hashes a) -> ST s ()
body !MutablePrimArray s (Hashes a)
buf = NumEntries -> NumEntries -> ST s ()
prepareProbes NumEntries
0 NumEntries
0
      where
        -- Start by filling the buffer as far as we can, either to the end of
        -- the buffer or until we run out of elements.
        prepareProbes :: Int -> Int -> ST s ()
        prepareProbes :: NumEntries -> NumEntries -> ST s ()
prepareProbes !NumEntries
i !NumEntries
i_w
          | NumEntries
i_w NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
0x0f Bool -> Bool -> Bool
&& NumEntries
i NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
n = do
              a
k <- NumEntries -> ST s a
key NumEntries
i
              let !kh :: Hashes a
kh = Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
bloom) a
k
              MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
prefetchInsert MBloom s a
bloom Hashes a
kh
              MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> Hashes a -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> a -> m ()
P.writePrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_w Hashes a
kh
              NumEntries -> NumEntries -> ST s ()
prepareProbes (NumEntries
iNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1) (NumEntries
i_wNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1)

          | NumEntries
n NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
> NumEntries
0     = NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe NumEntries
0 NumEntries
0 NumEntries
i_w
          | Bool
otherwise = () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()

        -- Read from the read end of the buffer and do the inserts.
        insertProbe :: Int -> Int -> Int -> ST s ()
        insertProbe :: NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe !NumEntries
i !NumEntries
i_r !NumEntries
i_w = do
            Hashes a
kh <- MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> ST s (Hashes a)
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> m a
P.readPrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_r
            MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
insertHashes MBloom s a
bloom Hashes a
kh
            NumEntries -> NumEntries -> NumEntries -> ST s ()
nextProbe NumEntries
i NumEntries
i_r NumEntries
i_w

        -- Move on to the next entry.
        nextProbe :: Int -> Int -> Int -> ST s ()
        nextProbe :: NumEntries -> NumEntries -> NumEntries -> ST s ()
nextProbe !NumEntries
i !NumEntries
i_r !NumEntries
i_w
          -- If there are elements left, we prepare them and add them at the
          -- write end of the buffer, before inserting the next element
          -- (from the read end of the buffer).
          | NumEntries
i NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
n = do
              a
k <- NumEntries -> ST s a
key NumEntries
i
              let !kh :: Hashes a
kh = Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
bloom) a
k
              MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
prefetchInsert MBloom s a
bloom Hashes a
kh
              MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> Hashes a -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> a -> m ()
P.writePrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_w Hashes a
kh
              NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe
                (NumEntries
iNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1)
                ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)
                ((NumEntries
i_w NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)

          -- Or if there's no more elements to add to the buffer, but the
          -- buffer is still non-empty, we just loop draining the buffer.
          | ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f) NumEntries -> NumEntries -> Bool
forall a. Eq a => a -> a -> Bool
/= NumEntries
i_w =
              NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe
                NumEntries
i
                ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)
                NumEntries
i_w

          -- When the buffer is empty, we're done.
          | Bool
otherwise = () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()