-- |
--
-- A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--

module Data.BloomFilter.Blocked (
    -- * Types
    Hash,
    Hashable,

    -- * Immutable Bloom filters
    Bloom,

    -- ** Creation
    create,
    unfold,
    fromList,

    -- ** (De)Serialisation
    formatVersion,
    serialise,
    deserialise,

    -- ** Sizes
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,

    -- ** Accessors
    size,
    elem,
    notElem,
    (?),

    -- * Mutable Bloom filters
    MBloom,
    new,
    maxSizeBits,
    insert,
    insertMany,

    -- ** Conversion
    freeze,
    thaw,
    unsafeFreeze,

    -- * Low level variants
    Hashes,
    hashes,
    insertHashes,
    elemHashes,
    -- ** Prefetching
    prefetchInsert,
    prefetchElem,
) where

import           Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
                     stToPrim)
import           Control.Monad.ST (ST, runST)
import           Data.Bits ((.&.))
import           Data.Primitive.ByteArray (MutableByteArray)
import qualified Data.Primitive.PrimArray as P

import           Data.BloomFilter.Blocked.Calc
import           Data.BloomFilter.Blocked.Internal hiding (deserialise)
import qualified Data.BloomFilter.Blocked.Internal as Internal
import           Data.BloomFilter.Hash

import           Prelude hiding (elem, notElem)

-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- @
--filter = create (sizeForBits 16 2) $ \mf -> do
--           insert mf \"foo\"
--           insert mf \"bar\"
-- @
--
-- Note that the result of the setup function is not used.
create :: BloomSize
       -> (forall s. (MBloom s a -> ST s ()))  -- ^ setup function
       -> Bloom a
{-# INLINE create #-}
create :: forall a. BloomSize -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize forall s. MBloom s a -> ST s ()
body =
    (forall s. ST s (Bloom a)) -> Bloom a
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Bloom a)) -> Bloom a)
-> (forall s. ST s (Bloom a)) -> Bloom a
forall a b. (a -> b) -> a -> b
$ do
      MBloom s a
mb <- BloomSize -> ST s (MBloom s a)
forall s a. BloomSize -> ST s (MBloom s a)
new BloomSize
bloomsize
      MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body MBloom s a
mb
      MBloom s a -> ST s (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb

{-# INLINEABLE insert #-}
-- | Insert a value into a mutable Bloom filter.  Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: Hashable a => MBloom s a -> a -> ST s ()
insert :: forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert = \ !MBloom s a
mb !a
x -> MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
insertHashes MBloom s a
mb (a -> Hashes a
forall a. Hashable a => a -> Hashes a
hashes a
x)

{-# INLINE elem #-}
-- | Query an immutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: Hashable a => a -> Bloom a -> Bool
elem :: forall a. Hashable a => a -> Bloom a -> Bool
elem = \ !a
x !Bloom a
b -> Bloom a -> Hashes a -> Bool
forall a. Bloom a -> Hashes a -> Bool
elemHashes Bloom a
b (a -> Hashes a
forall a. Hashable a => a -> Hashes a
hashes a
x)

-- | Same as 'elem' but with the opposite argument order:
--
-- > x `elem` bfilter
--
-- versus
--
-- > bfilter ? x
--
(?) :: Hashable a => Bloom a -> a -> Bool
? :: forall a. Hashable a => Bloom a -> a -> Bool
(?) = (a -> Bloom a -> Bool) -> Bloom a -> a -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
elem

{-# INLINE notElem #-}
-- | Query an immutable Bloom filter for non-membership.  If the value
-- /is/ present, return @False@.  If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: Hashable a => a -> Bloom a -> Bool
notElem :: forall a. Hashable a => a -> Bloom a -> Bool
notElem = \a
x Bloom a
b -> Bool -> Bool
not (a
x a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
`elem` Bloom a
b)

-- | Build an immutable Bloom filter from a seed value.  The seeding
-- function populates the filter as follows.
--
--   * If it returns 'Nothing', it is finished producing values to
--     insert into the filter.
--
--   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
--     @b@ is used as a new seed.
unfold :: forall a b.
          Hashable a
       => BloomSize
       -> (b -> Maybe (a, b))       -- ^ seeding function
       -> b                         -- ^ initial seed
       -> Bloom a
{-# INLINE unfold #-}
unfold :: forall a b.
Hashable a =>
BloomSize -> (b -> Maybe (a, b)) -> b -> Bloom a
unfold BloomSize
bloomsize b -> Maybe (a, b)
f b
k =
    BloomSize -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a. BloomSize -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body
  where
    body :: forall s. MBloom s a -> ST s ()
    body :: forall s. MBloom s a -> ST s ()
body MBloom s a
mb = b -> ST s ()
loop b
k
      where
        loop :: b -> ST s ()
        loop :: b -> ST s ()
loop !b
j = case b -> Maybe (a, b)
f b
j of
                    Maybe (a, b)
Nothing      -> () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
                    Just (a
a, b
j') -> MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
mb a
a ST s () -> ST s () -> ST s ()
forall a b. ST s a -> ST s b -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> ST s ()
loop b
j'

-- | Create a Bloom filter, populating it from a sequence of values.
--
-- For example
--
-- @
-- filt = fromList (policyForBits 10) [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (Foldable t, Hashable a)
         => BloomPolicy
         -> t a -- ^ values to populate with
         -> Bloom a
fromList :: forall (t :: * -> *) a.
(Foldable t, Hashable a) =>
BloomPolicy -> t a -> Bloom a
fromList BloomPolicy
policy t a
xs =
    BloomSize -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a. BloomSize -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bsize (\MBloom s a
b -> (a -> ST s ()) -> t a -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
b) t a
xs)
  where
    bsize :: BloomSize
bsize = BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy BloomPolicy
policy (t a -> NumEntries
forall a. t a -> NumEntries
forall (t :: * -> *) a. Foldable t => t a -> NumEntries
length t a
xs)

{-# SPECIALISE deserialise :: BloomSize
                           -> (MutableByteArray RealWorld -> Int -> Int -> IO ())
                           -> IO (Bloom a) #-}
deserialise :: PrimMonad m
            => BloomSize
            -> (MutableByteArray (PrimState m) -> Int -> Int -> m ())
            -> m (Bloom a)
deserialise :: forall (m :: * -> *) a.
PrimMonad m =>
BloomSize
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m (Bloom a)
deserialise BloomSize
bloomsize MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill = do
    MBloom (PrimState m) a
mbloom <- ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (MBloom (PrimState m) a)
 -> m (MBloom (PrimState m) a))
-> ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall a b. (a -> b) -> a -> b
$ BloomSize -> ST (PrimState m) (MBloom (PrimState m) a)
forall s a. BloomSize -> ST s (MBloom s a)
new BloomSize
bloomsize
    MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
forall (m :: * -> *) a.
PrimMonad m =>
MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
Internal.deserialise MBloom (PrimState m) a
mbloom MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill
    ST (PrimState m) (Bloom a) -> m (Bloom a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (Bloom a) -> m (Bloom a))
-> ST (PrimState m) (Bloom a) -> m (Bloom a)
forall a b. (a -> b) -> a -> b
$ MBloom (PrimState m) a -> ST (PrimState m) (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom (PrimState m) a
mbloom


-----------------------------------------------------------
-- Bulk insert
--

{-# INLINABLE insertMany #-}
-- | A bulk insert of many elements.
--
-- This is somewhat faster than repeated insertion using 'insert'. It uses
-- memory prefetching to improve the utilisation of memory bandwidth. This has
-- greatest benefit for large filters (that do not fit in L3 cache) and for
-- inserting many elements, e.g. > 10.
--
-- To get best performance, you probably want to specialise this function to
-- the 'Hashable' instance and to the lookup action. It is marked @INLINABLE@
-- to help with this.
--
insertMany ::
     forall a s.
     Hashable a
  => MBloom s a
  -> (Int -> ST s a) -- ^ Action to lookup elements, indexed @0..n-1@
  -> Int             -- ^ @n@, number of elements to insert
  -> ST s ()
insertMany :: forall a s.
Hashable a =>
MBloom s a -> (NumEntries -> ST s a) -> NumEntries -> ST s ()
insertMany MBloom s a
bloom NumEntries -> ST s a
key NumEntries
n =
    NumEntries -> ST s (MutablePrimArray (PrimState (ST s)) (Hashes a))
forall (m :: * -> *) a.
(PrimMonad m, Prim a) =>
NumEntries -> m (MutablePrimArray (PrimState m) a)
P.newPrimArray NumEntries
0x10 ST s (MutablePrimArray s (Hashes a))
-> (MutablePrimArray s (Hashes a) -> ST s ()) -> ST s ()
forall a b. ST s a -> (a -> ST s b) -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= MutablePrimArray s (Hashes a) -> ST s ()
body
  where
    -- The general strategy is to use a rolling buffer @buf@ (of size 16). At
    -- the write end of the buffer, we prepare the probe locations and prefetch
    -- the corresponding cache line. At the read end, we do the hash insert.
    -- By having a prefetch distance of 15 between the write and read ends, we
    -- can have up to 15 memory reads in flight at once, thus improving
    -- utilisation of the memory bandwidth.
    body :: P.MutablePrimArray s (Hashes a) -> ST s ()
    body :: MutablePrimArray s (Hashes a) -> ST s ()
body !MutablePrimArray s (Hashes a)
buf = NumEntries -> NumEntries -> ST s ()
prepareProbes NumEntries
0 NumEntries
0
      where
        -- Start by filling the buffer as far as we can, either to the end of
        -- the buffer or until we run out of elements.
        prepareProbes :: Int -> Int -> ST s ()
        prepareProbes :: NumEntries -> NumEntries -> ST s ()
prepareProbes !NumEntries
i !NumEntries
i_w
          | NumEntries
i_w NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
0x0f Bool -> Bool -> Bool
&& NumEntries
i NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
n = do
              a
k <- NumEntries -> ST s a
key NumEntries
i
              let !kh :: Hashes a
kh = a -> Hashes a
forall a. Hashable a => a -> Hashes a
hashes a
k
              MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
prefetchInsert MBloom s a
bloom Hashes a
kh
              MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> Hashes a -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> a -> m ()
P.writePrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_w Hashes a
kh
              NumEntries -> NumEntries -> ST s ()
prepareProbes (NumEntries
iNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1) (NumEntries
i_wNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1)

          | NumEntries
n NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
> NumEntries
0     = NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe NumEntries
0 NumEntries
0 NumEntries
i_w
          | Bool
otherwise = () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()

        -- Read from the read end of the buffer and do the inserts.
        insertProbe :: Int -> Int -> Int -> ST s ()
        insertProbe :: NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe !NumEntries
i !NumEntries
i_r !NumEntries
i_w = do
            Hashes a
kh <- MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> ST s (Hashes a)
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> m a
P.readPrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_r
            MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
insertHashes MBloom s a
bloom Hashes a
kh
            NumEntries -> NumEntries -> NumEntries -> ST s ()
nextProbe NumEntries
i NumEntries
i_r NumEntries
i_w

        -- Move on to the next entry.
        nextProbe :: Int -> Int -> Int -> ST s ()
        nextProbe :: NumEntries -> NumEntries -> NumEntries -> ST s ()
nextProbe !NumEntries
i !NumEntries
i_r !NumEntries
i_w
          -- If there are elements left, we prepare them and add them at the
          -- write end of the buffer, before inserting the next element
          -- (from the read end of the buffer).
          | NumEntries
i NumEntries -> NumEntries -> Bool
forall a. Ord a => a -> a -> Bool
< NumEntries
n = do
              a
k <- NumEntries -> ST s a
key NumEntries
i
              let !kh :: Hashes a
kh = a -> Hashes a
forall a. Hashable a => a -> Hashes a
hashes a
k
              MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
prefetchInsert MBloom s a
bloom Hashes a
kh
              MutablePrimArray (PrimState (ST s)) (Hashes a)
-> NumEntries -> Hashes a -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutablePrimArray (PrimState m) a -> NumEntries -> a -> m ()
P.writePrimArray MutablePrimArray s (Hashes a)
MutablePrimArray (PrimState (ST s)) (Hashes a)
buf NumEntries
i_w Hashes a
kh
              NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe
                (NumEntries
iNumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+NumEntries
1)
                ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)
                ((NumEntries
i_w NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)

          -- Or if there's no more elements to add to the buffer, but the
          -- buffer is still non-empty, we just loop draining the buffer.
          | ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f) NumEntries -> NumEntries -> Bool
forall a. Eq a => a -> a -> Bool
/= NumEntries
i_w =
              NumEntries -> NumEntries -> NumEntries -> ST s ()
insertProbe
                NumEntries
i
                ((NumEntries
i_r NumEntries -> NumEntries -> NumEntries
forall a. Num a => a -> a -> a
+ NumEntries
1) NumEntries -> NumEntries -> NumEntries
forall a. Bits a => a -> a -> a
.&. NumEntries
0x0f)
                NumEntries
i_w

          -- When the buffer is empty, we're done.
          | Bool
otherwise = () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()