-- | A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--
module Data.BloomFilter.Classic (
    -- * Overview
    -- $overview

    -- ** Example: a spell checker
    -- $example

    -- ** Differences from bloomfilter package
    -- $differences

    -- * Types
    Hash,
    Salt,
    Hashable,

    -- * Immutable Bloom filters
    Bloom,

    -- ** Creation
    create,
    unfold,
    fromList,

    -- ** (De)Serialisation
    formatVersion,
    serialise,
    deserialise,

    -- ** Sizes
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,

    -- ** Accessors
    size,
    elem,
    notElem,
    (?),

    -- * Mutable Bloom filters
    MBloom,
    new,
    maxSizeBits,
    insert,
    read,

    -- ** Conversion
    freeze,
    thaw,
    unsafeFreeze,

    -- * Low level variants
    Hashes,
    hashesWithSalt,
    insertHashes,
    elemHashes,
    readHashes,
) where

import           Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
                     stToPrim)
import           Control.Monad.ST (ST, runST)
import           Data.Primitive.ByteArray (MutableByteArray)

import           Data.BloomFilter.Classic.Calc
import           Data.BloomFilter.Classic.Internal hiding (deserialise)
import qualified Data.BloomFilter.Classic.Internal as Internal
import           Data.BloomFilter.Hash

import           Prelude hiding (elem, notElem, read)

-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- @
--filter = create (sizeForBits 16 2) 4 $ \mf -> do
--           insert mf \"foo\"
--           insert mf \"bar\"
-- @
--
-- Note that the result of the setup function is not used.
create :: BloomSize
       -> Salt
       -> (forall s. (MBloom s a -> ST s ()))  -- ^ setup function
       -> Bloom a
{-# INLINE create #-}
create :: forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize Salt
bloomsalt forall s. MBloom s a -> ST s ()
body =
    (forall s. ST s (Bloom a)) -> Bloom a
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Bloom a)) -> Bloom a)
-> (forall s. ST s (Bloom a)) -> Bloom a
forall a b. (a -> b) -> a -> b
$ do
      MBloom s a
mb <- BloomSize -> Salt -> ST s (MBloom s a)
forall s a. BloomSize -> Salt -> ST s (MBloom s a)
new BloomSize
bloomsize Salt
bloomsalt
      MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body MBloom s a
mb
      MBloom s a -> ST s (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb

-- | Insert a value into a mutable Bloom filter.  Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: Hashable a => MBloom s a -> a -> ST s ()
insert :: forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert !MBloom s a
mb !a
x = MBloom s a -> Hashes a -> ST s ()
forall s a. MBloom s a -> Hashes a -> ST s ()
insertHashes MBloom s a
mb (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
mb) a
x)

-- | Query an immutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: Hashable a => a -> Bloom a -> Bool
elem :: forall a. Hashable a => a -> Bloom a -> Bool
elem = \ !a
x !Bloom a
b -> Bloom a -> Hashes a -> Bool
forall a. Bloom a -> Hashes a -> Bool
elemHashes Bloom a
b (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (Bloom a -> Salt
forall a. Bloom a -> Salt
hashSalt Bloom a
b) a
x)

-- | Same as 'elem' but with the opposite argument order:
--
-- > x `elem` bfilter
--
-- versus
--
-- > bfilter ? x
--
(?) :: Hashable a => Bloom a -> a -> Bool
? :: forall a. Hashable a => Bloom a -> a -> Bool
(?) = (a -> Bloom a -> Bool) -> Bloom a -> a -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
elem

-- | Query an immutable Bloom filter for non-membership.  If the value
-- /is/ present, return @False@.  If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: Hashable a => a -> Bloom a -> Bool
notElem :: forall a. Hashable a => a -> Bloom a -> Bool
notElem = \ a
x Bloom a
b -> Bool -> Bool
not (a
x a -> Bloom a -> Bool
forall a. Hashable a => a -> Bloom a -> Bool
`elem` Bloom a
b)

-- | Query a mutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
read :: Hashable a => MBloom s a -> a -> ST s Bool
read :: forall a s. Hashable a => MBloom s a -> a -> ST s Bool
read !MBloom s a
mb !a
x = MBloom s a -> Hashes a -> ST s Bool
forall s a. MBloom s a -> Hashes a -> ST s Bool
readHashes MBloom s a
mb (Salt -> a -> Hashes a
forall a. Hashable a => Salt -> a -> Hashes a
hashesWithSalt (MBloom s a -> Salt
forall s a. MBloom s a -> Salt
mbHashSalt MBloom s a
mb) a
x)

-- | Build an immutable Bloom filter from a seed value.  The seeding
-- function populates the filter as follows.
--
--   * If it returns 'Nothing', it is finished producing values to
--     insert into the filter.
--
--   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
--     @b@ is used as a new seed.
unfold :: forall a b.
          Hashable a
       => BloomSize
       -> Salt
       -> (b -> Maybe (a, b))       -- ^ seeding function
       -> b                         -- ^ initial seed
       -> Bloom a
{-# INLINE unfold #-}
unfold :: forall a b.
Hashable a =>
BloomSize -> Salt -> (b -> Maybe (a, b)) -> b -> Bloom a
unfold BloomSize
bloomsize Salt
bloomsalt b -> Maybe (a, b)
f b
k =
    BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bloomsize Salt
bloomsalt MBloom s a -> ST s ()
forall s. MBloom s a -> ST s ()
body
  where
    body :: forall s. MBloom s a -> ST s ()
    body :: forall s. MBloom s a -> ST s ()
body MBloom s a
mb = b -> ST s ()
loop b
k
      where
        loop :: b -> ST s ()
        loop :: b -> ST s ()
loop !b
j = case b -> Maybe (a, b)
f b
j of
                    Maybe (a, b)
Nothing      -> () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
                    Just (a
a, b
j') -> MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
mb a
a ST s () -> ST s () -> ST s ()
forall a b. ST s a -> ST s b -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> ST s ()
loop b
j'


{-# INLINEABLE fromList #-}
-- | Create a Bloom filter, populating it from a sequence of values.
--
-- For example
--
-- @
-- filt = fromList (policyForBits 10) 4 [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (Foldable t, Hashable a)
         => BloomPolicy
         -> Salt
         -> t a -- ^ values to populate with
         -> Bloom a
fromList :: forall (t :: * -> *) a.
(Foldable t, Hashable a) =>
BloomPolicy -> Salt -> t a -> Bloom a
fromList BloomPolicy
policy Salt
bsalt t a
xs =
    BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
forall a.
BloomSize -> Salt -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create BloomSize
bsize Salt
bsalt (\MBloom s a
b -> (a -> ST s ()) -> t a -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (MBloom s a -> a -> ST s ()
forall a s. Hashable a => MBloom s a -> a -> ST s ()
insert MBloom s a
b) t a
xs)
  where
    bsize :: BloomSize
bsize = BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy BloomPolicy
policy (t a -> NumEntries
forall a. t a -> NumEntries
forall (t :: * -> *) a. Foldable t => t a -> NumEntries
length t a
xs)

{-# SPECIALISE deserialise :: BloomSize
                           -> Salt
                           -> (MutableByteArray RealWorld -> Int -> Int -> IO ())
                           -> IO (Bloom a) #-}
deserialise :: PrimMonad m
            => BloomSize
            -> Salt
            -> (MutableByteArray (PrimState m) -> Int -> Int -> m ())
            -> m (Bloom a)
deserialise :: forall (m :: * -> *) a.
PrimMonad m =>
BloomSize
-> Salt
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m (Bloom a)
deserialise BloomSize
bloomsalt Salt
bloomsize MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill = do
    MBloom (PrimState m) a
mbloom <- ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (MBloom (PrimState m) a)
 -> m (MBloom (PrimState m) a))
-> ST (PrimState m) (MBloom (PrimState m) a)
-> m (MBloom (PrimState m) a)
forall a b. (a -> b) -> a -> b
$ BloomSize -> Salt -> ST (PrimState m) (MBloom (PrimState m) a)
forall s a. BloomSize -> Salt -> ST s (MBloom s a)
new BloomSize
bloomsalt Salt
bloomsize
    MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
forall (m :: * -> *) a.
PrimMonad m =>
MBloom (PrimState m) a
-> (MutableByteArray (PrimState m)
    -> NumEntries -> NumEntries -> m ())
-> m ()
Internal.deserialise MBloom (PrimState m) a
mbloom MutableByteArray (PrimState m) -> NumEntries -> NumEntries -> m ()
fill
    ST (PrimState m) (Bloom a) -> m (Bloom a)
forall (m :: * -> *) a. PrimMonad m => ST (PrimState m) a -> m a
stToPrim (ST (PrimState m) (Bloom a) -> m (Bloom a))
-> ST (PrimState m) (Bloom a) -> m (Bloom a)
forall a b. (a -> b) -> a -> b
$ MBloom (PrimState m) a -> ST (PrimState m) (Bloom a)
forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom (PrimState m) a
mbloom

-- $overview
--
-- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
-- size determines the number of bits that should be used for the filter. Note
-- that a filter is fixed in size; it cannot be resized after creation.
--
-- The size can be specified by asking for a target false positive rate (FPR)
-- or a number of bits per element, and the number of elements in the filter.
-- For example:
--
-- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
--   with a false positive rate of 1 in 1000
--
-- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
--   with 10 bits per element
--
-- Depending on the application it may be more important to target a fixed
-- amount of memory to use, or target a specific FPR.
--
-- As a very rough guide for filter sizes, here are a range of FPRs and bits
-- per element:
--
-- * FPR of 1e-1 requires approximately 4.8 bits per element
-- * FPR of 1e-2 requires approximately 9.6 bits per element
-- * FPR of 1e-3 requires approximately 14.4 bits per element
-- * FPR of 1e-4 requires approximately 19.2 bits per element
-- * FPR of 1e-5 requires approximately 24.0 bits per element
--

-- $example
--
-- This example reads a dictionary file containing one word per line,
-- constructs a Bloom filter with a 1% false positive rate, and
-- spellchecks its standard input.  Like the Unix @spell@ command, it
-- prints each word that it does not recognize.
--
-- @
-- import Data.Maybe (mapMaybe)
-- import qualified Data.BloomFilter as B
--
-- main = do
--   filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words"
--   let check word | B.elem word filt  = Nothing
--                  | otherwise         = Just word
--   interact (unlines . mapMaybe check . lines)
-- @

-- $differences
--
-- This package is an entirely rewritten fork of
-- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
--
-- The main differences are
--
-- * This packages support bloomfilters of arbitrary sizes
--   (not limited to powers of two). Also sizes over 2^32 are supported.
--
-- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type
--   class, instead of having a @a -> ['Hash']@ typed field.
--   This separation allows clean de\/serialization of Bloom filters in this
--   package, as the hashing scheme is static.
--
-- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins'
--   @lookup3@.
--
-- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured
--   Bloom filters arrange all the bits for each insert or lookup into a single
--   cache line, which greatly reduces the number of slow uncached memory reads.
--   The trade-off for this performance optimisation is a slightly worse
--   trade-off between bits per element and the FPR. In practice for typical
--   FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element.