-- |
--
-- A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--
-- This module provides low-level control.  For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.

module Data.BloomFilter (
    -- * Overview
    -- $overview

    -- ** Ease of use
    -- $ease

    -- ** Performance
    -- $performance

    -- ** Differences from bloomfilter package
    -- $differences

    -- * Types
    Hash,
    Bloom,
    MBloom,
    Bloom',
    MBloom',
    CheapHashes,
    RealHashes,

    -- * Immutable Bloom filters

    -- ** Conversion
    freeze,
    thaw,
    unsafeFreeze,

    -- ** Creation
    unfold,

    fromList,
    empty,
    singleton,

    -- ** Accessors
    length,
    elem,
    elemHashes,
    notElem,
) where

import           Control.Exception (assert)
import           Control.Monad (forM_, liftM)
import           Control.Monad.ST (ST, runST)
import           Data.BloomFilter.Hash (CheapHashes, Hash, Hashable,
                     Hashes (..), RealHashes)
import           Data.BloomFilter.Internal (Bloom' (..), bloomInvariant)
import           Data.BloomFilter.Mutable (MBloom, MBloom', insert, new)
import qualified Data.BloomFilter.Mutable.Internal as MB
import           Data.Word (Word64)

import           Prelude hiding (elem, length, notElem)

import qualified Data.BloomFilter.BitVec64 as V

-- | Bloom filter using 'CheapHashes' hashing scheme.
type Bloom = Bloom' CheapHashes

-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- @
-- TODO
--import "Data.BloomFilter.Hash" (cheapHashes)
--
--filter = create (cheapHashes 3) 1024 $ \mf -> do
--           insertMB mf \"foo\"
--           insertMB mf \"bar\"
-- @
--
-- Note that the result of the setup function is not used.
create :: Int        -- ^ number of hash functions to use
        -> Word64                 -- ^ number of bits in filter
        -> (forall s. (MBloom' s h a -> ST s ()))  -- ^ setup function
        -> Bloom' h a
{-# INLINE create #-}
create :: forall (h :: * -> *) a.
Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
create Int
hash Word64
numBits forall s. MBloom' s h a -> ST s ()
body = (forall s. ST s (Bloom' h a)) -> Bloom' h a
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Bloom' h a)) -> Bloom' h a)
-> (forall s. ST s (Bloom' h a)) -> Bloom' h a
forall a b. (a -> b) -> a -> b
$ do
    MBloom' s h a
mb <- Int -> Word64 -> ST s (MBloom' s h a)
forall s (h :: * -> *) a. Int -> Word64 -> ST s (MBloom' s h a)
new Int
hash Word64
numBits
    MBloom' s h a -> ST s ()
forall s. MBloom' s h a -> ST s ()
body MBloom' s h a
mb
    MBloom' s h a -> ST s (Bloom' h a)
forall s (h :: * -> *) a. MBloom' s h a -> ST s (Bloom' h a)
unsafeFreeze MBloom' s h a
mb

-- | Create an immutable Bloom filter from a mutable one.  The mutable
-- filter may be modified afterwards.
freeze :: MBloom' s h a -> ST s (Bloom' h a)
freeze :: forall s (h :: * -> *) a. MBloom' s h a -> ST s (Bloom' h a)
freeze MBloom' s h a
mb = do
    BitVec64
ba <- MBitVec64 s -> ST s BitVec64
forall s. MBitVec64 s -> ST s BitVec64
V.freeze (MBloom' s h a -> MBitVec64 s
forall s (h :: * -> *) a. MBloom' s h a -> MBitVec64 s
MB.bitArray MBloom' s h a
mb)
    let !bf :: Bloom' h a
bf = Int -> Word64 -> BitVec64 -> Bloom' h a
forall (h :: * -> *) a. Int -> Word64 -> BitVec64 -> Bloom' h a
Bloom (MBloom' s h a -> Int
forall s (h :: * -> *) a. MBloom' s h a -> Int
MB.hashesN MBloom' s h a
mb) (MBloom' s h a -> Word64
forall s (h :: * -> *) a. MBloom' s h a -> Word64
MB.size MBloom' s h a
mb) BitVec64
ba
    Bool -> ST s (Bloom' h a) -> ST s (Bloom' h a)
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bloom' h a -> Bool
forall (h :: * -> *) a. Bloom' h a -> Bool
bloomInvariant Bloom' h a
bf) (ST s (Bloom' h a) -> ST s (Bloom' h a))
-> ST s (Bloom' h a) -> ST s (Bloom' h a)
forall a b. (a -> b) -> a -> b
$ Bloom' h a -> ST s (Bloom' h a)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bloom' h a
bf

-- | Create an immutable Bloom filter from a mutable one.  The mutable
-- filter /must not/ be modified afterwards, or a runtime crash may
-- occur.  For a safer creation interface, use 'freeze' or 'create'.
unsafeFreeze :: MBloom' s h a -> ST s (Bloom' h a)
unsafeFreeze :: forall s (h :: * -> *) a. MBloom' s h a -> ST s (Bloom' h a)
unsafeFreeze MBloom' s h a
mb = do
    BitVec64
ba <- MBitVec64 s -> ST s BitVec64
forall s. MBitVec64 s -> ST s BitVec64
V.unsafeFreeze (MBloom' s h a -> MBitVec64 s
forall s (h :: * -> *) a. MBloom' s h a -> MBitVec64 s
MB.bitArray MBloom' s h a
mb)
    let !bf :: Bloom' h a
bf = Int -> Word64 -> BitVec64 -> Bloom' h a
forall (h :: * -> *) a. Int -> Word64 -> BitVec64 -> Bloom' h a
Bloom (MBloom' s h a -> Int
forall s (h :: * -> *) a. MBloom' s h a -> Int
MB.hashesN MBloom' s h a
mb) (MBloom' s h a -> Word64
forall s (h :: * -> *) a. MBloom' s h a -> Word64
MB.size MBloom' s h a
mb) BitVec64
ba
    Bool -> ST s (Bloom' h a) -> ST s (Bloom' h a)
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bloom' h a -> Bool
forall (h :: * -> *) a. Bloom' h a -> Bool
bloomInvariant Bloom' h a
bf) (ST s (Bloom' h a) -> ST s (Bloom' h a))
-> ST s (Bloom' h a) -> ST s (Bloom' h a)
forall a b. (a -> b) -> a -> b
$ Bloom' h a -> ST s (Bloom' h a)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bloom' h a
bf

-- | Copy an immutable Bloom filter to create a mutable one.  There is
-- no non-copying equivalent.
thaw :: Bloom' h a -> ST s (MBloom' s h a)
thaw :: forall (h :: * -> *) a s. Bloom' h a -> ST s (MBloom' s h a)
thaw Bloom' h a
ub = Int -> Word64 -> MBitVec64 s -> MBloom' s h a
forall s (h :: * -> *) a.
Int -> Word64 -> MBitVec64 s -> MBloom' s h a
MB.MBloom (Bloom' h a -> Int
forall (h :: * -> *) a. Bloom' h a -> Int
hashesN Bloom' h a
ub) (Bloom' h a -> Word64
forall (h :: * -> *) a. Bloom' h a -> Word64
size Bloom' h a
ub) (MBitVec64 s -> MBloom' s h a)
-> ST s (MBitVec64 s) -> ST s (MBloom' s h a)
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM` BitVec64 -> ST s (MBitVec64 s)
forall s. BitVec64 -> ST s (MBitVec64 s)
V.thaw (Bloom' h a -> BitVec64
forall (h :: * -> *) a. Bloom' h a -> BitVec64
bitArray Bloom' h a
ub)

-- | Create an empty Bloom filter.
empty :: Int                    -- ^ number of hash functions to use
      -> Word64                 -- ^ number of bits in filter
      -> Bloom' h a
{-# INLINE [1] empty #-}
empty :: forall (h :: * -> *) a. Int -> Word64 -> Bloom' h a
empty Int
hash Word64
numBits = Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
forall (h :: * -> *) a.
Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
create Int
hash Word64
numBits (\MBloom' s h a
_ -> () -> ST s ()
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return ())

-- | Create a Bloom filter with a single element.
singleton :: (Hashes h, Hashable a)
          => Int               -- ^ number of hash functions to use
          -> Word64            -- ^ number of bits in filter
          -> a                 -- ^ element to insert
          -> Bloom' h a
singleton :: forall (h :: * -> *) a.
(Hashes h, Hashable a) =>
Int -> Word64 -> a -> Bloom' h a
singleton Int
hash Word64
numBits a
elt = Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
forall (h :: * -> *) a.
Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
create Int
hash Word64
numBits (\MBloom' s h a
mb -> MBloom' s h a -> a -> ST s ()
forall (h :: * -> *) a s.
(Hashes h, Hashable a) =>
MBloom' s h a -> a -> ST s ()
insert MBloom' s h a
mb a
elt)

-- | Query an immutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
elem :: forall (h :: * -> *) a.
(Hashes h, Hashable a) =>
a -> Bloom' h a -> Bool
elem a
elt Bloom' h a
ub = h a -> Bloom' h a -> Bool
forall (h :: * -> *) a. Hashes h => h a -> Bloom' h a -> Bool
elemHashes (a -> h a
forall a. Hashable a => a -> h a
forall (h :: * -> *) a. (Hashes h, Hashable a) => a -> h a
makeHashes a
elt) Bloom' h a
ub
{-# SPECIALIZE elem :: Hashable a => a -> Bloom a -> Bool #-}

-- | Query an immutable Bloom filter for membership using already constructed 'Hashes' value.
elemHashes :: Hashes h => h a -> Bloom' h a -> Bool
elemHashes :: forall (h :: * -> *) a. Hashes h => h a -> Bloom' h a -> Bool
elemHashes !h a
ch !Bloom' h a
ub = Int -> Bool
go Int
0 where
    go :: Int -> Bool
    go :: Int -> Bool
go !Int
i | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Bloom' h a -> Int
forall (h :: * -> *) a. Bloom' h a -> Int
hashesN Bloom' h a
ub
          = Bool
True
    go !Int
i = let idx' :: Word64
                !idx' :: Word64
idx' = h a -> Int -> Word64
forall a. h a -> Int -> Word64
forall (h :: * -> *) a. Hashes h => h a -> Int -> Word64
evalHashes h a
ch Int
i in
            let idx :: Int
                !idx :: Int
idx = Word64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word64
idx' Word64 -> Word64 -> Word64
`V.unsafeRemWord64` Bloom' h a -> Word64
forall (h :: * -> *) a. Bloom' h a -> Word64
size Bloom' h a
ub) in
            -- While the idx' can cover the full Word64 range,
            -- after taking the remainder, it now must fit in
            -- and Int because it's less than the filter size.
            if BitVec64 -> Int -> Bool
V.unsafeIndex (Bloom' h a -> BitVec64
forall (h :: * -> *) a. Bloom' h a -> BitVec64
bitArray Bloom' h a
ub) Int
idx
              then Int -> Bool
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
              else Bool
False
{-# SPECIALIZE elemHashes :: CheapHashes a -> Bloom a -> Bool #-}

-- | Query an immutable Bloom filter for non-membership.  If the value
-- /is/ present, return @False@.  If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: (Hashes h, Hashable a) => a -> Bloom' h a -> Bool
notElem :: forall (h :: * -> *) a.
(Hashes h, Hashable a) =>
a -> Bloom' h a -> Bool
notElem a
elt Bloom' h a
ub = h a -> Bloom' h a -> Bool
forall (h :: * -> *) a. Hashes h => h a -> Bloom' h a -> Bool
notElemHashes (a -> h a
forall a. Hashable a => a -> h a
forall (h :: * -> *) a. (Hashes h, Hashable a) => a -> h a
makeHashes a
elt) Bloom' h a
ub

-- | Query an immutable Bloom filter for non-membership using already constructed 'Hashes' value.
notElemHashes :: Hashes h => h a -> Bloom' h a -> Bool
notElemHashes :: forall (h :: * -> *) a. Hashes h => h a -> Bloom' h a -> Bool
notElemHashes !h a
ch !Bloom' h a
ub = Bool -> Bool
not (h a -> Bloom' h a -> Bool
forall (h :: * -> *) a. Hashes h => h a -> Bloom' h a -> Bool
elemHashes h a
ch Bloom' h a
ub)

-- | Return the size of an immutable Bloom filter, in bits.
length :: Bloom' h a -> Word64
length :: forall (h :: * -> *) a. Bloom' h a -> Word64
length = Bloom' h a -> Word64
forall (h :: * -> *) a. Bloom' h a -> Word64
size

-- | Build an immutable Bloom filter from a seed value.  The seeding
-- function populates the filter as follows.
--
--   * If it returns 'Nothing', it is finished producing values to
--     insert into the filter.
--
--   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
--     @b@ is used as a new seed.
unfold :: forall a b h. (Hashes h, Hashable a)
       => Int                       -- ^ number of hash functions to use
       -> Word64                    -- ^ number of bits in filter
       -> (b -> Maybe (a, b))       -- ^ seeding function
       -> b                         -- ^ initial seed
       -> Bloom' h a
{-# INLINE unfold #-}
unfold :: forall a b (h :: * -> *).
(Hashes h, Hashable a) =>
Int -> Word64 -> (b -> Maybe (a, b)) -> b -> Bloom' h a
unfold Int
hs Word64
numBits b -> Maybe (a, b)
f b
k = Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
forall (h :: * -> *) a.
Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
create Int
hs Word64
numBits (b -> MBloom' s h a -> ST s ()
forall s. b -> MBloom' s h a -> ST s ()
loop b
k)
  where loop :: forall s. b -> MBloom' s h a -> ST s ()
        loop :: forall s. b -> MBloom' s h a -> ST s ()
loop b
j MBloom' s h a
mb = case b -> Maybe (a, b)
f b
j of
                      Just (a
a, b
j') -> MBloom' s h a -> a -> ST s ()
forall (h :: * -> *) a s.
(Hashes h, Hashable a) =>
MBloom' s h a -> a -> ST s ()
insert MBloom' s h a
mb a
a ST s () -> ST s () -> ST s ()
forall a b. ST s a -> ST s b -> ST s b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> MBloom' s h a -> ST s ()
forall s. b -> MBloom' s h a -> ST s ()
loop b
j' MBloom' s h a
mb
                      Maybe (a, b)
_            -> () -> ST s ()
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

-- | Create an immutable Bloom filter, populating it from a list of
-- values.
--
-- Here is an example that uses the @cheapHashes@ function from the
-- "Data.BloomFilter.Hash" module to create a hash function that
-- returns three hashes.
--
-- @
-- filt = fromList 3 1024 [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (Hashes h, Hashable a)
         => Int                -- ^ number of hash functions to use
         -> Word64             -- ^ number of bits in filter
         -> [a]                -- ^ values to populate with
         -> Bloom' h a
fromList :: forall (h :: * -> *) a.
(Hashes h, Hashable a) =>
Int -> Word64 -> [a] -> Bloom' h a
fromList Int
hs Word64
numBits [a]
list = Int
-> Word64 -> (forall {s}. MBloom' s h a -> ST s ()) -> Bloom' h a
forall (h :: * -> *) a.
Int -> Word64 -> (forall s. MBloom' s h a -> ST s ()) -> Bloom' h a
create Int
hs Word64
numBits ((forall {s}. MBloom' s h a -> ST s ()) -> Bloom' h a)
-> (forall {s}. MBloom' s h a -> ST s ()) -> Bloom' h a
forall a b. (a -> b) -> a -> b
$ [a] -> (a -> ST s ()) -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [a]
list ((a -> ST s ()) -> ST s ())
-> (MBloom' s h a -> a -> ST s ()) -> MBloom' s h a -> ST s ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. MBloom' s h a -> a -> ST s ()
forall (h :: * -> *) a s.
(Hashes h, Hashable a) =>
MBloom' s h a -> a -> ST s ()
insert

-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter.  Note that
--   a filter is fixed in size; it cannot be resized after creation.
--
-- * A number of hash functions, /k/, to be used for the filter.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate.
-- The 'Data.BloomFilter.Easy.suggestSizing' function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.

-- $ease
--
-- This module provides immutable interfaces for working with a
-- query-only Bloom filter, and for converting to and from mutable
-- Bloom filters.
--
-- For a higher-level interface that is easy to use, see the
-- "Data.BloomFilter.Easy" module.

-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.

-- $differences
--
-- This package is (almost entirely rewritten) fork of
-- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
--
-- The main differences are
--
-- * This packages support bloomfilters of arbitrary sizes
--   (not limited to powers of two). Also sizes over 2^32 are supported.
--
-- * The 'Bloom' and 'MBloom' types are parametrised over 'Hashes' variable,
--   instead of having a @a -> ['Hash']@ typed field.
--   This separation allows clean de/serialization of Bloom filters in this
--   package, as the hashing scheme is a static.
--
-- * [XXH3 hash](https://xxhash.com/) is used instead of Jenkins'
--   lookup3.