-- | Various formulas for working with bloomfilters.
module Data.BloomFilter.Classic.Calc (
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,
) where

import           Numeric

type FPR          = Double
type BitsPerEntry = Double
type NumEntries   = Int

-- | A policy on intended bloom filter size -- independent of the number of
-- elements.
--
-- We can decide a policy based on:
--
-- 1. a target false positive rate (FPR) using 'policyForFPR'
-- 2. a number of bits per entry using 'policyForBits'
--
-- A policy can be turned into a 'BloomSize' given a target 'NumEntries' using
-- 'sizeForPolicy'.
--
-- Either way we define the policy, we can inspect the result to see:
--
-- 1. The bits per entry 'policyBits'. This will determine the
--    size of the bloom filter in bits. In general the bits per entry can be
--    fractional. The final bloom filter size in will be rounded to a whole
--    number of bits.
-- 2. The number of hashes 'policyHashes'.
-- 3. The expected FPR for the policy using 'policyFPR'.
--
data BloomPolicy = BloomPolicy {
       BloomPolicy -> Double
policyBits   :: !Double,
       BloomPolicy -> Int
policyHashes :: !Int
     }
  deriving stock Int -> BloomPolicy -> ShowS
[BloomPolicy] -> ShowS
BloomPolicy -> String
(Int -> BloomPolicy -> ShowS)
-> (BloomPolicy -> String)
-> ([BloomPolicy] -> ShowS)
-> Show BloomPolicy
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> BloomPolicy -> ShowS
showsPrec :: Int -> BloomPolicy -> ShowS
$cshow :: BloomPolicy -> String
show :: BloomPolicy -> String
$cshowList :: [BloomPolicy] -> ShowS
showList :: [BloomPolicy] -> ShowS
Show

policyForFPR :: FPR -> BloomPolicy
policyForFPR :: Double -> BloomPolicy
policyForFPR Double
fpr | Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
0 Bool -> Bool -> Bool
|| Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
1 =
    String -> BloomPolicy
forall a. HasCallStack => String -> a
error String
"bloomPolicyForFPR: fpr out of range (0,1)"

policyForFPR Double
fpr =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    -- There's a simper fomula to compute the number of bits, but it assumes
    -- that k is a real. We must however round k to the nearest natural, and
    -- so we have to use a more precise approximation, using the actual value
    -- of k.
    k       :: Int; k' :: Double
    k :: Int
k       = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round ((-Double
recip_log2) Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log_fpr))
    k' :: Double
k'      = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
k
    c :: Double
c       = Double -> Double
forall a. Num a => a -> a
negate Double
k' Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double -> Double
forall a. Floating a => a -> a
log1mexp (Double
log_fpr Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
k')
    log_fpr :: Double
log_fpr = Double -> Double
forall a. Floating a => a -> a
log Double
fpr
    -- For the source of this formula, see
    -- https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
    --
    -- We start with the FPR ε approximation that assumes independence for the
    -- probabilities of each bit being set.
    --
    --                         ε   = (1 - e^(-kn/m))^k
    --
    -- And noting that bits per entry @c = m/n@, hence @-kn/m = -k/c@, hence
    --
    --                         ε   = (1-e^(-k/c))^k
    --
    -- And then we rearrange to get c, the number of bits per entry:
    --
    --                            ε   =  (1-e^(-k/c))^k
    --                            ε   =  (1-exp (-k/c))^k
    --                            ε   =  exp (log (1 - exp (-k/c)) * k)
    --                        log ε   =  log (1 - exp (-k/c)) * k
    --                    log ε / k   =  log (1 - exp (-k/c))
    --               exp (log ε / k)  =  1 - exp (-k/c)
    --           1 - exp (log ε / k)  =  exp (-k/c)
    --      log (1 - exp (log ε / k)) =  -k/c
    -- -k / log (1 - exp (log ε / k)) =  c
    --     -k / log1mexp (log ε / k)  =  c

policyForBits :: BitsPerEntry -> BloomPolicy
policyForBits :: Double -> BloomPolicy
policyForBits Double
c | Double
c Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
0 =
    String -> BloomPolicy
forall a. HasCallStack => String -> a
error String
"policyForBits: bits per entry must be > 0"

policyForBits Double
c =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    k :: Int
k = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log2))
    -- For the source of this formula, see
    -- https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions

policyFPR :: BloomPolicy -> FPR
policyFPR :: BloomPolicy -> Double
policyFPR BloomPolicy {
            policyBits :: BloomPolicy -> Double
policyBits   = Double
c,
            policyHashes :: BloomPolicy -> Int
policyHashes = Int
k
          } =
    Double -> Double
forall a. Num a => a -> a
negate (Double -> Double
forall a. Floating a => a -> a
expm1 (Double -> Double
forall a. Num a => a -> a
negate (Double
k' Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
c))) Double -> Double -> Double
forall a. Floating a => a -> a -> a
** Double
k'
  where
    k' :: Double
k' = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
k
    -- For the source of this formula, see
    -- https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
    --
    -- We use the FPR ε approximation that assumes independence for the
    -- probabilities of each bit being set.
    --
    --                         ε   = (1 - e^(-kn/m))^k
    --
    -- And noting that bits per entry @c = m/n@, hence @-kn/m = -k/c@, hence
    --
    --                         ε   = (1-e^(-k/c))^k
    --

-- | Parameters for constructing a Bloom filter.
--
data BloomSize = BloomSize {
                   -- | The requested number of bits in filter.
                   -- The actual size will be rounded up to the nearest 512.
                   BloomSize -> Int
sizeBits   :: !Int,

                   -- | The number of hash functions to use.
                   BloomSize -> Int
sizeHashes :: !Int
                 }
  deriving stock Int -> BloomSize -> ShowS
[BloomSize] -> ShowS
BloomSize -> String
(Int -> BloomSize -> ShowS)
-> (BloomSize -> String)
-> ([BloomSize] -> ShowS)
-> Show BloomSize
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> BloomSize -> ShowS
showsPrec :: Int -> BloomSize -> ShowS
$cshow :: BloomSize -> String
show :: BloomSize -> String
$cshowList :: [BloomSize] -> ShowS
showList :: [BloomSize] -> ShowS
Show

sizeForFPR :: FPR -> NumEntries -> BloomSize
sizeForFPR :: Double -> Int -> BloomSize
sizeForFPR = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForFPR

sizeForBits :: BitsPerEntry -> NumEntries -> BloomSize
sizeForBits :: Double -> Int -> BloomSize
sizeForBits = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForBits

sizeForPolicy :: BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy :: BloomPolicy -> Int -> BloomSize
sizeForPolicy BloomPolicy {
                policyBits :: BloomPolicy -> Double
policyBits   = Double
c,
                policyHashes :: BloomPolicy -> Int
policyHashes = Int
k
              } Int
n =
    BloomSize {
      sizeBits :: Int
sizeBits   = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
c)),
      sizeHashes :: Int
sizeHashes = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 Int
k
    }

log2, recip_log2 :: Double
log2 :: Double
log2       = Double -> Double
forall a. Floating a => a -> a
log Double
2
recip_log2 :: Double
recip_log2 = Double -> Double
forall a. Fractional a => a -> a
recip Double
log2