-- | Various formulas for working with bloomfilters.
module Data.BloomFilter.Blocked.Calc (
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,
) where

import           Data.BloomFilter.Classic.Calc (BitsPerEntry, FPR, NumEntries)

{-
Calculating the relationship between bits and FPR for the blocked
implementation:

While in principle there's a principled approach to this, it's complex to
calculate numerically. So instead we compute a regression from samples of bits
& FPR. The fpr-calc.hs program in this package does this for a range of bits,
and outputs out both graph data (to feed into gnuplot) and it also a regression
fit. The exact fit one gets depends on the PRNG seed used.

We calculate the regression two ways, one for FPR -> bits, and bits -> FPR.
We use a quadratic regression, with the FPR in log space.

The following is the sample of the regression fit output that we end up using
in the functions 'policyForFPR' and 'policyForBits'.

Blocked bloom filter quadratic regressions:
bits independent, FPR dependent:
Fit {
  fitParams = V3 (-5.03623760876204e-3) 0.5251544487138062 (-0.10110451821280719),
  fitErrors = V3 3.344945010267228e-5 8.905631581753235e-4 5.102181306816477e-3,
  fitNDF = 996, fitWSSR = 1.5016403117905384
}

FPR independent, bits dependent:
Fit {
  fitParams = V3 8.079418894776325e-2 1.6462569292513933 0.5550062950289885,
  fitErrors = V3 7.713375250014809e-4 8.542261871094414e-3 2.0678969159415226e-2,
  fitNDF = 996, fitWSSR = 19.00125036371992
}

-}

-- | A policy on intended bloom filter size -- independent of the number of
-- elements.
--
-- We can decide a policy based on:
--
-- 1. a target false positive rate (FPR) using 'policyForFPR'
-- 2. a number of bits per entry using 'policyForBits'
--
-- A policy can be turned into a 'BloomSize' given a target 'NumEntries' using
-- 'sizeForPolicy'.
--
-- Either way we define the policy, we can inspect the result to see:
--
-- 1. The bits per entry 'policyBits'. This will determine the
--    size of the bloom filter in bits. In general the bits per entry can be
--    fractional. The final bloom filter size in will be rounded to a whole
--    number of bits.
-- 2. The number of hashes 'policyHashes'.
-- 3. The expected FPR for the policy using 'policyFPR'.
--
data BloomPolicy = BloomPolicy {
       BloomPolicy -> Double
policyBits   :: !Double,
       BloomPolicy -> Int
policyHashes :: !Int
     }
  deriving stock Int -> BloomPolicy -> ShowS
[BloomPolicy] -> ShowS
BloomPolicy -> String
(Int -> BloomPolicy -> ShowS)
-> (BloomPolicy -> String)
-> ([BloomPolicy] -> ShowS)
-> Show BloomPolicy
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> BloomPolicy -> ShowS
showsPrec :: Int -> BloomPolicy -> ShowS
$cshow :: BloomPolicy -> String
show :: BloomPolicy -> String
$cshowList :: [BloomPolicy] -> ShowS
showList :: [BloomPolicy] -> ShowS
Show

policyForFPR :: FPR -> BloomPolicy
policyForFPR :: Double -> BloomPolicy
policyForFPR Double
fpr | Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
0 Bool -> Bool -> Bool
|| Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
1 =
    String -> BloomPolicy
forall a. HasCallStack => String -> a
error String
"bloomPolicyForFPR: fpr out of range (0,1)"

policyForFPR Double
fpr =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    k       :: Int
    k :: Int
k       = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double
recip_log2 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log_fpr))
    c :: Double
c       = Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
f2
            Double -> Double -> Double
forall a. Num a => a -> a -> a
+           Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
f1
            Double -> Double -> Double
forall a. Num a => a -> a -> a
+                     Double
f0
    log_fpr :: Double
log_fpr = Double -> Double
forall a. Num a => a -> a
negate (Double -> Double
forall a. Floating a => a -> a
log Double
fpr)

    -- These parameters are from a (quadratic) linear regression in log space
    -- of samples of the actual FPR between 1 and 20 bits. This is with log FPR
    -- as the independent variable and bits as the dependent variable.
    f2,f1,f0 :: Double
    f2 :: Double
f2 = Double
8.079418894776325e-2
    f1 :: Double
f1 = Double
1.6462569292513933
    f0 :: Double
f0 = Double
0.5550062950289885

policyForBits :: BitsPerEntry -> BloomPolicy
policyForBits :: Double -> BloomPolicy
policyForBits Double
c | Double
c Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0 =
    String -> BloomPolicy
forall a. HasCallStack => String -> a
error String
"policyForBits: bits per entry must be > 0"

policyForBits Double
c =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    k :: Int
k = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log2))

policyFPR :: BloomPolicy -> FPR
policyFPR :: BloomPolicy -> Double
policyFPR BloomPolicy {
            policyBits :: BloomPolicy -> Double
policyBits = Double
c
          } =
    Double -> Double
forall a. Floating a => a -> a
exp (Double
0 Double -> Double -> Double
forall a. Ord a => a -> a -> a
`min` Double -> Double
forall a. Num a => a -> a
negate (Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
f2 Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
f1 Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
f0))
  where
    -- These parameters are from a (quadratic) linear regression in log space
    -- of samples of the actual FPR between 2 and 24 bits. This is with bits as
    -- the independent variable and log FPR as the dependent variable. We have to
    -- clamp the result to keep the FPR within sanity bounds, otherwise extreme
    -- bits per element (<0.1 or >104) give FPRs > 1. This is because it's
    -- just a regression, not a principled approach.
    f2,f1,f0 :: Double
    f2 :: Double
f2 = -Double
5.03623760876204e-3
    f1 :: Double
f1 =  Double
0.5251544487138062
    f0 :: Double
f0 = -Double
0.10110451821280719

-- | Parameters for constructing a Bloom filter.
--
data BloomSize = BloomSize {
                   -- | The requested number of bits in the filter.
                   --
                   -- The actual size will be rounded up to the nearest 512.
                   BloomSize -> Int
sizeBits   :: !Int,

                   -- | The number of hash functions to use.
                   BloomSize -> Int
sizeHashes :: !Int
                 }
  deriving stock Int -> BloomSize -> ShowS
[BloomSize] -> ShowS
BloomSize -> String
(Int -> BloomSize -> ShowS)
-> (BloomSize -> String)
-> ([BloomSize] -> ShowS)
-> Show BloomSize
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> BloomSize -> ShowS
showsPrec :: Int -> BloomSize -> ShowS
$cshow :: BloomSize -> String
show :: BloomSize -> String
$cshowList :: [BloomSize] -> ShowS
showList :: [BloomSize] -> ShowS
Show

sizeForFPR :: FPR -> NumEntries -> BloomSize
sizeForFPR :: Double -> Int -> BloomSize
sizeForFPR = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForFPR

sizeForBits :: BitsPerEntry -> NumEntries -> BloomSize
sizeForBits :: Double -> Int -> BloomSize
sizeForBits = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForBits

sizeForPolicy :: BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy :: BloomPolicy -> Int -> BloomSize
sizeForPolicy BloomPolicy {
                policyBits :: BloomPolicy -> Double
policyBits   = Double
c,
                policyHashes :: BloomPolicy -> Int
policyHashes = Int
k
              } Int
n =
    BloomSize {
      sizeBits :: Int
sizeBits   = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
c)),
      sizeHashes :: Int
sizeHashes = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 Int
k
    }

log2, recip_log2 :: Double
log2 :: Double
log2       = Double -> Double
forall a. Floating a => a -> a
log Double
2
recip_log2 :: Double
recip_log2 = Double -> Double
forall a. Fractional a => a -> a
recip Double
log2