-- | Various formulas for working with bloomfilters.
module Data.BloomFilter.Blocked.Calc (
    NumEntries,
    BloomSize (..),
    FPR,
    sizeForFPR,
    BitsPerEntry,
    sizeForBits,
    sizeForPolicy,
    BloomPolicy (..),
    policyFPR,
    policyForFPR,
    policyForBits,
) where

import           Data.BloomFilter.Classic.Calc (BitsPerEntry, BloomPolicy (..),
                     BloomSize (..), FPR, NumEntries)

{-
Calculating the relationship between bits and FPR for the blocked
implementation:

While in principle there's a principled approach to this, it's complex to
calculate numerically. So instead we compute a regression from samples of bits
& FPR. The fpr-calc.hs program in this package does this for a range of bits,
and outputs out both graph data (to feed into gnuplot) and it also a regression
fit. The exact fit one gets depends on the PRNG seed used.

We calculate the regression two ways, one for FPR -> bits, and bits -> FPR.
We use a quadratic regression, with the FPR in log space.

The following is the sample of the regression fit output that we end up using
in the functions 'policyForFPR' and 'policyForBits'.

Blocked bloom filter quadratic regressions:
bits independent, FPR dependent:
Fit {
  fitParams = V3 (-5.03623760876204e-3) 0.5251544487138062 (-0.10110451821280719),
  fitErrors = V3 3.344945010267228e-5 8.905631581753235e-4 5.102181306816477e-3,
  fitNDF = 996, fitWSSR = 1.5016403117905384
}

FPR independent, bits dependent:
Fit {
  fitParams = V3 8.079418894776325e-2 1.6462569292513933 0.5550062950289885,
  fitErrors = V3 7.713375250014809e-4 8.542261871094414e-3 2.0678969159415226e-2,
  fitNDF = 996, fitWSSR = 19.00125036371992
}

-}

policyForFPR :: FPR -> BloomPolicy
policyForFPR :: Double -> BloomPolicy
policyForFPR Double
fpr | Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
0 Bool -> Bool -> Bool
|| Double
fpr Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
1 =
    [Char] -> BloomPolicy
forall a. HasCallStack => [Char] -> a
error [Char]
"bloomPolicyForFPR: fpr out of range (0,1)"

policyForFPR Double
fpr =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    k       :: Int
    k :: Int
k       = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double
recip_log2 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log_fpr))
    c :: Double
c       = Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
f2
            Double -> Double -> Double
forall a. Num a => a -> a -> a
+           Double
log_fpr Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
f1
            Double -> Double -> Double
forall a. Num a => a -> a -> a
+                     Double
f0
    log_fpr :: Double
log_fpr = Double -> Double
forall a. Num a => a -> a
negate (Double -> Double
forall a. Floating a => a -> a
log Double
fpr)

    -- These parameters are from a (quadratic) linear regression in log space
    -- of samples of the actual FPR between 1 and 20 bits. This is with log FPR
    -- as the independent variable and bits as the dependent variable.
    f2,f1,f0 :: Double
    f2 :: Double
f2 = Double
8.079418894776325e-2
    f1 :: Double
f1 = Double
1.6462569292513933
    f0 :: Double
f0 = Double
0.5550062950289885

policyForBits :: BitsPerEntry -> BloomPolicy
policyForBits :: Double -> BloomPolicy
policyForBits Double
c | Double
c Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0 =
    [Char] -> BloomPolicy
forall a. HasCallStack => [Char] -> a
error [Char]
"policyForBits: bits per entry must be > 0"

policyForBits Double
c =
    BloomPolicy {
      policyBits :: Double
policyBits   = Double
c,
      policyHashes :: Int
policyHashes = Int
k
    }
  where
    k :: Int
k = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
log2))

policyFPR :: BloomPolicy -> FPR
policyFPR :: BloomPolicy -> Double
policyFPR BloomPolicy {
            policyBits :: BloomPolicy -> Double
policyBits = Double
c
          } =
    Double -> Double
forall a. Floating a => a -> a
exp (Double
0 Double -> Double -> Double
forall a. Ord a => a -> a -> a
`min` Double -> Double
forall a. Num a => a -> a
negate (Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
f2 Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
f1 Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
f0))
  where
    -- These parameters are from a (quadratic) linear regression in log space
    -- of samples of the actual FPR between 2 and 24 bits. This is with bits as
    -- the independent variable and log FPR as the dependent variable. We have to
    -- clamp the result to keep the FPR within sanity bounds, otherwise extreme
    -- bits per element (<0.1 or >104) give FPRs > 1. This is because it's
    -- just a regression, not a principled approach.
    f2,f1,f0 :: Double
    f2 :: Double
f2 = -Double
5.03623760876204e-3
    f1 :: Double
f1 =  Double
0.5251544487138062
    f0 :: Double
f0 = -Double
0.10110451821280719

sizeForFPR :: FPR -> NumEntries -> BloomSize
sizeForFPR :: Double -> Int -> BloomSize
sizeForFPR = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForFPR

sizeForBits :: BitsPerEntry -> NumEntries -> BloomSize
sizeForBits :: Double -> Int -> BloomSize
sizeForBits = BloomPolicy -> Int -> BloomSize
sizeForPolicy (BloomPolicy -> Int -> BloomSize)
-> (Double -> BloomPolicy) -> Double -> Int -> BloomSize
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> BloomPolicy
policyForBits

sizeForPolicy :: BloomPolicy -> NumEntries -> BloomSize
sizeForPolicy :: BloomPolicy -> Int -> BloomSize
sizeForPolicy BloomPolicy {
                policyBits :: BloomPolicy -> Double
policyBits   = Double
c,
                policyHashes :: BloomPolicy -> Int
policyHashes = Int
k
              } Int
n =
    BloomSize {
      sizeBits :: Int
sizeBits   = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
c)),
      sizeHashes :: Int
sizeHashes = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 Int
k
    }

log2, recip_log2 :: Double
log2 :: Double
log2       = Double -> Double
forall a. Floating a => a -> a
log Double
2
recip_log2 :: Double
recip_log2 = Double -> Double
forall a. Fractional a => a -> a
recip Double
log2