{-# LANGUAGE MagicHash     #-}
{-# LANGUAGE UnboxedTuples #-}
-- |
--
-- Fast hashing of Haskell values.
-- The hash used is XXH3 64bit.
--
module Data.BloomFilter.Hash (
    -- * Basic hash functionality
    Hash,
    Hashable(..),
    hash64,
    hashByteArray,
    -- * Incremental hashing
    Incremental (..),
    HashState,
    incrementalHash,
    -- * Hashing
    Hashes (..),
    RealHashes (..),
    -- * Compute a family of hash values
    CheapHashes (..),
    evalCheapHashes,
    makeCheapHashes,
) where

import           Control.Monad (forM_)
import           Control.Monad.ST (ST, runST)
import           Data.Array.Byte (ByteArray (..))
import           Data.Bits (unsafeShiftR)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import           Data.Char (ord)
import qualified Data.Primitive.ByteArray as P
import           Data.Primitive.Types (Prim (..))
import           Data.Word (Word32, Word64)
import           GHC.Exts (Int#, uncheckedIShiftL#, (+#))
import qualified XXH3

-- | A hash value is 64 bits wide.
type Hash = Word64

-------------------------------------------------------------------------------
-- One shot hashing
-------------------------------------------------------------------------------

-- | The class of types that can be converted to a hash value.
--
-- The instances are meant to be stable, the hash values can be persisted.
--
class Hashable a where
    -- | Compute a 64-bit hash of a value.
    hashSalt64 ::
           Word64  -- ^ seed
        -> a       -- ^ value to hash
        -> Word64

-- | Compute a 64-bit hash.
hash64 :: Hashable a => a -> Word64
hash64 :: forall a. Hashable a => a -> Word64
hash64 = Word64 -> a -> Word64
forall a. Hashable a => Word64 -> a -> Word64
hashSalt64 Word64
0

instance Hashable () where
    hashSalt64 :: Word64 -> () -> Word64
hashSalt64 Word64
salt ()
_ = Word64
salt

instance Hashable Char where
    -- Char's ordinal value should fit into Word32
    hashSalt64 :: Word64 -> Char -> Word64
hashSalt64 Word64
salt Char
c = Word64 -> Word32 -> Word64
forall a. Hashable a => Word64 -> a -> Word64
hashSalt64 Word64
salt (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word32)

instance Hashable BS.ByteString where
    hashSalt64 :: Word64 -> ByteString -> Word64
hashSalt64 Word64
salt ByteString
bs = ByteString -> Word64 -> Word64
XXH3.xxh3_64bit_withSeed_bs ByteString
bs Word64
salt

instance Hashable LBS.ByteString where
    hashSalt64 :: Word64 -> ByteString -> Word64
hashSalt64 Word64
salt ByteString
lbs =
        Word64 -> (forall s. HashState s -> ST s ()) -> Word64
incrementalHash Word64
salt ((forall s. HashState s -> ST s ()) -> Word64)
-> (forall s. HashState s -> ST s ()) -> Word64
forall a b. (a -> b) -> a -> b
$ \HashState s
s ->
        [ByteString] -> (ByteString -> ST s ()) -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (ByteString -> [ByteString]
LBS.toChunks ByteString
lbs) ((ByteString -> ST s ()) -> ST s ())
-> (ByteString -> ST s ()) -> ST s ()
forall a b. (a -> b) -> a -> b
$ \ByteString
bs ->
        HashState s -> ByteString -> ST s ()
forall a s. Incremental a => HashState s -> a -> ST s ()
forall s. HashState s -> ByteString -> ST s ()
update HashState s
s ByteString
bs

instance Hashable ByteArray where
    hashSalt64 :: Word64 -> ByteArray -> Word64
hashSalt64 Word64
salt ByteArray
ba = ByteArray -> Int -> Int -> Word64 -> Word64
XXH3.xxh3_64bit_withSeed_ba ByteArray
ba Int
0 (ByteArray -> Int
P.sizeofByteArray ByteArray
ba) Word64
salt

instance Hashable Word64 where
    hashSalt64 :: Word64 -> Word64 -> Word64
hashSalt64 Word64
salt Word64
w = Word64 -> Word64 -> Word64
XXH3.xxh3_64bit_withSeed_w64 Word64
w Word64
salt

instance Hashable Word32 where
    hashSalt64 :: Word64 -> Word32 -> Word64
hashSalt64 Word64
salt Word32
w = Word32 -> Word64 -> Word64
XXH3.xxh3_64bit_withSeed_w32 Word32
w Word64
salt

{- Note [Tree hashing]

We recursively hash inductive types (instead e.g. just serially hashing
their fields). Why?

So ("", "x") and ("x", "") or [[],[],[""]], [[],[""],[]] and [[""],[],[]]
have different hash values!

Another approach would be to have injective serialisation,
but then 'Incremental BS.ByteString' instance (e.g.) would need to serialise
the length, so we'd need third class for "pieces", keeping 'Incremental'
just adding bytes to the state (without any extras).

-}

instance Hashable a => Hashable [a] where
    hashSalt64 :: Word64 -> [a] -> Word64
hashSalt64 Word64
salt [a]
xs = Word64 -> (forall s. HashState s -> ST s ()) -> Word64
incrementalHash Word64
salt ((forall s. HashState s -> ST s ()) -> Word64)
-> (forall s. HashState s -> ST s ()) -> Word64
forall a b. (a -> b) -> a -> b
$ \HashState s
s -> [a] -> (a -> ST s ()) -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [a]
xs ((a -> ST s ()) -> ST s ()) -> (a -> ST s ()) -> ST s ()
forall a b. (a -> b) -> a -> b
$ \a
x ->
        HashState s -> Word64 -> ST s ()
forall a s. Incremental a => HashState s -> a -> ST s ()
forall s. HashState s -> Word64 -> ST s ()
update HashState s
s (a -> Word64
forall a. Hashable a => a -> Word64
hash64 a
x)

instance (Hashable a, Hashable b) => Hashable (a, b) where
    hashSalt64 :: Word64 -> (a, b) -> Word64
hashSalt64 Word64
salt (a
x, b
y) = Word64 -> (forall s. HashState s -> ST s ()) -> Word64
incrementalHash Word64
salt ((forall s. HashState s -> ST s ()) -> Word64)
-> (forall s. HashState s -> ST s ()) -> Word64
forall a b. (a -> b) -> a -> b
$ \HashState s
s -> do
        HashState s -> Word64 -> ST s ()
forall a s. Incremental a => HashState s -> a -> ST s ()
forall s. HashState s -> Word64 -> ST s ()
update HashState s
s (a -> Word64
forall a. Hashable a => a -> Word64
hash64 a
x)
        HashState s -> Word64 -> ST s ()
forall a s. Incremental a => HashState s -> a -> ST s ()
forall s. HashState s -> Word64 -> ST s ()
update HashState s
s (b -> Word64
forall a. Hashable a => a -> Word64
hash64 b
y)

-- | Hash a (part of) 'ByteArray'.
hashByteArray :: ByteArray -> Int -> Int -> Word64 -> Word64
hashByteArray :: ByteArray -> Int -> Int -> Word64 -> Word64
hashByteArray = ByteArray -> Int -> Int -> Word64 -> Word64
XXH3.xxh3_64bit_withSeed_ba

-------------------------------------------------------------------------------
-- Incremental hashing
-------------------------------------------------------------------------------

-- | Hash state for incremental hashing
newtype HashState s = HashState (XXH3.XXH3_State s)

-- | The class of types that can be incrementally hashed.
class Incremental a where
    update :: HashState s -> a -> ST s ()

instance Incremental BS.ByteString where
    update :: forall s. HashState s -> ByteString -> ST s ()
update (HashState XXH3_State s
s) = XXH3_State s -> ByteString -> ST s ()
forall s. XXH3_State s -> ByteString -> ST s ()
XXH3.xxh3_64bit_update_bs XXH3_State s
s

instance Incremental Word32 where
    update :: forall s. HashState s -> Word32 -> ST s ()
update (HashState XXH3_State s
s) = XXH3_State s -> Word32 -> ST s ()
forall s. XXH3_State s -> Word32 -> ST s ()
XXH3.xxh3_64bit_update_w32 XXH3_State s
s

instance Incremental Word64 where
    update :: forall s. HashState s -> Word64 -> ST s ()
update (HashState XXH3_State s
s) = XXH3_State s -> Word64 -> ST s ()
forall s. XXH3_State s -> Word64 -> ST s ()
XXH3.xxh3_64bit_update_w64 XXH3_State s
s

instance Incremental Char where
    update :: forall s. HashState s -> Char -> ST s ()
update HashState s
s Char
c = HashState s -> Word32 -> ST s ()
forall a s. Incremental a => HashState s -> a -> ST s ()
forall s. HashState s -> Word32 -> ST s ()
update HashState s
s (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word32)

-- | Calculate incrementally constructed hash.
incrementalHash :: Word64 -> (forall s. HashState s -> ST s ()) -> Word64
incrementalHash :: Word64 -> (forall s. HashState s -> ST s ()) -> Word64
incrementalHash Word64
seed forall s. HashState s -> ST s ()
f = (forall s. ST s Word64) -> Word64
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s Word64) -> Word64)
-> (forall s. ST s Word64) -> Word64
forall a b. (a -> b) -> a -> b
$ do
    XXH3_State s
s <- ST s (XXH3_State s)
forall s. ST s (XXH3_State s)
XXH3.xxh3_64bit_createState
    XXH3_State s -> Word64 -> ST s ()
forall s. XXH3_State s -> Word64 -> ST s ()
XXH3.xxh3_64bit_reset_withSeed XXH3_State s
s Word64
seed
    HashState s -> ST s ()
forall s. HashState s -> ST s ()
f (XXH3_State s -> HashState s
forall s. XXH3_State s -> HashState s
HashState XXH3_State s
s)
    XXH3_State s -> ST s Word64
forall s. XXH3_State s -> ST s Word64
XXH3.xxh3_64bit_digest XXH3_State s
s

-------------------------------------------------------------------------------
-- Hashes
-------------------------------------------------------------------------------

-- | A type class abstracting over different hashing schemes.b
class Hashes h where
    makeHashes :: Hashable a => a -> h a

    evalHashes :: h a -> Int -> Hash

-- | A closure of real hashing function.
newtype RealHashes a = RealHashes (Word64 -> Hash)

instance Hashes RealHashes where
    makeHashes :: forall a. Hashable a => a -> RealHashes a
makeHashes a
x = (Word64 -> Word64) -> RealHashes a
forall {k} (a :: k). (Word64 -> Word64) -> RealHashes a
RealHashes (\Word64
salt -> Word64 -> a -> Word64
forall a. Hashable a => Word64 -> a -> Word64
hashSalt64 Word64
salt a
x)
    evalHashes :: forall a. RealHashes a -> Int -> Word64
evalHashes (RealHashes Word64 -> Word64
f) Int
i = Word64 -> Word64
f (Int -> Word64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
i)

-------------------------------------------------------------------------------
-- CheapHashes
-------------------------------------------------------------------------------

-- | A pair of hashes used for a double hashing scheme.
--
-- See 'evalCheapHashes'.
data CheapHashes a = CheapHashes !Hash !Hash
  deriving Int -> CheapHashes a -> ShowS
[CheapHashes a] -> ShowS
CheapHashes a -> String
(Int -> CheapHashes a -> ShowS)
-> (CheapHashes a -> String)
-> ([CheapHashes a] -> ShowS)
-> Show (CheapHashes a)
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall k (a :: k). Int -> CheapHashes a -> ShowS
forall k (a :: k). [CheapHashes a] -> ShowS
forall k (a :: k). CheapHashes a -> String
$cshowsPrec :: forall k (a :: k). Int -> CheapHashes a -> ShowS
showsPrec :: Int -> CheapHashes a -> ShowS
$cshow :: forall k (a :: k). CheapHashes a -> String
show :: CheapHashes a -> String
$cshowList :: forall k (a :: k). [CheapHashes a] -> ShowS
showList :: [CheapHashes a] -> ShowS
Show
type role CheapHashes nominal

instance Hashes CheapHashes where
    makeHashes :: forall a. Hashable a => a -> CheapHashes a
makeHashes = a -> CheapHashes a
forall a. Hashable a => a -> CheapHashes a
makeCheapHashes
    {-# INLINE makeHashes #-}
    evalHashes :: forall a. CheapHashes a -> Int -> Word64
evalHashes = CheapHashes a -> Int -> Word64
forall {k} (a :: k). CheapHashes a -> Int -> Word64
evalCheapHashes
    {-# INLINE evalHashes #-}

instance Prim (CheapHashes a) where
    sizeOfType# :: Proxy (CheapHashes a) -> Int#
sizeOfType# Proxy (CheapHashes a)
_ = Int#
16#
    alignmentOfType# :: Proxy (CheapHashes a) -> Int#
alignmentOfType# Proxy (CheapHashes a)
_ = Int#
8#

    indexByteArray# :: ByteArray# -> Int# -> CheapHashes a
indexByteArray# ByteArray#
ba Int#
i = Word64 -> Word64 -> CheapHashes a
forall {k} (a :: k). Word64 -> Word64 -> CheapHashes a
CheapHashes
        (ByteArray# -> Int# -> Word64
forall a. Prim a => ByteArray# -> Int# -> a
indexByteArray# ByteArray#
ba (Int# -> Int#
indexLo Int#
i))
        (ByteArray# -> Int# -> Word64
forall a. Prim a => ByteArray# -> Int# -> a
indexByteArray# ByteArray#
ba (Int# -> Int#
indexHi Int#
i))
    readByteArray# :: forall s.
MutableByteArray# s
-> Int# -> State# s -> (# State# s, CheapHashes a #)
readByteArray# MutableByteArray# s
ba Int#
i State# s
s1 =
        case MutableByteArray# s -> Int# -> State# s -> (# State# s, Word64 #)
forall s.
MutableByteArray# s -> Int# -> State# s -> (# State# s, Word64 #)
forall a s.
Prim a =>
MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
readByteArray# MutableByteArray# s
ba (Int# -> Int#
indexLo Int#
i) State# s
s1 of { (# State# s
s2, Word64
lo #) ->
        case MutableByteArray# s -> Int# -> State# s -> (# State# s, Word64 #)
forall s.
MutableByteArray# s -> Int# -> State# s -> (# State# s, Word64 #)
forall a s.
Prim a =>
MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
readByteArray# MutableByteArray# s
ba (Int# -> Int#
indexHi Int#
i) State# s
s2 of { (# State# s
s3, Word64
hi #) ->
        (# State# s
s3, Word64 -> Word64 -> CheapHashes a
forall {k} (a :: k). Word64 -> Word64 -> CheapHashes a
CheapHashes Word64
lo Word64
hi #)
        }}
    writeByteArray# :: forall s.
MutableByteArray# s
-> Int# -> CheapHashes a -> State# s -> State# s
writeByteArray# MutableByteArray# s
ba Int#
i (CheapHashes Word64
lo Word64
hi) State# s
s =
        MutableByteArray# s -> Int# -> Word64 -> State# s -> State# s
forall s.
MutableByteArray# s -> Int# -> Word64 -> State# s -> State# s
forall a s.
Prim a =>
MutableByteArray# s -> Int# -> a -> State# s -> State# s
writeByteArray# MutableByteArray# s
ba (Int# -> Int#
indexHi Int#
i) Word64
hi (MutableByteArray# s -> Int# -> Word64 -> State# s -> State# s
forall s.
MutableByteArray# s -> Int# -> Word64 -> State# s -> State# s
forall a s.
Prim a =>
MutableByteArray# s -> Int# -> a -> State# s -> State# s
writeByteArray# MutableByteArray# s
ba (Int# -> Int#
indexLo Int#
i) Word64
lo State# s
s)

    indexOffAddr# :: Addr# -> Int# -> CheapHashes a
indexOffAddr# Addr#
ba Int#
i = Word64 -> Word64 -> CheapHashes a
forall {k} (a :: k). Word64 -> Word64 -> CheapHashes a
CheapHashes
        (Addr# -> Int# -> Word64
forall a. Prim a => Addr# -> Int# -> a
indexOffAddr# Addr#
ba (Int# -> Int#
indexLo Int#
i))
        (Addr# -> Int# -> Word64
forall a. Prim a => Addr# -> Int# -> a
indexOffAddr# Addr#
ba (Int# -> Int#
indexHi Int#
i))
    readOffAddr# :: forall s.
Addr# -> Int# -> State# s -> (# State# s, CheapHashes a #)
readOffAddr# Addr#
ba Int#
i State# s
s1 =
        case Addr# -> Int# -> State# s -> (# State# s, Word64 #)
forall s. Addr# -> Int# -> State# s -> (# State# s, Word64 #)
forall a s.
Prim a =>
Addr# -> Int# -> State# s -> (# State# s, a #)
readOffAddr# Addr#
ba (Int# -> Int#
indexLo Int#
i) State# s
s1 of { (# State# s
s2, Word64
lo #) ->
        case Addr# -> Int# -> State# s -> (# State# s, Word64 #)
forall s. Addr# -> Int# -> State# s -> (# State# s, Word64 #)
forall a s.
Prim a =>
Addr# -> Int# -> State# s -> (# State# s, a #)
readOffAddr# Addr#
ba (Int# -> Int#
indexHi Int#
i) State# s
s2 of { (# State# s
s3, Word64
hi #) ->
        (# State# s
s3, Word64 -> Word64 -> CheapHashes a
forall {k} (a :: k). Word64 -> Word64 -> CheapHashes a
CheapHashes Word64
lo Word64
hi #)
        }}
    writeOffAddr# :: forall s. Addr# -> Int# -> CheapHashes a -> State# s -> State# s
writeOffAddr# Addr#
ba Int#
i (CheapHashes Word64
lo Word64
hi) State# s
s =
        Addr# -> Int# -> Word64 -> State# s -> State# s
forall s. Addr# -> Int# -> Word64 -> State# s -> State# s
forall a s. Prim a => Addr# -> Int# -> a -> State# s -> State# s
writeOffAddr# Addr#
ba (Int# -> Int#
indexHi Int#
i) Word64
hi (Addr# -> Int# -> Word64 -> State# s -> State# s
forall s. Addr# -> Int# -> Word64 -> State# s -> State# s
forall a s. Prim a => Addr# -> Int# -> a -> State# s -> State# s
writeOffAddr# Addr#
ba (Int# -> Int#
indexLo Int#
i) Word64
lo State# s
s)

indexLo :: Int# -> Int#
indexLo :: Int# -> Int#
indexLo Int#
i = Int# -> Int# -> Int#
uncheckedIShiftL# Int#
i Int#
1#

indexHi :: Int# -> Int#
indexHi :: Int# -> Int#
indexHi Int#
i = Int# -> Int# -> Int#
uncheckedIShiftL# Int#
i Int#
1# Int# -> Int# -> Int#
+# Int#
1#

{- Note [Original CheapHashes]

Compute a list of 32-bit hashes relatively cheaply.  The value to
hash is inspected at most twice, regardless of the number of hashes
requested.

We use a variant of Kirsch and Mitzenmacher's technique from \"Less
Hashing, Same Performance: Building a Better Bloom Filter\",
<http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf>.

Where Kirsch and Mitzenmacher multiply the second hash by a
coefficient, we shift right by the coefficient.  This offers better
performance (as a shift is much cheaper than a multiply), and the
low order bits of the final hash stay well mixed.

-}

{- Note: [CheapHashes]

On the first glance the 'evalCheapHashes' scheme seems dubious.

Firstly, it's original performance motivation is dubious.

> multiply the second hash by a coefficient

While the scheme double hashing scheme is presented in
theoretical analysis as

    g(i) = a + i * b

In practice it's implemented in a loop which looks like

    g[0] = a
    for (i = 1; i < k; i++) {
        a += b;
        g[i] = a;
    }

I.e. with just an addition.

Secondly there is no analysis anywhere about the
'evalCheapHashes' scheme.

Peter Dillinger's thesis (Adaptive Approximate State Storage)
discusses various fast hashing schemes (section 6.5),
mentioning why ordinary "double hashing" is weak scheme.

Issue 1: when second hash value is bad, e.g. not coprime with bloom filters size in bits,
we can get repetitions (worst case 0, or m/2).

Issue 2: in bloom filter scenario, whether we do a + i * b or h0 - i * b' (with b' = -b)
as we probe all indices (as set) doesn't matter, not sequentially (like in hash table).
So we lose one bit entropy.

Issue 3: the scheme is prone to partial overlap.
Two values with the same second hash value could overlap on many indices.

Then Dillinger discusses various schemes which solve this issue.

The CheapHashes scheme seems to avoid these cuprits.
This is probably because it uses most of the bits of the second hash, even in m = 2^n scenarios.
(normal double hashing and enhances double hashing don't use the high bits or original hash then).
TL;DR CheapHashes seems to work well in practice.

For the record: RocksDB uses an own scheme as well,
where first hash is used to pick a cache line, and second one to generate probes inside it.
https://github.com/facebook/rocksdb/blob/096fb9b67d19a9a180e7c906b4a0cdb2b2d0c1f6/util/bloom_impl.h

-}

-- | Evalute 'CheapHashes' family.
--
-- \[
-- g_i = h_0 + \left\lfloor h_1 / 2^i \right\rfloor
-- \]
--
evalCheapHashes :: CheapHashes a -> Int -> Hash
evalCheapHashes :: forall {k} (a :: k). CheapHashes a -> Int -> Word64
evalCheapHashes (CheapHashes Word64
h1 Word64
h2) Int
i = Word64
h1 Word64 -> Word64 -> Word64
forall a. Num a => a -> a -> a
+ (Word64
h2 Word64 -> Int -> Word64
forall a. Bits a => a -> Int -> a
`unsafeShiftR` Int
i)

-- | Create 'CheapHashes' structure.
--
-- It's simply hashes the value twice using seed 0 and 1.
makeCheapHashes :: Hashable a => a -> CheapHashes a
makeCheapHashes :: forall a. Hashable a => a -> CheapHashes a
makeCheapHashes a
v = Word64 -> Word64 -> CheapHashes a
forall {k} (a :: k). Word64 -> Word64 -> CheapHashes a
CheapHashes (Word64 -> a -> Word64
forall a. Hashable a => Word64 -> a -> Word64
hashSalt64 Word64
0 a
v) (Word64 -> a -> Word64
forall a. Hashable a => Word64 -> a -> Word64
hashSalt64 Word64
1 a
v)
{-# SPECIALIZE makeCheapHashes :: BS.ByteString -> CheapHashes BS.ByteString #-}
{-# INLINEABLE makeCheapHashes #-}