bloom.rs - mozsearch

mozilla-central/servo/components/selectors/bloom.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: CSS Parsing and Computation

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */

//! Counting and non-counting Bloom filters tuned for use as ancestor filters

//! for selector matching.

use std::fmt::{self, Debug};

// The top 8 bits of the 32-bit hash value are not used by the bloom filter.

// Consumers may rely on this to pack hashes more efficiently.

pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;

const KEY_SIZE: usize = 12;

const ARRAY_SIZE: usize = 1 << KEY_SIZE;

const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;

/// A counting Bloom filter with 8-bit counters.

pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;

/// A counting Bloom filter with parameterized storage to handle

/// counters of different sizes.  For now we assume that having two hash

/// functions is enough, but we may revisit that decision later.

///

/// The filter uses an array with 2**KeySize entries.

///

/// Assuming a well-distributed hash function, a Bloom filter with

/// array size M containing N elements and

/// using k hash function has expected false positive rate exactly

///

/// $  (1 - (1 - 1/M)^{kN})^k  $

///

/// because each array slot has a

///

/// $  (1 - 1/M)^{kN}  $

///

/// chance of being 0, and the expected false positive rate is the

/// probability that all of the k hash functions will hit a nonzero

/// slot.

///

/// For reasonable assumptions (M large, kN large, which should both

/// hold if we're worried about false positives) about M and kN this

/// becomes approximately

///

/// $$  (1 - \exp(-kN/M))^k   $$

///

/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,

/// or in other words

///

/// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$

///

/// where r is the false positive rate.  This can be used to compute

/// the desired KeySize for a given load N and false positive rate r.

///

/// If N/M is assumed small, then the false positive rate can

/// further be approximated as 4*N^2/M^2.  So increasing KeySize by

/// 1, which doubles M, reduces the false positive rate by about a

/// factor of 4, and a false positive rate of 1% corresponds to

/// about M/N == 20.

///

/// What this means in practice is that for a few hundred keys using a

/// KeySize of 12 gives false positive rates on the order of 0.25-4%.

///

/// Similarly, using a KeySize of 10 would lead to a 4% false

/// positive rate for N == 100 and to quite bad false positive

/// rates for larger N.

#[derive(Clone, Default)]

pub struct CountingBloomFilter<S>

where

    S: BloomStorage,

    storage: S,

impl<S> CountingBloomFilter<S>

where

    S: BloomStorage,

    /// Creates a new bloom filter.

    #[inline]

    pub fn new() -> Self {

        Default::default()

    #[inline]

    pub fn clear(&mut self) {

        self.storage = Default::default();

    // Slow linear accessor to make sure the bloom filter is zeroed. This should

    // never be used in release builds.

    #[cfg(debug_assertions)]

    pub fn is_zeroed(&self) -> bool {

        self.storage.is_zeroed()

    #[cfg(not(debug_assertions))]

    pub fn is_zeroed(&self) -> bool {

        unreachable!()

    /// Inserts an item with a particular hash into the bloom filter.

    #[inline]

    pub fn insert_hash(&mut self, hash: u32) {

        self.storage.adjust_first_slot(hash, true);

        self.storage.adjust_second_slot(hash, true);

    /// Removes an item with a particular hash from the bloom filter.

    #[inline]

    pub fn remove_hash(&mut self, hash: u32) {

        self.storage.adjust_first_slot(hash, false);

        self.storage.adjust_second_slot(hash, false);

    /// Check whether the filter might contain an item with the given hash.

    /// This can sometimes return true even if the item is not in the filter,

    /// but will never return false for items that are actually in the filter.

    #[inline]

    pub fn might_contain_hash(&self, hash: u32) -> bool {

        !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)

impl<S> Debug for CountingBloomFilter<S>

where

    S: BloomStorage,

    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {

        let mut slots_used = 0;

        for i in 0..ARRAY_SIZE {

            if !self.storage.slot_is_empty(i) {

                slots_used += 1;

        write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)

pub trait BloomStorage: Clone + Default {

    fn slot_is_empty(&self, index: usize) -> bool;

    fn adjust_slot(&mut self, index: usize, increment: bool);

    fn is_zeroed(&self) -> bool;

    #[inline]

    fn first_slot_is_empty(&self, hash: u32) -> bool {

        self.slot_is_empty(Self::first_slot_index(hash))

    #[inline]

    fn second_slot_is_empty(&self, hash: u32) -> bool {

        self.slot_is_empty(Self::second_slot_index(hash))

    #[inline]

    fn adjust_first_slot(&mut self, hash: u32, increment: bool) {

        self.adjust_slot(Self::first_slot_index(hash), increment)

    #[inline]

    fn adjust_second_slot(&mut self, hash: u32, increment: bool) {

        self.adjust_slot(Self::second_slot_index(hash), increment)

    #[inline]

    fn first_slot_index(hash: u32) -> usize {

        hash1(hash) as usize

    #[inline]

    fn second_slot_index(hash: u32) -> usize {

        hash2(hash) as usize

/// Storage class for a CountingBloomFilter that has 8-bit counters.

pub struct BloomStorageU8 {

    counters: [u8; ARRAY_SIZE],

impl BloomStorage for BloomStorageU8 {

    #[inline]

    fn adjust_slot(&mut self, index: usize, increment: bool) {

        let slot = &mut self.counters[index];

        if *slot != 0xff {

            // full

            if increment {

                *slot += 1;

            } else {

                *slot -= 1;

    #[inline]

    fn slot_is_empty(&self, index: usize) -> bool {

        self.counters[index] == 0

    #[inline]

    fn is_zeroed(&self) -> bool {

        self.counters.iter().all(|x| *x == 0)

impl Default for BloomStorageU8 {

    fn default() -> Self {

        BloomStorageU8 {

            counters: [0; ARRAY_SIZE],

impl Clone for BloomStorageU8 {

    fn clone(&self) -> Self {

        BloomStorageU8 {

            counters: self.counters,

/// Storage class for a CountingBloomFilter that has 1-bit counters.

pub struct BloomStorageBool {

    counters: [u8; ARRAY_SIZE / 8],

impl BloomStorage for BloomStorageBool {

    #[inline]

    fn adjust_slot(&mut self, index: usize, increment: bool) {

        let bit = 1 << (index % 8);

        let byte = &mut self.counters[index / 8];

        // Since we have only one bit for storage, decrementing it

        // should never do anything.  Assert against an accidental

        // decrementing of a bit that was never set.

        assert!(

            increment || (*byte & bit) != 0,

            "should not decrement if slot is already false"

);

        if increment {

            *byte |= bit;

    #[inline]

    fn slot_is_empty(&self, index: usize) -> bool {

        let bit = 1 << (index % 8);

        (self.counters[index / 8] & bit) == 0

    #[inline]

    fn is_zeroed(&self) -> bool {

        self.counters.iter().all(|x| *x == 0)

impl Default for BloomStorageBool {

    fn default() -> Self {

        BloomStorageBool {

            counters: [0; ARRAY_SIZE / 8],

impl Clone for BloomStorageBool {

    fn clone(&self) -> Self {

        BloomStorageBool {

            counters: self.counters,

#[inline]

fn hash1(hash: u32) -> u32 {

    hash & KEY_MASK

#[inline]

fn hash2(hash: u32) -> u32 {

    (hash >> KEY_SIZE) & KEY_MASK

#[test]

fn create_and_insert_some_stuff() {

    use fxhash::FxHasher;

    use std::hash::{Hash, Hasher};

    use std::mem::transmute;

    fn hash_as_str(i: usize) -> u32 {

        let mut hasher = FxHasher::default();

        let s = i.to_string();

        s.hash(&mut hasher);

        let hash: u64 = hasher.finish();

        (hash >> 32) as u32 ^ (hash as u32)

    let mut bf = BloomFilter::new();

    // Statically assert that ARRAY_SIZE is a multiple of 8, which

    // BloomStorageBool relies on.

    unsafe {

        transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);

    for i in 0_usize..1000 {

        bf.insert_hash(hash_as_str(i));

    for i in 0_usize..1000 {

        assert!(bf.might_contain_hash(hash_as_str(i)));

    let false_positives = (1001_usize..2000)

        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))

        .count();

    assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.

    for i in 0_usize..100 {

        bf.remove_hash(hash_as_str(i));

    for i in 100_usize..1000 {

        assert!(bf.might_contain_hash(hash_as_str(i)));

    let false_positives = (0_usize..100)

        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))

        .count();

    assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.

    bf.clear();

    for i in 0_usize..2000 {

        assert!(!bf.might_contain_hash(hash_as_str(i)));

#[cfg(feature = "bench")]

#[cfg(test)]

mod bench {

    extern crate test;

    use super::BloomFilter;

    #[derive(Default)]

    struct HashGenerator(u32);

    impl HashGenerator {

        fn next(&mut self) -> u32 {

            // Each hash is split into two twelve-bit segments, which are used

            // as an index into an array. We increment each by 64 so that we

            // hit the next cache line, and then another 1 so that our wrapping

            // behavior leads us to different entries each time.

//

            // Trying to simulate cold caches is rather difficult with the cargo

            // benchmarking setup, so it may all be moot depending on the number

            // of iterations that end up being run. But we might as well.

            self.0 += (65) + (65 << super::KEY_SIZE);

            self.0

    #[bench]

    fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {

        b.iter(|| {

            let mut gen1 = HashGenerator::default();

            let mut gen2 = HashGenerator::default();

            let mut bf = BloomFilter::new();

            for _ in 0_usize..1000 {

                bf.insert_hash(gen1.next());

            for _ in 0_usize..100 {

                bf.remove_hash(gen2.next());

            for _ in 100_usize..200 {

                test::black_box(bf.might_contain_hash(gen2.next()));

});

    #[bench]

    fn might_contain_10(b: &mut test::Bencher) {

        let bf = BloomFilter::new();

        let mut gen = HashGenerator::default();

        b.iter(|| {

            for _ in 0..10 {

                test::black_box(bf.might_contain_hash(gen.next()));

});

    #[bench]

    fn clear(b: &mut test::Bencher) {

        let mut bf = Box::new(BloomFilter::new());

        b.iter(|| test::black_box(&mut bf).clear());

    #[bench]

    fn insert_10(b: &mut test::Bencher) {

        let mut bf = BloomFilter::new();

        let mut gen = HashGenerator::default();

        b.iter(|| {

            for _ in 0..10 {

                test::black_box(bf.insert_hash(gen.next()));

});

    #[bench]

    fn remove_10(b: &mut test::Bencher) {

        let mut bf = BloomFilter::new();

        let mut gen = HashGenerator::default();

        // Note: this will underflow, and that's ok.

        b.iter(|| {

            for _ in 0..10 {

                bf.remove_hash(gen.next())

});