Source code

Revision control

Other Tools

1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
* License, v. 2.0. If a copy of the MPL was not distributed with this
3
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5
//! Counting and non-counting Bloom filters tuned for use as ancestor filters
6
//! for selector matching.
7
8
use std::fmt::{self, Debug};
9
10
// The top 8 bits of the 32-bit hash value are not used by the bloom filter.
11
// Consumers may rely on this to pack hashes more efficiently.
12
pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
13
const KEY_SIZE: usize = 12;
14
15
const ARRAY_SIZE: usize = 1 << KEY_SIZE;
16
const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
17
18
/// A counting Bloom filter with 8-bit counters.
19
pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
20
21
/// A counting Bloom filter with parameterized storage to handle
22
/// counters of different sizes. For now we assume that having two hash
23
/// functions is enough, but we may revisit that decision later.
24
///
25
/// The filter uses an array with 2**KeySize entries.
26
///
27
/// Assuming a well-distributed hash function, a Bloom filter with
28
/// array size M containing N elements and
29
/// using k hash function has expected false positive rate exactly
30
///
31
/// $ (1 - (1 - 1/M)^{kN})^k $
32
///
33
/// because each array slot has a
34
///
35
/// $ (1 - 1/M)^{kN} $
36
///
37
/// chance of being 0, and the expected false positive rate is the
38
/// probability that all of the k hash functions will hit a nonzero
39
/// slot.
40
///
41
/// For reasonable assumptions (M large, kN large, which should both
42
/// hold if we're worried about false positives) about M and kN this
43
/// becomes approximately
44
///
45
/// $$ (1 - \exp(-kN/M))^k $$
46
///
47
/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
48
/// or in other words
49
///
50
/// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
51
///
52
/// where r is the false positive rate. This can be used to compute
53
/// the desired KeySize for a given load N and false positive rate r.
54
///
55
/// If N/M is assumed small, then the false positive rate can
56
/// further be approximated as 4*N^2/M^2. So increasing KeySize by
57
/// 1, which doubles M, reduces the false positive rate by about a
58
/// factor of 4, and a false positive rate of 1% corresponds to
59
/// about M/N == 20.
60
///
61
/// What this means in practice is that for a few hundred keys using a
62
/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
63
///
64
/// Similarly, using a KeySize of 10 would lead to a 4% false
65
/// positive rate for N == 100 and to quite bad false positive
66
/// rates for larger N.
67
#[derive(Clone, Default)]
68
pub struct CountingBloomFilter<S>
69
where
70
S: BloomStorage,
71
{
72
storage: S,
73
}
74
75
impl<S> CountingBloomFilter<S>
76
where
77
S: BloomStorage,
78
{
79
/// Creates a new bloom filter.
80
#[inline]
81
pub fn new() -> Self {
82
Default::default()
83
}
84
85
#[inline]
86
pub fn clear(&mut self) {
87
self.storage = Default::default();
88
}
89
90
// Slow linear accessor to make sure the bloom filter is zeroed. This should
91
// never be used in release builds.
92
#[cfg(debug_assertions)]
93
pub fn is_zeroed(&self) -> bool {
94
self.storage.is_zeroed()
95
}
96
97
#[cfg(not(debug_assertions))]
98
pub fn is_zeroed(&self) -> bool {
99
unreachable!()
100
}
101
102
/// Inserts an item with a particular hash into the bloom filter.
103
#[inline]
104
pub fn insert_hash(&mut self, hash: u32) {
105
self.storage.adjust_first_slot(hash, true);
106
self.storage.adjust_second_slot(hash, true);
107
}
108
109
/// Removes an item with a particular hash from the bloom filter.
110
#[inline]
111
pub fn remove_hash(&mut self, hash: u32) {
112
self.storage.adjust_first_slot(hash, false);
113
self.storage.adjust_second_slot(hash, false);
114
}
115
116
/// Check whether the filter might contain an item with the given hash.
117
/// This can sometimes return true even if the item is not in the filter,
118
/// but will never return false for items that are actually in the filter.
119
#[inline]
120
pub fn might_contain_hash(&self, hash: u32) -> bool {
121
!self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
122
}
123
}
124
125
impl<S> Debug for CountingBloomFilter<S>
126
where
127
S: BloomStorage,
128
{
129
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130
let mut slots_used = 0;
131
for i in 0..ARRAY_SIZE {
132
if !self.storage.slot_is_empty(i) {
133
slots_used += 1;
134
}
135
}
136
write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
137
}
138
}
139
140
pub trait BloomStorage: Clone + Default {
141
fn slot_is_empty(&self, index: usize) -> bool;
142
fn adjust_slot(&mut self, index: usize, increment: bool);
143
fn is_zeroed(&self) -> bool;
144
145
#[inline]
146
fn first_slot_is_empty(&self, hash: u32) -> bool {
147
self.slot_is_empty(Self::first_slot_index(hash))
148
}
149
150
#[inline]
151
fn second_slot_is_empty(&self, hash: u32) -> bool {
152
self.slot_is_empty(Self::second_slot_index(hash))
153
}
154
155
#[inline]
156
fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
157
self.adjust_slot(Self::first_slot_index(hash), increment)
158
}
159
160
#[inline]
161
fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
162
self.adjust_slot(Self::second_slot_index(hash), increment)
163
}
164
165
#[inline]
166
fn first_slot_index(hash: u32) -> usize {
167
hash1(hash) as usize
168
}
169
170
#[inline]
171
fn second_slot_index(hash: u32) -> usize {
172
hash2(hash) as usize
173
}
174
}
175
176
/// Storage class for a CountingBloomFilter that has 8-bit counters.
177
pub struct BloomStorageU8 {
178
counters: [u8; ARRAY_SIZE],
179
}
180
181
impl BloomStorage for BloomStorageU8 {
182
#[inline]
183
fn adjust_slot(&mut self, index: usize, increment: bool) {
184
let slot = &mut self.counters[index];
185
if *slot != 0xff {
186
// full
187
if increment {
188
*slot += 1;
189
} else {
190
*slot -= 1;
191
}
192
}
193
}
194
195
#[inline]
196
fn slot_is_empty(&self, index: usize) -> bool {
197
self.counters[index] == 0
198
}
199
200
#[inline]
201
fn is_zeroed(&self) -> bool {
202
self.counters.iter().all(|x| *x == 0)
203
}
204
}
205
206
impl Default for BloomStorageU8 {
207
fn default() -> Self {
208
BloomStorageU8 {
209
counters: [0; ARRAY_SIZE],
210
}
211
}
212
}
213
214
impl Clone for BloomStorageU8 {
215
fn clone(&self) -> Self {
216
BloomStorageU8 {
217
counters: self.counters,
218
}
219
}
220
}
221
222
/// Storage class for a CountingBloomFilter that has 1-bit counters.
223
pub struct BloomStorageBool {
224
counters: [u8; ARRAY_SIZE / 8],
225
}
226
227
impl BloomStorage for BloomStorageBool {
228
#[inline]
229
fn adjust_slot(&mut self, index: usize, increment: bool) {
230
let bit = 1 << (index % 8);
231
let byte = &mut self.counters[index / 8];
232
233
// Since we have only one bit for storage, decrementing it
234
// should never do anything. Assert against an accidental
235
// decrementing of a bit that was never set.
236
assert!(
237
increment || (*byte & bit) != 0,
238
"should not decrement if slot is already false"
239
);
240
241
if increment {
242
*byte |= bit;
243
}
244
}
245
246
#[inline]
247
fn slot_is_empty(&self, index: usize) -> bool {
248
let bit = 1 << (index % 8);
249
(self.counters[index / 8] & bit) == 0
250
}
251
252
#[inline]
253
fn is_zeroed(&self) -> bool {
254
self.counters.iter().all(|x| *x == 0)
255
}
256
}
257
258
impl Default for BloomStorageBool {
259
fn default() -> Self {
260
BloomStorageBool {
261
counters: [0; ARRAY_SIZE / 8],
262
}
263
}
264
}
265
266
impl Clone for BloomStorageBool {
267
fn clone(&self) -> Self {
268
BloomStorageBool {
269
counters: self.counters,
270
}
271
}
272
}
273
274
#[inline]
275
fn hash1(hash: u32) -> u32 {
276
hash & KEY_MASK
277
}
278
279
#[inline]
280
fn hash2(hash: u32) -> u32 {
281
(hash >> KEY_SIZE) & KEY_MASK
282
}
283
284
#[test]
285
fn create_and_insert_some_stuff() {
286
use fxhash::FxHasher;
287
use std::hash::{Hash, Hasher};
288
use std::mem::transmute;
289
290
fn hash_as_str(i: usize) -> u32 {
291
let mut hasher = FxHasher::default();
292
let s = i.to_string();
293
s.hash(&mut hasher);
294
let hash: u64 = hasher.finish();
295
(hash >> 32) as u32 ^ (hash as u32)
296
}
297
298
let mut bf = BloomFilter::new();
299
300
// Statically assert that ARRAY_SIZE is a multiple of 8, which
301
// BloomStorageBool relies on.
302
unsafe {
303
transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
304
}
305
306
for i in 0_usize..1000 {
307
bf.insert_hash(hash_as_str(i));
308
}
309
310
for i in 0_usize..1000 {
311
assert!(bf.might_contain_hash(hash_as_str(i)));
312
}
313
314
let false_positives = (1001_usize..2000)
315
.filter(|i| bf.might_contain_hash(hash_as_str(*i)))
316
.count();
317
318
assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
319
320
for i in 0_usize..100 {
321
bf.remove_hash(hash_as_str(i));
322
}
323
324
for i in 100_usize..1000 {
325
assert!(bf.might_contain_hash(hash_as_str(i)));
326
}
327
328
let false_positives = (0_usize..100)
329
.filter(|i| bf.might_contain_hash(hash_as_str(*i)))
330
.count();
331
332
assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
333
334
bf.clear();
335
336
for i in 0_usize..2000 {
337
assert!(!bf.might_contain_hash(hash_as_str(i)));
338
}
339
}
340
341
#[cfg(feature = "bench")]
342
#[cfg(test)]
343
mod bench {
344
extern crate test;
345
use super::BloomFilter;
346
347
#[derive(Default)]
348
struct HashGenerator(u32);
349
350
impl HashGenerator {
351
fn next(&mut self) -> u32 {
352
// Each hash is split into two twelve-bit segments, which are used
353
// as an index into an array. We increment each by 64 so that we
354
// hit the next cache line, and then another 1 so that our wrapping
355
// behavior leads us to different entries each time.
356
//
357
// Trying to simulate cold caches is rather difficult with the cargo
358
// benchmarking setup, so it may all be moot depending on the number
359
// of iterations that end up being run. But we might as well.
360
self.0 += (65) + (65 << super::KEY_SIZE);
361
self.0
362
}
363
}
364
365
#[bench]
366
fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
367
b.iter(|| {
368
let mut gen1 = HashGenerator::default();
369
let mut gen2 = HashGenerator::default();
370
let mut bf = BloomFilter::new();
371
for _ in 0_usize..1000 {
372
bf.insert_hash(gen1.next());
373
}
374
for _ in 0_usize..100 {
375
bf.remove_hash(gen2.next());
376
}
377
for _ in 100_usize..200 {
378
test::black_box(bf.might_contain_hash(gen2.next()));
379
}
380
});
381
}
382
383
#[bench]
384
fn might_contain_10(b: &mut test::Bencher) {
385
let bf = BloomFilter::new();
386
let mut gen = HashGenerator::default();
387
b.iter(|| {
388
for _ in 0..10 {
389
test::black_box(bf.might_contain_hash(gen.next()));
390
}
391
});
392
}
393
394
#[bench]
395
fn clear(b: &mut test::Bencher) {
396
let mut bf = Box::new(BloomFilter::new());
397
b.iter(|| test::black_box(&mut bf).clear());
398
}
399
400
#[bench]
401
fn insert_10(b: &mut test::Bencher) {
402
let mut bf = BloomFilter::new();
403
let mut gen = HashGenerator::default();
404
b.iter(|| {
405
for _ in 0..10 {
406
test::black_box(bf.insert_hash(gen.next()));
407
}
408
});
409
}
410
411
#[bench]
412
fn remove_10(b: &mut test::Bencher) {
413
let mut bf = BloomFilter::new();
414
let mut gen = HashGenerator::default();
415
// Note: this will underflow, and that's ok.
416
b.iter(|| {
417
for _ in 0..10 {
418
bf.remove_hash(gen.next())
419
}
420
});
421
}
422
}