exponential.rs - mozsearch

Enable keyboard shortcuts

// This Source Code Form is subject to the terms of the Mozilla Public

// License, v. 2.0. If a copy of the MPL was not distributed with this

// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::HashMap;

use once_cell::sync::OnceCell;

use serde::{Deserialize, Serialize};

use super::{Bucketing, Histogram};

use crate::util::floating_point_context::FloatingPointContext;

/// Create the possible ranges in an exponential distribution from `min` to `max` with

/// `bucket_count` buckets.

///

/// This algorithm calculates the bucket sizes using a natural log approach to get `bucket_count` number of buckets,

/// exponentially spaced between `min` and `max`

///

/// Bucket limits are the minimal bucket value.

/// That means values in a bucket `i` are `bucket[i] <= value < bucket[i+1]`.

/// It will always contain an underflow bucket (`< 1`).

fn exponential_range(min: u64, max: u64, bucket_count: usize) -> Vec<u64> {

    // Set the FPU control flag to the required state within this function

    let _fpc = FloatingPointContext::new();

    let log_max = (max as f64).ln();

    let mut ranges = Vec::with_capacity(bucket_count);

    let mut current = min;

    if current == 0 {

        current = 1;

    // undeflow bucket

    ranges.push(0);

    ranges.push(current);

    for i in 2..bucket_count {

        let log_current = (current as f64).ln();

        let log_ratio = (log_max - log_current) / (bucket_count - i) as f64;

        let log_next = log_current + log_ratio;

        let next_value = log_next.exp().round() as u64;

        current = if next_value > current {

            next_value

        } else {

            current + 1

};

        ranges.push(current);

    ranges

/// An exponential bucketing algorithm.

///

/// Buckets are pre-computed at instantiation with an exponential distribution from `min` to `max`

/// and `bucket_count` buckets.

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]

pub struct PrecomputedExponential {

    // Don't serialize the (potentially large) array of ranges, instead compute them on first

    // access.

    #[serde(skip)]

    pub(crate) bucket_ranges: OnceCell<Vec<u64>>,

    pub(crate) min: u64,

    pub(crate) max: u64,

    pub(crate) bucket_count: usize,

impl Bucketing for PrecomputedExponential {

    /// Get the bucket for the sample.

///

    /// This uses a binary search to locate the index `i` of the bucket such that:

    /// bucket[i] <= sample < bucket[i+1]

    fn sample_to_bucket_minimum(&self, sample: u64) -> u64 {

        let limit = match self.ranges().binary_search(&sample) {

            // Found an exact match to fit it in

            Ok(i) => i,

            // Sorted it fits after the bucket's limit, therefore it fits into the previous bucket

            Err(i) => i - 1,

};

        self.ranges()[limit]

    fn ranges(&self) -> &[u64] {

        // Create the exponential range on first access.

        self.bucket_ranges

            .get_or_init(|| exponential_range(self.min, self.max, self.bucket_count))

impl Histogram<PrecomputedExponential> {

    /// Creates a histogram with `count` exponential buckets in the range `min` to `max`.

    pub fn exponential(

        min: u64,

        max: u64,

        bucket_count: usize,

    ) -> Histogram<PrecomputedExponential> {

        Histogram {

            values: HashMap::new(),

            count: 0,

            sum: 0,

            bucketing: PrecomputedExponential {

                bucket_ranges: OnceCell::new(),

                min,

                max,

                bucket_count,

},

#[cfg(test)]

mod test {

    use super::*;

    const DEFAULT_BUCKET_COUNT: usize = 100;

    const DEFAULT_RANGE_MIN: u64 = 0;

    const DEFAULT_RANGE_MAX: u64 = 60_000;

    #[test]

    fn can_count() {

        let mut hist = Histogram::exponential(1, 500, 10);

        assert!(hist.is_empty());

        for i in 1..=10 {

            hist.accumulate(i);

        assert_eq!(10, hist.count());

        assert_eq!(55, hist.sum());

    #[test]

    fn overflow_values_accumulate_in_the_last_bucket() {

        let mut hist =

            Histogram::exponential(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT);

        hist.accumulate(DEFAULT_RANGE_MAX + 100);

        assert_eq!(1, hist.values[&DEFAULT_RANGE_MAX]);

    #[test]

    fn short_exponential_buckets_are_correct() {

        let test_buckets = vec![0, 1, 2, 3, 5, 9, 16, 29, 54, 100];

        assert_eq!(test_buckets, exponential_range(1, 100, 10));

        // There's always a zero bucket, so we increase the lower limit.

        assert_eq!(test_buckets, exponential_range(0, 100, 10));

    #[test]

    fn default_exponential_buckets_are_correct() {

        // Hand calculated values using current default range 0 - 60000 and bucket count of 100.

        // NOTE: The final bucket, regardless of width, represents the overflow bucket to hold any

        // values beyond the maximum (in this case the maximum is 60000)

        let test_buckets = vec![

            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 21, 23, 25, 28, 31, 34,

            38, 42, 46, 51, 56, 62, 68, 75, 83, 92, 101, 111, 122, 135, 149, 164, 181, 200, 221,

            244, 269, 297, 328, 362, 399, 440, 485, 535, 590, 651, 718, 792, 874, 964, 1064, 1174,

            1295, 1429, 1577, 1740, 1920, 2118, 2337, 2579, 2846, 3140, 3464, 3822, 4217, 4653,

            5134, 5665, 6250, 6896, 7609, 8395, 9262, 10219, 11275, 12440, 13726, 15144, 16709,

            18436, 20341, 22443, 24762, 27321, 30144, 33259, 36696, 40488, 44672, 49288, 54381,

            60000,

];

        assert_eq!(

            test_buckets,

            exponential_range(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT)

);

    #[test]

    fn default_buckets_correctly_accumulate() {

        let mut hist =

            Histogram::exponential(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT);

        for i in &[1, 10, 100, 1000, 10000] {

            hist.accumulate(*i);

        assert_eq!(11111, hist.sum());

        assert_eq!(5, hist.count());

        assert_eq!(None, hist.values.get(&0)); // underflow is empty

        assert_eq!(1, hist.values[&1]); // bucket_ranges[1]  = 1

        assert_eq!(1, hist.values[&10]); // bucket_ranges[10] = 10

        assert_eq!(1, hist.values[&92]); // bucket_ranges[33] = 92

        assert_eq!(1, hist.values[&964]); // bucket_ranges[57] = 964

        assert_eq!(1, hist.values[&9262]); // bucket_ranges[80] = 9262

    #[test]

    fn accumulate_large_numbers() {

        let mut hist = Histogram::exponential(1, 500, 10);

        hist.accumulate(u64::MAX);

        hist.accumulate(u64::MAX);

        assert_eq!(2, hist.count());

        // Saturate before overflowing

        assert_eq!(u64::MAX, hist.sum());

        assert_eq!(2, hist.values[&500]);