linear.rs - mozsearch

Enable keyboard shortcuts

// This Source Code Form is subject to the terms of the Mozilla Public

// License, v. 2.0. If a copy of the MPL was not distributed with this

// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::cmp;

use std::collections::HashMap;

use once_cell::sync::OnceCell;

use serde::{Deserialize, Serialize};

use super::{Bucketing, Histogram};

/// Create the possible ranges in a linear distribution from `min` to `max` with

/// `bucket_count` buckets.

///

/// This algorithm calculates `bucket_count` number of buckets of equal sizes between `min` and `max`.

///

/// Bucket limits are the minimal bucket value.

/// That means values in a bucket `i` are `bucket[i] <= value < bucket[i+1]`.

/// It will always contain an underflow bucket (`< 1`).

fn linear_range(min: u64, max: u64, count: usize) -> Vec<u64> {

    let mut ranges = Vec::with_capacity(count);

    ranges.push(0);

    let min = cmp::max(1, min);

    let count = count as u64;

    for i in 1..count {

        let range = (min * (count - 1 - i) + max * (i - 1)) / (count - 2);

        ranges.push(range);

    ranges

/// A linear bucketing algorithm.

///

/// Buckets are pre-computed at instantiation with a linear  distribution from `min` to `max`

/// and `bucket_count` buckets.

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]

pub struct PrecomputedLinear {

    // Don't serialize the (potentially large) array of ranges, instead compute them on first

    // access.

    #[serde(skip)]

    pub(crate) bucket_ranges: OnceCell<Vec<u64>>,

    pub(crate) min: u64,

    pub(crate) max: u64,

    pub(crate) bucket_count: usize,

impl Bucketing for PrecomputedLinear {

    /// Get the bucket for the sample.

///

    /// This uses a binary search to locate the index `i` of the bucket such that:

    /// bucket[i] <= sample < bucket[i+1]

    fn sample_to_bucket_minimum(&self, sample: u64) -> u64 {

        let limit = match self.ranges().binary_search(&sample) {

            // Found an exact match to fit it in

            Ok(i) => i,

            // Sorted it fits after the bucket's limit, therefore it fits into the previous bucket

            Err(i) => i - 1,

};

        self.ranges()[limit]

    fn ranges(&self) -> &[u64] {

        // Create the linear range on first access.

        self.bucket_ranges

            .get_or_init(|| linear_range(self.min, self.max, self.bucket_count))

impl Histogram<PrecomputedLinear> {

    /// Creates a histogram with `bucket_count` linear buckets in the range `min` to `max`.

    pub fn linear(min: u64, max: u64, bucket_count: usize) -> Histogram<PrecomputedLinear> {

        Histogram {

            values: HashMap::new(),

            count: 0,

            sum: 0,

            bucketing: PrecomputedLinear {

                bucket_ranges: OnceCell::new(),

                min,

                max,

                bucket_count,

},

#[cfg(test)]

mod test {

    use super::*;

    const DEFAULT_BUCKET_COUNT: usize = 100;

    const DEFAULT_RANGE_MIN: u64 = 0;

    const DEFAULT_RANGE_MAX: u64 = 100;

    #[test]

    fn can_count() {

        let mut hist = Histogram::linear(1, 500, 10);

        assert!(hist.is_empty());

        for i in 1..=10 {

            hist.accumulate(i);

        assert_eq!(10, hist.count());

        assert_eq!(55, hist.sum());

    #[test]

    fn overflow_values_accumulate_in_the_last_bucket() {

        let mut hist =

            Histogram::linear(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT);

        hist.accumulate(DEFAULT_RANGE_MAX + 100);

        assert_eq!(1, hist.values[&DEFAULT_RANGE_MAX]);

    #[test]

    fn short_linear_buckets_are_correct() {

        let test_buckets = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10];

        assert_eq!(test_buckets, linear_range(1, 10, 10));

        // There's always a zero bucket, so we increase the lower limit.

        assert_eq!(test_buckets, linear_range(0, 10, 10));

    #[test]

    fn long_linear_buckets_are_correct() {

        // Hand calculated values using current default range 0 - 60000 and bucket count of 100.

        // NOTE: The final bucket, regardless of width, represents the overflow bucket to hold any

        // values beyond the maximum (in this case the maximum is 60000)

        let test_buckets = vec![

            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,

            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,

            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,

            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,

            90, 91, 92, 93, 94, 95, 96, 97, 98, 100,

];

        assert_eq!(

            test_buckets,

            linear_range(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT)

);

    #[test]

    fn default_buckets_correctly_accumulate() {

        let mut hist =

            Histogram::linear(DEFAULT_RANGE_MIN, DEFAULT_RANGE_MAX, DEFAULT_BUCKET_COUNT);

        for i in &[1, 10, 100, 1000, 10000] {

            hist.accumulate(*i);

        assert_eq!(11111, hist.sum());

        assert_eq!(5, hist.count());

        assert_eq!(None, hist.values.get(&0));

        assert_eq!(1, hist.values[&1]);

        assert_eq!(1, hist.values[&10]);

        assert_eq!(3, hist.values[&100]);

    #[test]

    fn accumulate_large_numbers() {

        let mut hist = Histogram::linear(1, 500, 10);

        hist.accumulate(u64::MAX);

        hist.accumulate(u64::MAX);

        assert_eq!(2, hist.count());

        // Saturate before overflowing

        assert_eq!(u64::MAX, hist.sum());

        assert_eq!(2, hist.values[&500]);