utf16_throughput.rs

Enable keyboard shortcuts

// This file is part of ICU4X. For terms of use, please see the file

// called LICENSE at the top level of the ICU4X source tree

// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};

use criterion::{black_box, Criterion, Throughput};

use smallvec::SmallVec;

//use detone::IterDecomposeVietnamese;

// 2048 times size of u16 fits on one 4KB memory page, which maximizes

// the run to take average over without introducing cross-page effects.

const INPUT_SIZE: usize = 2048;

fn generate_bmp_input_nfc(s: &str) -> Vec<u16> {

    ComposingNormalizerBorrowed::new_nfc()

        .normalize_iter(s.chars().cycle())

        .take(INPUT_SIZE)

        .map(|c| {

            if c <= '\u{FFFF}' {

                c as u16

            } else {

                unreachable!("Data should stay on the BMP!")

})

        .collect()

fn generate_bmp_input_nfd(s: &str) -> Vec<u16> {

    DecomposingNormalizerBorrowed::new_nfd()

        .normalize_iter(s.chars().cycle())

        .take(INPUT_SIZE)

        .map(|c| {

            if c <= '\u{FFFF}' {

                c as u16

            } else {

                unreachable!("Data should stay on the BMP!")

})

        .collect()

/// Removes headers and replaces line feed with space.

/// Do not use for languages that don't use spaces!

fn prepare_file_contents(content: &str) -> String {

    content

        .lines()

        .filter(|&s| !s.starts_with('#'))

        .map(|s| s.to_owned())

        .collect::<Vec<String>>()

        .join(" ")

fn slice_as_slice(s: &[u16]) -> &[u16] {

    black_box(s)

fn bench_lang(name: &str, data: &str, c: &mut Criterion) {

    let input_nfc = generate_bmp_input_nfc(data);

    let input_nfd = generate_bmp_input_nfd(data);

    let nfc = ComposingNormalizerBorrowed::new_nfc();

    let nfd = DecomposingNormalizerBorrowed::new_nfd();

    // Appending to this output is infallible (does not return `Err`) and

    // this is sized to be large enough not to actually take the the heap

    // allocation path.

    let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new();

        let mut group_name = "utf16_throughput_nfc_".to_string();

        group_name.push_str(name);

        let mut group = c.benchmark_group(&group_name);

        group.throughput(Throughput::Elements(input_nfc.len() as u64));

        group.bench_function("read", |b| {

            b.iter(|| {

                let _ = black_box(

                    nfc.split_normalized_utf16(slice_as_slice(&input_nfc))

.0

                        .len(),

);

})

});

        group.bench_function("writing_to_nfc", |b| {

            b.iter(|| {

                output.clear(); // Should be trivial and OK to do from within here.

                let _ = black_box(

                    nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),

);

})

});

        group.bench_function("writing_to_nfd", |b| {

            b.iter(|| {

                output.clear(); // Should be trivial and OK to do from within here.

                let _ = black_box(

                    nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),

);

})

});

        group.finish();

        let mut group_name = "utf16_throughput_nfd_".to_string();

        group_name.push_str(name);

        let mut group = c.benchmark_group(&group_name);

        group.throughput(Throughput::Elements(input_nfd.len() as u64));

        group.bench_function("read", |b| {

            b.iter(|| {

                let _ = black_box(

                    nfd.split_normalized_utf16(slice_as_slice(&input_nfd))

.0

                        .len(),

);

})

});

        group.bench_function("writing_to_nfd", |b| {

            b.iter(|| {

                output.clear(); // Should be trivial and OK to do from within here.

                let _ = black_box(

                    nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),

);

})

});

        group.bench_function("writing_to_nfc", |b| {

            b.iter(|| {

                output.clear(); // Should be trivial and OK to do from within here.

                let _ = black_box(

                    nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),

);

})

});

        group.finish();

static EL: &str = include_str!("./data/TestRandomWordsUDHR_el.txt");

static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. ";

static FR: &str = include_str!("./data/TestRandomWordsUDHR_fr.txt");

static VI: &str = include_str!("./data/wotw.txt");

static ZH: &str = "單父人呂公善沛令，辟仇，從之客，因家焉。沛中豪傑吏聞令有重客，皆往賀。";

// zh text from https://www.gutenberg.org/cache/epub/23841/pg23841.txt

// metadata at https://www.gutenberg.org/ebooks/23841

// If you replace this text, be sure not to include ASCII spaces and be sure

// to include punctuation using code points actually used for punctuation in

// Chinese.

// TODO: Add:

// * Japanese with realistic proportion of kana voicing marks

// * Korean, since Hangul is special-cased in the normalizer

// * Kannada or some other non-Korean BMP language that uses

//   backward-combining starters (with realistic proportion of such

//   characters).

// * Chakma or some other living non-BMP language.

// * Vietnamese in the orthographic form (i.e. as produced by

//   the official non-IME keyboard layout that's less common

//   than the NFC-producing IME.)

pub fn criterion_benchmark(c: &mut Criterion) {

    bench_lang("el", prepare_file_contents(EL).as_str(), c);

    bench_lang("en", EN, c);

    bench_lang("fr", prepare_file_contents(FR).as_str(), c);

    bench_lang("vi", prepare_file_contents(VI).as_str(), c);

    bench_lang("zh", ZH, c);