Source code
Revision control
Copy as Markdown
Other Tools
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
use criterion::{black_box, Criterion, Throughput};
use smallvec::SmallVec;
//use detone::IterDecomposeVietnamese;
// 2048 times size of u16 fits on one 4KB memory page, which maximizes
// the run to take average over without introducing cross-page effects.
const INPUT_SIZE: usize = 2048;
fn generate_bmp_input_nfc(s: &str) -> Vec<u16> {
ComposingNormalizerBorrowed::new_nfc()
.normalize_iter(s.chars().cycle())
.take(INPUT_SIZE)
.map(|c| {
if c <= '\u{FFFF}' {
c as u16
} else {
unreachable!("Data should stay on the BMP!")
}
})
.collect()
}
fn generate_bmp_input_nfd(s: &str) -> Vec<u16> {
DecomposingNormalizerBorrowed::new_nfd()
.normalize_iter(s.chars().cycle())
.take(INPUT_SIZE)
.map(|c| {
if c <= '\u{FFFF}' {
c as u16
} else {
unreachable!("Data should stay on the BMP!")
}
})
.collect()
}
/// Removes headers and replaces line feed with space.
/// Do not use for languages that don't use spaces!
fn prepare_file_contents(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join(" ")
}
fn slice_as_slice(s: &[u16]) -> &[u16] {
black_box(s)
}
fn bench_lang(name: &str, data: &str, c: &mut Criterion) {
let input_nfc = generate_bmp_input_nfc(data);
let input_nfd = generate_bmp_input_nfd(data);
let nfc = ComposingNormalizerBorrowed::new_nfc();
let nfd = DecomposingNormalizerBorrowed::new_nfd();
// Appending to this output is infallible (does not return `Err`) and
// this is sized to be large enough not to actually take the the heap
// allocation path.
let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new();
{
let mut group_name = "utf16_throughput_nfc_".to_string();
group_name.push_str(name);
let mut group = c.benchmark_group(&group_name);
group.throughput(Throughput::Elements(input_nfc.len() as u64));
group.bench_function("read", |b| {
b.iter(|| {
let _ = black_box(
nfc.split_normalized_utf16(slice_as_slice(&input_nfc))
.0
.len(),
);
})
});
group.bench_function("writing_to_nfc", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),
);
})
});
group.bench_function("writing_to_nfd", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),
);
})
});
group.finish();
}
{
let mut group_name = "utf16_throughput_nfd_".to_string();
group_name.push_str(name);
let mut group = c.benchmark_group(&group_name);
group.throughput(Throughput::Elements(input_nfd.len() as u64));
group.bench_function("read", |b| {
b.iter(|| {
let _ = black_box(
nfd.split_normalized_utf16(slice_as_slice(&input_nfd))
.0
.len(),
);
})
});
group.bench_function("writing_to_nfd", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),
);
})
});
group.bench_function("writing_to_nfc", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),
);
})
});
group.finish();
}
}
static EL: &str = include_str!("./data/TestRandomWordsUDHR_el.txt");
static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. ";
static FR: &str = include_str!("./data/TestRandomWordsUDHR_fr.txt");
static VI: &str = include_str!("./data/wotw.txt");
static ZH: &str = "單父人呂公善沛令,辟仇,從之客,因家焉。沛中豪傑吏聞令有重客,皆往賀。";
// If you replace this text, be sure not to include ASCII spaces and be sure
// to include punctuation using code points actually used for punctuation in
// Chinese.
// TODO: Add:
// * Japanese with realistic proportion of kana voicing marks
// * Korean, since Hangul is special-cased in the normalizer
// * Kannada or some other non-Korean BMP language that uses
// backward-combining starters (with realistic proportion of such
// characters).
// * Chakma or some other living non-BMP language.
// * Vietnamese in the orthographic form (i.e. as produced by
// the official non-IME keyboard layout that's less common
// than the NFC-producing IME.)
pub fn criterion_benchmark(c: &mut Criterion) {
bench_lang("el", prepare_file_contents(EL).as_str(), c);
bench_lang("en", EN, c);
bench_lang("fr", prepare_file_contents(FR).as_str(), c);
bench_lang("vi", prepare_file_contents(VI).as_str(), c);
bench_lang("zh", ZH, c);
}