Source code
Revision control
Copy as Markdown
Other Tools
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::measurement::WallTime;
use criterion::{
black_box, criterion_group, criterion_main, BatchSize, BenchmarkGroup, BenchmarkId, Criterion,
};
use icu::collator::{options::*, *};
use icu::locale::locale;
use icu_locale_core::Locale;
// Split the baseline and collator benches into distinct `#inline(never)` functions to avoid
// collator layout changes from affecting the baseline!
#[inline(never)]
fn baseline_bench(group: &mut BenchmarkGroup<'_, WallTime>, file_name: &&str, elements: &[&str]) {
// baseline performance, locale-unaware code point sort done by Rust (0 for ordering in the html report)
group.bench_function(
BenchmarkId::new(format!("{}/0_rust_sort", file_name), "default"),
|bencher| {
bencher.iter_batched(
|| black_box(elements.to_vec()),
|mut lines| lines.sort_unstable(),
BatchSize::SmallInput,
)
},
);
}
#[inline(never)]
fn collator_bench(
group: &mut BenchmarkGroup<'_, WallTime>,
file_name: &&str,
elements: &[&str],
index: usize,
strength: Strength,
locale_under_bench: &Locale,
) {
let mut options = CollatorOptions::default();
options.strength = Some(strength);
let collator =
Collator::try_new(CollatorPreferences::from(locale_under_bench), options).unwrap();
// ICU4X collator performance, sort is locale-aware
group.bench_function(
BenchmarkId::new(
format!("{}/1_icu4x_sort", file_name),
format!("{}_{:?}", index, strength),
),
|bencher| {
bencher.iter_batched(
|| black_box(elements.to_vec()),
|mut lines| lines.sort_unstable_by(|left, right| collator.compare(left, right)),
BatchSize::SmallInput,
)
},
);
}
pub fn collator_with_locale(criterion: &mut Criterion) {
// Load file content in reverse order vector.
let content_photos: (&str, Vec<&str>) = (
"TestFilenames_Photos",
include_str!("data/TestFilenames_Photos.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
let content_latin: (&str, Vec<&str>) = (
"TestNames_Latin",
include_str!("data/TestNames_Latin.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
let content_asian: (&str, Vec<&str>) = (
"TestNames_Asian",
include_str!("data/TestNames_Asian.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect(),
);
let content_russian: (&str, Vec<&str>) = (
"TestNames_Russian",
include_str!("data/TestNames_Russian.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect(),
);
let content_chinese: (&str, Vec<&str>) = (
"TestNames_Chinese",
include_str!("data/TestNames_Chinese.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect(),
);
let content_jp_h: (&str, Vec<&str>) = (
"TestNames_Japanese_h",
include_str!("data/TestNames_Japanese_h.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
let content_jp_k: (&str, Vec<&str>) = (
"TestNames_Japanese_k",
include_str!("data/TestNames_Japanese_k.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
let content_korean: (&str, Vec<&str>) = (
"TestNames_Korean",
include_str!("data/TestNames_Korean.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
let content_thai: (&str, Vec<&str>) = (
"TestNames_Thai",
include_str!("data/TestNames_Thai.txt")
.lines()
.filter(|&s| !s.starts_with('#'))
.rev()
.collect::<Vec<&str>>(),
);
// hsivonen@ : All five strengths are benched.
// The default is tertiary, so it makes sense to bench that.
//
// Also, it's particularly interesting to bench quaternary
// and identical with Japanese due to the performance remarks ICU4C docs make about Japanese.
//
// In particular, to get full Japanese standard behavior, you need the identical strength.
// Furthermore, CLDR used to default to quaternary for Japanese but now defaults to tertiary
// as for every other language for performance reasons.
let all_strength = [
Strength::Primary,
Strength::Secondary,
Strength::Tertiary,
Strength::Quaternary,
Strength::Identical,
];
let performance_parameters = [
(
locale!("en-US"),
vec![&content_latin, &content_photos],
&all_strength,
),
(
locale!("da-DK"),
vec![&content_latin, &content_photos],
&all_strength,
),
(locale!("fr-CA"), vec![&content_latin], &all_strength),
(
locale!("ja-JP"),
vec![&content_latin, &content_jp_h, &content_jp_k, &content_asian],
&all_strength,
),
(
locale!("zh-u-co-pinyin"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_CN
(
locale!("zh-u-co-stroke"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_TW
(
locale!("ru-RU"),
vec![&content_latin, &content_russian],
&all_strength,
),
(
locale!("th"),
vec![&content_latin, &content_thai],
&all_strength,
),
(
locale!("ko-KR"),
vec![&content_latin, &content_korean],
&all_strength,
),
];
for perf_parameter in performance_parameters {
let (locale_under_bench, files_under_bench, benched_strength) = perf_parameter;
let mut group = criterion.benchmark_group(locale_under_bench.to_string());
for content_under_bench in files_under_bench {
let (file_name, elements) = black_box(content_under_bench);
baseline_bench(&mut group, file_name, elements);
// index to keep order of strength in the html report
for (index, strength) in benched_strength.iter().enumerate() {
collator_bench(
&mut group,
file_name,
elements,
index,
*strength,
&locale_under_bench,
);
}
}
// TODO: Bench content_photos with numeric mode on.
group.finish();
}
}
criterion_group!(
name = benches;
config = Criterion::default();
targets = collator_with_locale
);
criterion_main!(benches);