Source code
Revision control
Copy as Markdown
Other Tools
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
// Various collation-related algorithms and constants in this file are
// adapted from ICU4C and, therefore, are subject to the ICU license as
// described in LICENSE.
//! This module holds the `Collator` struct whose `compare_impl()` contains
//! the comparison of collation element sequences.
use alloc::collections::VecDeque;
use alloc::vec::Vec;
use crate::elements::CharacterAndClassAndTrieValue;
use crate::elements::CollationElement32;
use crate::elements::Tag;
use crate::elements::BACKWARD_COMBINING_MARKER;
use crate::elements::CE_BUFFER_SIZE;
use crate::elements::FALLBACK_CE32;
use crate::elements::HANGUL_N_COUNT;
use crate::elements::HANGUL_S_BASE;
use crate::elements::HANGUL_S_COUNT;
use crate::elements::HANGUL_T_COUNT;
use crate::elements::IDENTICAL_PREFIX_HANGUL_MARKER_CE32;
use crate::elements::NON_ROUND_TRIP_MARKER;
use crate::elements::{
char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT,
QUATERNARY_MASK,
};
use crate::options::CollatorOptionsBitField;
use crate::options::{
AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength,
};
use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType};
use crate::provider::CollationData;
use crate::provider::CollationDiacritics;
use crate::provider::CollationDiacriticsV1;
use crate::provider::CollationJamo;
use crate::provider::CollationJamoV1;
use crate::provider::CollationMetadataV1;
use crate::provider::CollationReordering;
use crate::provider::CollationReorderingV1;
use crate::provider::CollationRootV1;
use crate::provider::CollationSpecialPrimariesV1;
use crate::provider::CollationSpecialPrimariesValidated;
use crate::provider::CollationTailoringV1;
use core::cmp::Ordering;
use core::convert::{Infallible, TryFrom};
use icu_collections::codepointtrie::AbstractCodePointTrie;
use icu_collections::codepointtrie::CharsWithTrieDefaultForAsciiEx;
use icu_collections::codepointtrie::CharsWithTrieEx;
#[cfg(feature = "serde")]
use icu_collections::codepointtrie::CodePointTrie;
#[cfg(not(feature = "serde"))]
use icu_collections::codepointtrie::FastCodePointTrie;
#[cfg(feature = "latin1")]
use icu_collections::codepointtrie::Latin1CharsWithTrieEx;
use icu_collections::codepointtrie::TypedCodePointTrie;
use icu_collections::codepointtrie::WithTrie;
use icu_normalizer::provider::DecompositionData;
use icu_normalizer::provider::DecompositionTables;
use icu_normalizer::provider::NormalizerNfdDataV1;
use icu_normalizer::provider::NormalizerNfdTablesV1;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
use smallvec::SmallVec;
use utf16_iter::Utf16CharsWithTrieEx;
use utf8_iter::Utf8CharsWithTrieDefaultForAsciiEx;
use utf8_iter::Utf8CharsWithTrieEx;
use zerovec::ule::AsULE;
#[cfg(feature = "serde")]
type NormTrie<'trie> = CodePointTrie<'trie, u32>;
#[cfg(not(feature = "serde"))]
type NormTrie<'trie> = FastCodePointTrie<'trie, u32>;
// Special sort key bytes for all levels.
const LEVEL_SEPARATOR_BYTE: u8 = 1;
/// Merge-sort-key separator.
///
/// Same as the unique primary and identical-level weights of U+FFFE. Must not
/// be used as primary compression low terminator. Otherwise usable.
const MERGE_SEPARATOR: char = '\u{fffe}';
const MERGE_SEPARATOR_BYTE: u8 = 2;
const MERGE_SEPARATOR_PRIMARY: u32 = 0x02000000;
/// Primary compression low terminator, must be greater than [`MERGE_SEPARATOR_BYTE`].
///
/// Reserved value in primary second byte if the lead byte is compressible.
/// Otherwise usable in all CE weight bytes.
const PRIMARY_COMPRESSION_LOW_BYTE: u8 = 3;
/// Primary compression high terminator.
///
/// Reserved value in primary second byte if the lead byte is compressible.
/// Otherwise usable in all CE weight bytes.
const PRIMARY_COMPRESSION_HIGH_BYTE: u8 = 0xff;
/// Default secondary/tertiary weight lead byte.
const COMMON_BYTE: u8 = 5;
const COMMON_WEIGHT16: u16 = 0x0500;
// Internal flags for sort key generation
const PRIMARY_LEVEL_FLAG: u8 = 0x01;
const SECONDARY_LEVEL_FLAG: u8 = 0x02;
const CASE_LEVEL_FLAG: u8 = 0x04;
const TERTIARY_LEVEL_FLAG: u8 = 0x08;
const QUATERNARY_LEVEL_FLAG: u8 = 0x10;
const LEVEL_MASKS: [u8; Strength::Identical as usize + 1] = [
PRIMARY_LEVEL_FLAG,
PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG,
PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG,
PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG,
0,
0,
0,
PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG,
];
// Internal constants for indexing into the below compression configurations
const WEIGHT_LOW: usize = 0;
const WEIGHT_MIDDLE: usize = 1;
const WEIGHT_HIGH: usize = 2;
const WEIGHT_MAX_COUNT: usize = 3;
// Secondary level: Compress up to 33 common weights as 05..25 or 25..45.
const SEC_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21];
// Case level, lowerFirst: Compress up to 7 common weights as 1..7 or 7..13.
const CASE_LOWER_FIRST_COMMON: [u8; 4] = [1, 7, 13, 7];
// Case level, upperFirst: Compress up to 13 common weights as 3..15.
const CASE_UPPER_FIRST_COMMON: [u8; 4] = [3, 0 /* unused */, 15, 13];
// Tertiary level only (no case): Compress up to 97 common weights as 05..65 or 65..C5.
const TER_ONLY_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x60, COMMON_BYTE + 0xc0, 0x61];
// Tertiary with case, lowerFirst: Compress up to 33 common weights as 05..25 or 25..45.
const TER_LOWER_FIRST_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21];
// Tertiary with case, upperFirst: Compress up to 33 common weights as 85..A5 or A5..C5.
const TER_UPPER_FIRST_COMMON: [u8; 4] = [
COMMON_BYTE + 0x80,
COMMON_BYTE + 0x80 + 0x20,
COMMON_BYTE + 0x80 + 0x40,
0x21,
];
const QUAT_COMMON: [u8; 4] = [0x1c, 0x1c + 0x70, 0x1c + 0xe0, 0x71];
const QUAT_SHIFTED_LIMIT_BYTE: u8 = QUAT_COMMON[WEIGHT_LOW] - 1; // 0x1b
// Do not use byte values 0, 1, 2 because they are separators in sort keys.
const SLOPE_MIN: i32 = 3;
const SLOPE_MAX: i32 = 0xff;
const SLOPE_MIDDLE: i32 = 0x81;
const SLOPE_TAIL_COUNT: i32 = SLOPE_MAX - SLOPE_MIN + 1;
const SLOPE_SINGLE: i32 = 80;
const SLOPE_LEAD_2: i32 = 42;
const SLOPE_LEAD_3: i32 = 3;
// The difference value range for single-byters.
const SLOPE_REACH_POS_1: i32 = SLOPE_SINGLE;
const SLOPE_REACH_NEG_1: i32 = -SLOPE_SINGLE;
// The difference value range for double-byters.
const SLOPE_REACH_POS_2: i32 = SLOPE_LEAD_2 * SLOPE_TAIL_COUNT + (SLOPE_LEAD_2 - 1);
const SLOPE_REACH_NEG_2: i32 = -SLOPE_REACH_POS_2 - 1;
// The difference value range for 3-byters.
const SLOPE_REACH_POS_3: i32 = SLOPE_LEAD_3 * SLOPE_TAIL_COUNT * SLOPE_TAIL_COUNT
+ (SLOPE_LEAD_3 - 1) * SLOPE_TAIL_COUNT
+ (SLOPE_TAIL_COUNT - 1);
const SLOPE_REACH_NEG_3: i32 = -SLOPE_REACH_POS_3 - 1;
// The lead byte start values.
const SLOPE_START_POS_2: i32 = SLOPE_MIDDLE + SLOPE_SINGLE + 1;
const SLOPE_START_POS_3: i32 = SLOPE_START_POS_2 + SLOPE_LEAD_2;
const SLOPE_START_NEG_2: i32 = SLOPE_MIDDLE + SLOPE_REACH_NEG_1;
const SLOPE_START_NEG_3: i32 = SLOPE_START_NEG_2 - SLOPE_LEAD_2;
struct AnyQuaternaryAccumulator(u32);
impl AnyQuaternaryAccumulator {
#[inline(always)]
pub fn new() -> Self {
AnyQuaternaryAccumulator(0)
}
#[inline(always)]
pub fn accumulate(&mut self, non_primary: NonPrimary) {
self.0 |= non_primary.bits()
}
#[inline(always)]
pub fn has_quaternary(&self) -> bool {
self.0 & u32::from(QUATERNARY_MASK) != 0
}
}
/// `true` iff `i` is greater or equal to `start` and less or equal
/// to `end`.
#[inline(always)]
fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
i.wrapping_sub(start) <= (end - start)
}
/// Finds the identical prefix of `left` and `right` containing
/// Latin1.
///
/// Returns the identical prefix, the part of `left` after the
/// prefix, and the part of `right` after the prefix.
///
/// ✨ *Enabled with the `latin1` Cargo feature.*
#[cfg(feature = "latin1")]
fn split_prefix_latin1<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
let i = left
.iter()
.zip(right.iter())
.take_while(|(l, r)| l == r)
.count();
if let Some((head, left_tail)) = left.split_at_checked(i) {
if let Some(right_tail) = right.get(i..) {
return (head, left_tail, right_tail);
}
}
(&[], left, right)
}
/// Finds the identical prefix of `left` containing Latin1
/// and `right` containing potentially ill-formed UTF-16.
///
/// Returns the identical prefix, the part of `left` after the
/// prefix, and the part of `right` after the prefix.
///
/// ✨ *Enabled with the `latin1` Cargo feature.*
#[cfg(feature = "latin1")]
fn split_prefix_latin1_utf16<'a, 'b>(
left: &'a [u8],
right: &'b [u16],
) -> (&'a [u8], &'a [u8], &'b [u16]) {
let i = left
.iter()
.zip(right.iter())
.take_while(|(l, r)| u16::from(**l) == **r)
.count();
if let Some((head, left_tail)) = left.split_at_checked(i) {
if let Some(right_tail) = right.get(i..) {
return (head, left_tail, right_tail);
}
}
(&[], left, right)
}
/// Finds the identical prefix of `left` and `right` containing
/// potentially ill-formed UTF-16, while avoiding splitting a
/// well-formed surrogate pair. In case of ill-formed
/// UTF-16, the prefix is not guaranteed to be maximal.
///
/// Returns the identical prefix, the part of `left` after the
/// prefix, and the part of `right` after the prefix.
fn split_prefix_u16<'a, 'b>(
left: &'a [u16],
right: &'b [u16],
) -> (&'a [u16], &'a [u16], &'b [u16]) {
let mut i = left
.iter()
.zip(right.iter())
.take_while(|(l, r)| l == r)
.count();
if i != 0 {
if let Some(&last) = left.get(i.wrapping_sub(1)) {
if in_inclusive_range16(last, 0xD800, 0xDBFF) {
i -= 1;
}
if let Some((head, left_tail)) = left.split_at_checked(i) {
if let Some(right_tail) = right.get(i..) {
return (head, left_tail, right_tail);
}
}
}
}
(&[], left, right)
}
/// Finds the identical prefix of `left` and `right` containing
/// potentially ill-formed UTF-8, while avoiding splitting a UTF-8
/// byte sequence. In case of ill-formed UTF-8, the prefix is
/// not guaranteed to be maximal.
///
/// Returns the identical prefix, the part of `left` after the
/// prefix, and the part of `right` after the prefix.
fn split_prefix_u8<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
let mut i = left
.iter()
.zip(right.iter())
.take_while(|(l, r)| l == r)
.count();
if i != 0 {
// Tails must not start with a UTF-8 continuation
// byte unless it's the first byte of the original
// slice.
// First, left and right differ, but since they
// are the same afterwards, one of them needs checking
// only once.
if let Some(right_first) = right.get(i) {
if (right_first & 0b1100_0000) == 0b1000_0000 {
i -= 1;
}
}
while i != 0 {
if let Some(left_first) = left.get(i) {
if (left_first & 0b1100_0000) == 0b1000_0000 {
i -= 1;
continue;
}
}
break;
}
if let Some((head, left_tail)) = left.split_at_checked(i) {
if let Some(right_tail) = right.get(i..) {
return (head, left_tail, right_tail);
}
}
}
(&[], left, right)
}
/// Finds the identical prefix of `left` and `right` containing
/// guaranteed well-format UTF-8.
///
/// Returns the identical prefix, the part of `left` after the
/// prefix, and the part of `right` after the prefix.
fn split_prefix<'a, 'b>(left: &'a str, right: &'b str) -> (&'a str, &'a str, &'b str) {
let left_bytes = left.as_bytes();
let right_bytes = right.as_bytes();
let mut i = left_bytes
.iter()
.zip(right_bytes.iter())
.take_while(|(l, r)| l == r)
.count();
if i != 0 {
// Tails must not start with a UTF-8 continuation
// byte.
// Since the inputs are valid UTF-8, the first byte
// of either input slice cannot be a contination slice,
// so we may rely on finding a lead byte when walking
// backwards.
// Since the inputs are valid UTF-8, if a tail starts
// with a continuation, both tails must start with a
// continuation, since the most recent lead byte must
// be equal, so the difference is within valid UTF-8
// sequences of equal length.
// Therefore, it's sufficient to examine only one of
// the sides.
loop {
if let Some(left_first) = left_bytes.get(i) {
if (left_first & 0b1100_0000) == 0b1000_0000 {
i -= 1;
continue;
}
}
break;
}
// The methods below perform useless UTF-8 boundary checks,
// since we just checked. However, avoiding `unsafe` to
// make this code easier to audit.
if let Some((head, left_tail)) = left.split_at_checked(i) {
if let Some(right_tail) = right.get(i..) {
return (head, left_tail, right_tail);
}
}
}
("", left, right)
}
/// Holder struct for payloads that are locale-dependent. (For code
/// reuse between owned and borrowed cases.)
#[derive(Debug)]
struct LocaleSpecificDataHolder {
tailoring: Option<DataPayload<CollationTailoringV1>>,
diacritics: DataPayload<CollationDiacriticsV1>,
reordering: Option<DataPayload<CollationReorderingV1>>,
merged_options: CollatorOptionsBitField,
lithuanian_dot_above: bool,
}
icu_locale_core::preferences::define_preferences!(
/// The preferences for collation.
///
/// # Preferences
///
/// Examples for using the different preferences below can be found in the [crate-level docs](crate).
///
/// ## Case First
///
/// See the [spec](https://www.unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
/// This is the BCP47 key `kf`. Three possibilities: [`CollationCaseFirst::False`] (default,
/// except for Danish and Maltese), [`CollationCaseFirst::Lower`], and [`CollationCaseFirst::Upper`]
/// (default for Danish and Maltese).
///
/// ## Numeric
///
/// This is the BCP47 key `kn`. When set to [`CollationNumericOrdering::True`], any sequence of decimal
/// digits (General_Category = Nd) is sorted at the primary level according to the
/// numeric value. The default is [`CollationNumericOrdering::False`].
[Copy]
CollatorPreferences,
{
/// The collation type. This corresponds to the `-u-co` BCP-47 tag.
collation_type: CollationType,
/// Treatment of case. (Large and small kana differences are treated as case differences.)
/// This corresponds to the `-u-kf` BCP-47 tag.
case_first: CollationCaseFirst,
/// When set to `True`, any sequence of decimal digits is sorted at a primary level according
/// to the numeric value.
/// This corresponds to the `-u-kn` BPC-47 tag.
numeric_ordering: CollationNumericOrdering
}
);
impl LocaleSpecificDataHolder {
/// The constructor code reused between owned and borrowed cases.
fn try_new_unstable_internal<D>(
provider: &D,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
D: DataProvider<CollationTailoringV1>
+ DataProvider<CollationDiacriticsV1>
+ DataProvider<CollationMetadataV1>
+ DataProvider<CollationReorderingV1>
+ ?Sized,
{
let marker_attributes = prefs
.collation_type
.as_ref()
// all collation types are valid marker attributes
.map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str()))
.unwrap_or_default();
let data_locale = CollationTailoringV1::make_locale(prefs.locale_preferences);
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
marker_attributes,
&data_locale,
),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};
let fallback_req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
Default::default(),
&data_locale,
),
..Default::default()
};
let metadata_payload: DataPayload<CollationMetadataV1> = provider
.load(req)
.or_else(|_| provider.load(fallback_req))?
.payload;
let metadata = metadata_payload.get();
let tailoring: Option<DataPayload<CollationTailoringV1>> = if metadata.tailored() {
Some(
provider
.load(req)
.or_else(|_| provider.load(fallback_req))?
.payload,
)
} else {
None
};
let reordering: Option<DataPayload<CollationReorderingV1>> = if metadata.reordering() {
Some(
provider
.load(req)
.or_else(|_| provider.load(fallback_req))?
.payload,
)
} else {
None
};
if let Some(reordering) = &reordering {
if reordering.get().reorder_table.len() != 256 {
return Err(DataError::custom("invalid").with_marker(CollationReorderingV1::INFO));
}
}
let tailored_diacritics = metadata.tailored_diacritics();
let diacritics: DataPayload<CollationDiacriticsV1> = provider
.load(if tailored_diacritics {
req
} else {
Default::default()
})?
.payload;
if tailored_diacritics {
// In the tailored case we accept a shorter table in which case the tailoring is
// responsible for supplying the missing values in the trie.
// As of June 2022, none of the collations actually use a shortened table.
// Vietnamese and Ewe load a full-length alternative table and the rest use
// the default one.
if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
}
} else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
}
let mut altered_defaults = CollatorOptionsBitField::default();
if metadata.alternate_shifted() {
altered_defaults.set_alternate_handling(Some(AlternateHandling::Shifted));
}
if metadata.backward_second_level() {
altered_defaults.set_backward_second_level(Some(true));
}
altered_defaults.set_case_first(Some(metadata.case_first()));
altered_defaults.set_max_variable(Some(metadata.max_variable()));
let mut merged_options = CollatorOptionsBitField::from(options);
merged_options.set_case_first(prefs.case_first);
merged_options.set_numeric_from_enum(prefs.numeric_ordering);
merged_options.set_defaults(altered_defaults);
Ok(LocaleSpecificDataHolder {
tailoring,
diacritics,
merged_options,
reordering,
lithuanian_dot_above: metadata.lithuanian_dot_above(),
})
}
}
/// Compares strings according to culturally-relevant ordering.
#[derive(Debug)]
pub struct Collator {
special_primaries: DataPayload<ErasedMarker<CollationSpecialPrimariesValidated<'static>>>,
root: DataPayload<CollationRootV1>,
tailoring: Option<DataPayload<CollationTailoringV1>>,
jamo: DataPayload<CollationJamoV1>,
diacritics: DataPayload<CollationDiacriticsV1>,
options: CollatorOptionsBitField,
reordering: Option<DataPayload<CollationReorderingV1>>,
decompositions: DataPayload<NormalizerNfdDataV1>,
tables: DataPayload<NormalizerNfdTablesV1>,
lithuanian_dot_above: bool,
}
impl Collator {
/// Constructs a borrowed version of this type for more efficient querying.
pub fn as_borrowed(&self) -> CollatorBorrowed<'_> {
CollatorBorrowed {
special_primaries: self.special_primaries.get(),
root: self.root.get(),
tailoring: if let Some(t) = self.tailoring.as_ref() {
t.get()
} else {
self.root.get()
},
jamo: self.jamo.get(),
diacritics: self.diacritics.get(),
options: self.options,
reordering: self.reordering.as_ref().map(|s| s.get()),
decompositions: self.decompositions.get(),
tables: self.tables.get(),
lithuanian_dot_above: self.lithuanian_dot_above,
}
}
/// Creates `CollatorBorrowed` for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<CollatorBorrowed<'static>, DataError> {
CollatorBorrowed::try_new(prefs, options)
}
icu_provider::gen_buffer_data_constructors!(
(prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError,
functions: [
try_new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D>(
provider: &D,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
D: DataProvider<CollationSpecialPrimariesV1>
+ DataProvider<CollationRootV1>
+ DataProvider<CollationTailoringV1>
+ DataProvider<CollationDiacriticsV1>
+ DataProvider<CollationJamoV1>
+ DataProvider<CollationMetadataV1>
+ DataProvider<CollationReorderingV1>
+ DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ ?Sized,
{
Self::try_new_unstable_internal(
provider,
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
prefs,
options,
)
}
#[expect(clippy::too_many_arguments)]
fn try_new_unstable_internal<D>(
provider: &D,
root: DataPayload<CollationRootV1>,
decompositions: DataPayload<NormalizerNfdDataV1>,
tables: DataPayload<NormalizerNfdTablesV1>,
jamo: DataPayload<CollationJamoV1>,
special_primaries: DataPayload<CollationSpecialPrimariesV1>,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
D: DataProvider<CollationRootV1>
+ DataProvider<CollationTailoringV1>
+ DataProvider<CollationDiacriticsV1>
+ DataProvider<CollationMetadataV1>
+ DataProvider<CollationReorderingV1>
+ ?Sized,
{
let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
// TODO: redesign Korean search collation handling
if jamo.get().ce32s.len() != JAMO_COUNT {
return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
}
// `variant_count` isn't stable yet:
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO));
}
let special_primaries = special_primaries.map_project(|csp, _| {
let compressible_bytes = (csp.last_primaries.len()
== MaxVariable::Currency as usize + 16)
.then(|| {
csp.last_primaries
.as_maybe_borrowed()?
.as_ule_slice()
.get((MaxVariable::Currency as usize)..)?
.try_into()
.ok()
})
.flatten()
.unwrap_or(
CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK,
);
CollationSpecialPrimariesValidated {
last_primaries: csp.last_primaries.truncated(MaxVariable::Currency as usize),
numeric_primary: csp.numeric_primary,
compressible_bytes,
}
});
Ok(Collator {
special_primaries,
root,
tailoring: locale_dependent.tailoring,
jamo,
diacritics: locale_dependent.diacritics,
options: locale_dependent.merged_options,
reordering: locale_dependent.reordering,
decompositions,
tables,
lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
})
}
}
macro_rules! quick_primary_compare {
($left_primary:ident,
$right_primary:ident,
$variable_top:ident,
$self:ident,
) => {
if ($left_primary != $right_primary)
&& ($left_primary != 0)
&& ($right_primary != 0)
&& !($left_primary < $variable_top && $left_primary > MERGE_SEPARATOR_PRIMARY)
&& !($right_primary < $variable_top && $right_primary > MERGE_SEPARATOR_PRIMARY)
{
if let Some(reordering) = &$self.reordering {
$left_primary = reordering.reorder($left_primary);
$right_primary = reordering.reorder($right_primary);
}
if $left_primary < $right_primary {
return Ordering::Less;
}
return Ordering::Greater;
}
};
}
macro_rules! hangul_syllable_compare {
($left_hangul_offset:ident,
$right_hangul_offset:ident,
) => {
let left_l = $left_hangul_offset / HANGUL_N_COUNT;
let right_l = $right_hangul_offset / HANGUL_N_COUNT;
if left_l != right_l {
// We don't really support jamo tailoring, so let's
// compare the jamo directly.
if left_l < right_l {
return Ordering::Less;
}
return Ordering::Greater;
}
let left_v = ($left_hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
let right_v = ($right_hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
if left_v != right_v {
// We don't really support jamo tailoring, so let's
// compare the jamo directly.
if left_v < right_v {
return Ordering::Less;
}
return Ordering::Greater;
}
let left_t = $left_hangul_offset % HANGUL_T_COUNT;
let right_t = $right_hangul_offset % HANGUL_T_COUNT;
// If either syllable is a two-jamo syllable, we fall through.
if left_t != right_t && left_t != 0 && right_t != 0 {
// We don't really support jamo tailoring, so let's
// compare the jamo directly.
if left_t < right_t {
return Ordering::Less;
}
return Ordering::Greater;
}
};
}
macro_rules! compare {
($(#[$meta:meta])*,
$compare:ident,
$left_slice:ty,
$right_slice:ty,
$split_prefix:ident,
$left_to_iter:ident,
$right_to_iter:ident,
$self:ident,
$left_tail:ident,
$right_tail:ident,
$primary_check:block,
$variable_top:ident,
) => {
$(#[$meta])*
pub fn $compare(&$self, left: &$left_slice, right: &$right_slice) -> Ordering {
let (head, $left_tail, $right_tail) = $split_prefix(left, right);
if $left_tail.is_empty() && $right_tail.is_empty() {
return Ordering::Equal;
}
let $variable_top = $self.variable_top();
if head.is_empty() {
$primary_check
}
let norm_trie = $self.norm_trie();
let ret = $self.compare_impl($left_tail.$left_to_iter(norm_trie), $right_tail.$right_to_iter(norm_trie), head.$left_to_iter(norm_trie), $variable_top);
if $self.options.strength() == Strength::Identical && ret == Ordering::Equal {
// We don't need to remove the leading U+0000, because it compares equal anyway.
return icu_normalizer::new_decomposition($left_tail.$left_to_iter(norm_trie), $self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }).cmp(
icu_normalizer::new_decomposition($right_tail.$right_to_iter(norm_trie), $self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }),
);
}
ret
}
}
}
/// Compares strings according to culturally-relevant ordering,
/// borrowed version.
#[derive(Debug)]
pub struct CollatorBorrowed<'a> {
special_primaries: &'a CollationSpecialPrimariesValidated<'a>,
root: &'a CollationData<'a>,
tailoring: &'a CollationData<'a>,
jamo: &'a CollationJamo<'a>,
diacritics: &'a CollationDiacritics<'a>,
options: CollatorOptionsBitField,
reordering: Option<&'a CollationReordering<'a>>,
decompositions: &'a DecompositionData<'a>,
tables: &'a DecompositionTables<'a>,
lithuanian_dot_above: bool,
}
impl CollatorBorrowed<'static> {
/// Creates a collator for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError> {
// These are assigned to locals in order to keep the code after these assignments
// copypaste-compatible with `Collator::try_new_unstable_internal`.
let provider = &crate::provider::Baked;
let decompositions = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1;
let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;
let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
// TODO: redesign Korean search collation handling
const _: () = assert!(
crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1
.ce32s
.as_slice()
.len()
== JAMO_COUNT
);
// `variant_count` isn't stable yet:
const _: () = assert!(
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.len()
> (MaxVariable::Currency as usize)
);
let special_primaries = const {
&CollationSpecialPrimariesValidated {
last_primaries: zerovec::ZeroSlice::from_ule_slice(
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.as_ule_slice()
.split_at(MaxVariable::Currency as usize)
.0,
)
.as_zerovec(),
numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.numeric_primary,
compressible_bytes: {
const C: &[<u16 as AsULE>::ULE] =
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.as_ule_slice();
if C.len() == MaxVariable::Currency as usize + 16 {
let i = MaxVariable::Currency as usize;
#[allow(clippy::indexing_slicing)] // protected, const
&[
C[i],
C[i + 1],
C[i + 2],
C[i + 3],
C[i + 4],
C[i + 5],
C[i + 6],
C[i + 7],
C[i + 8],
C[i + 9],
C[i + 10],
C[i + 11],
C[i + 12],
C[i + 13],
C[i + 14],
C[i + 15],
]
} else {
CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK
}
},
}
};
// Attribute belongs closer to `unwrap`, but
#[expect(clippy::unwrap_used)]
Ok(CollatorBorrowed {
special_primaries,
root,
// Unwrap is OK, because we know we have the baked provider.
tailoring: if let Some(s) = locale_dependent.tailoring {
s.get_static().unwrap()
} else {
root
},
jamo,
// Unwrap is OK, because we know we have the baked provider.
diacritics: locale_dependent.diacritics.get_static().unwrap(),
options: locale_dependent.merged_options,
// Unwrap is OK, because we know we have the baked provider.
reordering: locale_dependent.reordering.map(|s| s.get_static().unwrap()),
decompositions,
tables,
lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
})
}
/// Cheaply converts a [`CollatorBorrowed<'static>`] into a [`Collator`].
///
/// Note: Due to branching and indirection, using [`Collator`] might inhibit some
/// compile-time optimizations that are possible with [`CollatorBorrowed`].
pub const fn static_to_owned(self) -> Collator {
Collator {
special_primaries: DataPayload::from_static_ref(self.special_primaries),
root: DataPayload::from_static_ref(self.root),
tailoring: Some(DataPayload::from_static_ref(self.tailoring)),
jamo: DataPayload::from_static_ref(self.jamo),
diacritics: DataPayload::from_static_ref(self.diacritics),
options: self.options,
reordering: if let Some(s) = self.reordering {
// `map` not available in const context
Some(DataPayload::from_static_ref(s))
} else {
None
},
decompositions: DataPayload::from_static_ref(self.decompositions),
tables: DataPayload::from_static_ref(self.tables),
lithuanian_dot_above: self.lithuanian_dot_above,
}
}
}
macro_rules! collation_elements {
($self:expr, $chars:expr, $numeric_primary:expr) => {{
let jamo = <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice());
let jamo = jamo.unwrap();
CollationElements::new(
$chars,
$self.root,
$self.tailoring,
jamo,
&$self.diacritics.secondaries,
$self.tables,
$numeric_primary,
$self.lithuanian_dot_above,
)
}};
}
impl<'data> CollatorBorrowed<'data> {
/// The resolved options showing how the default options, the requested options,
/// and the options from locale data were combined.
pub fn resolved_options(&self) -> ResolvedCollatorOptions {
self.options.into()
}
fn norm_trie(&self) -> &'data NormTrie<'data> {
#[allow(clippy::useless_conversion)]
<&NormTrie<'data>>::try_from(&self.decompositions.trie)
.unwrap_or_else(|_| unreachable!("Incompatible data"))
}
compare!(
/// Compare guaranteed well-formed UTF-8 slices.
,
compare,
str,
str,
split_prefix,
chars_with_trie_default_for_ascii,
chars_with_trie_default_for_ascii,
self,
left_tail,
right_tail,
{
// Not macroized: Copy and paste to the other UTF-8 case below.
let tailoring_trie = &self.tailoring.trie;
if let Some((left_c, left_u32)) = left_tail.chars_with_trie(tailoring_trie).next() {
// Logically, there's a bunch of stuff we could do from the left side alone to determine
// that reading from the right side at all or reading from the right trie is useless.
// Doing that seems to be a pessimization, at least in the absence of PGO.
if let Some((right_c, right_u32)) = right_tail.chars_with_trie(tailoring_trie).next() {
let left_ce32 = CollationElement32::new(left_u32);
let right_ce32 = CollationElement32::new(right_u32);
if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) {
if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) {
quick_primary_compare!(left_primary, right_primary, variable_top, self,);
}
}
// Try Hangul. (At least in the absence of PGO, it's better _not_ to put
// this into an `else` branch of the `left_primary` `if`.)
let right_hangul_offset = u32::from(right_c).wrapping_sub(HANGUL_S_BASE);
if right_hangul_offset < HANGUL_S_COUNT {
let left_hangul_offset = u32::from(left_c).wrapping_sub(HANGUL_S_BASE);
if left_hangul_offset < HANGUL_S_COUNT {
hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,);
}
}
}
}
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
},
variable_top,
);
compare!(
/// Compare potentially ill-formed UTF-8 slices. Ill-formed input is compared
/// as if errors had been replaced with REPLACEMENT CHARACTERs according
/// to the WHATWG Encoding Standard.
,
compare_utf8,
[u8],
[u8],
split_prefix_u8,
chars_with_trie_default_for_ascii,
chars_with_trie_default_for_ascii,
self,
left_tail,
right_tail,
{
// Direct copypaste from the `str` case.
let tailoring_trie = &self.tailoring.trie;
if let Some((left_c, left_u32)) = left_tail.chars_with_trie(tailoring_trie).next() {
// Logically, there's a bunch of stuff we could do from the left side alone to determine
// that reading from the right side at all or reading from the right trie is useless.
// Doing that seems to be a pessimization, at least in the absence of PGO.
if let Some((right_c, right_u32)) = right_tail.chars_with_trie(tailoring_trie).next() {
let left_ce32 = CollationElement32::new(left_u32);
let right_ce32 = CollationElement32::new(right_u32);
if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) {
if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) {
quick_primary_compare!(left_primary, right_primary, variable_top, self,);
}
}
// Try Hangul. (At least in the absence of PGO, it's better _not_ to put
// this into an `else` branch of the `left_primary` `if`.)
let right_hangul_offset = u32::from(right_c).wrapping_sub(HANGUL_S_BASE);
if right_hangul_offset < HANGUL_S_COUNT {
let left_hangul_offset = u32::from(left_c).wrapping_sub(HANGUL_S_BASE);
if left_hangul_offset < HANGUL_S_COUNT {
hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,);
}
}
}
}
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
},
variable_top,
);
compare!(
/// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates
/// are compared as if each one was a REPLACEMENT CHARACTER.
,
compare_utf16,
[u16],
[u16],
split_prefix_u16,
chars_with_trie,
chars_with_trie,
self,
left_tail,
right_tail,
{
let tailoring_trie = &self.tailoring.trie;
if let Some(left_u) = left_tail.first() {
if let Some(right_u) = right_tail.first() {
let left_u16 = *left_u;
let right_u16 = *right_u;
let left_u32 = tailoring_trie.get16(left_u16);
let right_u32 = tailoring_trie.get16(right_u16);
let left_ce32 = CollationElement32::new(left_u32);
let right_ce32 = CollationElement32::new(right_u32);
if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) {
if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) {
quick_primary_compare!(left_primary, right_primary, variable_top, self,);
}
}
// Try Hangul. Not putting in an `else` of the above consistent with UTF-8.
let left_hangul_offset = u32::from(left_u16).wrapping_sub(HANGUL_S_BASE);
if left_hangul_offset < HANGUL_S_COUNT {
if let Some(right_u) = right_tail.first() {
let right_u16 = *right_u;
let right_hangul_offset = u32::from(right_u16).wrapping_sub(HANGUL_S_BASE);
if right_hangul_offset < HANGUL_S_COUNT {
hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,);
}
}
}
}
}
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
},
variable_top,
);
compare!(
/// Compare Latin1 slices.
///
/// ✨ *Enabled with the `latin1` Cargo feature.*
#[cfg(feature = "latin1")]
,
compare_latin1,
[u8],
[u8],
split_prefix_latin1,
latin1_chars_with_trie,
latin1_chars_with_trie,
self,
left_tail,
right_tail,
{
let tailoring_trie = &self.tailoring.trie;
if let Some(left_u) = left_tail.first() {
if let Some(right_u) = right_tail.first() {
let left_u8 = *left_u;
let right_u8 = *right_u;
// The probability of getting a non-simple ce32 from
// the Latin1 part above ASCII is so high that let's
// only consider ASCII on the left for this fast path.
// SAFETY: Checking the invariant of `get7` here.
if left_u8 < 0x80 {
// SAFETY: Invariant of `get7` checked above.
let left_u32 = unsafe { tailoring_trie.get7(left_u8) };
let left_ce32 = CollationElement32::new(left_u32);
// SAFETY: Checking the invariant of `get7` here for right.
if right_u8 < 0x80 {
// SAFETY: Invariant of `get7` checked above.
let right_u32 = unsafe { tailoring_trie.get7(right_u8) };
let right_ce32 = CollationElement32::new(right_u32);
// Should be use script reordering to cater to reordering
// digets or punctuation relative to letters?
if let Some(left_primary) = left_ce32.to_primary_simple() {
if let Some(right_primary) = right_ce32.to_primary_simple() {
if (left_primary != right_primary)
&& (left_primary != 0)
&& (right_primary != 0)
&& !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY)
&& !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY)
{
if left_primary < right_primary {
return Ordering::Less;
}
return Ordering::Greater;
}
}
}
}
}
}
}
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
},
variable_top,
);
compare!(
/// Compare Latin1 slice with potentially ill-formed UTF-16
/// slice.
///
/// If you need to compare a potentially ill-formed UTF-16
/// slice with a Latin1 slice, swap the arguments and
/// call `reverse()` on the return value.
///
/// ✨ *Enabled with the `latin1` Cargo feature.*
#[cfg(feature = "latin1")]
,
compare_latin1_utf16,
[u8],
[u16],
split_prefix_latin1_utf16,
latin1_chars_with_trie,
chars_with_trie,
self,
left_tail,
right_tail,
{
let tailoring_trie = &self.tailoring.trie;
if let Some(left_u) = left_tail.first() {
if let Some(right_u) = right_tail.first() {
let left_u8 = *left_u;
// The probability of getting a non-simple ce32 from
// the Latin1 part above ASCII is so high that let's
// only consider ASCII on the left for this fast path.
// SAFETY: Checking the invariant of `get7` here.
if left_u8 < 0x80 {
// SAFETY: Invariant of `get7` checked above.
let left_u32 = unsafe { tailoring_trie.get7(left_u8) };
let left_ce32 = CollationElement32::new(left_u32);
let right_u16 = *right_u;
let right_u32 = tailoring_trie.get16(right_u16);
let right_ce32 = CollationElement32::new(right_u32);
if let Some(mut left_primary) = left_ce32.to_primary_simple() {
// Don't use the macro to micro-optimize away the long primary
// case for ASCII.
if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) {
if (left_primary != right_primary)
&& (left_primary != 0)
&& (right_primary != 0)
&& !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY)
&& !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY)
{
if let Some(reordering) = &self.reordering {
left_primary = reordering.reorder(left_primary);
right_primary = reordering.reorder(right_primary);
}
if left_primary < right_primary {
return Ordering::Less;
}
return Ordering::Greater;
}
}
}
}
}
}
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
},
variable_top,
);
#[inline(always)]
fn numeric_primary(&self) -> Option<u8> {
if self.options.numeric() {
Some(self.special_primaries.numeric_primary)
} else {
None
}
}
#[inline(always)]
fn variable_top(&self) -> u32 {
if self.options.alternate_handling() == AlternateHandling::NonIgnorable {
0
} else {
// +1 so that we can use "<" and primary ignorables test out early.
self.special_primaries
.last_primary_for_group(self.options.max_variable())
+ 1
}
}
/// The implementation of the comparison operation.
///
/// `head_chars` is an iterator _backward_ over the identical
/// prefix and `left_chars` and `right_chars` are iterators
/// _forward_ over the parts after the identical prefix.
fn compare_impl<L, R, H, T>(
&'data self,
left_chars: L,
right_chars: R,
mut head_chars: H,
variable_top: u32,
) -> Ordering
where
L: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + 'data,
R: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + 'data,
H: DoubleEndedIterator<Item = (char, u32)> + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
{
// Sadly, it looks like variable CEs and backward second level
// require us to store the full 64-bit CEs instead of storing only
// the NonPrimary part.
//
// TODO(#2008): Consider having two monomorphizations of this method:
// one that can deal with variables shifted to quaternary and
// backward second level and another that doesn't support that
// and only stores `NonPrimary` in `left_ces` and `right_ces`
// with double the number of stack allocated elements.
// Note: These are used only after the identical prefix skipping,
// but initializing these up here improves performance at the time
// of writing. Presumably the source order affects the stack frame
// layout.
let mut left_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
let mut right_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
// The algorithm comes from CollationCompare::compareUpToQuaternary in ICU4C.
let mut any_variable = false;
let numeric_primary = self.numeric_primary();
let mut left = collation_elements!(self, left_chars, numeric_primary);
let mut right = collation_elements!(self, right_chars, numeric_primary);
// Start identical prefix
// The logic here to check whether the boundary found by skipping
// the identical prefix is safe is complicated compared to the ICU4C
// approach of having a set of characters that are unsafe as the character
// immediately following the identical prefix. However, the approach here
// avoids extra data, and working on the main data avoids the bug
// possibility of data structures not being mutually consistent.
// This code intentionally does not keep around the `CollationElement32`s
// that have been read from the collation data tries, because keeping
// them around turned out to be a pessimization: There would be added
// branches on the hot path of the algorithm that maps characters to
// collation elements, and the element size of the upcoming buffer
// would grow.
//
// However, the values read from the normalization trie _are_ kept around,
// since there is already a place where to put them.
// This loop is only broken out of as goto forward.
#[expect(clippy::never_loop)]
'prefix: loop {
if let Some((mut head_last_c, head_last_trie_val)) = head_chars.next_back() {
let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
head_last_c,
head_last_trie_val,
);
let mut head_last_ce32 = CollationElement32::default();
let mut head_last_ok = false;
if let Some(left_different) = left.iter_next_before_init() {
left.prepend_upcoming_before_init(left_different.clone());
if let Some(right_different) = right.iter_next_before_init() {
// Note: left_different and right_different may both be U+FFFD.
right.prepend_upcoming_before_init(right_different.clone());
// The base logic is that a boundary between two starters
// that decompose to selves is safe iff the starter
// before the boundary can't contract a starter, the
// starter after the boundary doesn't have a prefix
// condition, and, with the numeric mode enabled,
// they aren't both numeric.
//
// This base logic is then extended with Hangul
// syllables and characters that decompose to a
// BMP starter followed by a BMP non-starter.
// The logic could be extended further, in
// particular to cover singleton decompositions
// to a BMP starter, but such characters can be
// expected to be rare enough in real-world input
// that it's not worthwhile to make this code more
// branchy.
//
// A Hangul syllable is safe on either side of the
// boundary, because Hangul syllables can't participate
// in contraction or have prefix conditions. They are
// also known not to be numeric.
//
// Hangul jamo is safe to look up from the main trie
// instead of the jamo table, because they aren't
// allowed to participate in contractions or prefix
// conditions, either, and are known not to be numeric.
//
// After a boundary, a decomposition to a BMP starter
// and a BMP non-starter can obviously be analyzed by
// considering the starter as if it was a starter
// that decomposes to self.
//
// Before a boundary the contraction condition considers
// whether the contraction can contract a starter.
// For the case of contracting a non-starter, it's
// fine for the BMP starter of the decomposition to
// contract the non-starter from the same decomposition:
// Since that would happen as part of the prefix that
// is identical, it wouldn't affect anything after.
//
// The case of contracting a non-starter other than
// the one that came from the decomposition itself
// is irrelevant, because we don't allow a non-starter
// right after the boundary regardless of the contraction
// status of what's before the boundary.
//
// Finally, decompositions to starter and non-starter
// are known not to be numeric.
// The checks below are repetitive, but an attempt to factor
// repetitive code into an inlined function regressed,
// performance, so it seems that having the control flow
// right here without an intermediate enum from a
// function return to branch on is important.
// This loop is only broken out of as goto forward. The control flow
// is much more readable this way.
#[expect(clippy::never_loop)]
loop {
// The two highest bits are about NFC, which we don't
// care about here.
let decomposition = head_last.trie_val;
if (decomposition
& !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
== 0
{
// Intentionally empty block to keep
// the same structure as in the cases
// where something happens here.
} else if ((decomposition & HIGH_ZEROS_MASK) != 0)
&& ((decomposition & LOW_ZEROS_MASK) != 0)
{
// Decomposition into two BMP characters: starter and non-starter
// Let's take the starter
head_last_c = char_from_u32(decomposition & 0x7FFF);
} else if decomposition == HANGUL_SYLLABLE_MARKER {
head_last_ce32 = IDENTICAL_PREFIX_HANGUL_MARKER_CE32;
} else {
break;
}
head_last_ok = true;
let left_c;
let right_c;
let decomposition = left_different.trie_val;
if (decomposition
& !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
== 0
{
left_c = left_different.character();
} else if ((decomposition & HIGH_ZEROS_MASK) != 0)
&& ((decomposition & LOW_ZEROS_MASK) != 0)
{
// Decomposition into two BMP characters: starter and non-starter
// Let's take the starter
left_c = char_from_u32(decomposition & 0x7FFF);
} else if decomposition == HANGUL_SYLLABLE_MARKER {
left_c = left_different.character();
if right_different.trie_val == HANGUL_SYLLABLE_MARKER {
// Hangul syllable to Hangul syllable comparison can give us
// a quick exit.
let left_hangul_offset =
u32::from(left_c).wrapping_sub(HANGUL_S_BASE);
let right_hangul_offset =
u32::from(right_different.character())
.wrapping_sub(HANGUL_S_BASE);
// If these are not in range, we have a memory-safe GIGO case.
debug_assert!(left_hangul_offset < HANGUL_S_COUNT);
debug_assert!(right_hangul_offset < HANGUL_S_COUNT);
hangul_syllable_compare!(
left_hangul_offset,
right_hangul_offset,
);
// We had a two-jamo syllable whose jamo matched
// the first two jamo of the other syllable.
// Still, we're at a good boundary.
break 'prefix;
}
} else {
break;
}
let decomposition = right_different.trie_val;
if (decomposition
& !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
== 0
{
right_c = right_different.character();
} else if ((decomposition & HIGH_ZEROS_MASK) != 0)
&& ((decomposition & LOW_ZEROS_MASK) != 0)
{
// Decomposition into two BMP characters: starter and non-starter
// Let's take the starter
right_c = char_from_u32(decomposition & 0x7FFF);
} else if decomposition == HANGUL_SYLLABLE_MARKER {
right_c = right_different.character();
} else {
break;
}
// The last character of the prefix is OK on the normalization
// level. Now let's check its ce32 unless it's a Hangul syllable,
// in which case `head_last_ce32` already is a non-default placeholder.
if head_last_ce32 == CollationElement32::default() {
head_last_ce32 = self.tailoring.ce32_for_char(head_last_c);
if head_last_ce32 == FALLBACK_CE32 {
head_last_ce32 = self.root.ce32_for_char(head_last_c);
}
if head_last_ce32.tag_checked() == Some(Tag::Contraction)
&& head_last_ce32.at_least_one_suffix_contains_starter()
{
break;
}
}
let mut left_data = self.tailoring;
let mut left_ce32 = self.tailoring.ce32_for_char(left_c);
if left_ce32 == FALLBACK_CE32 {
left_ce32 = self.root.ce32_for_char(left_c);
left_data = self.root;
}
let mut right_data = self.tailoring;
let mut right_ce32 = self.tailoring.ce32_for_char(right_c);
if right_ce32 == FALLBACK_CE32 {
right_ce32 = self.root.ce32_for_char(right_c);
right_data = self.root;
}
// We don't actually need to do this.
// if self.options.numeric()
// && head_last_ce32.tag_checked() == Some(Tag::Digit)
// && (left_ce32.tag_checked() == Some(Tag::Digit)
// || right_ce32.tag_checked() == Some(Tag::Digit))
// {
// break;
// }
// We might be at at a good boundary unless either ce32 is a
// prefix ce32. Let's check for the happy path first, though.
// Now check if we ce32s we have are simple enough to
// make a quick decision here.
if let Some(mut left_primary) =
left_ce32.to_primary_in_quick_check(left_data)
{
if let Some(mut right_primary) =
right_ce32.to_primary_in_quick_check(right_data)
{
quick_primary_compare!(
left_primary,
right_primary,
variable_top,
self,
);
}
}
if left_ce32.tag_checked() == Some(Tag::Prefix)
|| right_ce32.tag_checked() == Some(Tag::Prefix)
{
// TODO: For the Japanese tailoring, it might actually be worthwhile
// to handle the prefix ce32s inline above the quick check. We've already
// read the last character from `head` anyway.
break;
}
// We're at a good boundary but could not make a quick primary comparison decision.
// Note: It might look like a good idea to cache the CE32s, but
// doing so actually makes things slower.
break 'prefix;
}
}
}
let mut tail_first_c;
let mut tail_first_ce32;
let mut tail_first_ok;
loop {
// Take a step back.
left.prepend_upcoming_before_init(head_last.clone());
right.prepend_upcoming_before_init(head_last.clone());
tail_first_c = head_last_c;
tail_first_ce32 = head_last_ce32;
tail_first_ok = head_last_ok;
let Some((head_last_c_new, decomposition)) = head_chars.next_back() else {
// We need to step back beyond the start of the prefix.
// Treat as good boundary.
break 'prefix;
};
head_last_c = head_last_c_new;
head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
head_last_c,
decomposition,
);
head_last_ce32 = CollationElement32::default();
head_last_ok = false;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
// Intentionally empty block to keep
// the same structure as in the cases
// where something happens here.
} else if ((decomposition & HIGH_ZEROS_MASK) != 0)
&& ((decomposition & LOW_ZEROS_MASK) != 0)
{
// Decomposition into two BMP characters: starter and non-starter
// Let's take the starter
head_last_c = char_from_u32(decomposition & 0x7FFF);
} else if decomposition == HANGUL_SYLLABLE_MARKER {
head_last_ce32 = FFFD_CE32;
} else {
continue;
}
head_last_ok = true;
if !tail_first_ok {
continue;
}
// The last character of the prefix is OK on the normalization
// level. Now let's check its ce32 unless it's a Hangul syllable.
if head_last_ce32 == CollationElement32::default() {
head_last_ce32 = self.tailoring.ce32_for_char(head_last_c);
if head_last_ce32 == FALLBACK_CE32 {
head_last_ce32 = self.root.ce32_for_char(head_last_c);
}
if head_last_ce32.tag_checked() == Some(Tag::Contraction)
&& head_last_ce32.at_least_one_suffix_contains_starter()
{
continue;
}
}
// Check this _after_ `head_last_ce32` to make sure
// `head_last_ce32` is initialized for the next loop round
// trip if applicable.
if tail_first_ce32 == CollationElement32::default() {
tail_first_ce32 = self.tailoring.ce32_for_char(tail_first_c);
if tail_first_ce32 == FALLBACK_CE32 {
tail_first_ce32 = self.root.ce32_for_char(tail_first_c);
}
} // else we already have a trie value from the previous loop iteration or we have Hangul syllable
if tail_first_ce32.tag_checked() == Some(Tag::Prefix) {
continue;
}
if self.options.numeric()
&& head_last_ce32.tag_checked() == Some(Tag::Digit)
&& tail_first_ce32.tag_checked() == Some(Tag::Digit)
{
continue;
}
// We are at a good boundary!
break 'prefix;
}
} else {
// The prefix is empty
break 'prefix;
}
// Unreachable line
}
// End identical prefix
left.init();
right.init();
loop {
let mut left_primary;
'left_primary_loop: loop {
let ce = left.next();
left_primary = ce.primary();
// TODO(#2008): Consider compiling out the variable handling when we know we aren't
// shifting variable CEs.
if !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) {
left_ces.push(ce);
} else {
// Variable CE, shift it to quaternary level.
// Ignore all following primary ignorables, and shift further variable CEs.
any_variable = true;
// Relative to ICU4C, the next line is hoisted out of the following loop
// in order to keep the variables called `ce` immutable to make it easier
// to reason about each assignment into `ce` resulting in exactly a single
// push into `left_ces`.
left_ces.push(ce.clone_with_non_primary_zeroed());
loop {
// This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
let ce = left.next();
left_primary = ce.primary();
if left_primary != 0
&& !(left_primary < variable_top
&& left_primary > MERGE_SEPARATOR_PRIMARY)
{
// Neither a primary ignorable nor a variable CE.
left_ces.push(ce);
break 'left_primary_loop;
}
// If `left_primary == 0`, the following line ignores a primary-ignorable.
// Otherwise, it shifts a variable CE.
left_ces.push(ce.clone_with_non_primary_zeroed());
}
}
if left_primary != 0 {
break;
}
}
let mut right_primary;
'right_primary_loop: loop {
let ce = right.next();
right_primary = ce.primary();
// TODO(#2008): Consider compiling out the variable handling when we know we aren't
// shifting variable CEs.
if !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) {
right_ces.push(ce);
} else {
// Variable CE, shift it to quaternary level.
// Ignore all following primary ignorables, and shift further variable CEs.
any_variable = true;
// Relative to ICU4C, the next line is hoisted out of the following loop
// in order to keep the variables called `ce` immutable to make it easier
// to reason about each assignment into `ce` resulting in exactly a single
// push into `right_ces`.
right_ces.push(ce.clone_with_non_primary_zeroed());
loop {
// This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
let ce = right.next();
right_primary = ce.primary();
if right_primary != 0
&& !(right_primary < variable_top
&& right_primary > MERGE_SEPARATOR_PRIMARY)
{
// Neither a primary ignorable nor a variable CE.
right_ces.push(ce);
break 'right_primary_loop;
}
// If `right_primary == 0`, the following line ignores a primary-ignorable.
// Otherwise, it shifts a variable CE.
right_ces.push(ce.clone_with_non_primary_zeroed());
}
}
if right_primary != 0 {
break;
}
}
if left_primary != right_primary {
if let Some(reordering) = &self.reordering {
left_primary = reordering.reorder(left_primary);
right_primary = reordering.reorder(right_primary);
}
if left_primary < right_primary {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_primary == NO_CE_PRIMARY {
break;
}
}
// Sadly, we end up pushing the sentinel value, which means these
// `SmallVec`s allocate more often than if we didn't actually
// store the sentinel.
debug_assert_eq!(left_ces.last(), Some(&NO_CE));
debug_assert_eq!(right_ces.last(), Some(&NO_CE));
// Note: `unwrap_or_default` in the iterations below should never
// actually end up using the "_or_default" part, because the sentinel
// is in the `SmallVec`s. These could be changed to `unwrap()` if we
// preferred panic in case of a bug.
// TODO(#2009): Should we save one slot by not putting the sentinel in
// the `SmallVec`s? So far, the answer seems "no", as it would complicate
// the primary comparison above.
// Compare the buffered secondary & tertiary weights.
// We might skip the secondary level but continue with the case level
// which is turned on separately.
if self.options.strength() >= Strength::Secondary {
if !self.options.backward_second_level() {
let mut left_iter = left_ces.iter();
let mut right_iter = right_ces.iter();
let mut left_secondary;
let mut right_secondary;
loop {
loop {
left_secondary = left_iter.next().unwrap_or_default().secondary();
if left_secondary != 0 {
break;
}
}
loop {
right_secondary = right_iter.next().unwrap_or_default().secondary();
if right_secondary != 0 {
break;
}
}
if left_secondary != right_secondary {
if left_secondary < right_secondary {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_secondary == NO_CE_SECONDARY {
break;
}
}
} else {
let mut left_remaining = &left_ces[..];
let mut right_remaining = &right_ces[..];
loop {
if left_remaining.is_empty() {
debug_assert!(right_remaining.is_empty());
break;
}
let (left_prefix, right_prefix) = {
let mut left_iter = left_remaining.iter();
loop {
let left_primary = left_iter.next().unwrap_or_default().primary();
if left_primary != 0 && left_primary <= MERGE_SEPARATOR_PRIMARY {
break;
}
debug_assert_ne!(left_primary, NO_CE_PRIMARY);
}
let left_new_remaining = left_iter.as_slice();
// Index in range by construction
#[expect(clippy::indexing_slicing)]
let left_prefix =
&left_remaining[..left_remaining.len() - 1 - left_new_remaining.len()];
left_remaining = left_new_remaining;
let mut right_iter = right_remaining.iter();
loop {
let right_primary = right_iter.next().unwrap_or_default().primary();
if right_primary != 0 && right_primary <= MERGE_SEPARATOR_PRIMARY {
break;
}
debug_assert_ne!(right_primary, NO_CE_PRIMARY);
}
let right_new_remaining = right_iter.as_slice();
// Index in range by construction
#[expect(clippy::indexing_slicing)]
let right_prefix = &right_remaining
[..right_remaining.len() - 1 - right_new_remaining.len()];
right_remaining = right_new_remaining;
(left_prefix, right_prefix)
};
let mut left_iter = left_prefix.iter();
let mut right_iter = right_prefix.iter();
let mut left_secondary;
let mut right_secondary;
loop {
loop {
left_secondary = left_iter.next_back().unwrap_or_default().secondary();
if left_secondary != 0 {
break;
}
}
loop {
right_secondary =
right_iter.next_back().unwrap_or_default().secondary();
if right_secondary != 0 {
break;
}
}
if left_secondary != right_secondary {
if left_secondary < right_secondary {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_secondary == NO_CE_SECONDARY {
break;
}
}
}
}
}
if self.options.case_level() {
let mut left_non_primary;
let mut right_non_primary;
let mut left_case;
let mut right_case;
let mut left_iter = left_ces.iter();
let mut right_iter = right_ces.iter();
if self.options.strength() == Strength::Primary {
// Primary+caseLevel: Ignore case level weights of primary ignorables.
// Otherwise we would get a-umlaut > a
// which is not desirable for accent-insensitive sorting.
// Check for (lower 32 bits) == 0 as well because variable CEs are stored
// with only primary weights.
loop {
loop {
let ce = left_iter.next().unwrap_or_default();
left_non_primary = ce.non_primary();
if !ce.either_half_zero() {
break;
}
}
left_case = left_non_primary.case();
loop {
let ce = right_iter.next().unwrap_or_default();
right_non_primary = ce.non_primary();
if !ce.either_half_zero() {
break;
}
}
right_case = right_non_primary.case();
// No need to handle NO_CE and MERGE_SEPARATOR specially:
// There is one case weight for each previous-level weight,
// so level length differences were handled there.
if left_case != right_case {
if !self.options.upper_first() {
if left_case < right_case {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_case < right_case {
return Ordering::Greater;
}
return Ordering::Less;
}
if left_non_primary.secondary() == NO_CE_SECONDARY {
break;
}
}
} else {
// Secondary+caseLevel: By analogy with the above,
// ignore case level weights of secondary ignorables.
//
// Note: A tertiary CE has uppercase case bits (0.0.ut)
// to keep tertiary+caseFirst well-formed.
//
// Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
// Otherwise a tertiary CE's uppercase would be no greater than
// a primary/secondary CE's uppercase.
// (See UCA well-formedness condition 2.)
// We could construct a special case weight higher than uppercase,
// but it's simpler to always ignore case weights of secondary ignorables,
// turning 0.0.ut into 0.0.0.t.
// (See LDML Collation, Case Parameters.)
loop {
loop {
left_non_primary = left_iter.next().unwrap_or_default().non_primary();
if left_non_primary.secondary() != 0 {
break;
}
}
left_case = left_non_primary.case();
loop {
right_non_primary = right_iter.next().unwrap_or_default().non_primary();
if right_non_primary.secondary() != 0 {
break;
}
}
right_case = right_non_primary.case();
// No need to handle NO_CE and MERGE_SEPARATOR specially:
// There is one case weight for each previous-level weight,
// so level length differences were handled there.
if left_case != right_case {
if !self.options.upper_first() {
if left_case < right_case {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_case < right_case {
return Ordering::Greater;
}
return Ordering::Less;
}
if left_non_primary.secondary() == NO_CE_SECONDARY {
break;
}
}
}
}
if let Some(tertiary_mask) = self.options.tertiary_mask() {
let mut any_quaternaries = AnyQuaternaryAccumulator::new();
let mut left_iter = left_ces.iter();
let mut right_iter = right_ces.iter();
loop {
let mut left_non_primary;
let mut left_tertiary;
loop {
left_non_primary = left_iter.next().unwrap_or_default().non_primary();
any_quaternaries.accumulate(left_non_primary);
debug_assert!(
left_non_primary.tertiary() != 0 || left_non_primary.case_quaternary() == 0
);
left_tertiary = left_non_primary.tertiary_case_quarternary(tertiary_mask);
if left_tertiary != 0 {
break;
}
}
let mut right_non_primary;
let mut right_tertiary;
loop {
right_non_primary = right_iter.next().unwrap_or_default().non_primary();
any_quaternaries.accumulate(right_non_primary);
debug_assert!(
right_non_primary.tertiary() != 0
|| right_non_primary.case_quaternary() == 0
);
right_tertiary = right_non_primary.tertiary_case_quarternary(tertiary_mask);
if right_tertiary != 0 {
break;
}
}
if left_tertiary != right_tertiary {
if self.options.upper_first() {
// Pass through NO_CE and keep real tertiary weights larger than that.
// Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
// to keep tertiary CEs well-formed.
// Their case+tertiary weights must be greater than those of
// primary and secondary CEs.
// Magic numbers from ICU4C.
if left_tertiary > NO_CE_TERTIARY {
if left_non_primary.secondary() != 0 {
left_tertiary ^= 0xC000;
} else {
left_tertiary += 0x4000;
}
}
if right_tertiary > NO_CE_TERTIARY {
if right_non_primary.secondary() != 0 {
right_tertiary ^= 0xC000;
} else {
right_tertiary += 0x4000;
}
}
}
if left_tertiary < right_tertiary {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_tertiary == NO_CE_TERTIARY {
break;
}
}
if !any_variable && !any_quaternaries.has_quaternary() {
return Ordering::Equal;
}
} else {
return Ordering::Equal;
}
if self.options.strength() <= Strength::Tertiary {
return Ordering::Equal;
}
let mut left_iter = left_ces.iter();
let mut right_iter = right_ces.iter();
loop {
let mut left_quaternary;
loop {
let ce = left_iter.next().unwrap_or_default();
if ce.tertiary_ignorable() {
left_quaternary = ce.primary();
} else {
left_quaternary = ce.quaternary();
}
if left_quaternary != 0 {
break;
}
}
let mut right_quaternary;
loop {
let ce = right_iter.next().unwrap_or_default();
if ce.tertiary_ignorable() {
right_quaternary = ce.primary();
} else {
right_quaternary = ce.quaternary();
}
if right_quaternary != 0 {
break;
}
}
if left_quaternary != right_quaternary {
if let Some(reordering) = &self.reordering {
left_quaternary = reordering.reorder(left_quaternary);
right_quaternary = reordering.reorder(right_quaternary);
}
if left_quaternary < right_quaternary {
return Ordering::Less;
}
return Ordering::Greater;
}
if left_quaternary == NO_CE_PRIMARY {
break;
}
}
Ordering::Equal
}
fn sort_key_levels(&self) -> u8 {
#[expect(clippy::indexing_slicing)]
let mut levels = LEVEL_MASKS[self.options.strength() as usize];
if self.options.case_level() {
levels |= CASE_LEVEL_FLAG;
}
levels
}
/// Given valid UTF-8, write the sort key bytes up to the collator's strength.
///
/// The bytes are written to an implementor of [`CollationKeySink`], a no-std version of `std::io::Write`.
/// This trait is currently unstable, but it is implemented by `Vec<u8>`, `VecDeque<u8>`,
/// `SmallVec<[u8; N]>`, and `&mut [u8]` (returning an error if the slice is too small).
///
/// If two sort keys generated at the same strength are compared bytewise, the result is
/// the same as a collation comparison of the original strings at that strength.
///
/// For identical strength, the UTF-8 NFD normalization is appended for breaking ties.
///
/// No terminating zero byte is written to the output, so the output is not a valid C
/// string, but the caller may append a zero afterward if a C string is desired.
///
/// ⚠️ Generating a sort key is expensive relative to comparison because to compare, the
/// collator skips identical prefixes before doing more complex comparison. Only use sort
/// keys if you expect to compare them many times so as to amortize the cost of generating
/// them. Measurement of this performance trade-off would be a good idea.
///
/// ⚠️ Sort keys, if stored durably, should be presumed to be invalidated by a CLDR update, a
/// new version of Unicode, or an update to the ICU4X code. Applications using sort keys
/// *must* be prepared to recompute them if required and should take the performance of
/// such an operation into account when deciding to use sort keys.
///
/// ⚠️ If you should store sort keys in a database that is or becomes so large that
/// regenerating sort keys becomes impractical, you should not expect ICU4X to support your
/// using an older, frozen copy of the sort key generation algorithm with a later version
/// of the library.
///
/// # Example
///
/// ```
/// use icu_collator::{
/// options::{CollatorOptions, Strength},
/// Collator,
/// };
/// use icu_locale::locale;
/// let locale = locale!("utf").into();
/// let mut options = CollatorOptions::default();
/// options.strength = Some(Strength::Primary);
/// let collator = Collator::try_new(locale, options).unwrap();
///
/// let mut k1 = Vec::new();
/// let Ok(()) = collator.write_sort_key_to("hello", &mut k1);
/// let mut k2 = Vec::new();
/// let Ok(()) = collator.write_sort_key_to("Héłłö", &mut k2);
/// assert_eq!(k1, k2);
/// ```
pub fn write_sort_key_to<S>(&self, s: &str, sink: &mut S) -> Result<S::Output, S::Error>
where
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink)
}
/// Given potentially invalid UTF-8, write the sort key bytes up to the collator's strength.
///
/// For further details, see [`Self::write_sort_key_to`].
pub fn write_sort_key_utf8_to<S>(&self, s: &[u8], sink: &mut S) -> Result<S::Output, S::Error>
where
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink)
}
/// Given potentially invalid UTF-16, write the sort key bytes up to the collator's strength.
///
/// For further details, see [`Self::write_sort_key_to`].
pub fn write_sort_key_utf16_to<S>(&self, s: &[u16], sink: &mut S) -> Result<S::Output, S::Error>
where
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars_with_trie(self.norm_trie()), sink)
}
fn write_sort_key_impl<I, T, S>(
&'data self,
iter: I,
sink: &mut S,
) -> Result<S::Output, S::Error>
where
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + Clone + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
S: CollationKeySink + ?Sized,
S::State: Default,
{
let identical = if self.options.strength() == Strength::Identical {
Some(iter.clone())
} else {
None
};
let mut state = S::State::default();
self.write_sort_key_up_to_quaternary(iter, sink, &mut state)?;
if let Some(iter) = identical {
sink.write_byte(&mut state, LEVEL_SEPARATOR_BYTE)?;
let mut iter = icu_normalizer::new_decomposition(iter, self.tables);
let _ = iter.next(); // Discard the U+0000.
write_identical_level(iter, sink, &mut state)?;
}
sink.finish(state)
}
/// Write the sort key bytes up to the collator's strength.
///
/// Optionally write the case level. Separate levels with the `LEVEL_SEPARATOR_BYTE`, but
/// do not write a terminating zero as with a C string.
fn write_sort_key_up_to_quaternary<I, S, T>(
&'data self,
iter: I,
sink: &mut S,
state: &mut S::State,
) -> Result<(), S::Error>
where
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + Clone + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
S: CollationKeySink + ?Sized,
{
// This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C.
let levels = self.sort_key_levels();
let mut iter = collation_elements!(self, iter, self.numeric_primary());
iter.init();
let variable_top = self.variable_top();
let tertiary_mask = self.options.tertiary_mask().unwrap_or_default();
let mut cases = SortKeyLevel::default();
let mut secondaries = SortKeyLevel::default();
let mut tertiaries = SortKeyLevel::default();
let mut quaternaries = SortKeyLevel::default();
let mut prev_reordered_primary = 0;
let mut common_cases = 0usize;
let mut common_secondaries = 0usize;
let mut common_tertiaries = 0usize;
let mut common_quaternaries = 0usize;
let mut prev_secondary = 0;
let mut sec_segment_start = 0;
loop {
let mut ce = iter.next();
let mut p = ce.primary();
if p < variable_top && p > MERGE_SEPARATOR_PRIMARY {
// Variable CE, shift it to quaternary level. Ignore all following primary
// ignorables, and shift further variable CEs.
if common_quaternaries != 0 {
common_quaternaries -= 1;
while common_quaternaries >= QUAT_COMMON[WEIGHT_MAX_COUNT] as _ {
quaternaries.append_byte(QUAT_COMMON[WEIGHT_MIDDLE]);
common_quaternaries -= QUAT_COMMON[WEIGHT_MAX_COUNT] as usize;
}
// Shifted primary weights are lower than the common weight.
quaternaries.append_byte(QUAT_COMMON[WEIGHT_LOW] + common_quaternaries as u8);
common_quaternaries = 0;
}
loop {
if levels & QUATERNARY_LEVEL_FLAG != 0 {
if let Some(reordering) = &self.reordering {
p = reordering.reorder(p);
}
if (p >> 24) as u8 >= QUAT_SHIFTED_LIMIT_BYTE {
// Prevent shifted primary lead bytes from overlapping with the
// common compression range.
quaternaries.append_byte(QUAT_SHIFTED_LIMIT_BYTE);
}
quaternaries.append_weight_32(p);
}
loop {
ce = iter.next();
p = ce.primary();
if p != 0 {
break;
}
}
if !(p < variable_top && p > MERGE_SEPARATOR_PRIMARY) {
break;
}
}
}
// ce could be primary ignorable, or NO_CE, or the merge separator, or a regular
// primary CE, but it is not variable. If ce == NO_CE, then write nothing for the
// primary level but terminate compression on all levels and then exit the loop.
if p > NO_CE_PRIMARY && levels & PRIMARY_LEVEL_FLAG != 0 {
// Test the un-reordered primary for compressibility.
let is_compressible = self.special_primaries.is_compressible((p >> 24) as _);
if let Some(reordering) = &self.reordering {
p = reordering.reorder(p);
}
let p1 = (p >> 24) as u8;
if !is_compressible || p1 != (prev_reordered_primary >> 24) as u8 {
if prev_reordered_primary != 0 {
if p < prev_reordered_primary {
// No primary compression terminator at the end of the level or
// merged segment.
if p1 > MERGE_SEPARATOR_BYTE {
sink.write(state, &[PRIMARY_COMPRESSION_LOW_BYTE])?;
}
} else {
sink.write(state, &[PRIMARY_COMPRESSION_HIGH_BYTE])?;
}
}
sink.write_byte(state, p1)?;
prev_reordered_primary = if is_compressible { p } else { 0 };
}
let p2 = (p >> 16) as u8;
if p2 != 0 {
let (b0, b1, b2) = (p2, (p >> 8) as _, p as _);
sink.write_byte(state, b0)?;
if b1 != 0 {
sink.write_byte(state, b1)?;
if b2 != 0 {
sink.write_byte(state, b2)?;
}
}
}
}
let non_primary = ce.non_primary();
if non_primary.ignorable() {
continue; // completely ignorable, no secondary/case/tertiary/quaternary
}
macro_rules! handle_common {
($key:ident, $w:ident, $common:ident, $weights:ident, $lim:expr) => {
if $common != 0 {
$common -= 1;
while $common >= $weights[WEIGHT_MAX_COUNT] as _ {
$key.append_byte($weights[WEIGHT_MIDDLE]);
$common -= $weights[WEIGHT_MAX_COUNT] as usize;
}
let b = if $w < $lim {
$weights[WEIGHT_LOW] + ($common as u8)
} else {
$weights[WEIGHT_HIGH] - ($common as u8)
};
$key.append_byte(b);
$common = 0;
}
};
($key:ident, $w:ident, $common:ident, $weights:ident) => {
handle_common!($key, $w, $common, $weights, COMMON_WEIGHT16);
};
}
if levels & SECONDARY_LEVEL_FLAG != 0 {
let s = non_primary.secondary();
if s == 0 {
// secondary ignorable
} else if s == COMMON_WEIGHT16
&& (!self.options.backward_second_level() || p != MERGE_SEPARATOR_PRIMARY)
{
// s is a common secondary weight, and backwards-secondary is off or the ce
// is not the merge separator.
common_secondaries += 1;
} else if !self.options.backward_second_level() {
handle_common!(secondaries, s, common_secondaries, SEC_COMMON);
secondaries.append_weight_16(s);
} else {
if common_secondaries != 0 {
common_secondaries -= 1;
// Append reverse weights. The level will be re-reversed later.
let remainder = common_secondaries % SEC_COMMON[WEIGHT_MAX_COUNT] as usize;
let b = if prev_secondary < COMMON_WEIGHT16 {
SEC_COMMON[WEIGHT_LOW] + remainder as u8
} else {
SEC_COMMON[WEIGHT_HIGH] - remainder as u8
};
secondaries.append_byte(b);
common_secondaries -= remainder;
// common_secondaries is now a multiple of SEC_COMMON[WEIGHT_MAX_COUNT]
while common_secondaries > 0 {
// same as >= SEC_COMMON[WEIGHT_MAX_COUNT]
secondaries.append_byte(SEC_COMMON[WEIGHT_MIDDLE]);
common_secondaries -= SEC_COMMON[WEIGHT_MAX_COUNT] as usize;
}
// commonSecondaries == 0
}
if 0 < p && p <= MERGE_SEPARATOR_PRIMARY {
// The backwards secondary level compares secondary weights backwards
// within segments separated by the merge separator (U+FFFE).
let secs = &mut secondaries.buf;
let last = secs.len() - 1;
if sec_segment_start < last {
let mut q = sec_segment_start;
let mut r = last;
// these indices start at valid values and we stop when they cross
#[expect(clippy::indexing_slicing)]
while q < r {
let b = secs[q];
secs[q] = secs[r];
q += 1;
secs[r] = b;
r -= 1;
}
}
let b = if p == NO_CE_PRIMARY {
LEVEL_SEPARATOR_BYTE
} else {
MERGE_SEPARATOR_BYTE
};
secondaries.append_byte(b);
prev_secondary = 0;
sec_segment_start = secondaries.len();
} else {
secondaries.append_reverse_weight_16(s);
prev_secondary = s;
}
}
}
if levels & CASE_LEVEL_FLAG != 0 {
if self.options.strength() == Strength::Primary && p == 0
|| non_primary.bits() <= 0xffff
{
// Primary+caseLevel: Ignore case level weights of primary ignorables.
// Otherwise: Ignore case level weights of secondary ignorables. For
// details see the comments in the CollationCompare class.
} else {
// case bits & tertiary lead byte
let mut c = ((non_primary.bits() >> 8) & 0xff) as u8;
debug_assert_ne!(c & 0xc0, 0xc0);
if c & 0xc0 == 0 && c > LEVEL_SEPARATOR_BYTE {
common_cases += 1;
} else {
if !self.options.upper_first() {
// lower first: Compress common weights to nibbles 1..7..13,
// mixed=14, upper=15. If there are only common (=lowest) weights
// in the whole level, then we need not write anything. Level
// length differences are handled already on the next-higher level.
if common_cases != 0 && (c > LEVEL_SEPARATOR_BYTE || !cases.is_empty())
{
common_cases -= 1;
while common_cases >= CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _
{
cases.append_byte(CASE_LOWER_FIRST_COMMON[WEIGHT_MIDDLE] << 4);
common_cases -=
CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize;
}
let b = if c <= LEVEL_SEPARATOR_BYTE {
CASE_LOWER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8
} else {
CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] - common_cases as u8
};
cases.append_byte(b << 4);
common_cases = 0;
}
if c > LEVEL_SEPARATOR_BYTE {
// 14 or 15
c = (CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] + (c >> 6)) << 4;
}
} else {
// upper first: Compress common weights to nibbles 3..15, mixed=2,
// upper=1. The compressed common case weights only go up from the
// "low" value because with upperFirst the common weight is the
// highest one.
if common_cases != 0 {
common_cases -= 1;
while common_cases >= CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _
{
cases.append_byte(CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] << 4);
common_cases -=
CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize;
}
cases.append_byte(
(CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8) << 4,
);
common_cases = 0;
}
if c > LEVEL_SEPARATOR_BYTE {
// 2 or 1
c = (CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] - (c >> 6)) << 4;
}
}
// c is a separator byte 01 or a left-shifted nibble 0x10, 0x20, ...
// 0xf0.
cases.append_byte(c);
}
}
}
if levels & TERTIARY_LEVEL_FLAG != 0 {
let mut t = non_primary.tertiary_case_quarternary(tertiary_mask);
debug_assert_ne!(non_primary.bits() & 0xc000, 0xc000);
if t == COMMON_WEIGHT16 {
common_tertiaries += 1;
} else if tertiary_mask & 0x8000 == 0 {
// Tertiary weights without case bits. Move lead bytes 06..3F to C6..FF
// for a large common-weight range.
handle_common!(tertiaries, t, common_tertiaries, TER_ONLY_COMMON);
if t > COMMON_WEIGHT16 {
t += 0xc000;
}
tertiaries.append_weight_16(t);
} else if !self.options.upper_first() {
// Tertiary weights with caseFirst=lowerFirst. Move lead bytes 06..BF to
// 46..FF for the common-weight range.
handle_common!(tertiaries, t, common_tertiaries, TER_LOWER_FIRST_COMMON);
if t > COMMON_WEIGHT16 {
t += 0x4000;
}
tertiaries.append_weight_16(t);
} else {
// Tertiary weights with caseFirst=upperFirst. Do not change the
// artificial uppercase weight of a tertiary CE (0.0.ut), to keep tertiary
// CEs well-formed. Their case+tertiary weights must be greater than those
// of primary and secondary CEs.
//
// Separator 01 -> 01 (unchanged)
// Lowercase 02..04 -> 82..84 (includes uncased)
// Common weight 05 -> 85..C5 (common-weight compression range)
// Lowercase 06..3F -> C6..FF
// Mixed case 42..7F -> 42..7F
// Uppercase 82..BF -> 02..3F
// Tertiary CE 86..BF -> C6..FF
if t <= NO_CE_TERTIARY {
// Keep separators unchanged.
} else if non_primary.bits() > 0xffff {
// Invert case bits of primary & secondary CEs.
t ^= 0xc000;
if t < (TER_UPPER_FIRST_COMMON[WEIGHT_HIGH] as u16) << 8 {
t -= 0x4000;
}
} else {
// Keep uppercase bits of tertiary CEs.
debug_assert!((0x8600..=0xbfff).contains(&t));
t += 0x4000;
}
handle_common!(
tertiaries,
t,
common_tertiaries,
TER_UPPER_FIRST_COMMON,
(TER_UPPER_FIRST_COMMON[WEIGHT_LOW] as u16) << 8
);
tertiaries.append_weight_16(t);
}
}
if levels & QUATERNARY_LEVEL_FLAG != 0 {
let q = (non_primary.bits() & 0xffff) as u16;
if q & 0xc0 == 0 && q > NO_CE_QUATERNARY {
common_quaternaries += 1;
} else if q == NO_CE_QUATERNARY
&& self.options.alternate_handling() == AlternateHandling::NonIgnorable
&& quaternaries.is_empty()
{
// If alternate=non-ignorable and there are only common quaternary weights,
// then we need not write anything. The only weights greater than the
// merge separator and less than the common weight are shifted primary
// weights, which are not generated for alternate=non-ignorable. There are
// also exactly as many quaternary weights as tertiary weights, so level
// length differences are handled already on tertiary level. Any
// above-common quaternary weight will compare greater regardless.
quaternaries.append_byte(LEVEL_SEPARATOR_BYTE);
} else {
let q = if q == NO_CE_QUATERNARY {
LEVEL_SEPARATOR_BYTE
} else {
(0xfc + ((q >> 6) & 3)) as u8
};
handle_common!(
quaternaries,
q,
common_quaternaries,
QUAT_COMMON,
QUAT_COMMON[WEIGHT_LOW]
);
quaternaries.append_byte(q);
}
}
if (non_primary.bits() >> 24) as u8 == LEVEL_SEPARATOR_BYTE {
break; // ce == NO_CE
}
}
macro_rules! write_level {
($key:ident, $flag:ident) => {
if levels & $flag != 0 {
sink.write(state, &[LEVEL_SEPARATOR_BYTE])?;
sink.write(state, &$key.buf)?;
}
};
}
write_level!(secondaries, SECONDARY_LEVEL_FLAG);
if levels & CASE_LEVEL_FLAG != 0 {
sink.write(state, &[LEVEL_SEPARATOR_BYTE])?;
// Write pairs of nibbles as bytes, except separator bytes as themselves.
let mut b = 0;
if let Some((last, head)) = cases.buf.split_last() {
debug_assert_eq!(*last, 1); // The trailing NO_CE
for c in head {
debug_assert_eq!(*c & 0xf, 0);
debug_assert_ne!(*c, 0);
if b == 0 {
b = *c;
} else {
sink.write_byte(state, b | (*c >> 4))?;
b = 0;
}
}
} else {
debug_assert!(false);
}
if b != 0 {
sink.write_byte(state, b)?;
}
}
write_level!(tertiaries, TERTIARY_LEVEL_FLAG);
write_level!(quaternaries, QUATERNARY_LEVEL_FLAG);
Ok(())
}
}
/// Error indicating that a [`CollationKeySink`] with limited space ran out of space.
#[derive(Debug, PartialEq, Eq)]
pub struct TooSmall {
/// The total length, in bytes, of the sort key.
pub length: usize,
}
impl TooSmall {
pub fn new(length: usize) -> Self {
Self { length }
}
}
/// A [`std::io::Write`]-like trait for writing to a buffer-like object.
///
/// (This crate does not have access to [`std`].)
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. Do not implement or call methods on this trait
/// unless you are prepared for things to occasionally break.
///
/// Graduation tracking issue: [issue #7178](https://github.com/unicode-org/icu4x/issues/7178).
/// </div>
///
/// ✨ *Enabled with the `unstable` Cargo feature.*
pub trait CollationKeySink {
/// The type of error the sink may return.
type Error;
/// An intermediate state object used by the sink, which must implement [`Default`].
type State;
/// A result value indicating the final state of the sink (e.g. a number of bytes written).
type Output;
/// Writes a buffer into the writer.
fn write(&mut self, state: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error>;
/// Write a single byte into the writer.
fn write_byte(&mut self, state: &mut Self::State, b: u8) -> Result<(), Self::Error> {
self.write(state, &[b])
}
/// Finalize any internal sink state (perhaps by flushing a buffer) and return the final
/// output value.
fn finish(&mut self, state: Self::State) -> Result<Self::Output, Self::Error>;
}
impl CollationKeySink for Vec<u8> {
type Error = Infallible;
type State = ();
type Output = ();
fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
self.extend_from_slice(buf);
Ok(())
}
fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
Ok(())
}
}
impl CollationKeySink for VecDeque<u8> {
type Error = Infallible;
type State = ();
type Output = ();
fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
self.extend(buf.iter());
Ok(())
}
fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
Ok(())
}
}
impl<const N: usize> CollationKeySink for SmallVec<[u8; N]> {
type Error = Infallible;
type State = ();
type Output = ();
fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
self.extend_from_slice(buf);
Ok(())
}
fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
Ok(())
}
}
impl CollationKeySink for [u8] {
type Error = TooSmall;
type State = usize;
type Output = usize;
fn write(&mut self, offset: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
if *offset + buf.len() <= self.len() {
// just checked bounds
#[expect(clippy::indexing_slicing)]
self[*offset..*offset + buf.len()].copy_from_slice(buf);
}
*offset += buf.len();
Ok(())
}
fn finish(&mut self, offset: Self::State) -> Result<Self::Output, Self::Error> {
if offset <= self.len() {
Ok(offset)
} else {
Err(TooSmall::new(offset))
}
}
}
#[derive(Default)]
struct SortKeyLevel {
buf: SmallVec<[u8; 40]>,
}
impl SortKeyLevel {
fn len(&self) -> usize {
self.buf.len()
}
fn is_empty(&self) -> bool {
self.buf.is_empty()
}
fn append_byte(&mut self, x: u8) {
self.buf.push(x);
}
fn append_weight_16(&mut self, w: u16) {
debug_assert_ne!(w, 0);
let b0 = (w >> 8) as u8;
let b1 = w as u8;
self.append_byte(b0);
if b1 != 0 {
self.append_byte(b1);
}
}
fn append_reverse_weight_16(&mut self, w: u16) {
debug_assert_ne!(w, 0);
let b0 = (w >> 8) as u8;
let b1 = w as u8;
if b1 != 0 {
self.append_byte(b1);
}
self.append_byte(b0);
}
fn append_weight_32(&mut self, w: u32) {
debug_assert_ne!(w, 0);
let b0 = (w >> 24) as u8;
let b1 = (w >> 16) as u8;
let b2 = (w >> 8) as u8;
let b3 = w as u8;
self.append_byte(b0);
if b1 != 0 {
self.append_byte(b1);
if b2 != 0 {
self.append_byte(b2);
if b3 != 0 {
self.append_byte(b3);
}
}
}
}
}
// The algorithm below (BOCSU or Binary Ordered Compression Scheme for Unicode) is translated
// from the C++ code in ICU4C at icu4c/source/i18n/bocsu.{cpp,h}. The algorithm works by
// converting a sequence of codepoints into a sequence of presumably small differences. See
// the C++ code for a more detailed explanation.
macro_rules! negdivmod {
($n:ident, $d:ident, $m:ident) => {
$m = $n % $d;
$n /= $d;
if $m < 0 {
$n -= 1;
$m += $d;
}
};
}
fn write_diff<S>(mut diff: i32, sink: &mut S, state: &mut S::State) -> Result<(), S::Error>
where
S: CollationKeySink + ?Sized,
{
let mut out = |b| sink.write_byte(state, b);
if diff >= SLOPE_REACH_NEG_1 {
if diff <= SLOPE_REACH_POS_1 {
out((SLOPE_MIDDLE + diff) as _)?;
} else if diff <= SLOPE_REACH_POS_2 {
out((SLOPE_START_POS_2 + (diff / SLOPE_TAIL_COUNT)) as _)?;
out((SLOPE_MIN + diff % SLOPE_TAIL_COUNT) as _)?;
} else if diff <= SLOPE_REACH_POS_3 {
let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
diff /= SLOPE_TAIL_COUNT;
let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
let p0 = SLOPE_START_POS_3 + (diff / SLOPE_TAIL_COUNT);
out(p0 as _)?;
out(p1 as _)?;
out(p2 as _)?;
} else {
let p3 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
diff /= SLOPE_TAIL_COUNT;
let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
diff /= SLOPE_TAIL_COUNT;
let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
out(SLOPE_MAX as _)?;
out(p1 as _)?;
out(p2 as _)?;
out(p3 as _)?;
}
} else {
let mut m;
if diff >= SLOPE_REACH_NEG_2 {
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
out((SLOPE_START_NEG_2 + diff) as _)?;
out((SLOPE_MIN + m) as _)?;
} else if diff >= SLOPE_REACH_NEG_3 {
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
let p2 = SLOPE_MIN + m;
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
let p1 = SLOPE_MIN + m;
let p0 = SLOPE_START_NEG_3 + diff;
out(p0 as _)?;
out(p1 as _)?;
out(p2 as _)?;
} else {
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
let p3 = SLOPE_MIN + m;
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
let p2 = SLOPE_MIN + m;
negdivmod!(diff, SLOPE_TAIL_COUNT, m);
let p1 = SLOPE_MIN + m;
let _ = diff;
out(SLOPE_MIN as _)?;
out(p1 as _)?;
out(p2 as _)?;
out(p3 as _)?;
}
}
Ok(())
}
fn write_identical_level<I, S>(iter: I, sink: &mut S, state: &mut S::State) -> Result<(), S::Error>
where
I: Iterator<Item = char>,
S: CollationKeySink + ?Sized,
{
let mut prev = 0i32;
for c in iter {
if !(0x4e00..=0xa000).contains(&prev) {
prev = (prev & !0x7f) - SLOPE_REACH_NEG_1;
} else {
// Unihan U+4e00..U+9fa5: double-bytes down from the upper end
prev = 0x9fff - SLOPE_REACH_POS_2;
}
if c == MERGE_SEPARATOR {
sink.write_byte(state, MERGE_SEPARATOR_BYTE)?;
prev = 0;
} else {
let c = c as i32;
write_diff(c - prev, sink, state)?;
prev = c;
}
}
Ok(())
}
#[cfg(test)]
mod test {
use super::*;
use icu_locale::locale;
type Key = Vec<u8>;
fn collator_en(strength: Strength) -> CollatorBorrowed<'static> {
let locale = locale!("en").into();
let mut options = CollatorOptions::default();
options.strength = Some(strength);
Collator::try_new(locale, options).unwrap()
}
fn collator_en_case_level(strength: Strength) -> CollatorBorrowed<'static> {
let locale = locale!("en").into();
let mut options = CollatorOptions::default();
options.strength = Some(strength);
options.case_level = Some(crate::options::CaseLevel::On);
Collator::try_new(locale, options).unwrap()
}
fn keys(strength: Strength) -> (Key, Key, Key) {
let collator = collator_en(strength);
let mut k0 = Vec::new();
let Ok(()) = collator.write_sort_key_to("aabc", &mut k0);
let mut k1 = Vec::new();
let Ok(()) = collator.write_sort_key_to("aAbc", &mut k1);
let mut k2 = Vec::new();
let Ok(()) = collator.write_sort_key_to("áAbc", &mut k2);
(k0, k1, k2)
}
#[test]
fn sort_key_primary() {
let (k0, k1, k2) = keys(Strength::Primary);
assert_eq!(k0, k1);
assert_eq!(k1, k2);
}
#[test]
fn sort_key_secondary() {
let (k0, k1, k2) = keys(Strength::Secondary);
assert_eq!(k0, k1);
assert!(k1 < k2);
}
#[test]
fn sort_key_tertiary() {
let (k0, k1, k2) = keys(Strength::Tertiary);
assert!(k0 < k1);
assert!(k1 < k2);
}
fn collator_ja(strength: Strength) -> CollatorBorrowed<'static> {
let locale = locale!("ja").into();
let mut options = CollatorOptions::default();
options.strength = Some(strength);
Collator::try_new(locale, options).unwrap()
}
fn keys_ja_strs(strength: Strength, s0: &str, s1: &str) -> (Key, Key) {
let collator = collator_ja(strength);
let mut k0 = Vec::new();
let Ok(()) = collator.write_sort_key_to(s0, &mut k0);
let mut k1 = Vec::new();
let Ok(()) = collator.write_sort_key_to(s1, &mut k1);
(k0, k1)
}
fn keys_ja(strength: Strength) -> (Key, Key) {
keys_ja_strs(strength, "あ", "ア")
}
#[test]
fn sort_keys_ja_to_quaternary() {
let (k0, k1) = keys_ja(Strength::Primary);
assert_eq!(k0, k1);
let (k0, k1) = keys_ja(Strength::Secondary);
assert_eq!(k0, k1);
let (k0, k1) = keys_ja(Strength::Tertiary);
assert_eq!(k0, k1);
let (k0, k1) = keys_ja(Strength::Quaternary);
assert!(k0 < k1);
}
#[test]
fn sort_keys_ja_identical() {
let (k0, k1) = keys_ja_strs(Strength::Quaternary, "ア", "ア");
assert_eq!(k0, k1);
let (k0, k1) = keys_ja_strs(Strength::Identical, "ア", "ア");
assert!(k0 < k1);
}
#[test]
fn sort_keys_utf16() {
let collator = collator_en(Strength::Identical);
const STR8: &[u8] = b"hello world!";
let mut k8 = Vec::new();
let Ok(()) = collator.write_sort_key_utf8_to(STR8, &mut k8);
const STR16: &[u16] = &[
0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
];
let mut k16 = Vec::new();
let Ok(()) = collator.write_sort_key_utf16_to(STR16, &mut k16);
assert_eq!(k8, k16);
}
#[test]
fn sort_keys_invalid() {
let collator = collator_en(Strength::Identical);
// some invalid strings
let mut k = Vec::new();
let Ok(()) = collator.write_sort_key_utf8_to(b"\xf0\x90", &mut k);
let mut k = Vec::new();
let Ok(()) = collator.write_sort_key_utf16_to(&[0xdd1e], &mut k);
}
#[test]
fn sort_key_to_vecdeque() {
let collator = collator_en(Strength::Identical);
let mut k0 = Vec::new();
let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0);
let mut k1 = VecDeque::new();
let Ok(()) = collator.write_sort_key_to("áAbc", &mut k1);
assert!(k0.iter().eq(k1.iter()));
}
#[test]
fn sort_key_to_slice() {
let collator = collator_en(Strength::Identical);
let mut k0 = Vec::new();
let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0);
let mut k1 = [0u8; 100];
let len = collator.write_sort_key_to("áAbc", &mut k1[..]).unwrap();
assert_eq!(len, k0.len());
assert!(k0.iter().eq(k1[..len].iter()));
}
#[test]
fn sort_key_to_slice_no_space() {
let collator = collator_en(Strength::Identical);
let mut k = [0u8; 0];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
assert!(matches!(res, Err(TooSmall { .. })));
}
#[test]
fn sort_key_to_slice_too_long() {
// This runs out of space in write_sort_key_up_to_quaternary.
let collator = collator_en(Strength::Identical);
let mut k = [0u8; 5];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
assert!(matches!(res, Err(TooSmall { .. })));
}
#[test]
fn sort_key_to_slice_identical_too_long() {
// This runs out of space while appending UTF-8 in the SinkAdapter.
let collator = collator_en(Strength::Identical);
let mut k = [0u8; 22];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
assert!(matches!(res, Err(TooSmall { .. })));
}
#[test]
fn sort_key_just_right() {
// get the length needed
let collator = collator_en(Strength::Identical);
let mut k = [0u8; 0];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
let len = res.unwrap_err().length;
// almost enough
let mut k = vec![0u8; len - 1];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
let len = res.unwrap_err().length;
// just right
let mut k = vec![0u8; len];
let res = collator.write_sort_key_to("áAbc", &mut k[..]);
assert_eq!(res, Ok(len));
}
#[test]
fn sort_key_utf16_slice_too_small() {
let collator = collator_en(Strength::Identical);
const STR16: &[u16] = &[0x68, 0x65, 0x6c, 0x6c, 0x6f];
let mut k = [0u8; 4];
let res = collator.write_sort_key_utf16_to(STR16, &mut k[..]);
assert!(matches!(res, Err(TooSmall { .. })));
}
#[test]
fn sort_key_very_long() {
let collator = collator_en(Strength::Secondary);
let mut k = Vec::new();
let Ok(()) = collator.write_sort_key_to(&"a".repeat(300), &mut k);
}
#[test]
fn sort_key_case_level() {
let collator = collator_en_case_level(Strength::Tertiary);
let mut k = Vec::new();
let Ok(()) = collator.write_sort_key_to("aBc", &mut k);
}
#[test]
fn sort_key_case_level_empty() {
let collator = collator_en_case_level(Strength::Tertiary);
let mut k = Vec::new();
let Ok(()) = collator.write_sort_key_to("", &mut k);
}
fn check_sort_key_less(a: &[u16], b: &[u16]) {
let collator = collator_en(Strength::Identical);
let mut ak = Vec::new();
let Ok(()) = collator.write_sort_key_utf16_to(a, &mut ak);
let mut bk = Vec::new();
let Ok(()) = collator.write_sort_key_utf16_to(b, &mut bk);
assert!(ak < bk, "failed: {a:04x?} - {b:04x?}");
}
#[test]
fn sort_key_fffe_bug_6811() {
check_sort_key_less(
&[0xfffe, 0x0001, 0x0002, 0x0003],
&[0x0001, 0xfffe, 0x0002, 0x0003],
);
check_sort_key_less(
&[0x0001, 0xfffe, 0x0002, 0x0003],
&[0x0001, 0x0002, 0xfffe, 0x0003],
);
check_sort_key_less(
&[0x0001, 0x0002, 0xfffe, 0x0003],
&[0x0001, 0x0002, 0x0003, 0xfffe],
);
check_sort_key_less(&[0xfffe, 0x0000, 0x0000], &[0x0000, 0xfffe, 0x0000]);
check_sort_key_less(&[0x0000, 0xfffe, 0x0000], &[0x0000, 0x0000, 0xfffe]);
}
}