Source code

Revision control

Copy as Markdown

Other Tools

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// Various collation-related algorithms and constants in this file are
// adapted from ICU4C and, therefore, are subject to the ICU license as
// described in LICENSE.
//! This module holds the 64-bit `CollationElement` struct used for
//! the actual comparison, the 32-bit `CollationElement32` struct
//! that's used for storage. (Strictly speaking, the storage is
//! `RawBytesULE<4>`.) And the `CollationElements` iterator adapter
//! that turns an iterator over `char` into an iterator over
//! `CollationElement`. (To match the structure of ICU4C, this isn't
//! a real Rust `Iterator`. Instead of signaling end by returning
//! `None`, it signals end by returning `NO_CE`.)
//!
//! This module also declares various constants that are also used
//! by the `comparison` module.
use core::char::REPLACEMENT_CHARACTER;
use core::marker::PhantomData;
use icu_collections::char16trie::TrieResult;
use icu_collections::codepointtrie::AbstractCodePointTrie;
use icu_collections::codepointtrie::WithTrie;
use icu_normalizer::provider::DecompositionTables;
use icu_properties::props::CanonicalCombiningClass;
use smallvec::SmallVec;
use zerovec::ule::AsULE;
use zerovec::ule::RawBytesULE;
use zerovec::{zeroslice, ZeroSlice};
use crate::provider::CollationData;
/// `true` iff `ce32`, when interpreted as `CollationElement32`,
/// is self-contained.
#[cfg(feature = "datagen")]
pub fn is_self_contained(ce32: u32) -> bool {
CollationElement32::new(ce32)
.to_ce_self_contained()
.is_some()
}
// Start `SmallVec` size constants.
//
// These are the on-stack buffer sizes. If the buffers need
// to grow larger, they are spilled to the heap.
//
// TODO(#2005): Figure out good sizes for these.
/// The number of full 64-bit collation units that get buffered
/// in the primary comparison loop so that they can be examined
/// by the subsequent comparison stregths.
///
/// Note 1: If a primary difference is found, the comparison
/// returns early, so these buffers end up holding all the
/// collation elements only if there is no primary difference.
///
/// Note 2: Unfortunately for now, a sentinel value signaling
/// the end of input gets written into the buffer in addition
/// to the real collation elements.
///
/// This should probably either be halved to 4 on the logic
/// that especially in the presence of the identical prefix
/// optimization, most comparisons return after a couple of
/// primary comparisons or increased to 32 on the logic that
/// such a buffer could better hold a file or human name that
/// differs on secordary or higher level.
pub(crate) const CE_BUFFER_SIZE: usize = 8;
/// The number of extra full 64-bit collation units that have
/// already been computed as part of an operation that yields
/// multiple collation units at a time.
const PENDING_CE_BUFFER_SIZE: usize = 6;
/// Either the identical prefix or the lookahead plus the next
/// upcoming character.
///
/// The longest contraction suffix in CLDR 40 is 7 characters long.
const UPCOMING_CHARACTER_BUFFER_SIZE: usize = 10;
/// The contiguous sequence of combining characters.
const COMBINING_CHARACTER_BUFFER_SIZE: usize = 7;
/// The sequence of digits in the numeric mode.
const DIGIT_BUFFER_SIZE: usize = 8;
/// The number of combining characters that a contraction has
/// matched.
const PENDING_REMOVALS_SIZE: usize = 1;
// End `SmallVec` constants
/// Marker that the decomposition does not round trip via NFC.
///
/// See components/normalizer/trie-value-format.md
pub(crate) const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
/// Marker that the first character of the decomposition
/// can combine backwards.
///
/// See components/normalizer/trie-value-format.md
pub(crate) const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
/// Mask for the bits have to be zero for this to be a BMP
/// singleton decomposition, or value baked into the surrogate
/// range.
///
/// See components/normalizer/trie-value-format.md
pub(crate) const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
/// Mask for the bits have to be zero for this to be a complex
/// decomposition.
///
/// See components/normalizer/trie-value-format.md
pub(crate) const LOW_ZEROS_MASK: u32 = 0xFFE0;
/// Marker value for U+FDFA in NFKD. (Unified with
/// `HANGUL_SYLLABLE_MARKER`, but they differ by
/// `NON_ROUND_TRIP_MARKER`.)
///
/// See components/normalizer/trie-value-format.md
const FDFA_MARKER: u16 = 1;
/// Marker value for Hangul syllables. (Unified with `FDFA_MARKER`,
/// but they differ by `NON_ROUND_TRIP_MARKER`.)
///
/// See components/normalizer/trie-value-format.md
pub(crate) const HANGUL_SYLLABLE_MARKER: u32 = 1;
/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
///
/// See components/normalizer/trie-value-format.md
fn trie_value_has_ccc(trie_value: u32) -> bool {
(trie_value & 0x3FFFFE00) == 0xD800
}
/// Checks if the trie signifies a special non-starter decomposition.
///
/// See components/normalizer/trie-value-format.md
fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
(trie_value & 0x3FFFFF00) == 0xD900
}
/// Checks if a trie value signifies a character whose decomposition
/// starts with a non-starter.
///
/// See components/normalizer/trie-value-format.md
fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
trie_value_has_ccc(trie_value)
}
/// Extracts a canonical combining class (possibly zero) from a trie value.
///
/// See components/normalizer/trie-value-format.md
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
} else {
CanonicalCombiningClass::NotReordered
}
}
// These constants originate from page 143 of Unicode 14.0
pub(crate) const HANGUL_S_BASE: u32 = 0xAC00;
pub(crate) const HANGUL_L_BASE: u32 = 0x1100;
pub(crate) const HANGUL_V_BASE: u32 = 0x1161;
pub(crate) const HANGUL_T_BASE: u32 = 0x11A7;
pub(crate) const HANGUL_T_COUNT: u32 = 28;
pub(crate) const HANGUL_N_COUNT: u32 = 588;
pub(crate) const HANGUL_S_COUNT: u32 = 11172;
pub(crate) const JAMO_COUNT: usize = 256; // 0x1200 - 0x1100
const COMBINING_DIACRITICS_BASE: usize = 0x0300;
const OPTIMIZED_DIACRITICS_LIMIT: usize = 0x034F;
pub(crate) const OPTIMIZED_DIACRITICS_MAX_COUNT: usize =
OPTIMIZED_DIACRITICS_LIMIT - COMBINING_DIACRITICS_BASE;
pub(crate) const CASE_MASK: u16 = 0xC000;
pub(crate) const TERTIARY_MASK: u16 = 0x3F3F; // ONLY_TERTIARY_MASK in ICU4C
pub(crate) const QUATERNARY_MASK: u16 = 0xC0;
// A CE32 is special if its low byte is this or greater.
// Impossible case bits 11 mark special CE32s.
// This value itself is used to indicate a fallback to the root collation.
const SPECIAL_CE32_LOW_BYTE: u8 = 0xC0;
pub(crate) const FALLBACK_CE32: CollationElement32 =
CollationElement32(SPECIAL_CE32_LOW_BYTE as u32);
const LONG_PRIMARY_CE32_LOW_BYTE: u8 = 0xC1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG
/// Used only as a placeholder on the indentical prefix path.
/// The requirement is that this CE32 fails the quick mapping to a primary,
/// which is does, because the tag byte is higher than
/// `LONG_PRIMARY_CE32_LOW_BYTE`.
pub(crate) const IDENTICAL_PREFIX_HANGUL_MARKER_CE32: CollationElement32 = CollationElement32(0xC2);
const COMMON_SECONDARY_CE: u64 = 0x05000000;
const COMMON_TERTIARY_CE: u64 = 0x0500;
const COMMON_SEC_AND_TER_CE: u64 = COMMON_SECONDARY_CE | COMMON_TERTIARY_CE;
const UNASSIGNED_IMPLICIT_BYTE: u8 = 0xFE;
// /// Set if there is no match for the single (no-suffix) character itself.
// /// This is only possible if there is a prefix.
// /// In this case, discontiguous contraction matching cannot add combining marks
// /// starting from an empty suffix.
// /// The default CE32 is used anyway if there is no suffix match.
// const CONTRACT_SINGLE_CP_NO_MATCH: u32 = 0x100;
/// Set if the first character of every contraction suffix has lccc!=0.
const CONTRACT_NEXT_CCC: u32 = 0x200;
/// Set if any contraction suffix ends with lccc!=0.
const CONTRACT_TRAILING_CCC: u32 = 0x400;
/// Set if at least one contraction suffix contains a starter
const CONTRACT_HAS_STARTER: u32 = 0x800;
// const NO_CE32: CollationElement32 = CollationElement32::default();
// constants named NO_CE* : End of input. Only used in runtime code, not stored in data.
pub(crate) const NO_CE: CollationElement = CollationElement::default();
pub(crate) const NO_CE_PRIMARY: u32 = 1; // not a left-adjusted weight
// const NO_CE_NON_PRIMARY: NonPrimary = NonPrimary::default();
pub(crate) const NO_CE_SECONDARY: u16 = 0x0100;
pub(crate) const NO_CE_TERTIARY: u16 = 0x0100;
pub(crate) const NO_CE_QUATERNARY: u16 = 0x0100;
const NO_CE_VALUE: u64 =
((NO_CE_PRIMARY as u64) << 32) | ((NO_CE_SECONDARY as u64) << 16) | (NO_CE_TERTIARY as u64); // 0x101000100
pub(crate) const FFFD_PRIMARY: u32 = 0xFFFD0000; // U+FFFD
pub(crate) const FFFD_CE_VALUE: u64 = ((FFFD_PRIMARY as u64) << 32) | COMMON_SEC_AND_TER_CE;
pub(crate) const FFFD_CE: CollationElement = CollationElement(FFFD_CE_VALUE);
pub(crate) const FFFD_CE32_VALUE: u32 = 0xFFFD0505;
pub(crate) const FFFD_CE32: CollationElement32 = CollationElement32(FFFD_CE32_VALUE);
pub(crate) const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
const SINGLE_REPLACEMENT_CHARACTER_U16: &ZeroSlice<u16> =
zeroslice!(u16; <u16 as AsULE>::ULE::from_unsigned; [REPLACEMENT_CHARACTER as u16]);
pub(crate) const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
const SINGLE_REPLACEMENT_CHARACTER_CHAR: &ZeroSlice<char> =
zeroslice!(char; <char as AsULE>::ULE::from_aligned; [REPLACEMENT_CHARACTER]);
/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
/// are enabled and return `default` if debug assertions are not enabled.
///
/// Use this only if the only reason why `opt` could be `None` is bogus
/// data from the provider.
#[inline(always)]
pub(crate) fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
if let Some(val) = opt {
val
} else {
// GIGO case
debug_assert!(false);
default
}
}
/// Convert a `u32` _obtained from data provider data_ to `char`.
#[inline(always)]
pub(crate) fn char_from_u32(u: u32) -> char {
unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
}
/// Convert a `u16` _obtained from data provider data_ to `char`.
#[inline(always)]
fn char_from_u16(u: u16) -> char {
char_from_u32(u32::from(u))
}
#[inline(always)]
fn in_inclusive_range(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
}
/// Special-CE32 tags, from bits 3..0 of a special 32-bit CE.
/// Bits 31..8 are available for tag-specific data.
/// Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0.
#[derive(Eq, PartialEq, Debug)]
#[allow(dead_code)]
#[repr(u8)] // This repr is necessary for transmute safety
pub(crate) enum Tag {
/// Fall back to the base collator.
/// This is the tag value in [`SPECIAL_CE32_LOW_BYTE`] and [`FALLBACK_CE32`].
/// Bits 31..8: Unused, 0.
Fallback = 0,
/// Long-primary CE with [`COMMON_SEC_AND_TER_CE`].
/// Bits 31..8: Three-byte primary.
LongPrimary = 1,
/// Long-secondary CE with zero primary.
/// Bits 31..16: Secondary weight.
/// Bits 15.. 8: Tertiary weight.
LongSecondary = 2,
/// Unused.
/// May be used in the future for single-byte secondary CEs (`SHORT_SECONDARY_TAG`),
/// storing the secondary in bits 31..24, the ccc in bits 23..16,
/// and the tertiary in bits 15..8.
Reserved3 = 3,
/// Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05].
/// Bits 31..24: Single-byte primary weight pp of the first CE.
/// Bits 23..16: Tertiary weight tt of the first CE.
/// Bits 15.. 8: Secondary weight ss of the second CE.
/// Unused by ICU4X, may get repurposed for jamo expansions is Korean search.
LatinExpansion = 4,
/// Points to one or more simple/long-primary/long-secondary 32-bit CE32s.
/// Bits 31..13: Index into `uint32_t` table.
/// Bits 12.. 8: Length=1..31.
Expansion32 = 5,
/// Points to one or more 64-bit CEs.
/// Bits 31..13: Index into CE table.
/// Bits 12.. 8: Length=1..31.
Expansion = 6,
/// Builder data, used only in the `CollationDataBuilder`, not in runtime data.
///
/// If bit 8 is 0: Builder context, points to a list of context-sensitive mappings.
/// Bits 31..13: Index to the builder's list of `ConditionalCE32` for this character.
/// Bits 12.. 9: Unused, 0.
///
/// If bit 8 is 1 (`IS_BUILDER_JAMO_CE32`): Builder-only jamoCE32 value.
/// The builder fetches the Jamo CE32 from the trie.
/// Bits 31..13: Jamo code point.
/// Bits 12.. 9: Unused, 0.
BuilderData = 7,
/// Points to prefix trie.
/// Bits 31..13: Index into prefix/contraction data.
/// Bits 12.. 8: Unused, 0.
Prefix = 8,
/// Points to contraction data.
/// Bits 31..13: Index into prefix/contraction data.
/// Bits 12..11: Unused, 0.
/// Bit 10: `CONTRACT_TRAILING_CCC` flag.
/// Bit 9: `CONTRACT_NEXT_CCC` flag.
/// Bit 8: `CONTRACT_SINGLE_CP_NO_MATCH` flag.
Contraction = 9,
/// Decimal digit.
/// Bits 31..13: Index into `uint32_t` table for non-numeric-collation CE32.
/// Bit 12: Unused, 0.
/// Bits 11.. 8: Digit value 0..9.
Digit = 10,
/// Tag for U+0000, for moving the NUL-termination handling
/// from the regular fastpath into specials-handling code.
/// Bits 31..8: Unused, 0.
/// Not used by ICU4X.
U0000 = 11,
/// Tag for a Hangul syllable.
/// Bits 31..9: Unused, 0.
/// Bit 8: `HANGUL_NO_SPECIAL_JAMO` flag.
/// Not used by ICU4X, may get reused for compressing Hanja expansions.
Hangul = 12,
/// Tag for a lead surrogate code unit.
/// Optional optimization for UTF-16 string processing.
/// Bits 31..10: Unused, 0.
/// 9.. 8: =0: All associated supplementary code points are unassigned-implicit.
/// =1: All associated supplementary code points fall back to the base data.
/// else: (Normally 2) Look up the data for the supplementary code point.
/// Not used by ICU4X.
LeadSurrogate = 13,
/// Tag for CEs with primary weights in code point order.
/// Bits 31..13: Index into CE table, for one data "CE".
/// Bits 12.. 8: Unused, 0.
///
/// This data "CE" has the following bit fields:
/// Bits 63..32: Three-byte primary pppppp00.
/// 31.. 8: Start/base code point of the in-order range.
/// 7: Flag isCompressible primary.
/// 6.. 0: Per-code point primary-weight increment.
Offset = 14,
/// Implicit CE tag. Compute an unassigned-implicit CE.
/// All bits are set (`UNASSIGNED_CE32=0xffffffff`).
Implicit = 15,
}
/// A compressed form of a collation element as stored in the collation
/// data.
///
/// A `CollationElement32` can be "normal" or "special".
/// Bits 7 and 6 are case bits for the "normal" case and setting
/// both is an impossible case bit combination. Hence, "special"
/// `CollationElement32`s are marked by setting both case bits
/// to 1. This is equivalent with the low byte being less than
/// `SPECIAL_CE32_LOW_BYTE` (0xC0, i.e. 0b11000000) in the "normal"
/// case and equal to or greater in the "special" case.
///
/// For the normal case:
/// Bits: 31..16: Primary weight
/// Bits: 15..8: Secondary weight
/// Bits: 7..6: Case bits (cannot both be 1 simultaneously)
/// Bits: 5..0: The high part of the discontiguous tertiary weight
/// (The quaternary weight and the low part of the discontiguous
/// tertiary weight are zero.)
///
/// For the special case:
/// Bits 31..8: tag-specific; see the documentation for `Tag`.
/// Bits 7..6: The specialness marker; both bits set to 1
/// Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0.
/// Bits 3..0: the tag (bit-compatible with `Tag`)
#[derive(Copy, Clone, PartialEq, Debug)]
pub(crate) struct CollationElement32(u32);
impl CollationElement32 {
#[inline(always)]
pub fn new(bits: u32) -> Self {
CollationElement32(bits)
}
#[inline(always)]
pub fn new_from_ule(ule: RawBytesULE<4>) -> Self {
CollationElement32(u32::from_unaligned(ule))
}
#[inline(always)]
fn low_byte(self) -> u8 {
self.0 as u8
}
#[inline(always)]
pub(crate) fn tag_checked(self) -> Option<Tag> {
let t = self.low_byte();
if t < SPECIAL_CE32_LOW_BYTE {
None
} else {
Some(self.tag())
}
}
/// Returns the tag if this element is special.
/// Non-specialness should first be checked by seeing if either
/// `to_ce_simple_or_long_primary()` or `to_ce_self_contained()`
/// returns non-`None`.
///
/// # Panics
///
/// Panics in debug mode if called on a non-special element.
#[inline(always)]
pub(crate) fn tag(self) -> Tag {
debug_assert!(self.low_byte() >= SPECIAL_CE32_LOW_BYTE);
// Safety: Tag has values 0 to 15, which are filtered for with the 0xF mask.
unsafe { core::mem::transmute(self.low_byte() & 0xF) }
}
/// Simplest possible check for the Latin1 fast path.
#[cfg(feature = "latin1")]
#[inline(always)]
pub fn to_primary_simple(self) -> Option<u32> {
let t = self.low_byte();
if t < SPECIAL_CE32_LOW_BYTE {
// Not special
Some(self.0 & 0xFFFF0000)
} else {
None
}
}
/// Extract only the first primary in the quick check without identical
/// prefix.
#[inline(always)]
pub fn to_primary_in_quick_check(self, data: &CollationData) -> Option<u32> {
let t = self.low_byte();
if t < SPECIAL_CE32_LOW_BYTE {
// Not special
Some(self.0 & 0xFFFF0000)
} else if t == LONG_PRIMARY_CE32_LOW_BYTE {
Some(self.0 - u32::from(t))
} else {
let tag = self.tag();
if tag == Tag::Expansion {
// Hiragana in `ja` tailoring
Some(data.get_primary_from_ces(self.index()))
} else {
None
}
// Note: If we start adding support for more tags,
// we should probably do early exits for contractions
// and potential Hangul syllables before checking
// for expansion.
}
}
/// Expands to 64 bits if the expansion is to a single 64-bit collation
/// element and is not a long-secondary expansion.
#[inline(always)]
pub fn to_ce_simple_or_long_primary(self) -> Option<CollationElement> {
let t = self.low_byte();
if t < SPECIAL_CE32_LOW_BYTE {
// Not special
let as64 = u64::from(self.0);
Some(CollationElement::new(
((as64 & 0xFFFF0000) << 32) | ((as64 & 0xFF00) << 16) | (u64::from(t) << 8),
))
} else if t == LONG_PRIMARY_CE32_LOW_BYTE {
let as64 = u64::from(self.0);
Some(CollationElement::new(
((as64 - u64::from(t)) << 32) | COMMON_SEC_AND_TER_CE,
))
} else {
// Could still be long secondary (or not self-contained at all).
// See `to_ce_self_contained()`.
None
}
}
/// Expands to 64 bits if the expansion is to a single 64-bit collation
/// element.
#[inline(always)]
pub fn to_ce_self_contained(self) -> Option<CollationElement> {
if let Some(ce) = self.to_ce_simple_or_long_primary() {
return Some(ce);
}
if self.tag() == Tag::LongSecondary {
Some(CollationElement::new(u64::from(self.0 & 0xffffff00)))
} else {
None
}
}
/// Expands to 64 bits if the expansion is to a single 64-bit collation
/// element or otherwise returns the collation element for U+FFFD.
#[inline(always)]
pub fn to_ce_self_contained_or_gigo(self) -> CollationElement {
unwrap_or_gigo(self.to_ce_self_contained(), FFFD_CE)
}
/// Gets the length from this element.
///
/// # Panics
///
/// In debug builds if this element doesn't have a length.
#[inline(always)]
pub fn len(self) -> usize {
debug_assert!(self.tag() == Tag::Expansion32 || self.tag() == Tag::Expansion);
((self.0 >> 8) & 31) as usize
}
/// Gets the index from this element.
///
/// # Panics
///
/// In debug builds if this element doesn't have an index.
#[inline(always)]
pub fn index(self) -> usize {
debug_assert!(
self.tag() == Tag::Expansion32
|| self.tag() == Tag::Expansion
|| self.tag() == Tag::Contraction
|| self.tag() == Tag::Digit
|| self.tag() == Tag::Prefix
|| self.tag() == Tag::Offset
);
(self.0 >> 13) as usize
}
#[inline(always)]
pub fn digit(self) -> u8 {
debug_assert!(self.tag() == Tag::Digit);
((self.0 >> 8) & 0xF) as u8
}
#[inline(always)]
pub fn every_suffix_starts_with_combining(self) -> bool {
debug_assert!(self.tag() == Tag::Contraction);
(self.0 & CONTRACT_NEXT_CCC) != 0
}
#[inline(always)]
pub fn at_least_one_suffix_contains_starter(self) -> bool {
debug_assert!(self.tag() == Tag::Contraction);
(self.0 & CONTRACT_HAS_STARTER) != 0
}
#[inline(always)]
pub fn at_least_one_suffix_ends_with_non_starter(self) -> bool {
debug_assert!(self.tag() == Tag::Contraction);
(self.0 & CONTRACT_TRAILING_CCC) != 0
}
}
impl Default for CollationElement32 {
fn default() -> Self {
CollationElement32(1) // NO_CE32
}
}
/// A collation element is a 64-bit value.
///
/// Bits 63..32 are the primary weight.
/// Bits 31..16 are the secondary weight.
/// Bits 15..14 are the case bits.
/// Bits 13..8 and 5..0 are the (bitwise discontiguous) tertiary weight.
/// Bits 7..6 the quaternary weight.
#[derive(Copy, Clone, Debug, PartialEq)]
pub(crate) struct CollationElement(u64);
impl CollationElement {
#[inline(always)]
pub fn new(bits: u64) -> Self {
CollationElement(bits)
}
#[inline(always)]
pub fn new_from_primary(primary: u32) -> Self {
CollationElement((u64::from(primary) << 32) | COMMON_SEC_AND_TER_CE)
}
#[inline(always)]
pub fn new_from_secondary(secondary: u16) -> Self {
CollationElement((u64::from(secondary) << 16) | COMMON_TERTIARY_CE)
}
#[inline(always)]
pub fn new_implicit_from_char(c: char) -> Self {
// Collation::unassignedPrimaryFromCodePoint
// Create a gap before U+0000. Use c-1 for [first unassigned].
let mut c_with_offset = u32::from(c) + 1;
// Fourth byte: 18 values, every 14th byte value (gap of 13).
let mut primary: u32 = 2 + (c_with_offset % 18) * 14;
c_with_offset /= 18;
// Third byte: 254 values
primary |= (2 + (c_with_offset % 254)) << 8;
c_with_offset /= 254;
// Second byte: 251 values 04..FE excluding the primary compression bytes.
primary |= (4 + (c_with_offset % 251)) << 16;
// One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18).
primary |= u32::from(UNASSIGNED_IMPLICIT_BYTE) << 24;
CollationElement::new_from_primary(primary)
}
#[inline(always)]
pub fn clone_with_non_primary_zeroed(self) -> Self {
CollationElement(self.0 & 0xFFFFFFFF00000000)
}
/// Get the primary weight
#[inline(always)]
pub fn primary(self) -> u32 {
(self.0 >> 32) as u32
}
/// Get the non-primary weights
#[inline(always)]
pub fn non_primary(self) -> NonPrimary {
NonPrimary::new(self.0 as u32)
}
/// Get the secondary weight
#[inline(always)]
pub fn secondary(self) -> u16 {
self.non_primary().secondary()
}
#[inline(always)]
pub fn quaternary(self) -> u32 {
self.non_primary().quaternary()
}
#[inline(always)]
pub fn tertiary_ignorable(self) -> bool {
self.non_primary().tertiary_ignorable()
}
#[inline(always)]
pub fn either_half_zero(self) -> bool {
self.primary() == 0 || (self.0 as u32) == 0
}
#[inline(always)]
pub const fn default() -> CollationElement {
CollationElement(NO_CE_VALUE) // NO_CE
}
}
impl Default for CollationElement {
#[inline(always)]
fn default() -> Self {
CollationElement(NO_CE_VALUE) // NO_CE
}
}
impl Default for &CollationElement {
#[inline(always)]
fn default() -> Self {
&CollationElement(NO_CE_VALUE) // NO_CE
}
}
/// The purpose of grouping the non-primary bits
/// into a struct is to allow for a future optimization
/// that specializes code over whether storage for primary
/// weights is needed or not. (I.e. whether to specialize
/// on `CollationElement` or `NonPrimary`.)
#[derive(Copy, Clone, PartialEq, Debug)]
pub(crate) struct NonPrimary(u32);
impl NonPrimary {
/// Constructor
pub fn new(bits: u32) -> Self {
NonPrimary(bits)
}
/// Get the bits
pub fn bits(self) -> u32 {
self.0
}
/// Get the secondary weight
#[inline(always)]
pub fn secondary(self) -> u16 {
(self.0 >> 16) as u16
}
/// Get the case bits as the high two bits of a u16
#[inline(always)]
pub fn case(self) -> u16 {
(self.0 as u16) & CASE_MASK
}
/// Get the tertiary weight as u16 with the high
/// two bits of each half zeroed.
#[inline(always)]
pub fn tertiary(self) -> u16 {
(self.0 as u16) & TERTIARY_MASK
}
#[inline(always)]
pub fn tertiary_ignorable(self) -> bool {
(self.0 as u16) <= NO_CE_TERTIARY
}
/// Get the quaternary weight in the original
/// storage bit positions with the other bits
/// set to one.
#[inline(always)]
pub fn quaternary(self) -> u32 {
self.0 | !(QUATERNARY_MASK as u32)
}
/// Get any combination of tertiary, case, and quaternary
/// by mask.
#[inline(always)]
pub fn tertiary_case_quarternary(self, mask: u16) -> u16 {
debug_assert!((mask & CASE_MASK) == CASE_MASK || (mask & CASE_MASK) == 0);
debug_assert!((mask & TERTIARY_MASK) == TERTIARY_MASK || (mask & TERTIARY_MASK) == 0);
debug_assert!((mask & QUATERNARY_MASK) == QUATERNARY_MASK || (mask & QUATERNARY_MASK) == 0);
(self.0 as u16) & mask
}
#[inline(always)]
pub fn case_quaternary(self) -> u16 {
(self.0 as u16) & (CASE_MASK | QUATERNARY_MASK)
}
#[inline(always)]
pub fn ignorable(self) -> bool {
self.0 == 0
}
}
impl Default for NonPrimary {
#[inline(always)]
fn default() -> Self {
NonPrimary(0x01000100) // Low 32 bits of NO_CE
}
}
/// This struct makes the handling of the `upcoming` buffer
/// easily so that trie lookups are done at most once. However,
/// when `upcoming[0]` is an undecomposed starter, we don't
/// need the ccc yet, and when lookahead has already done the
/// trie lookups, we don't need `trie_value`, as it is implied
/// by ccc.
//
// TODO(#2386): This struct carries redundant information, and
// `upcoming` should be split into a buffer of `CharacterAndClass`
// and an `Option<CharacterAndTrieValue>`, but that refactoring
// isn't 100% necessary, so focusing on data format stability
// for 1.0.
//
// (Deliberately non-`Copy`, because `CharacterAndClass` is
// non-`Copy`.)
#[derive(Debug, Clone)]
pub(crate) struct CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass,
pub trie_val: u32,
}
impl CharacterAndClassAndTrieValue {
pub fn new_with_non_decomposing_starter(c: char) -> Self {
CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass::new(c, CanonicalCombiningClass::NotReordered),
trie_val: 0,
}
}
pub fn new_with_non_zero_ccc(c: char, ccc: CanonicalCombiningClass) -> Self {
CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass::new(c, ccc),
trie_val: 0xD800 | u32::from(ccc.to_icu4c_value()),
}
}
pub fn new_with_non_special_decomposition_trie_val(c: char, trie_val: u32) -> Self {
debug_assert!(!trie_value_indicates_special_non_starter_decomposition(
trie_val
));
CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass::new_with_trie_value(c, trie_val),
trie_val,
}
}
pub fn new_with_trie_val(c: char, trie_val: u32) -> Self {
if !trie_value_indicates_special_non_starter_decomposition(trie_val) {
CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass::new_with_trie_value(c, trie_val),
trie_val,
}
} else {
CharacterAndClassAndTrieValue {
c_and_c: CharacterAndClass::new(c, CanonicalCombiningClass::from_icu4c_value(0xFF)),
trie_val,
}
}
}
pub fn decomposition_starts_with_non_starter(&self) -> bool {
decomposition_starts_with_non_starter(self.trie_val)
}
pub fn character(&self) -> char {
self.c_and_c.character()
}
fn ccc(&self) -> CanonicalCombiningClass {
let ret = self.c_and_c.ccc();
debug_assert_ne!(ret, CanonicalCombiningClass::from_icu4c_value(0xFF));
ret
}
}
/// Pack a `char` and a `CanonicalCombiningClass` in
/// 32 bits (the former in the lower 24 bits and the
/// latter in the high 8 bits). The latter can be
/// initialized to 0xFF upon creation, in which case
/// it can be actually set later by calling
/// `set_ccc_from_trie_if_not_already_set`. This is
/// a micro optimization to avoid the Canonical
/// Combining Class trie lookup when there is only
/// one combining character in a sequence. This type
/// is intentionally non-`Copy` to get compiler help
/// in making sure that the class is set on the
/// instance on which it is intended to be set
/// and not on a temporary copy.
///
/// Note that 0xFF is won't be assigned to an actual
/// canonical combining class per definition D104
/// in The Unicode Standard.
//
// NOTE: The Pernosco debugger has special knowledge
// of this struct. Please do not change the bit layout
// or the crate-module-qualified name of this struct
// without coordination.
#[derive(Debug, Clone)]
// Safety invariant: The low 24 bits are a valid char
struct CharacterAndClass(u32);
impl CharacterAndClass {
pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
// Safety invariant upheld here: the first half is a valid char
// and the second half does not affect the low 24 bits
CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
}
pub fn new_with_placeholder(c: char) -> Self {
// Safety invariant upheld here: the first half is a valid char
// and the second half does not affect the low 24 bits
CharacterAndClass(u32::from(c) | ((0xFF) << 24))
}
pub fn new_with_trie_value(c: char, trie_value: u32) -> Self {
Self::new(c, ccc_from_trie_value(trie_value))
}
pub fn character(&self) -> char {
// Safety: from the safety invariant, this extracts the low 24 bits
unsafe { char::from_u32_unchecked(self.0 & 0xFF_FFFF) }
}
pub fn ccc(&self) -> CanonicalCombiningClass {
// Safety invariant upheld here: The argument is outside of the low 24 bits,
// and \0 is a valid character
CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
}
pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
(self.character(), self.ccc())
}
pub fn set_ccc_from_trie_if_not_already_set<'data, T: AbstractCodePointTrie<'data, u32>>(
&mut self,
trie: &T,
) {
if self.0 >> 24 != 0xFF {
return;
}
let scalar = self.character();
// Safety invariant upheld here: The first half doesn't affect the lower 24 bits,
// and the second half was taken from the old `self` which had these invariants upheld already.
self.0 = ((ccc_from_trie_value(trie.scalar(scalar)).to_icu4c_value() as u32) << 24)
| u32::from(scalar);
}
}
/// Iterator that transforms an iterator over `char` into an iterator
/// over `CollationElement` with a tailoring.
/// Not a real Rust iterator: Instead of `None` uses `NO_CE` to indicate
/// end of iteration to optimize comparison.
///
/// It is _extremely_ important for performance that `SmallVec`s not be
/// moved. To facilitate move-avoidance, this struct has the following
/// life cycle where `new` returns the struct in a state that is not
/// yet valid for a `next` call until `init` is called:
///
/// 1. `new`.
/// 2. Some number of calls to `iter_next_before_init` and
/// `prepend_upcoming_before_init`.
/// 3. `init`.
/// 4. Some number of calls to `next`.
pub(crate) struct CollationElements<'data, I, T>
where
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32>,
T: AbstractCodePointTrie<'data, u32>,
{
/// See components/normalizer/trie-value-format.md for the trie wrapped in `iter`.
iter: I,
/// Already computed but not yet returned `CollationElement`s.
pending: SmallVec<[CollationElement; PENDING_CE_BUFFER_SIZE]>, // TODO(#2005): Figure out good length
/// The index of the next item to be returned from `pending`. The purpose
/// of this index is to avoid moving the rest of the items.
pending_pos: usize,
/// The characters most previously seen (or never-matching placeholders)
/// CLDR, as of 40, has two kinds of prefixes:
/// Prefixes that contain a single starter
/// Prefixes that contain a starter followed by either U+3099 or U+309A
/// Last-pushed is at index 0 and previously-pushed at index 1
prefix: [char; 2],
/// `upcoming` holds the characters that have already been read from
/// `iter` but haven't yet been mapped to `CollationElement`s.
///
/// Typically, `upcoming` holds one character and corresponds semantically
/// to `pending_unnormalized_starter` in `icu::normalizer::Decomposition`.
/// This is why there isn't a move avoidance optimization similar to
/// `pending_pos` above for this buffer. A complex decomposition, a
/// Hangul syllable followed by a non-starter, or lookahead can cause
/// `pending` to hold more than one `char`.
///
/// Invariant: `upcoming` is allowed to become empty only after `iter`
/// has been exhausted.
///
/// Invariant: (Checked by `debug_assert!`) At the start of `next()` call,
/// if `upcoming` isn't empty (with `iter` having been exhausted), the
/// first `char` in `upcoming` must have its decomposition start with a
/// starter.
///
/// TODO: Reverse the order, since now `insert(0, x)` and `remove(0)`
/// are used more often than `push()` and `pop()`.
upcoming: SmallVec<[CharacterAndClassAndTrieValue; UPCOMING_CHARACTER_BUFFER_SIZE]>,
/// The root collation data.
root: &'data CollationData<'data>,
/// Tailoring if applicable.
tailoring: &'data CollationData<'data>,
/// The `CollationElement32` mapping for the Hangul Jamo block.
///
/// Note: in ICU4C the jamo table contains only modern jamo. Here, the jamo table contains the whole Unicode block.
jamo: &'data [<u32 as AsULE>::ULE; JAMO_COUNT],
/// The `CollationElement32` mapping for the Combining Diacritical Marks block.
diacritics: &'data ZeroSlice<u16>,
/// NFD complex decompositions on the BMP
scalars16: &'data ZeroSlice<u16>,
/// NFD complex decompositions on supplementary planes
scalars32: &'data ZeroSlice<char>,
/// If numeric mode is enabled, the 8 high bits of the numeric primary.
/// `None` if disabled.
numeric_primary: Option<u8>,
/// Whether the Lithuanian combining dot above handling is enabled.
lithuanian_dot_above: bool,
/// Whether `upcoming` (except the last item) has been normalized already
upcoming_normalized: bool,
#[cfg(debug_assertions)]
/// Whether `iter` has been exhausted
iter_exhausted: bool,
#[cfg(debug_assertions)]
/// Whether `init` has been called
initialized: bool,
_phantom: PhantomData<T>,
}
impl<'data, I, T> CollationElements<'data, I, T>
where
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
{
#[expect(clippy::too_many_arguments)]
pub fn new(
delegate: I,
root: &'data CollationData,
tailoring: &'data CollationData,
jamo: &'data [<u32 as AsULE>::ULE; JAMO_COUNT],
diacritics: &'data ZeroSlice<u16>,
tables: &'data DecompositionTables,
numeric_primary: Option<u8>,
lithuanian_dot_above: bool,
) -> Self {
CollationElements::<I, T> {
iter: delegate,
pending: SmallVec::new(),
pending_pos: 0,
prefix: ['\u{FFFF}'; 2],
upcoming: SmallVec::new(),
root,
tailoring,
jamo,
diacritics,
scalars16: &tables.scalars16,
scalars32: &tables.scalars24,
numeric_primary,
lithuanian_dot_above,
upcoming_normalized: false,
#[cfg(debug_assertions)]
iter_exhausted: false,
#[cfg(debug_assertions)]
initialized: false,
_phantom: PhantomData,
}
}
pub fn iter_next_before_init(&mut self) -> Option<CharacterAndClassAndTrieValue> {
#[cfg(debug_assertions)]
debug_assert!(!self.initialized);
self.iter_next()
}
pub fn prepend_upcoming_before_init(&mut self, c: CharacterAndClassAndTrieValue) {
#[cfg(debug_assertions)]
debug_assert!(!self.initialized);
self.upcoming.insert(0, c);
}
pub fn init(&mut self) {
// TODO: Consider removing the invariant that this method upholds.
#[cfg(debug_assertions)]
{
debug_assert!(!self.initialized);
self.initialized = true;
}
loop {
// Ensure the last item is a starter (unless)
// iter exhausted.
if let Some(last) = self.upcoming.last() {
if last.decomposition_starts_with_non_starter() {
// Not using `while let` to be able to set `iter_exhausted`
loop {
if let Some(ch) = self.iter_next() {
let starter = !ch.decomposition_starts_with_non_starter();
self.upcoming.push(ch);
if starter {
break;
}
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
break;
}
}
}
if let Some(first) = self.upcoming.first() {
if !first.decomposition_starts_with_non_starter() {
return;
}
}
} else {
// Ensure that `upcoming` starts with a starter in the case where
// we get here with an empty `upcoming` due to the identical prefix
// code exiting right away, because the very first code units differ.
if let Some(ch) = self.iter_next() {
let starter = !ch.decomposition_starts_with_non_starter();
self.upcoming.push(ch);
if starter {
return;
}
// Loop back to uphoad the invariant that `upcoming` ends with
// a character whose decomposition starts with a starter unless
// the iterator has been exhausted.
continue;
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
return;
}
}
break;
}
// The case where upcoming does not start with a starter.
// Ideally, we'd have something more specialized here that would extract
// the code path that `self.next()` runs after dealing with the U+0000.
self.upcoming.insert(
0,
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter('\u{0}'),
); // Make sure the process always begins with a starter
let _ = self.next(); // Remove the placeholder starter
}
fn iter_next(&mut self) -> Option<CharacterAndClassAndTrieValue> {
let (c, trie_val) = self.iter.next()?;
Some(CharacterAndClassAndTrieValue::new_with_trie_val(
c, trie_val,
))
}
fn next_internal(&mut self) -> Option<CharacterAndClassAndTrieValue> {
if self.upcoming.is_empty() {
return None;
}
// TODO: something more efficient here.
let ret = self.upcoming.remove(0);
if self.upcoming.is_empty() {
if let Some(c) = self.iter_next() {
self.upcoming.push(c);
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
}
}
Some(ret)
}
fn maybe_gather_combining(&mut self) {
if self.upcoming.len() != 1 {
return;
}
// index has to be in range due to the check above.
// rewriting with `get()` would result in two checks.
#[expect(clippy::indexing_slicing)]
if !self.upcoming[0].decomposition_starts_with_non_starter() {
return;
}
// We now have a single character that decomposes to start with
// a non-starter. Decompose it and assign the real canonical combining class.
let first = self.upcoming.remove(0);
self.push_decomposed_combining(first);
// Not using `while let` to be able to set `iter_exhausted`
loop {
if let Some(ch) = self.iter_next() {
if ch.decomposition_starts_with_non_starter() {
self.push_decomposed_combining(ch);
} else {
// Got a new starter
self.upcoming.push(ch);
break;
}
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
break;
}
}
}
/// Ensures that `upcoming` is normalized to NFD, except:
/// 1. When the last item is a starter, it isn't necessarily normalized.
/// 2. Hangul syllable are unnormalized.
fn ensure_upcoming_normalized(&mut self) {
if self.upcoming_normalized {
return;
}
self.upcoming_normalized = true;
let without_trailing_starter = if let Some((last, head)) = self.upcoming.split_last() {
if !last.decomposition_starts_with_non_starter() {
if head.is_empty() {
// There is a single starter, which isn't required
// to be normalized.
return;
} else {
head
}
} else {
&self.upcoming[..]
}
} else {
// Make the assertion conditional to make CI happy.
#[cfg(debug_assertions)]
debug_assert!(self.iter_exhausted);
return;
};
// It would be better to attempt to normalize in place, but let's do at
// least this.
if without_trailing_starter.iter().all(|c| {
(c.trie_val
& !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER | HANGUL_SYLLABLE_MARKER))
== 0
}) {
return;
}
let mut unnormalized = core::mem::take(&mut self.upcoming);
let last_index = unnormalized.len() - 1;
// Indexing is for debug assert only.
#[expect(clippy::indexing_slicing)]
{
debug_assert!(!unnormalized[0].decomposition_starts_with_non_starter());
}
let mut start_combining = 0;
for (i, c) in unnormalized.drain(..).enumerate() {
if c.decomposition_starts_with_non_starter() {
self.push_decomposed_combining(c);
} else if i == last_index {
// Indices are in range by construction, so indexing is OK.
#[expect(clippy::indexing_slicing)]
self.upcoming[start_combining..].sort_by_key(|c| c.ccc());
self.upcoming.push(c);
return;
} else {
// Indices are in range by construction, so indexing is OK.
#[expect(clippy::indexing_slicing)]
self.upcoming[start_combining..].sort_by_key(|c| c.ccc());
start_combining = self.push_decomposed_starter(c);
}
}
// Make the assertion conditional to make CI happy.
#[cfg(debug_assertions)]
debug_assert!(self.iter_exhausted);
// Indices are in range by construction, so indexing is OK.
#[expect(clippy::indexing_slicing)]
self.upcoming[start_combining..].sort_by_key(|c| c.ccc());
}
fn push_decomposed_combining(&mut self, c: CharacterAndClassAndTrieValue) {
if !trie_value_indicates_special_non_starter_decomposition(c.trie_val) {
debug_assert!(trie_value_has_ccc(c.trie_val));
self.upcoming.push(c);
return;
}
// The Tibetan special cases are starters that decompose into non-starters.
match c.character() {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0300}',
CanonicalCombiningClass::Above,
));
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0301}',
CanonicalCombiningClass::Above,
));
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0313}',
CanonicalCombiningClass::Above,
));
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0308}',
CanonicalCombiningClass::Above,
));
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0301}',
CanonicalCombiningClass::Above,
));
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F72}',
CanonicalCombiningClass::CCC130,
));
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F74}',
CanonicalCombiningClass::CCC132,
));
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_zero_ccc(
'\u{0F80}',
CanonicalCombiningClass::CCC130,
));
}
_ => {
// GIGO case
debug_assert!(false);
}
}
}
fn push_decomposed_starter(&mut self, c: CharacterAndClassAndTrieValue) -> usize {
let mut search_start_combining = false;
let old_len = self.upcoming.len();
// Not inserting early returns below to keep the same structure
// as in the ce32 mapping code.
// Hangul syllable check omitted, because it's fine not to decompose
// Hangul syllables in lookahead, because Hangul isn't allowed to
// participate in contractions, and the trie default is that a character
// is its own decomposition.
// See components/normalizer/trie-value-format.md
let decomposition = c.trie_val;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
<= HANGUL_SYLLABLE_MARKER
{
// The character is its own decomposition (or Hangul syllable)
// Set the Canonical Combining Class to zero
self.upcoming.push(
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(c.character()),
);
} else {
let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
if !high_zeros && !low_zeros {
// Decomposition into two BMP characters: starter and non-starter
let starter = char_from_u32(decomposition & 0x7FFF);
let low_c = char_from_u32((decomposition >> 15) & 0x7FFF);
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(starter));
let trie_value = self.iter.trie().bmp(low_c as u16);
self.upcoming.push(
CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(
low_c, trie_value,
),
);
} else if high_zeros {
let singleton = decomposition as u16;
debug_assert_ne!(
singleton, FDFA_MARKER,
"How come U+FDFA NFKD marker seen in NFD?"
);
if (singleton & 0xFF00) == 0xD800 {
// We're at the end of the stream, so we aren't dealing with the
// next undecomposed starter but are dealing with an
// already-decomposed non-starter. Just put it back.
self.upcoming.push(c);
// Make the assertion conditional to make CI happy.
#[cfg(debug_assertions)]
debug_assert!(self.iter_exhausted);
} else {
// Decomposition into one BMP character
self.upcoming.push(
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(
char_from_u16(singleton),
),
);
}
} else {
debug_assert!(low_zeros);
// Only 12 of 14 bits used as of Unicode 16.
let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
// Only 3 of 4 bits used as of Unicode 16.
let len_bits = decomposition & 0b1111;
let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
if offset < self.scalars16.len() {
let len = (len_bits + 2) as usize;
for u in unwrap_or_gigo(
self.scalars16.get_subslice(offset..offset + len),
SINGLE_REPLACEMENT_CHARACTER_U16, // single instead of empty for consistency with the other code path
)
.iter()
{
let ch = char_from_u16(u);
let trie_value = self.iter.trie().bmp(u);
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(ch, trie_value));
}
} else {
let len = (len_bits + 1) as usize;
let offset32 = offset - self.scalars16.len();
for ch in unwrap_or_gigo(
self.scalars32.get_subslice(offset32..offset32 + len),
SINGLE_REPLACEMENT_CHARACTER_CHAR, // single instead of empty for consistency with the other code path
)
.iter()
{
let trie_value = self.iter.trie().scalar(ch);
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(ch, trie_value));
}
}
search_start_combining = !only_non_starters_in_trail;
}
}
if search_start_combining {
// The decomposition contains starters. As of Unicode 14,
// There are two possible patterns:
// BMP: starter, starter, non-starter
// Plane 1: starter, starter.
// However, for forward compatibility, support any combination
// and search for the last starter.
let mut i = self.upcoming.len() - 1;
loop {
if let Some(ch) = self.upcoming.get(i) {
if ch.decomposition_starts_with_non_starter() {
i -= 1;
continue;
}
break;
}
// GIGO case
debug_assert!(false);
// This will wrap to zero below
i = usize::MAX;
break;
}
i + 1
} else {
old_len + 1
}
}
// Decomposes `c`, pushes it to `self.upcoming` (unless the character is
// a Hangul syllable; Hangul isn't allowed to participate in contractions),
// gathers the following combining characters from `self.iter` and the following starter.
// Sorts the combining characters and leaves the starter at the end
// unnormalized. The trailing unnormalized starter doesn't get appended if
// `self.iter` is exhausted.
fn push_decomposed_and_gather_combining(&mut self, c: CharacterAndClassAndTrieValue) {
let start_combining = self.push_decomposed_starter(c);
// Not using `while let` to be able to set `iter_exhausted`
loop {
if let Some(ch) = self.iter_next() {
if ch.decomposition_starts_with_non_starter() {
self.push_decomposed_combining(ch);
} else {
// Got a new starter
// Indices are in range by construction, so indexing is OK.
#[expect(clippy::indexing_slicing)]
self.upcoming[start_combining..].sort_by_key(|c| c.ccc());
self.upcoming.push(ch);
return;
}
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
// Indices are in range by construction, so indexing is OK.
#[expect(clippy::indexing_slicing)]
self.upcoming[start_combining..].sort_by_key(|c| c.ccc());
return;
}
}
}
// Assumption: `pos` starts from zero and increases one by one.
// Indexing is OK, because we check against `len()` and the `pos`
// increases one by one by construction.
#[expect(clippy::indexing_slicing)]
fn look_ahead(&mut self, pos: usize) -> Option<CharacterAndClassAndTrieValue> {
debug_assert!(self.upcoming_normalized);
if pos + 1 == self.upcoming.len() {
let c = self.upcoming.remove(pos);
self.push_decomposed_and_gather_combining(c);
Some(self.upcoming[pos].clone())
} else if pos == self.upcoming.len() {
if let Some(c) = self.iter_next() {
debug_assert!(
false,
"The `upcoming` queue should be empty when iteration `pos` at the end"
);
self.push_decomposed_and_gather_combining(c);
Some(self.upcoming[pos].clone())
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
None
}
} else {
Some(self.upcoming[pos].clone())
}
}
fn is_next_decomposition_starts_with_starter(&self) -> bool {
if let Some(c_c_tv) = self.upcoming.first() {
!c_c_tv.decomposition_starts_with_non_starter()
} else {
true
}
}
fn prepend_and_sort_non_starter_prefix_of_suffix(&mut self, c: CharacterAndClassAndTrieValue) {
// Add one for the insertion afterwards.
let end = 1 + {
let mut iter = self.upcoming.iter().enumerate();
loop {
if let Some((i, ch)) = iter.next() {
if !ch.decomposition_starts_with_non_starter() {
break i;
}
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
break self.upcoming.len();
}
}
};
let start = c.decomposition_starts_with_non_starter() as usize;
self.upcoming.insert(0, c);
// Indices in range by construction
#[expect(clippy::indexing_slicing)]
{
let slice: &mut [CharacterAndClassAndTrieValue] = &mut self.upcoming[start..end];
slice.sort_by_key(|cc| cc.ccc());
};
}
fn prefix_push(&mut self, c: char) {
self.prefix[1] = self.prefix[0];
self.prefix[0] = c;
}
/// Micro optimization for doing a simpler write when
/// we know the most recent character was a non-starter
/// that is not a kana voicing mark.
fn mark_prefix_unmatchable(&mut self) {
self.prefix[0] = '\u{FFFF}';
}
pub fn next(&mut self) -> CollationElement {
#[cfg(debug_assertions)]
debug_assert!(self.initialized);
debug_assert!(self.is_next_decomposition_starts_with_starter());
if let Some(&ret) = self.pending.get(self.pending_pos) {
self.pending_pos += 1;
if self.pending_pos == self.pending.len() {
self.pending.clear();
self.pending_pos = 0;
}
return ret;
}
debug_assert_eq!(self.pending_pos, 0);
if let Some(c_c_tv) = self.next_internal() {
let mut c = c_c_tv.character();
let mut ce32;
let mut data: &CollationData = self.tailoring;
// TODO: Should this be a reusable buffer on the struct instead of
// getting re-created on the stack every time?
let mut combining_characters: SmallVec<
[CharacterAndClass; COMBINING_CHARACTER_BUFFER_SIZE],
> = SmallVec::new(); // TODO(#2005): Figure out good length
// Betting that fusing the NFD algorithm into this one at the
// expense of the repetitiveness below, the common cases become
// fast in a way that offsets the lack of the canonical closure.
// The wall of code before the "Slow path" is an attempt to
// optimize based on that bet.
// See components/normalizer/trie-value-format.md
let decomposition = c_c_tv.trie_val;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
// The character is its own decomposition
// TODO: This is a bad idea. Make sure the jamo are in the root trie and then
// remove this special case.
let jamo_index = (c as usize).wrapping_sub(HANGUL_L_BASE as usize);
// Attribute belongs on an inner expression, but
#[expect(clippy::indexing_slicing)]
if jamo_index >= self.jamo.len() {
// Note: It might seem like a good idea to reuse the CE32s
// from the identical prefix check here, but the logistics
// actually make everything slower.
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
} else {
// The purpose of reading the CE32 from the jamo table instead
// of the trie even in this case is to make it unnecessary
// for all search collation tries to carry a copy of the Hangul
// part of the search root. Instead, all non-Korean tailorings
// can use a shared copy of the non-Korean search jamo table.
//
// TODO(#1941): This isn't actually true with the current jamo
// search expansions!
// TODO(#1941): Instead of having different jamo CE32 table for
// "search" collations, we could instead decompose the archaic
// jamo to the modern approximation sequences here and then map
// those by looking up the modern jamo from the normal root.
// We need to set data to root, because archaic jamo refer to
// the root.
data = self.root;
// Index in range by construction above. Not using `get` with
// `if let` in order to put the likely branch first.
ce32 = CollationElement32::new_from_ule(self.jamo[jamo_index]);
}
if self.is_next_decomposition_starts_with_starter() {
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {
self.prefix_push(c);
return ce;
} else if ce32.tag() == Tag::Contraction
&& ce32.every_suffix_starts_with_combining()
{
// Avoid falling onto the slow path e.g. that letters that
// may contract with a diacritic when we know that it won't
// contract with the next character.
let default = data.get_default(ce32.index());
if let Some(ce) = default.to_ce_simple_or_long_primary() {
self.prefix_push(c);
return ce;
}
}
// TODO(2003): Figure out if it would be an optimization to
// handle `Implicit` and `Offset` tags here.
}
} else {
let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
if !high_zeros && !low_zeros {
// Decomposition into two BMP characters: starter and non-starter
c = char_from_u32(decomposition & 0x7FFF);
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
if self.is_next_decomposition_starts_with_starter() {
let diacritic_index =
(combining as usize).wrapping_sub(COMBINING_DIACRITICS_BASE);
if let Some(secondary) = self.diacritics.get(diacritic_index) {
debug_assert_ne!(combining, '\u{0344}', "Should never have COMBINING GREEK DIALYTIKA TONOS here, since it should have decomposed further.");
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {
let ce_for_combining =
CollationElement::new_from_secondary(secondary);
self.pending.push(ce_for_combining);
self.mark_prefix_unmatchable();
return ce;
}
if ce32.tag() == Tag::Contraction
&& ce32.every_suffix_starts_with_combining()
{
let (default, mut trie) = data.get_default_and_trie(ce32.index());
match trie.next(combining) {
TrieResult::NoMatch | TrieResult::NoValue => {
if let Some(ce) = default.to_ce_simple_or_long_primary() {
let ce_for_combining =
CollationElement::new_from_secondary(secondary);
self.pending.push(ce_for_combining);
self.mark_prefix_unmatchable();
return ce;
}
}
TrieResult::Intermediate(trie_ce32) => {
if !ce32.at_least_one_suffix_contains_starter() {
if let Some(ce) =
CollationElement32::new(trie_ce32 as u32)
.to_ce_simple_or_long_primary()
{
self.mark_prefix_unmatchable();
return ce;
}
}
}
TrieResult::FinalValue(trie_ce32) => {
if let Some(ce) = CollationElement32::new(trie_ce32 as u32)
.to_ce_simple_or_long_primary()
{
self.mark_prefix_unmatchable();
return ce;
}
}
}
}
}
}
combining_characters.push(CharacterAndClass::new_with_placeholder(combining));
} else if high_zeros {
// Do the Hangul check on the character instead of trusting
// the trie value in order not to let GIGO cause unsafety.
let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
if hangul_offset < HANGUL_S_COUNT {
// Hangul syllable
// The math here comes from page 144 of Unicode 14.0
let l = hangul_offset / HANGUL_N_COUNT;
let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
let t = hangul_offset % HANGUL_T_COUNT;
// No prefix matches on Hangul
self.mark_prefix_unmatchable();
// Indexing OK, because indices in range by construction
#[expect(clippy::indexing_slicing)]
if self.is_next_decomposition_starts_with_starter() {
// TODO(#1941): Assuming self-contained CE32s is OK for the root,
// but not currently OK for search collation, which at this time
// do not support tailored Hangul.
self.pending.push(
CollationElement32::new_from_ule(
self.jamo[(HANGUL_V_BASE - HANGUL_L_BASE + v) as usize],
)
.to_ce_self_contained_or_gigo(),
);
if t != 0 {
self.pending.push(
CollationElement32::new_from_ule(
self.jamo[(HANGUL_T_BASE - HANGUL_L_BASE + t) as usize],
)
.to_ce_self_contained_or_gigo(),
);
}
return CollationElement32::new_from_ule(self.jamo[l as usize])
.to_ce_self_contained_or_gigo();
}
// Uphold the invariant that the upcoming character is a starter (or end of stream)
// at the start of the next `next()` call. We uphold this invariant by leaving the
// last jamo unmapped to `CollationElement` in `pending` and instead prepend it to
// `upcoming`.
//
// Indexing OK, because indices in range by construction
#[expect(clippy::indexing_slicing)]
if t != 0 {
self.pending.push(
CollationElement32::new_from_ule(
self.jamo[(HANGUL_V_BASE - HANGUL_L_BASE + v) as usize],
)
.to_ce_self_contained_or_gigo(),
);
self.upcoming.insert(
0,
// Safety: HANGUL_T_BASE is 0x11A7, t is < HANGUL_T_COUNT = 28, so this is definitely
// in range for a char (≤ 0xD800)
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(
unsafe { core::char::from_u32_unchecked(HANGUL_T_BASE + t) },
),
);
} else {
self.upcoming.insert(
0,
// Safety: HANGUL_V_BASE is 0x1161, v is < HANGUL_N_COUNT = 588, so this is definitely
// in range for a char (≤ 0xD800)
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(
unsafe { core::char::from_u32_unchecked(HANGUL_V_BASE + v) },
),
);
}
// Indexing OK, because indices in range by construction
#[expect(clippy::indexing_slicing)]
return CollationElement32::new_from_ule(self.jamo[l as usize])
.to_ce_self_contained_or_gigo();
}
let singleton = decomposition as u16;
debug_assert_ne!(
singleton, FDFA_MARKER,
"How come U+FDFA NFKD marker seen in NFD?"
);
// Decomposition into one BMP character
c = char_from_u16(singleton);
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
if self.is_next_decomposition_starts_with_starter() {
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {
self.prefix_push(c);
return ce;
}
}
} else {
debug_assert!(low_zeros);
// Only 12 of 14 bits used as of Unicode 16.
let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
// Only 3 of 4 bits used as of Unicode 16.
let len_bits = decomposition & 0b1111;
let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
if offset < self.scalars16.len() {
let len = (len_bits + 2) as usize;
let (starter, tail) = self
.scalars16
.get_subslice(offset..offset + len)
.and_then(ZeroSlice::split_first)
.map_or_else(
|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_U16)
},
|(first, tail)| (char_from_u16(first), tail),
);
c = starter;
if only_non_starters_in_trail {
for u in tail.iter() {
let char_from_u = char_from_u16(u);
let trie_value = self.iter.trie().bmp(u);
let ccc = ccc_from_trie_value(trie_value);
combining_characters.push(CharacterAndClass::new(char_from_u, ccc));
}
} else {
let mut it = tail.iter();
while let Some(u) = it.next() {
let ch = char_from_u16(u);
let ccc = ccc_from_trie_value(self.iter.trie().bmp(u));
if ccc != CanonicalCombiningClass::NotReordered {
// As of Unicode 14, this branch is never taken.
// It exist for forward compatibility.
combining_characters.push(CharacterAndClass::new(ch, ccc));
continue;
}
// At this point, we might have a single newly-read
// combining character in self.upcoming. In that case, we
// need to buffer up the upcoming combining characters, too,
// in order to make `prepend_and_sort_non_starter_prefix_of_suffix`
// sort the right characters.
self.maybe_gather_combining();
while let Some(u) = it.next_back() {
let tail_char = char_from_u16(u);
let trie_value = self.iter.trie().bmp(u);
self.prepend_and_sort_non_starter_prefix_of_suffix(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(tail_char, trie_value));
}
self.prepend_and_sort_non_starter_prefix_of_suffix(
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(
ch,
),
);
break;
}
}
} else {
let len = (len_bits + 1) as usize;
let offset32 = offset - self.scalars16.len();
let (starter, tail) = self
.scalars32
.get_subslice(offset32..offset32 + len)
.and_then(|slice| slice.split_first())
.unwrap_or_else(|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_CHAR)
});
c = starter;
if only_non_starters_in_trail {
for ch in tail.iter() {
let trie_value = self.iter.trie().scalar(ch);
let ccc = ccc_from_trie_value(trie_value);
combining_characters.push(CharacterAndClass::new(ch, ccc));
}
} else {
let mut it = tail.iter();
while let Some(ch) = it.next() {
let ccc = ccc_from_trie_value(self.iter.trie().scalar(ch));
if ccc != CanonicalCombiningClass::NotReordered {
// As of Unicode 14, this branch is never taken.
// It exist for forward compatibility.
combining_characters.push(CharacterAndClass::new(ch, ccc));
continue;
}
// At this point, we might have a single newly-read
// combining character in self.upcoming. In that case, we
// need to buffer up the upcoming combining characters, too,
// in order to make `prepend_and_sort_non_starter_prefix_of_suffix`
// sort the right characters.
self.maybe_gather_combining();
while let Some(tail_char) = it.next_back() {
let trie_value = self.iter.trie().scalar(tail_char);
self.prepend_and_sort_non_starter_prefix_of_suffix(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(tail_char, trie_value));
}
self.prepend_and_sort_non_starter_prefix_of_suffix(
CharacterAndClassAndTrieValue::new_with_non_decomposing_starter(
ch,
),
);
break;
}
}
}
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
}
}
let mut may_have_contracted_starter = false;
// Slow path
self.collect_combining(&mut combining_characters);
// Now:
// c is the starter character
// ce32 is the CollationElement32 for the starter
// combining_characters contains all the combining characters before
// the next starter sorted by combining class.
let mut looked_ahead = 0;
let mut drain_from_upcoming = 0;
'outer: loop {
'ce32loop: loop {
// TODO(#2002): Ensure that the CE32 flavors in this loop are checked in the optimal
// order given their frequency in real workloads.
if let Some(ce) = ce32.to_ce_self_contained() {
self.pending.push(ce);
break 'ce32loop;
} else {
match ce32.tag() {
Tag::Expansion32 => {
let ce32s = data.get_ce32s(ce32.index(), ce32.len());
for u in ce32s.iter() {
self.pending.push(
CollationElement32::new(u).to_ce_self_contained_or_gigo(),
);
}
break 'ce32loop;
}
Tag::Expansion => {
let ces = data.get_ces(ce32.index(), ce32.len());
for u in ces.iter() {
self.pending.push(CollationElement::new(u));
}
break 'ce32loop;
}
Tag::Prefix => {
let (default, mut trie) = data.get_default_and_trie(ce32.index());
ce32 = default;
for &ch in self.prefix.iter() {
match trie.next(ch) {
TrieResult::NoValue => {}
TrieResult::NoMatch => {
continue 'ce32loop;
}
TrieResult::Intermediate(ce32_i) => {
ce32 = CollationElement32::new(ce32_i as u32);
}
TrieResult::FinalValue(ce32_i) => {
ce32 = CollationElement32::new(ce32_i as u32);
continue 'ce32loop;
}
}
}
continue 'ce32loop;
}
Tag::Contraction => {
let every_suffix_starts_with_combining =
ce32.every_suffix_starts_with_combining();
let at_least_one_suffix_contains_starter =
ce32.at_least_one_suffix_contains_starter();
let at_least_one_suffix_ends_with_non_starter =
ce32.at_least_one_suffix_ends_with_non_starter();
let (default, mut trie) = data.get_default_and_trie(ce32.index());
ce32 = default;
if every_suffix_starts_with_combining
&& combining_characters.is_empty()
{
continue 'ce32loop;
}
let mut longest_matching_state = trie.clone();
let mut longest_matching_index = 0;
let mut attempt = 0;
let mut i = 0;
let mut most_recent_skipped_ccc =
CanonicalCombiningClass::NotReordered;
// TODO(#2001): Pending removals will in practice be small numbers.
// What if we made the item smaller than usize?
let mut pending_removals: SmallVec<[usize; PENDING_REMOVALS_SIZE]> =
SmallVec::new();
while let Some((character, ccc)) =
combining_characters.get(i).map(|c| c.character_and_ccc())
{
match (most_recent_skipped_ccc < ccc, trie.next(character)) {
(true, TrieResult::Intermediate(ce32_i)) => {
let _ = combining_characters.remove(i);
while let Some(idx) = pending_removals.pop() {
combining_characters.remove(idx);
i -= 1; // Adjust for the shortening
}
attempt = 0;
longest_matching_index = i;
longest_matching_state = trie.clone();
ce32 = CollationElement32::new(ce32_i as u32);
}
(true, TrieResult::FinalValue(ce32_i)) => {
let _ = combining_characters.remove(i);
while let Some(idx) = pending_removals.pop() {
combining_characters.remove(idx);
}
ce32 = CollationElement32::new(ce32_i as u32);
continue 'ce32loop;
}
(_, TrieResult::NoValue) => {
pending_removals.push(i);
i += 1;
}
_ => {
pending_removals.clear();
most_recent_skipped_ccc = ccc;
attempt += 1;
i = longest_matching_index + attempt;
trie = longest_matching_state.clone();
}
}
}
if !(at_least_one_suffix_contains_starter
&& combining_characters.is_empty())
{
continue 'ce32loop;
}
// Let's just set this flag here instead of trying to make
// it more granular and, therefore, more error-prone.
// After all, this flag is just about optimizing away one
// `CodePointInversionList` check in the common case.
may_have_contracted_starter = true;
debug_assert!(pending_removals.is_empty());
self.ensure_upcoming_normalized();
loop {
let ahead = self.look_ahead(looked_ahead);
looked_ahead += 1;
if let Some(ch) = ahead {
match trie.next(ch.character()) {
TrieResult::NoValue => {}
TrieResult::NoMatch => {
if !at_least_one_suffix_ends_with_non_starter {
continue 'ce32loop;
}
if !ch.decomposition_starts_with_non_starter() {
continue 'ce32loop;
}
// The last-checked character is non-starter
// and at least one contraction suffix ends
// with a non-starter. Try a discontiguous
// match.
trie = longest_matching_state.clone();
// For clarity, mint a new set of variables that
// behave consistently with the
// `combining_characters` case
let mut longest_matching_index = 0;
let mut attempt = 0;
let mut i = 0;
most_recent_skipped_ccc = ch.ccc();
self.ensure_upcoming_normalized();
loop {
let ahead = self.look_ahead(looked_ahead + i);
if let Some(ch) = ahead {
let ccc = ch.ccc();
if ccc
== CanonicalCombiningClass::NotReordered
{
// If we came here, we had an intervening non-matching
// non-starter, after which we cannot contract another
// starter anymore.
continue 'ce32loop;
}
match (
most_recent_skipped_ccc < ccc,
trie.next(ch.character()),
) {
(
true,
TrieResult::Intermediate(ce32_i),
) => {
let _ = self
.upcoming
.remove(looked_ahead + i);
while let Some(idx) =
pending_removals.pop()
{
self.upcoming
.remove(looked_ahead + idx);
i -= 1; // Adjust for the shortening
}
attempt = 0;
longest_matching_index = i;
longest_matching_state =
trie.clone();
ce32 = CollationElement32::new(
ce32_i as u32,
);
}
(
true,
TrieResult::FinalValue(ce32_i),
) => {
let _ = self
.upcoming
.remove(looked_ahead + i);
while let Some(idx) =
pending_removals.pop()
{
self.upcoming
.remove(looked_ahead + idx);
}
ce32 = CollationElement32::new(
ce32_i as u32,
);
continue 'ce32loop;
}
(_, TrieResult::NoValue) => {
pending_removals.push(i);
i += 1;
}
_ => {
pending_removals.clear();
most_recent_skipped_ccc = ccc;
attempt += 1;
i = longest_matching_index
+ attempt;
trie =
longest_matching_state.clone();
}
}
} else {
continue 'ce32loop;
}
}
}
TrieResult::Intermediate(ce32_i) => {
longest_matching_state = trie.clone();
drain_from_upcoming = looked_ahead;
ce32 = CollationElement32::new(ce32_i as u32);
}
TrieResult::FinalValue(ce32_i) => {
drain_from_upcoming = looked_ahead;
ce32 = CollationElement32::new(ce32_i as u32);
continue 'ce32loop;
}
}
} else {
continue 'ce32loop;
}
}
// Unreachable
}
Tag::Digit => {
if let Some(high_bits) = self.numeric_primary {
let mut digits: SmallVec<[u8; DIGIT_BUFFER_SIZE]> =
SmallVec::new(); // TODO(#2005): Figure out good length
digits.push(ce32.digit());
let numeric_primary = u32::from(high_bits) << 24;
if combining_characters.is_empty() {
// Numeric collation doesn't work with combining
// characters applied to the digits.
// It's unclear if reading from the tailoring first
// is needed for practical purposes, since it doesn't
// make much sense to tailor the numeric value of digits.
// Performing the usual fallback pattern anyway just in
// case.
may_have_contracted_starter = true;
self.ensure_upcoming_normalized();
while let Some(upcoming) = self.look_ahead(looked_ahead) {
looked_ahead += 1;
ce32 =
self.tailoring.ce32_for_char(upcoming.character());
if ce32 == FALLBACK_CE32 {
ce32 =
self.root.ce32_for_char(upcoming.character());
}
if ce32.tag_checked() != Some(Tag::Digit) {
break;
}
drain_from_upcoming = looked_ahead;
digits.push(ce32.digit());
}
}
// Skip leading zeros
let mut zeros = 0;
while let Some(&digit) = digits.get(zeros) {
if digit != 0 {
break;
}
zeros += 1;
}
if zeros == digits.len() {
// All zeros, keep a zero
zeros = digits.len() - 1;
}
// Index in range by construction above
#[expect(clippy::indexing_slicing)]
let mut remaining = &digits[zeros..];
while !remaining.is_empty() {
// Numeric CEs are generated for segments of
// up to 254 digits.
let (head, tail) = remaining
.split_at_checked(254)
.unwrap_or((remaining, b""));
remaining = tail;
// From ICU4C CollationIterator::appendNumericSegmentCEs
if head.len() <= 7 {
let mut digit_iter = head.iter();
// `unwrap` succeeds, because we always have at least one
// digit to even start numeric processing.
#[expect(clippy::unwrap_used)]
let mut value = u32::from(*digit_iter.next().unwrap());
for &digit in digit_iter {
value *= 10;
value += u32::from(digit);
}
// Primary weight second byte values:
// 74 byte values 2.. 75 for small numbers in two-byte primary weights.
// 40 byte values 76..115 for medium numbers in three-byte primary weights.
// 16 byte values 116..131 for large numbers in four-byte primary weights.
// 124 byte values 132..255 for very large numbers with 4..127 digit pairs.
let mut first_byte = 2u32;
let mut num_bytes = 74u32;
if value < num_bytes {
self.pending.push(
CollationElement::new_from_primary(
numeric_primary
| ((first_byte + value) << 16),
),
);
continue;
}
value -= num_bytes;
first_byte += num_bytes;
num_bytes = 40;
if value < num_bytes * 254 {
// Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
self.pending.push(
CollationElement::new_from_primary(
numeric_primary
| ((first_byte + value / 254) << 16)
| ((2 + value % 254) << 8),
),
);
continue;
}
value -= num_bytes * 254;
first_byte += num_bytes;
num_bytes = 16;
if value < num_bytes * 254 * 254 {
// Four-byte primary for 10234..1042489=10234+16*254*254-1.
let mut primary =
numeric_primary | (2 + value % 254);
value /= 254;
primary |= (2 + value % 254) << 8;
value /= 254;
primary |= (first_byte + value % 254) << 16;
self.pending.push(
CollationElement::new_from_primary(primary),
);
continue;
}
// original value > 1042489
}
debug_assert!(head.len() >= 7);
// The second primary byte value 132..255 indicates the number of digit pairs (4..127),
// then we generate primary bytes with those pairs.
// Omit trailing 00 pairs.
// Decrement the value for the last pair.
// Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.
let mut len = head.len();
let num_pairs = (len as u32).div_ceil(2); // as u32 OK, because capped to 254
let mut primary =
numeric_primary | ((132 - 4 + num_pairs) << 16);
// Find the length without trailing 00 pairs.
//
// The indexing below is within bounds due to the following:
//
// * We skipped leading zeros.
// * If `len == 2`: The loop condition is false, because
// `head[len - 2]` isn't a leading zero.
// * If `len == 1`: The loop condition is false, because
// `head[len - 1]` isn't a leading zero, and `&&`
// short-circuits, so the `head[len - 2]` access doesn't
// occur.
#[expect(clippy::indexing_slicing)]
while head[len - 1] == 0 && head[len - 2] == 0 {
len -= 2;
}
// Read the first pair
// Index in bounds by construction above.
#[expect(clippy::indexing_slicing)]
let mut digit_iter = head[..len].iter();
// `unwrap` succeeds by construction
#[expect(clippy::unwrap_used)]
let mut pair = if len & 1 == 1 {
// Only "half a pair" if we have an odd number of digits.
u32::from(*digit_iter.next().unwrap())
} else {
u32::from(*digit_iter.next().unwrap()) * 10
+ u32::from(*digit_iter.next().unwrap())
};
pair = 11 + 2 * pair;
let mut shift = 8u32;
while let (Some(&left), Some(&right)) =
(digit_iter.next(), digit_iter.next())
{
if shift == 0 {
primary |= pair;
self.pending.push(
CollationElement::new_from_primary(primary),
);
primary = numeric_primary;
shift = 16;
} else {
primary |= pair << shift;
shift -= 8;
}
pair =
11 + 2 * (u32::from(left) * 10 + u32::from(right));
}
primary |= (pair - 1) << shift;
self.pending
.push(CollationElement::new_from_primary(primary));
}
break 'ce32loop;
}
ce32 = data.get_ce32(ce32.index());
continue 'ce32loop;
}
Tag::Offset => {
self.pending.push(data.ce_from_offset_ce32(c, ce32));
break 'ce32loop;
}
Tag::Implicit => {
self.pending
.push(CollationElement::new_implicit_from_char(c));
break 'ce32loop;
}
Tag::Fallback
| Tag::Reserved3
| Tag::LongPrimary
| Tag::LongSecondary
| Tag::BuilderData
| Tag::LeadSurrogate
| Tag::LatinExpansion
| Tag::U0000
| Tag::Hangul => {
debug_assert!(false);
// GIGO case
self.pending.push(FFFD_CE);
break 'ce32loop;
}
}
}
}
self.prefix_push(c);
'combining_outer: loop {
debug_assert!(drain_from_upcoming == 0 || combining_characters.is_empty());
let mut i = 0;
'combining: while let Some(ch) =
combining_characters.get(i).map(|c| c.character())
{
c = ch;
let diacritic_index = (c as usize).wrapping_sub(COMBINING_DIACRITICS_BASE);
if let Some(secondary) = self.diacritics.get(diacritic_index) {
// TODO(#2006): unlikely annotation
if c == '\u{0307}' && self.lithuanian_dot_above {
if let Some(next_c) =
combining_characters.get(i + 1).map(|c| c.character())
{
if next_c == '\u{0300}'
|| next_c == '\u{0301}'
|| next_c == '\u{0303}'
{
// Lithuanian contracts COMBINING DOT ABOVE with three other diacritics of the
// same combining class such that the COMBINING DOT ABOVE is ignored for
// collation. Since the combining class is the same, it's valid to simply
// look at the next character in `combining_characters`.
i += 1;
continue 'combining;
}
}
}
self.pending
.push(CollationElement::new_from_secondary(secondary));
self.mark_prefix_unmatchable();
i += 1;
continue 'combining;
}
// `c` is not a table-optimized diacritic.
// Not bothering to micro optimize away the move of the remaining
// part of `combining_characters`.
let _ = combining_characters.drain(..=i);
data = self.tailoring;
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
continue 'outer;
}
// Note: The borrow checker didn't like the iterator formulation
// for the loop below, because the `Drain` would have kept `self`
// mutable borrowed when trying to call `prefix_push`. To change
// this, `prefix` and `prefix_push` would need to be refactored
// into a struct.
i = 0;
while i < drain_from_upcoming {
// By construction, `drain_from_upcoming` doesn't exceed `upcoming.len()`
#[expect(clippy::indexing_slicing)]
let ch = self.upcoming[i].character();
self.prefix_push(ch);
i += 1;
}
// TODO(#2004): The above makes prefix out of sync when starter-contracting
// contractions use `pending_removals` instead of `drain_from_upcoming`.
// Do there exist prefixes that overlap contraction suffixes?
// At least as of CLDR 40, the two possible non-starters in prefixes,
// kana voicing marks, shouldn't be participating in Brahmic contractions.
let _ = self.upcoming.drain(..drain_from_upcoming);
if self.upcoming.is_empty() {
// Make the assertion conditional to make CI happy.
#[cfg(debug_assertions)]
debug_assert!(self.iter_exhausted || may_have_contracted_starter);
if let Some(c_c_tv) = self.iter_next() {
self.upcoming.push(c_c_tv);
} else {
#[cfg(debug_assertions)]
{
self.iter_exhausted = true;
}
}
}
if may_have_contracted_starter {
may_have_contracted_starter = false;
if !self.is_next_decomposition_starts_with_starter() {
// We need to loop back and process another round of
// non-starters in order to maintain the invariant of
// `upcoming` on the next call to `next()`.
drain_from_upcoming = 0;
self.collect_combining(&mut combining_characters);
continue 'combining_outer;
}
}
// By construction, we have at least on pending CE by now.
#[expect(clippy::indexing_slicing)]
let ret = self.pending[0];
debug_assert_eq!(self.pending_pos, 0);
if self.pending.len() == 1 {
self.pending.clear();
} else {
self.pending_pos = 1;
}
return ret;
}
}
} else {
NO_CE
}
}
#[inline(always)]
fn collect_combining(
&mut self,
combining_characters: &mut SmallVec<[CharacterAndClass; COMBINING_CHARACTER_BUFFER_SIZE]>,
) {
while !self.is_next_decomposition_starts_with_starter() {
// `unwrap` is OK, because `!self.is_next_decomposition_starts_with_starter()`
// means the `unwrap()` must succeed.
#[expect(clippy::unwrap_used)]
let combining = self.next_internal().unwrap().c_and_c;
let combining_c = combining.character();
if !in_inclusive_range(combining_c, '\u{0340}', '\u{0F81}') {
combining_characters.push(combining);
} else {
// The Tibetan special cases are starters that decompose into non-starters.
match combining_c {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
combining_characters.push(CharacterAndClass::new(
'\u{0300}',
CanonicalCombiningClass::Above,
));
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
combining_characters.push(CharacterAndClass::new(
'\u{0301}',
CanonicalCombiningClass::Above,
));
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
combining_characters.push(CharacterAndClass::new(
'\u{0313}',
CanonicalCombiningClass::Above,
));
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
combining_characters.push(CharacterAndClass::new(
'\u{0308}',
CanonicalCombiningClass::Above,
));
combining_characters.push(CharacterAndClass::new(
'\u{0301}',
CanonicalCombiningClass::Above,
));
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
combining_characters.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
combining_characters.push(CharacterAndClass::new(
'\u{0F72}',
CanonicalCombiningClass::CCC130,
));
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
combining_characters.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
combining_characters.push(CharacterAndClass::new(
'\u{0F74}',
CanonicalCombiningClass::CCC132,
));
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
combining_characters.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
combining_characters.push(CharacterAndClass::new(
'\u{0F80}',
CanonicalCombiningClass::CCC130,
));
}
_ => {
combining_characters.push(combining);
}
};
}
}
if combining_characters.len() > 1 {
// This optimizes away the class lookup when len() == 1.
// Unclear if this micro optimization is worthwhile.
// In any case, we store the CanonicalCombiningClass in order to
// avoid having to look it up again when deciding whether to proceed
// with a discontiguous match. As a side effect, it also means that
// duplicate lookups aren't needed if the sort below happens to compare
// an item more than once.
combining_characters
.iter_mut()
.for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(self.iter.trie()));
combining_characters.sort_by_key(|cc| cc.ccc());
}
}
}