Source code

Revision control

Copy as Markdown

Other Tools

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
#![cfg_attr(not(any(test, feature = "std")), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
//! Normalizing text into Unicode Normalization Forms.
//!
//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! # Implementation notes
//!
//! The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally
//! and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid
//! UTF-16 is a step that doesn’t leak into the normalizer internals. Ill-formed byte sequences are
//! treated as U+FFFD.
//!
//! The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization
//! data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition
//! into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated
//! data layout.
//!
//! Notably, the decomposition data structure is optimized for a starter decomposing to itself,
//! which is the most common case, and for a starter decomposing to a starter and a non-starter
//! on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the
//! knowledge that the second character of such a decomposition is a non-starter. Therefore,
//! decomposition into two starters is handled by generic fallback path that looks the
//! decomposition from an array by offset and length instead of baking a BMP starter pair directly
//! into a trie value.
//!
//! The decompositions into non-starters are hard-coded. At present in Unicode, these appear
//! to be special cases falling into three categories:
//!
//! 1. Deprecated combining marks.
//! 2. Particular Tibetan vowel sings.
//! 3. NFKD only: half-width kana voicing marks.
//!
//! Hopefully Unicode never adds more decompositions into non-starters (other than a character
//! decomposing to itself), but if it does, a code update is needed instead of a mere data update.
//!
//! The composing normalizer builds on the decomposing normalizer by performing the canonical
//! composition post-processing per spec. As an optimization, though, the composing normalizer
//! attempts to pass through already-normalized text consisting of starters that never combine
//! backwards and that map to themselves if followed by a character whose decomposition starts
//! with a starter that never combines backwards.
//!
//! As a difference with ICU4C, the composing normalizer has only the simplest possible
//! passthrough (only one inversion list lookup per character in the best case) and the full
//! decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these
//! extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid
//! doing the work of checking whether the FCD condition holds.
extern crate alloc;
mod error;
pub mod properties;
pub mod provider;
pub mod uts46;
pub use crate::error::NormalizerError;
#[doc(no_inline)]
pub use NormalizerError as Error;
use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
use crate::provider::DecompositionDataV1;
use crate::provider::Uts46DecompositionSupplementV1Marker;
use alloc::string::String;
use alloc::vec::Vec;
use core::char::REPLACEMENT_CHARACTER;
use core::str::from_utf8_unchecked;
use icu_collections::char16trie::Char16Trie;
use icu_collections::char16trie::Char16TrieIterator;
use icu_collections::char16trie::TrieResult;
use icu_collections::codepointtrie::CodePointTrie;
use icu_properties::CanonicalCombiningClass;
use icu_provider::prelude::*;
use provider::CanonicalCompositionsV1Marker;
use provider::CanonicalDecompositionTablesV1Marker;
use provider::CompatibilityDecompositionTablesV1Marker;
use provider::DecompositionSupplementV1;
use provider::DecompositionTablesV1;
use smallvec::SmallVec;
use utf16_iter::Utf16CharsEx;
use utf8_iter::Utf8CharsEx;
use write16::Write16;
use zerofrom::ZeroFrom;
use zerovec::{zeroslice, ZeroSlice};
#[derive(Debug)]
enum SupplementPayloadHolder {
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
}
impl SupplementPayloadHolder {
fn get(&self) -> &DecompositionSupplementV1 {
match self {
SupplementPayloadHolder::Compatibility(d) => d.get(),
SupplementPayloadHolder::Uts46(d) => d.get(),
}
}
}
/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
#[derive(Debug, PartialEq, Eq)]
enum IgnorableBehavior {
/// 0xFFFFFFFF in data is not supported.
Unsupported,
/// Ignorables are ignored.
Ignored,
/// Ignorables are treated as singleton decompositions
/// to the REPLACEMENT CHARACTER.
ReplacementCharacter,
}
/// Number of iterations allowed on the fast path before flushing.
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
/// character, this means two memory pages.
/// Intel Core i7-4770 had the best results between 2 and 4 pages
/// when testing powers of two. Apple M1 didn't seem to care
/// about 1, 2, 4, or 8 pages.
///
/// Curiously, the `str` case does not appear to benefit from
/// similar flushing, though the tested monomorphization never
/// passes an error through from `Write`.
const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;
/// Marker for UTS 46 ignorables.
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
/// Marker for starters that decompose to themselves but may
/// combine backwards under canonical composition.
/// (Main trie only; not used in the supplementary trie.)
const BACKWARD_COMBINING_STARTER_MARKER: u32 = 1;
/// Magic marker trie value for characters whose decomposition
/// starts with a non-starter. The actual decomposition is
/// hard-coded.
const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER: u32 = 2;
/// `u16` version of the previous marker value.
const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2;
/// Marker that a complex decomposition isn't round-trippable
/// under re-composition.
const NON_ROUND_TRIP_MARKER: u16 = 1;
/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
fn trie_value_has_ccc(trie_value: u32) -> bool {
(trie_value & 0xFFFFFF00) == 0xD800
}
/// Checks if the trie signifies a special non-starter decomposition.
fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
trie_value == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER
}
/// Checks if a trie value signifies a character whose decomposition
/// starts with a non-starter.
fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
trie_value_has_ccc(trie_value)
|| trie_value_indicates_special_non_starter_decomposition(trie_value)
}
/// Extracts a canonical combining class (possibly zero) from a trie value.
///
/// # Panics
///
/// The trie value must not be one that signifies a special non-starter
/// decomposition. (Debug-only)
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
} else {
debug_assert_ne!(trie_value, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER);
CanonicalCombiningClass::NotReordered
}
}
/// The tail (everything after the first character) of the NFKD form U+FDFA
/// as 16-bit units.
static FDFA_NFKD: [u16; 17] = [
0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
0x633, 0x644, 0x645,
];
/// Marker value for U+FDFA in NFKD
const FDFA_MARKER: u16 = 3;
// These constants originate from page 143 of Unicode 14.0
/// Syllable base
const HANGUL_S_BASE: u32 = 0xAC00;
/// Lead jamo base
const HANGUL_L_BASE: u32 = 0x1100;
/// Vowel jamo base
const HANGUL_V_BASE: u32 = 0x1161;
/// Trail jamo base (deliberately off by one to account for the absence of a trail)
const HANGUL_T_BASE: u32 = 0x11A7;
/// Lead jamo count
const HANGUL_L_COUNT: u32 = 19;
/// Vowel jamo count
const HANGUL_V_COUNT: u32 = 21;
/// Trail jamo count (deliberately off by one to account for the absence of a trail)
const HANGUL_T_COUNT: u32 = 28;
/// Vowel jamo count times trail jamo count
const HANGUL_N_COUNT: u32 = 588;
/// Syllable count
const HANGUL_S_COUNT: u32 = 11172;
/// One past the conjoining jamo block
const HANGUL_JAMO_LIMIT: u32 = 0x1200;
/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
/// are enabled and return `default` if debug assertions are not enabled.
///
/// Use this only if the only reason why `opt` could be `None` is bogus
/// data from the provider.
#[inline(always)]
fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
if let Some(val) = opt {
val
} else {
// GIGO case
debug_assert!(false);
default
}
}
/// Convert a `u32` _obtained from data provider data_ to `char`.
#[inline(always)]
fn char_from_u32(u: u32) -> char {
unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
}
/// Convert a `u16` _obtained from data provider data_ to `char`.
#[inline(always)]
fn char_from_u16(u: u16) -> char {
char_from_u32(u32::from(u))
}
const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
#[inline(always)]
fn in_inclusive_range(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
}
#[inline(always)]
fn in_inclusive_range32(u: u32, start: u32, end: u32) -> bool {
u.wrapping_sub(start) <= (end - start)
}
#[inline(always)]
fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
u.wrapping_sub(start) <= (end - start)
}
/// Performs canonical composition (including Hangul) on a pair of
/// characters or returns `None` if these characters don't compose.
/// Composition exclusions are taken into account.
#[inline]
fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
return compose_non_hangul(iter, starter, second);
}
if v < HANGUL_V_COUNT {
let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
if l < HANGUL_L_COUNT {
let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
// Safe, because the inputs are known to be in range.
return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
}
return None;
}
if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
// Safe, because the inputs are known to be in range.
return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
}
}
None
}
/// Performs (non-Hangul) canonical composition on a pair of characters
/// or returns `None` if these characters don't compose. Composition
/// exclusions are taken into account.
fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
// To make the trie smaller, the pairs are stored second character first.
// Given how this method is used in ways where it's known that `second`
// is or isn't a starter. We could potentially split the trie into two
// tries depending on whether `second` is a starter.
match iter.next(second) {
TrieResult::NoMatch => None,
TrieResult::NoValue => match iter.next(starter) {
TrieResult::NoMatch => None,
TrieResult::FinalValue(i) => {
if let Some(c) = char::from_u32(i as u32) {
Some(c)
} else {
// GIGO case
debug_assert!(false);
None
}
}
TrieResult::NoValue | TrieResult::Intermediate(_) => {
// GIGO case
debug_assert!(false);
None
}
},
TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
// GIGO case
debug_assert!(false);
None
}
}
}
/// Struct for holding together a character and the value
/// looked up for it from the NFD trie in a more explicit
/// way than an anonymous pair.
/// Also holds a flag about the supplementary-trie provenance.
#[derive(Debug, PartialEq, Eq)]
struct CharacterAndTrieValue {
character: char,
trie_val: u32,
from_supplement: bool,
}
impl CharacterAndTrieValue {
#[inline(always)]
pub fn new(c: char, trie_value: u32) -> Self {
CharacterAndTrieValue {
character: c,
trie_val: trie_value,
from_supplement: false,
}
}
#[inline(always)]
pub fn new_from_supplement(c: char, trie_value: u32) -> Self {
CharacterAndTrieValue {
character: c,
trie_val: trie_value,
from_supplement: true,
}
}
#[inline(always)]
pub fn starter_and_decomposes_to_self(&self) -> bool {
if self.trie_val > BACKWARD_COMBINING_STARTER_MARKER {
return false;
}
// Hangul syllables get 0 as their trie value
u32::from(self.character).wrapping_sub(HANGUL_S_BASE) >= HANGUL_S_COUNT
}
#[inline(always)]
pub fn can_combine_backwards(&self) -> bool {
decomposition_starts_with_non_starter(self.trie_val)
|| self.trie_val == BACKWARD_COMBINING_STARTER_MARKER
|| in_inclusive_range32(self.trie_val, 0x1161, 0x11C2)
}
#[inline(always)]
pub fn potential_passthrough(&self) -> bool {
self.potential_passthrough_impl(BACKWARD_COMBINING_STARTER_MARKER)
}
#[inline(always)]
pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
self.potential_passthrough_impl(0)
}
#[inline(always)]
fn potential_passthrough_impl(&self, bound: u32) -> bool {
// This methods looks badly branchy, but most characters
// take the first return.
if self.trie_val <= bound {
return true;
}
if self.from_supplement {
return false;
}
let trail_or_complex = (self.trie_val >> 16) as u16;
if trail_or_complex == 0 {
return false;
}
let lead = self.trie_val as u16;
if lead == 0 {
return true;
}
if lead == NON_ROUND_TRIP_MARKER {
return false;
}
if (trail_or_complex & 0x7F) == 0x3C
&& in_inclusive_range16(trail_or_complex, 0x0900, 0x0BFF)
{
// Nukta
return false;
}
if in_inclusive_range(self.character, '\u{FB1D}', '\u{FB4E}') {
// Hebrew presentation forms
return false;
}
if in_inclusive_range(self.character, '\u{1F71}', '\u{1FFB}') {
// Polytonic Greek with oxia
return false;
}
// To avoid more branchiness, 4 characters that decompose to
// a BMP starter followed by a BMP non-starter are excluded
// from being encoded directly into the trie value and are
// handled as complex decompositions instead. These are:
// U+0F76 TIBETAN VOWEL SIGN VOCALIC R
// U+0F78 TIBETAN VOWEL SIGN VOCALIC L
// U+212B ANGSTROM SIGN
// U+2ADC FORKING
true
}
}
/// Pack a `char` and a `CanonicalCombiningClass` in
/// 32 bits (the former in the lower 24 bits and the
/// latter in the high 8 bits). The latter can be
/// initialized to 0xFF upon creation, in which case
/// it can be actually set later by calling
/// `set_ccc_from_trie_if_not_already_set`. This is
/// a micro optimization to avoid the Canonical
/// Combining Class trie lookup when there is only
/// one combining character in a sequence. This type
/// is intentionally non-`Copy` to get compiler help
/// in making sure that the class is set on the
/// instance on which it is intended to be set
/// and not on a temporary copy.
///
/// Note that 0xFF is won't be assigned to an actual
/// canonical combining class per definition D104
/// in The Unicode Standard.
//
// NOTE: The Pernosco debugger has special knowledge
// of this struct. Please do not change the bit layout
// or the crate-module-qualified name of this struct
// without coordination.
#[derive(Debug)]
struct CharacterAndClass(u32);
impl CharacterAndClass {
pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
CharacterAndClass(u32::from(c) | (u32::from(ccc.0) << 24))
}
pub fn new_with_placeholder(c: char) -> Self {
CharacterAndClass(u32::from(c) | ((0xFF) << 24))
}
pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
}
pub fn new_starter(c: char) -> Self {
CharacterAndClass(u32::from(c))
}
pub fn character(&self) -> char {
// Safe, because the low 24 bits came from a `char`
// originally.
unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
}
pub fn ccc(&self) -> CanonicalCombiningClass {
CanonicalCombiningClass((self.0 >> 24) as u8)
}
pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
(self.character(), self.ccc())
}
pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
if self.0 >> 24 != 0xFF {
return;
}
let scalar = self.0 & 0xFFFFFF;
self.0 = ((ccc_from_trie_value(trie.get32_u32(scalar)).0 as u32) << 24) | scalar;
}
}
// This function exists as a borrow check helper.
#[inline(always)]
fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) {
// We don't look up the canonical combining class for starters
// of for single combining characters between starters. When
// there's more than one combining character between starters,
// we look up the canonical combining class for each character
// exactly once.
if slice.len() < 2 {
return;
}
slice
.iter_mut()
.for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
slice.sort_by_key(|cc| cc.ccc());
}
/// An iterator adaptor that turns an `Iterator` over `char` into
/// a lazily-decomposed `char` sequence.
#[derive(Debug)]
pub struct Decomposition<'data, I>
where
I: Iterator<Item = char>,
{
delegate: I,
buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
/// The index of the next item to be read from `buffer`.
/// The purpose if this index is to avoid having to move
/// the rest upon every read.
buffer_pos: usize,
// At the start of `next()` if not `None`, this is a pending unnormalized
// starter. When `Decomposition` appears alone, this is never a non-starter.
// However, when `Decomposition` appears inside a `Composition`, this
// may become a non-starter before `decomposing_next()` is called.
pending: Option<CharacterAndTrieValue>, // None at end of stream
trie: &'data CodePointTrie<'data, u32>,
supplementary_trie: Option<&'data CodePointTrie<'data, u32>>,
scalars16: &'data ZeroSlice<u16>,
scalars24: &'data ZeroSlice<char>,
supplementary_scalars16: &'data ZeroSlice<u16>,
supplementary_scalars24: &'data ZeroSlice<char>,
half_width_voicing_marks_become_non_starters: bool,
/// The lowest character for which either of the following does
/// not hold:
/// 1. Decomposes to self.
/// 2. Decomposition starts with a non-starter
decomposition_passthrough_bound: u32, // never above 0xC0
ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
}
impl<'data, I> Decomposition<'data, I>
where
I: Iterator<Item = char>,
{
/// Constructs a decomposing iterator adapter from a delegate
/// iterator and references to the necessary data, without
/// supplementary data.
///
/// Use `DecomposingNormalizer::normalize_iter()` instead unless
/// there's a good reason to use this constructor directly.
///
/// Public but hidden in order to be able to use this from the
/// collator.
#[doc(hidden)]
pub fn new(
delegate: I,
decompositions: &'data DecompositionDataV1,
tables: &'data DecompositionTablesV1,
) -> Self {
Self::new_with_supplements(
delegate,
decompositions,
None,
tables,
None,
0xC0,
IgnorableBehavior::Unsupported,
)
}
/// Constructs a decomposing iterator adapter from a delegate
/// iterator and references to the necessary data, including
/// supplementary data.
///
/// Use `DecomposingNormalizer::normalize_iter()` instead unless
/// there's a good reason to use this constructor directly.
fn new_with_supplements(
delegate: I,
decompositions: &'data DecompositionDataV1,
supplementary_decompositions: Option<&'data DecompositionSupplementV1>,
tables: &'data DecompositionTablesV1,
supplementary_tables: Option<&'data DecompositionTablesV1>,
decomposition_passthrough_bound: u8,
ignorable_behavior: IgnorableBehavior,
) -> Self {
let half_width_voicing_marks_become_non_starters =
if let Some(supplementary) = supplementary_decompositions {
supplementary.half_width_voicing_marks_become_non_starters()
} else {
false
};
let mut ret = Decomposition::<I> {
delegate,
buffer: SmallVec::new(), // Normalized
buffer_pos: 0,
// Initialize with a placeholder starter in case
// the real stream starts with a non-starter.
pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
trie: &decompositions.trie,
supplementary_trie: supplementary_decompositions.map(|s| &s.trie),
scalars16: &tables.scalars16,
scalars24: &tables.scalars24,
supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
&supplementary.scalars16
} else {
EMPTY_U16
},
supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
&supplementary.scalars24
} else {
EMPTY_CHAR
},
half_width_voicing_marks_become_non_starters,
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
ignorable_behavior,
};
let _ = ret.next(); // Remove the U+FFFF placeholder
ret
}
fn push_decomposition16(
&mut self,
low: u16,
offset: usize,
slice16: &ZeroSlice<u16>,
) -> (char, usize) {
let len = usize::from(low >> 13) + 2;
let (starter, tail) = slice16
.get_subslice(offset..offset + len)
.and_then(|slice| slice.split_first())
.map_or_else(
|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_U16)
},
|(first, trail)| (char_from_u16(first), trail),
);
if low & 0x1000 != 0 {
// All the rest are combining
self.buffer.extend(
tail.iter()
.map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
);
(starter, 0)
} else {
let mut i = 0;
let mut combining_start = 0;
for u in tail.iter() {
let ch = char_from_u16(u);
let trie_value = self.trie.get(ch);
self.buffer.push(CharacterAndClass::new_with_trie_value(
CharacterAndTrieValue::new(ch, trie_value),
));
i += 1;
// Half-width kana and iota subscript don't occur in the tails
// of these multicharacter decompositions.
if !decomposition_starts_with_non_starter(trie_value) {
combining_start = i;
}
}
(starter, combining_start)
}
}
fn push_decomposition32(
&mut self,
low: u16,
offset: usize,
slice32: &ZeroSlice<char>,
) -> (char, usize) {
let len = usize::from(low >> 13) + 1;
let (starter, tail) = slice32
.get_subslice(offset..offset + len)
.and_then(|slice| slice.split_first())
.unwrap_or_else(|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_CHAR)
});
if low & 0x1000 != 0 {
// All the rest are combining
self.buffer
.extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
(starter, 0)
} else {
let mut i = 0;
let mut combining_start = 0;
for ch in tail.iter() {
let trie_value = self.trie.get(ch);
self.buffer.push(CharacterAndClass::new_with_trie_value(
CharacterAndTrieValue::new(ch, trie_value),
));
i += 1;
// Half-width kana and iota subscript don't occur in the tails
// of these multicharacter decompositions.
if !decomposition_starts_with_non_starter(trie_value) {
combining_start = i;
}
}
(starter, combining_start)
}
}
#[inline(always)]
fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
if let Some(supplementary) = self.supplementary_trie {
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
return value;
}
}
CharacterAndTrieValue::new(c, self.trie.get(c))
}
#[inline(never)]
fn attach_supplementary_trie_value(
&self,
c: char,
supplementary: &CodePointTrie<u32>,
) -> Option<CharacterAndTrieValue> {
let voicing_mark = u32::from(c).wrapping_sub(0xFF9E);
if voicing_mark <= 1 && self.half_width_voicing_marks_become_non_starters {
return Some(CharacterAndTrieValue::new(
if voicing_mark == 0 {
'\u{3099}'
} else {
'\u{309A}'
},
0xD800 | u32::from(CanonicalCombiningClass::KanaVoicing.0),
));
}
let trie_value = supplementary.get32(u32::from(c));
if trie_value != 0 {
return Some(CharacterAndTrieValue::new_from_supplement(c, trie_value));
}
None
}
fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
debug_assert!(self.pending.is_none());
loop {
let c = self.delegate.next()?;
// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}
if let Some(supplementary) = self.supplementary_trie {
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
if value.trie_val == IGNORABLE_MARKER {
match self.ignorable_behavior {
IgnorableBehavior::Unsupported => {
debug_assert!(false);
}
IgnorableBehavior::ReplacementCharacter => {
return Some(CharacterAndTrieValue::new(
c,
u32::from(REPLACEMENT_CHARACTER),
));
}
IgnorableBehavior::Ignored => {
// Else ignore this character by reading the next one from the delegate.
continue;
}
}
}
return Some(value);
}
}
let trie_val = self.trie.get(c);
debug_assert_ne!(trie_val, IGNORABLE_MARKER);
return Some(CharacterAndTrieValue::new(c, trie_val));
}
}
fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
if let Some(pending) = self.pending.take() {
// Only happens as part of `Composition` and as part of
// the contiguous-buffer methods of `DecomposingNormalizer`.
// I.e. does not happen as part of standalone iterator
// usage of `Decomposition`.
Some(pending)
} else {
self.delegate_next_no_pending()
}
}
fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
let (starter, combining_start) = {
let c = c_and_trie_val.character;
let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
if hangul_offset >= HANGUL_S_COUNT {
let decomposition = c_and_trie_val.trie_val;
if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
// The character is its own decomposition
(c, 0)
} else {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
let starter = char_from_u16(lead);
let combining = char_from_u16(trail_or_complex);
self.buffer
.push(CharacterAndClass::new_with_placeholder(combining));
(starter, 0)
} else if lead > NON_ROUND_TRIP_MARKER {
if lead != FDFA_MARKER {
debug_assert_ne!(
lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16,
"Should not reach this point with non-starter marker"
);
// Decomposition into one BMP character
let starter = char_from_u16(lead);
(starter, 0)
} else {
// Special case for the NFKD form of U+FDFA.
self.buffer.extend(FDFA_NFKD.map(|u| {
// Safe, because `FDFA_NFKD` is known not to contain
// surrogates.
CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(u32::from(u))
})
}));
('\u{0635}', 17)
}
} else {
// Complex decomposition
// Format for 16-bit value:
// 15..13: length minus two for 16-bit case and length minus one for
// the 32-bit case. Length 8 needs to fit in three bits in
// the 16-bit case, and this way the value is future-proofed
// up to 9 in the 16-bit case. Zero is unused and length one
// in the 16-bit case goes directly into the trie.
// 12: 1 if all trailing characters are guaranteed non-starters,
// 0 if no guarantees about non-starterness.
// Note: The bit choice is this way around to allow for
// dynamically falling back to not having this but instead
// having one more bit for length by merely choosing
// different masks.
// 11..0: Start offset in storage. The offset is to the logical
// sequence of scalars16, scalars32, supplementary_scalars16,
// supplementary_scalars32.
let offset = usize::from(trail_or_complex & 0xFFF);
if offset < self.scalars16.len() {
self.push_decomposition16(trail_or_complex, offset, self.scalars16)
} else if offset < self.scalars16.len() + self.scalars24.len() {
self.push_decomposition32(
trail_or_complex,
offset - self.scalars16.len(),
self.scalars24,
)
} else if offset
< self.scalars16.len()
+ self.scalars24.len()
+ self.supplementary_scalars16.len()
{
self.push_decomposition16(
trail_or_complex,
offset - (self.scalars16.len() + self.scalars24.len()),
self.supplementary_scalars16,
)
} else {
self.push_decomposition32(
trail_or_complex,
offset
- (self.scalars16.len()
+ self.scalars24.len()
+ self.supplementary_scalars16.len()),
self.supplementary_scalars24,
)
}
}
}
} else {
// Hangul syllable
// The math here comes from page 144 of Unicode 14.0
let l = hangul_offset / HANGUL_N_COUNT;
let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
let t = hangul_offset % HANGUL_T_COUNT;
// The unsafe blocks here are OK, because the values stay
// within the Hangul jamo block and, therefore, the scalar
// value range by construction.
self.buffer.push(CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(HANGUL_V_BASE + v)
}));
let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
if t != 0 {
self.buffer.push(CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(HANGUL_T_BASE + t)
}));
(first, 2)
} else {
(first, 1)
}
}
};
// Either we're inside `Composition` or `self.pending.is_none()`.
self.gather_and_sort_combining(combining_start);
starter
}
fn gather_and_sort_combining(&mut self, combining_start: usize) {
// Not a `for` loop to avoid holding a mutable reference to `self` across
// the loop body.
while let Some(ch_and_trie_val) = self.delegate_next() {
if trie_value_has_ccc(ch_and_trie_val.trie_val) {
self.buffer
.push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
} else if trie_value_indicates_special_non_starter_decomposition(
ch_and_trie_val.trie_val,
) {
// The Tibetan special cases are starters that decompose into non-starters.
let mapped = match ch_and_trie_val.character {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
CharacterAndClass::new('\u{0300}', CanonicalCombiningClass::Above)
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
CharacterAndClass::new('\u{0313}', CanonicalCombiningClass::Above)
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
self.buffer.push(CharacterAndClass::new(
'\u{0308}',
CanonicalCombiningClass::Above,
));
CharacterAndClass::new('\u{0301}', CanonicalCombiningClass::Above)
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F72}', CanonicalCombiningClass::CCC130)
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F74}', CanonicalCombiningClass::CCC132)
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
self.buffer.push(CharacterAndClass::new(
'\u{0F71}',
CanonicalCombiningClass::CCC129,
));
CharacterAndClass::new('\u{0F80}', CanonicalCombiningClass::CCC130)
}
_ => {
// GIGO case
debug_assert!(false);
CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
}
};
self.buffer.push(mapped);
} else {
self.pending = Some(ch_and_trie_val);
break;
}
}
// Slicing succeeds by construction; we've always ensured that `combining_start`
// is in permissible range.
#[allow(clippy::indexing_slicing)]
sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
}
}
impl<'data, I> Iterator for Decomposition<'data, I>
where
I: Iterator<Item = char>,
{
type Item = char;
fn next(&mut self) -> Option<char> {
if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
self.buffer_pos += 1;
if self.buffer_pos == self.buffer.len() {
self.buffer.clear();
self.buffer_pos = 0;
}
return Some(ret);
}
debug_assert_eq!(self.buffer_pos, 0);
let c_and_trie_val = self.pending.take()?;
Some(self.decomposing_next(c_and_trie_val))
}
}
/// An iterator adaptor that turns an `Iterator` over `char` into
/// a lazily-decomposed and then canonically composed `char` sequence.
#[derive(Debug)]
pub struct Composition<'data, I>
where
I: Iterator<Item = char>,
{
/// The decomposing part of the normalizer than operates before
/// the canonical composition is performed on its output.
decomposition: Decomposition<'data, I>,
/// Non-Hangul canonical composition data.
canonical_compositions: Char16Trie<'data>,
/// To make `next()` yield in cases where there's a non-composing
/// starter in the decomposition buffer, we put it here to let it
/// wait for the next `next()` call (or a jump forward within the
/// `next()` call).
unprocessed_starter: Option<char>,
/// The lowest character for which any one of the following does
/// not hold:
/// 1. Roundtrips via decomposition and recomposition.
/// 2. Decomposition starts with a non-starter
/// 3. Is not a backward-combining starter
composition_passthrough_bound: u32,
}
impl<'data, I> Composition<'data, I>
where
I: Iterator<Item = char>,
{
fn new(
decomposition: Decomposition<'data, I>,
canonical_compositions: Char16Trie<'data>,
composition_passthrough_bound: u16,
) -> Self {
Self {
decomposition,
canonical_compositions,
unprocessed_starter: None,
composition_passthrough_bound: u32::from(composition_passthrough_bound),
}
}
/// Performs canonical composition (including Hangul) on a pair of
/// characters or returns `None` if these characters don't compose.
/// Composition exclusions are taken into account.
#[inline(always)]
pub fn compose(&self, starter: char, second: char) -> Option<char> {
compose(self.canonical_compositions.iter(), starter, second)
}
/// Performs (non-Hangul) canonical composition on a pair of characters
/// or returns `None` if these characters don't compose. Composition
/// exclusions are taken into account.
#[inline(always)]
fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
compose_non_hangul(self.canonical_compositions.iter(), starter, second)
}
}
impl<'data, I> Iterator for Composition<'data, I>
where
I: Iterator<Item = char>,
{
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
if self.unprocessed_starter.is_none() {
// The loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
loop {
if let Some((character, ccc)) = self
.decomposition
.buffer
.get(self.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
self.decomposition.buffer_pos += 1;
if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
self.decomposition.buffer.clear();
self.decomposition.buffer_pos = 0;
}
if ccc == CanonicalCombiningClass::NotReordered {
// Previous decomposition contains a starter. This must
// now become the `unprocessed_starter` for it to have
// a chance to compose with the upcoming characters.
//
// E.g. parenthesized Hangul in NFKC comes through here,
// but suitable composition exclusion could exercise this
// in NFC.
self.unprocessed_starter = Some(character);
break; // We already have a starter, so skip taking one from `pending`.
}
return Some(character);
}
debug_assert_eq!(self.decomposition.buffer_pos, 0);
undecomposed_starter = self.decomposition.pending.take()?;
if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
|| undecomposed_starter.potential_passthrough()
{
// TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
// character is not below `decomposition_passthrough_bound` but is
// below `composition_passthrough_bound`, we read from the trie
// unnecessarily.
if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
let cannot_combine_backwards = u32::from(upcoming.character)
< self.composition_passthrough_bound
|| !upcoming.can_combine_backwards();
self.decomposition.pending = Some(upcoming);
if cannot_combine_backwards {
// Fast-track succeeded!
return Some(undecomposed_starter.character);
}
} else {
// End of stream
return Some(undecomposed_starter.character);
}
}
break; // Not actually looping
}
}
let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
// The point of having this boolean is to have only one call site to
// `self.decomposition.decomposing_next`, which is hopefully beneficial for
// code size under inlining.
let mut attempt_composition = false;
loop {
if let Some(unprocessed) = self.unprocessed_starter.take() {
debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
debug_assert_eq!(starter, '\u{0}');
starter = unprocessed;
} else {
debug_assert_eq!(self.decomposition.buffer_pos, 0);
let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
if !attempt_composition {
starter = next_starter;
} else if let Some(composed) = self.compose(starter, next_starter) {
starter = composed;
} else {
// This is our yield point. We'll pick this up above in the
// next call to `next()`.
self.unprocessed_starter = Some(next_starter);
return Some(starter);
}
}
// We first loop by index to avoid moving the contents of `buffer`, but
// if there's a discontiguous match, we'll start modifying `buffer` instead.
loop {
let (character, ccc) = if let Some((character, ccc)) = self
.decomposition
.buffer
.get(self.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
(character, ccc)
} else {
self.decomposition.buffer.clear();
self.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = self.compose(starter, character) {
starter = composed;
self.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
{
let _ = self
.decomposition
.buffer
.drain(0..self.decomposition.buffer_pos);
}
self.decomposition.buffer_pos = 0;
if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
// We failed to compose a starter. Discontiguous match not allowed.
// We leave the starter in `buffer` for `next()` to find.
return Some(starter);
}
let mut i = 1; // We have skipped one non-starter.
while let Some((character, ccc)) = self
.decomposition
.buffer
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CanonicalCombiningClass::NotReordered {
// Discontiguous match not allowed.
return Some(starter);
}
debug_assert!(ccc >= most_recent_skipped_ccc);
if ccc != most_recent_skipped_ccc {
// Using the non-Hangul version as a micro-optimization, since
// we already rejected the case where `second` is a starter
// above, and conjoining jamo are starters.
if let Some(composed) = self.compose_non_hangul(starter, character) {
self.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
debug_assert_eq!(self.decomposition.buffer_pos, 0);
if !self.decomposition.buffer.is_empty() {
return Some(starter);
}
// Now we need to check if composition with an upcoming starter is possible.
#[allow(clippy::unwrap_used)]
if self.decomposition.pending.is_some() {
// We know that `pending_starter` decomposes to start with a starter.
// Otherwise, it would have been moved to `self.decomposition.buffer`
// by `self.decomposing_next()`. We do this set lookup here in order
// to get an opportunity to go back to the fast track.
// Note that this check has to happen _after_ checking that `pending`
// holds a character, because this flag isn't defined to be meaningful
// when `pending` isn't holding a character.
let pending = self.decomposition.pending.as_ref().unwrap();
if u32::from(pending.character) < self.composition_passthrough_bound
|| !pending.can_combine_backwards()
{
// Won't combine backwards anyway.
return Some(starter);
}
// Consume what we peeked. `unwrap` OK, because we checked `is_some()`
// above.
undecomposed_starter = self.decomposition.pending.take().unwrap();
// The following line is OK, because we're about to loop back
// to `self.decomposition.decomposing_next(c);`, which will
// restore the between-`next()`-calls invariant of `pending`
// before this function returns.
attempt_composition = true;
continue;
}
// End of input
return Some(starter);
}
}
}
macro_rules! composing_normalize_to {
($(#[$meta:meta])*,
$normalize_to:ident,
$write:path,
$slice:ty,
$prolog:block,
$always_valid_utf:literal,
$as_slice:ident,
$fast:block,
$text:ident,
$sink:ident,
$composition:ident,
$composition_passthrough_bound:ident,
$undecomposed_starter:ident,
$pending_slice:ident,
$len_utf:ident,
) => {
$(#[$meta])*
pub fn $normalize_to<W: $write + ?Sized>(
&self,
$text: $slice,
$sink: &mut W,
) -> core::fmt::Result {
$prolog
let mut $composition = self.normalize_iter($text.chars());
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
// Try to get the compiler to hoist the bound to a register.
let $composition_passthrough_bound = $composition.composition_passthrough_bound;
'outer: loop {
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
let mut $undecomposed_starter =
if let Some(pending) = $composition.decomposition.pending.take() {
pending
} else {
return Ok(());
};
// Allowing indexed slicing, because a failure would be a code bug and
// not a data issue.
#[allow(clippy::indexing_slicing)]
if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
$undecomposed_starter.potential_passthrough()
{
// We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
// was returned in response to an error by the iterator. Assume the
// latter for correctness even though it pessimizes the former.
if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
// The `$fast` block must either:
// 1. Return due to reaching EOF
// 2. Leave a starter with its trie value in `$undecomposed_starter`
// and, if there is still more input, leave the next character
// and its trie value in `$composition.decomposition.pending`.
$fast
}
}
// Fast track above, full algorithm below
let mut starter = $composition
.decomposition
.decomposing_next($undecomposed_starter);
'bufferloop: loop {
// We first loop by index to avoid moving the contents of `buffer`, but
// if there's a discontiguous match, we'll start modifying `buffer` instead.
loop {
let (character, ccc) = if let Some((character, ccc)) = $composition
.decomposition
.buffer
.get($composition.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
(character, ccc)
} else {
$composition.decomposition.buffer.clear();
$composition.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = $composition.compose(starter, character) {
starter = composed;
$composition.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
if most_recent_skipped_ccc == CanonicalCombiningClass::NotReordered {
// We failed to compose a starter. Discontiguous match not allowed.
// Write the current `starter` we've been composing, make the unmatched
// starter in the buffer the new `starter` (we know it's been decomposed)
// and process the rest of the buffer with that as the starter.
$sink.write_char(starter)?;
starter = character;
$composition.decomposition.buffer_pos += 1;
continue 'bufferloop;
} else {
{
let _ = $composition
.decomposition
.buffer
.drain(0..$composition.decomposition.buffer_pos);
}
$composition.decomposition.buffer_pos = 0;
}
let mut i = 1; // We have skipped one non-starter.
while let Some((character, ccc)) = $composition
.decomposition
.buffer
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CanonicalCombiningClass::NotReordered {
// Discontiguous match not allowed.
$sink.write_char(starter)?;
for cc in $composition.decomposition.buffer.drain(..i) {
$sink.write_char(cc.character())?;
}
starter = character;
{
let removed = $composition.decomposition.buffer.remove(0);
debug_assert_eq!(starter, removed.character());
}
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
continue 'bufferloop;
}
debug_assert!(ccc >= most_recent_skipped_ccc);
if ccc != most_recent_skipped_ccc {
// Using the non-Hangul version as a micro-optimization, since
// we already rejected the case where `second` is a starter
// above, and conjoining jamo are starters.
if let Some(composed) =
$composition.compose_non_hangul(starter, character)
{
$composition.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
if !$composition.decomposition.buffer.is_empty() {
$sink.write_char(starter)?;
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
// We had non-empty buffer, so can't compose with upcoming.
continue 'outer;
}
// Now we need to check if composition with an upcoming starter is possible.
if $composition.decomposition.pending.is_some() {
// We know that `pending_starter` decomposes to start with a starter.
// Otherwise, it would have been moved to `composition.decomposition.buffer`
// by `composition.decomposing_next()`. We do this set lookup here in order
// to get an opportunity to go back to the fast track.
// Note that this check has to happen _after_ checking that `pending`
// holds a character, because this flag isn't defined to be meaningful
// when `pending` isn't holding a character.
let pending = $composition.decomposition.pending.as_ref().unwrap();
if u32::from(pending.character) < $composition.composition_passthrough_bound
|| !pending.can_combine_backwards()
{
// Won't combine backwards anyway.
$sink.write_char(starter)?;
continue 'outer;
}
let pending_starter = $composition.decomposition.pending.take().unwrap();
let decomposed = $composition.decomposition.decomposing_next(pending_starter);
if let Some(composed) = $composition.compose(starter, decomposed) {
starter = composed;
} else {
$sink.write_char(starter)?;
starter = decomposed;
}
continue 'bufferloop;
}
// End of input
$sink.write_char(starter)?;
return Ok(());
} // 'bufferloop
}
}
};
}
macro_rules! decomposing_normalize_to {
($(#[$meta:meta])*,
$normalize_to:ident,
$write:path,
$slice:ty,
$prolog:block,
$as_slice:ident,
$fast:block,
$text:ident,
$sink:ident,
$decomposition:ident,
$decomposition_passthrough_bound:ident,
$undecomposed_starter:ident,
$pending_slice:ident,
$outer:lifetime, // loop labels use lifetime tokens
) => {
$(#[$meta])*
pub fn $normalize_to<W: $write + ?Sized>(
&self,
$text: $slice,
$sink: &mut W,
) -> core::fmt::Result {
$prolog
let mut $decomposition = self.normalize_iter($text.chars());
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
// Try to get the compiler to hoist the bound to a register.
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
$outer: loop {
for cc in $decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
debug_assert_eq!($decomposition.buffer_pos, 0);
let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
pending
} else {
return Ok(());
};
// Allowing indexed slicing, because a failure would be a code bug and
// not a data issue.
#[allow(clippy::indexing_slicing)]
if $undecomposed_starter.starter_and_decomposes_to_self() {
// Don't bother including `undecomposed_starter` in a contiguous buffer
// write: Just write it right away:
$sink.write_char($undecomposed_starter.character)?;
let $pending_slice = $decomposition.delegate.$as_slice();
$fast
}
let starter = $decomposition.decomposing_next($undecomposed_starter);
$sink.write_char(starter)?;
}
}
};
}
macro_rules! normalizer_methods {
() => {
/// Normalize a string slice into a `String`.
pub fn normalize(&self, text: &str) -> String {
let mut ret = String::new();
ret.reserve(text.len());
let _ = self.normalize_to(text, &mut ret);
ret
}
/// Check whether a string slice is normalized.
pub fn is_normalized(&self, text: &str) -> bool {
let mut sink = IsNormalizedSinkStr::new(text);
if self.normalize_to(text, &mut sink).is_err() {
return false;
}
sink.finished()
}
/// Normalize a slice of potentially-invalid UTF-16 into a `Vec`.
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
pub fn normalize_utf16(&self, text: &[u16]) -> Vec<u16> {
let mut ret = Vec::new();
let _ = self.normalize_utf16_to(text, &mut ret);
ret
}
/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
///
/// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
let mut sink = IsNormalizedSinkUtf16::new(text);
if self.normalize_utf16_to(text, &mut sink).is_err() {
return false;
}
sink.finished()
}
/// Normalize a slice of potentially-invalid UTF-8 into a `String`.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
pub fn normalize_utf8(&self, text: &[u8]) -> String {
let mut ret = String::new();
ret.reserve(text.len());
let _ = self.normalize_utf8_to(text, &mut ret);
ret
}
/// Check if a slice of potentially-invalid UTF-8 is normalized.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard before checking.
pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
let mut sink = IsNormalizedSinkUtf8::new(text);
if self.normalize_utf8_to(text, &mut sink).is_err() {
return false;
}
sink.finished()
}
};
}
/// A normalizer for performing decomposing normalization.
#[derive(Debug)]
pub struct DecomposingNormalizer {
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
supplementary_decompositions: Option<SupplementPayloadHolder>,
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
supplementary_tables: Option<DataPayload<CompatibilityDecompositionTablesV1Marker>>,
decomposition_passthrough_bound: u8, // never above 0xC0
composition_passthrough_bound: u16, // never above 0x0300
}
impl DecomposingNormalizer {
/// NFD constructor using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [đź“š Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_nfd() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars24
.const_len()
<= 0xFFF,
"NormalizerError::FutureExtension"
);
DecomposingNormalizer {
decompositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
supplementary_decompositions: None,
tables: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
),
supplementary_tables: None,
decomposition_passthrough_bound: 0xC0,
composition_passthrough_bound: 0x0300,
}
}
icu_provider::gen_any_buffer_data_constructors!(
locale: skip,
options: skip,
error: NormalizerError,
#[cfg(skip)]
functions: [
new_nfd,
try_new_nfd_with_any_provider,
try_new_nfd_with_buffer_provider,
try_new_nfd_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ ?Sized,
{
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
provider.load(Default::default())?.take_payload()?;
let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
// The data is from a future where there exists a normalization flavor whose
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
// of space. If a good use case from such a decomposition flavor arises, we can
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
// since for now the masks are hard-coded, error out.
return Err(NormalizerError::FutureExtension);
}
Ok(DecomposingNormalizer {
decompositions,
supplementary_decompositions: None,
tables,
supplementary_tables: None,
decomposition_passthrough_bound: 0xC0,
composition_passthrough_bound: 0x0300,
})
}
/// NFKD constructor using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [đź“š Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_nfkd() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars24
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
.scalars24
.const_len()
<= 0xFFF,
"NormalizerError::FutureExtension"
);
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap <= 0x0300,
"NormalizerError::ValidationError"
);
let decomposition_capped =
if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0xC0 {
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
} else {
0xC0
};
let composition_capped =
if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap < 0x0300 {
crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1.passthrough_cap
} else {
0x0300
};
DecomposingNormalizer {
decompositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_V1),
)),
tables: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
),
supplementary_tables: Some(DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
)),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
}
}
icu_provider::gen_any_buffer_data_constructors!(
locale: skip,
options: skip,
error: NormalizerError,
#[cfg(skip)]
functions: [
new_nfkd,
try_new_nfkd_with_any_provider,
try_new_nfkd_with_buffer_provider,
try_new_nfkd_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CompatibilityDecompositionSupplementV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ ?Sized,
{
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
provider.load(Default::default())?.take_payload()?;
let supplementary_decompositions: DataPayload<
CompatibilityDecompositionSupplementV1Marker,
> = provider.load(Default::default())?.take_payload()?;
let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
if tables.get().scalars16.len()
+ tables.get().scalars24.len()
+ supplementary_tables.get().scalars16.len()
+ supplementary_tables.get().scalars24.len()
> 0xFFF
{
// The data is from a future where there exists a normalization flavor whose
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
// of space. If a good use case from such a decomposition flavor arises, we can
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
// since for now the masks are hard-coded, error out.
return Err(NormalizerError::FutureExtension);
}
let cap = supplementary_decompositions.get().passthrough_cap;
if cap > 0x0300 {
return Err(NormalizerError::ValidationError);
}
let decomposition_capped = cap.min(0xC0);
let composition_capped = cap.min(0x0300);
Ok(DecomposingNormalizer {
decompositions,
supplementary_decompositions: Some(SupplementPayloadHolder::Compatibility(
supplementary_decompositions,
)),
tables,
supplementary_tables: Some(supplementary_tables),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
})
}
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46_decomposed() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars24
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1
.scalars24
.const_len()
<= 0xFFF,
"NormalizerError::FutureExtension"
);
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap <= 0x0300,
"NormalizerError::ValidationError"
);
let decomposition_capped =
if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0xC0 {
crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
} else {
0xC0
};
let composition_capped =
if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap < 0x0300 {
crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1.passthrough_cap
} else {
0x0300
};
DecomposingNormalizer {
decompositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_UTS46D_V1,
),
)),
tables: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
),
supplementary_tables: Some(DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFKDEX_V1,
)),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
}
}
/// UTS 46 decomposed constructor (testing only)
///
/// This is a special building block normalization for IDNA. It is the decomposed counterpart of
/// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
/// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
/// NFD in this normalization. In both cases, the previous UTS 46 processing before using
/// normalization is expected to deal with these characters. Making the disallowed characters
/// behave like this is beneficial to data size, and this normalizer implementation cannot
/// deal with a character normalizing to the empty string, which doesn't happen in NFD or
/// NFKD as of Unicode 14.
///
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
/// Therefore, the output of this normalization may differ for different inputs that are
/// canonically equivalent with each other if they differ by how U+0345 is ordered relative
/// to other reorderable characters.
///
/// Public for testing only.
#[doc(hidden)]
pub(crate) fn try_new_uts46_decomposed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
+ ?Sized,
{
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
provider.load(Default::default())?.take_payload()?;
let supplementary_decompositions: DataPayload<Uts46DecompositionSupplementV1Marker> =
provider.load(Default::default())?.take_payload()?;
let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
let supplementary_tables: DataPayload<CompatibilityDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
if tables.get().scalars16.len()
+ tables.get().scalars24.len()
+ supplementary_tables.get().scalars16.len()
+ supplementary_tables.get().scalars24.len()
> 0xFFF
{
// The data is from a future where there exists a normalization flavor whose
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
// of space. If a good use case from such a decomposition flavor arises, we can
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
// since for now the masks are hard-coded, error out.
return Err(NormalizerError::FutureExtension);
}
let cap = supplementary_decompositions.get().passthrough_cap;
if cap > 0x0300 {
return Err(NormalizerError::ValidationError);
}
let decomposition_capped = cap.min(0xC0);
let composition_capped = cap.min(0x0300);
Ok(DecomposingNormalizer {
decompositions,
supplementary_decompositions: Some(SupplementPayloadHolder::Uts46(
supplementary_decompositions,
)),
tables,
supplementary_tables: Some(supplementary_tables),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
})
}
/// Wraps a delegate iterator into a decomposing iterator
/// adapter by using the data already held by this normalizer.
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<I> {
Decomposition::new_with_supplements(
iter,
self.decompositions.get(),
self.supplementary_decompositions.as_ref().map(|s| s.get()),
self.tables.get(),
self.supplementary_tables.as_ref().map(|s| s.get()),
self.decomposition_passthrough_bound,
IgnorableBehavior::Unsupported,
)
}
normalizer_methods!();
decomposing_normalize_to!(
/// Normalize a string slice into a `Write` sink.
,
normalize_to,
core::fmt::Write,
&str,
{
},
as_str,
{
let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
0xC3u8
} else {
decomposition_passthrough_bound.min(0x80) as u8
};
// The attribute belongs on an inner statement, but Rust doesn't allow it there.
#[allow(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
// Fast-track succeeded!
continue 'fastest;
}
decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
break 'fastest;
}
// End of stream
sink.write_str(pending_slice)?;
return Ok(());
}
// `unwrap()` OK, because the slice is valid UTF-8 and we know there
// is an upcoming byte.
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() {
continue 'fast;
}
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- decomposition.delegate.as_str().len()
- upcoming.len_utf8()];
sink.write_str(consumed_so_far_slice)?;
// Now let's figure out if we got a starter or a non-starter.
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
// Let this trie value to be reprocessed in case it is
// one of the rare decomposing ones.
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
decomposing_normalize_to!(
/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
,
normalize_utf8_to,
core::fmt::Write,
&[u8],
{
},
as_slice,
{
let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
// The attribute belongs on an inner statement, but Rust doesn't allow it there.
#[allow(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter = decomposition.delegate.as_slice().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
// Fast-track succeeded!
continue 'fastest;
}
break 'fastest;
}
// End of stream
sink.write_str(unsafe { from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
// `unwrap()` OK, because the slice is valid UTF-8 and we know there
// is an upcoming byte.
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() {
if upcoming != REPLACEMENT_CHARACTER {
continue 'fast;
}
// We might have an error, so fall out of the fast path.
// Since the U+FFFD might signify an error, we can't
// assume `upcoming.len_utf8()` for the backoff length.
let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
// We could call `gather_and_sort_combining` here and
// `continue 'outer`, but this should be better for code
// size.
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- decomposition.delegate.as_slice().len()
- upcoming.len_utf8()];
sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
// Now let's figure out if we got a starter or a non-starter.
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
// Let this trie value to be reprocessed in case it is
// one of the rare decomposing ones.
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
decomposing_normalize_to!(
/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
,
normalize_utf16_to,
write16::Write16,
&[u16],
{
sink.size_hint(text.len())?;
},
as_slice,
{
let mut code_unit_iter = decomposition.delegate.as_slice().iter();
// The purpose of the counter is to flush once in a while. If we flush
// too much, there is too much flushing overhead. If we flush too rarely,
// the flush starts reading from too far behind compared to the hot
// recently-read memory.
let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
'fast: loop {
counter -= 1;
if let Some(&upcoming_code_unit) = code_unit_iter.next() {
let mut upcoming32 = u32::from(upcoming_code_unit);
if upcoming32 < decomposition_passthrough_bound && counter != 0 {
continue 'fast;
}
// The loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
'surrogateloop: loop {
let surrogate_base = upcoming32.wrapping_sub(0xD800);
if surrogate_base > (0xDFFF - 0xD800) {
// Not surrogate
break 'surrogateloop;
}
if surrogate_base <= (0xDBFF - 0xD800) {
let iter_backup = code_unit_iter.clone();
if let Some(&low) = code_unit_iter.next() {
if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
upcoming32 = (upcoming32 << 10) + u32::from(low)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
break 'surrogateloop;
} else {
code_unit_iter = iter_backup;
}
}
}
// unpaired surrogate
let slice_to_write = &pending_slice
[..pending_slice.len() - code_unit_iter.as_slice().len() - 1];
sink.write_slice(slice_to_write)?;
undecomposed_starter =
CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
debug_assert!(decomposition.pending.is_none());
// We could instead call `gather_and_sort_combining` and `continue 'outer`, but
// assuming this is better for code size.
break 'fast;
}
// Not unpaired surrogate
let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
let upcoming_with_trie_value =
decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() && counter != 0 {
continue 'fast;
}
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- code_unit_iter.as_slice().len()
- upcoming.len_utf16()];
sink.write_slice(consumed_so_far_slice)?;
// Now let's figure out if we got a starter or a non-starter.
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
// Sync with main iterator
decomposition.delegate = code_unit_iter.as_slice().chars();
// Let this trie value to be reprocessed in case it is
// one of the rare decomposing ones.
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
// End of stream
sink.write_slice(pending_slice)?;
return Ok(());
}
// Sync the main iterator
decomposition.delegate = code_unit_iter.as_slice().chars();
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
}
/// A normalizer for performing composing normalization.
#[derive(Debug)]
pub struct ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer,
canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
}
impl ComposingNormalizer {
/// NFC constructor using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [đź“š Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_nfc() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_nfd(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}
icu_provider::gen_any_buffer_data_constructors!(
locale: skip,
options: skip,
error: NormalizerError,
#[cfg(skip)]
functions: [
new_nfc,
try_new_nfc_with_any_provider,
try_new_nfc_with_buffer_provider,
try_new_nfc_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ ?Sized,
{
let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
/// NFKC constructor using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [đź“š Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_nfkc() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_nfkd(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}
icu_provider::gen_any_buffer_data_constructors!(
locale: skip,
options: skip,
error: NormalizerError,
#[cfg(skip)]
functions: [
new_nfkc,
try_new_nfkc_with_any_provider,
try_new_nfkc_with_buffer_provider,
try_new_nfkc_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CompatibilityDecompositionSupplementV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ ?Sized,
{
let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
/// This is a special building block normalization for IDNA that implements parts of the Map
/// step and the following Normalize step.
///
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
/// Therefore, the output of this normalization may differ for different inputs that are
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
/// to other reorderable characters.
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
+ DataProvider<CanonicalCompositionsV1Marker>
+ ?Sized,
{
let decomposing_normalizer =
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
/// Wraps a delegate iterator into a composing iterator
/// adapter by using the data already held by this normalizer.
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
}
fn normalize_iter_private<I: Iterator<Item = char>>(
&self,
iter: I,
ignorable_behavior: IgnorableBehavior,
) -> Composition<I> {
Composition::new(
Decomposition::new_with_supplements(
iter,
self.decomposing_normalizer.decompositions.get(),
self.decomposing_normalizer
.supplementary_decompositions
.as_ref()
.map(|s| s.get()),
self.decomposing_normalizer.tables.get(),
self.decomposing_normalizer
.supplementary_tables
.as_ref()
.map(|s| s.get()),
self.decomposing_normalizer.decomposition_passthrough_bound,
ignorable_behavior,
),
ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
self.decomposing_normalizer.composition_passthrough_bound,
)
}
normalizer_methods!();
composing_normalize_to!(
/// Normalize a string slice into a `Write` sink.
,
normalize_to,
core::fmt::Write,
&str,
{},
true,
as_str,
{
// Let's hope LICM hoists this outside `'outer`.
let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
0xCCu8
} else {
// We can make this fancy if a normalization other than NFC where looking at
// non-ASCII lead bytes is worthwhile is ever introduced.
composition_passthrough_bound.min(0x80) as u8
};
// This is basically an `Option` discriminant for `undecomposed_starter`,
// but making it a boolean so that writes in the tightest loop are as
// simple as possible (and potentially as peel-hoistable as possible).
// Furthermore, this reduces `unwrap()` later.
let mut undecomposed_starter_valid = true;
// Annotation belongs really on inner statements, but Rust doesn't
// allow it there.
#[allow(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < composition_passthrough_byte_bound {
// Fast-track succeeded!
undecomposed_starter_valid = false;
continue 'fastest;
}
composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
break 'fastest;
}
// End of stream
sink.write_str(pending_slice)?;
return Ok(());
}
// `unwrap()` OK, because the slice is valid UTF-8 and we know there
// is an upcoming byte.
let upcoming = composition.decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
// Can't combine backwards, hence a plain (non-backwards-combining)
// starter albeit past `composition_passthrough_bound`
// Fast-track succeeded!
undecomposed_starter = upcoming_with_trie_value;
undecomposed_starter_valid = true;
continue 'fast;
}
// We need to fall off the fast path.
composition.decomposition.pending = Some(upcoming_with_trie_value);
let consumed_so_far_slice = if undecomposed_starter_valid {
&pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
} else {
// slicing and unwrap OK, because we've just evidently read enough previously.
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
// `unwrap` OK, because we've previously manage to read the previous character
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
undecomposed_starter_valid = true;
consumed_so_far.as_str()
};
sink.write_str(consumed_so_far_slice)?;
break 'fast;
}
debug_assert!(undecomposed_starter_valid);
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf8,
);
composing_normalize_to!(
/// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
,
normalize_utf8_to,
core::fmt::Write,
&[u8],
{},
false,
as_slice,
{
// This is basically an `Option` discriminant for `undecomposed_starter`,
// but making it a boolean so that writes in the tightest loop are as
// simple as possible (and potentially as peel-hoistable as possible).
// Furthermore, this reduces `unwrap()` later.
let mut undecomposed_starter_valid = true;
'fast: loop {
if let Some(upcoming) = composition.decomposition.delegate.next() {
if u32::from(upcoming) < composition_passthrough_bound {
// Fast-track succeeded!
undecomposed_starter_valid = false;
continue 'fast;
}
// TODO(#2006): Annotate as unlikely
if upcoming == REPLACEMENT_CHARACTER {
// Can't tell if this is an error or a literal U+FFFD in
// the input. Assuming the former to be sure.
// Since the U+FFFD might signify an error, we can't
// assume `upcoming.len_utf8()` for the backoff length.
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe{ from_utf8_unchecked(consumed_so_far_slice)})?;
undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
undecomposed_starter_valid = true;
composition.decomposition.pending = None;
break 'fast;
}
let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
// Can't combine backwards, hence a plain (non-backwards-combining)
// starter albeit past `composition_passthrough_bound`
// Fast-track succeeded!
undecomposed_starter = upcoming_with_trie_value;
undecomposed_starter_valid = true;
continue 'fast;
}
// We need to fall off the fast path.
composition.decomposition.pending = Some(upcoming_with_trie_value);
// Annotation belongs really on inner statement, but Rust doesn't
// allow it there.
#[allow(clippy::unwrap_used)]
let consumed_so_far_slice = if undecomposed_starter_valid {
&pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8() - undecomposed_starter.character.len_utf8()]
} else {
// slicing and unwrap OK, because we've just evidently read enough previously.
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
// `unwrap` OK, because we've previously manage to read the previous character
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
undecomposed_starter_valid = true;
consumed_so_far.as_slice()
};
sink.write_str(unsafe { from_utf8_unchecked(consumed_so_far_slice)})?;
break 'fast;
}
// End of stream
sink.write_str(unsafe {from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
debug_assert!(undecomposed_starter_valid);
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf8,
);
composing_normalize_to!(
/// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
,
normalize_utf16_to,
write16::Write16,
&[u16],
{
sink.size_hint(text.len())?;
},
false,
as_slice,
{
let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
let mut upcoming32;
// This is basically an `Option` discriminant for `undecomposed_starter`,
// but making it a boolean so that writes to it are are as
// simple as possible.
// Furthermore, this removes the need for `unwrap()` later.
let mut undecomposed_starter_valid;
// The purpose of the counter is to flush once in a while. If we flush
// too much, there is too much flushing overhead. If we flush too rarely,
// the flush starts reading from too far behind compared to the hot
// recently-read memory.
let mut counter = UTF16_FAST_PATH_FLUSH_THRESHOLD;
// The purpose of this trickiness is to avoid writing to
// `undecomposed_starter_valid` from the tightest loop. Writing to it
// from there destroys performance.
let mut counter_reference = counter - 1;
'fast: loop {
counter -= 1;
if let Some(&upcoming_code_unit) = code_unit_iter.next() {
upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
if upcoming32 < composition_passthrough_bound && counter != 0 {
// No need for surrogate or U+FFFD check, because
// `composition_passthrough_bound` cannot be higher than
// U+0300.
// Fast-track succeeded!
continue 'fast;
}
// if `counter` equals `counter_reference`, the `continue 'fast`
// line above has not executed and `undecomposed_starter` is still
// valid.
undecomposed_starter_valid = counter == counter_reference;
// The loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
'surrogateloop: loop {
let surrogate_base = upcoming32.wrapping_sub(0xD800);
if surrogate_base > (0xDFFF - 0xD800) {
// Not surrogate
break 'surrogateloop;
}
if surrogate_base <= (0xDBFF - 0xD800) {
let iter_backup = code_unit_iter.clone();
if let Some(&low) = code_unit_iter.next() {
if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
upcoming32 = (upcoming32 << 10) + u32::from(low)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
break 'surrogateloop;
} else {
code_unit_iter = iter_backup;
}
}
}
// unpaired surrogate
let slice_to_write = &pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - 1];
sink.write_slice(slice_to_write)?;
undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
undecomposed_starter_valid = true;
composition.decomposition.pending = None;
break 'fast;
}
// Not unpaired surrogate
let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() && counter != 0 {
// Can't combine backwards, hence a plain (non-backwards-combining)
// starter albeit past `composition_passthrough_bound`
// Fast-track succeeded!
undecomposed_starter = upcoming_with_trie_value;
// Cause `undecomposed_starter_valid` to be set to true.
// This regresses English performance on Haswell by 11%
// compared to commenting out this assignment to
// `counter_reference`.
counter_reference = counter - 1;
continue 'fast;
}
// We need to fall off the fast path.
composition.decomposition.pending = Some(upcoming_with_trie_value);
// Annotation belongs really on inner statement, but Rust doesn't
// allow it there.
#[allow(clippy::unwrap_used)]
let consumed_so_far_slice = if undecomposed_starter_valid {
&pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16() - undecomposed_starter.character.len_utf16()]
} else {
// slicing and unwrap OK, because we've just evidently read enough previously.
let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars();
// `unwrap` OK, because we've previously manage to read the previous character
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
undecomposed_starter_valid = true;
consumed_so_far.as_slice()
};
sink.write_slice(consumed_so_far_slice)?;
break 'fast;
}
// End of stream
sink.write_slice(pending_slice)?;
return Ok(());
}
debug_assert!(undecomposed_starter_valid);
// Sync the main iterator
composition.decomposition.delegate = code_unit_iter.as_slice().chars();
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf16,
);
}
struct IsNormalizedSinkUtf16<'a> {
expect: &'a [u16],
}
impl<'a> IsNormalizedSinkUtf16<'a> {
pub fn new(slice: &'a [u16]) -> Self {
IsNormalizedSinkUtf16 { expect: slice }
}
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
}
impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
// We know that if we get a slice, it's a pass-through,
// so we can compare addresses. Indexing is OK, because
// an indexing failure would be a code bug rather than
// an input or data issue.
#[allow(clippy::indexing_slicing)]
if s.as_ptr() == self.expect.as_ptr() {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_slice();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}
struct IsNormalizedSinkUtf8<'a> {
expect: &'a [u8],
}
impl<'a> IsNormalizedSinkUtf8<'a> {
pub fn new(slice: &'a [u8]) -> Self {
IsNormalizedSinkUtf8 { expect: slice }
}
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
}
impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
// We know that if we get a slice, it's a pass-through,
// so we can compare addresses. Indexing is OK, because
// an indexing failure would be a code bug rather than
// an input or data issue.
#[allow(clippy::indexing_slicing)]
if s.as_ptr() == self.expect.as_ptr() {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_slice();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}
struct IsNormalizedSinkStr<'a> {
expect: &'a str,
}
impl<'a> IsNormalizedSinkStr<'a> {
pub fn new(slice: &'a str) -> Self {
IsNormalizedSinkStr { expect: slice }
}
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
}
impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
// We know that if we get a slice, it's a pass-through,
// so we can compare addresses. Indexing is OK, because
// an indexing failure would be a code bug rather than
// an input or data issue.
#[allow(clippy::indexing_slicing)]
if s.as_ptr() == self.expect.as_ptr() {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_str();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}