latin1.rs - mozsearch

firefox-main/third_party/rust/icu_normalizer/src/latin1.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

// This file is part of ICU4X. For terms of use, please see the file

// called LICENSE at the top level of the ICU4X source tree

// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Methods for normalizing Latin1 input into a UTF-16 sink.

//!

//! NFC is not available, since Latin1 input is already known to be

//! in NFC.

use write16::Write16;

/// Entries start from U+00A0 NO-BREAK SPACE. If the character is

/// always its own normalization, the value in the table is 0.

/// If the character has a compatibility decompositons, the value

/// in the table is the index into `COMPATIBILITY_DECOMPOSITIONS`

/// shifted left by two and the length of the subslice of

/// `COMPATIBILITY_DECOMPOSITIONS` in the low 2 bits. This means

/// that the high half is zero. Otherwise, the high 8 bits are the

/// first character of the canonical decomposition and the low 8

/// bits are the offset that needs to be added to U+0300 to get the

/// second character of the canonical decomposition.

static TABLE: [u16; 96] = [

    0x01,   // nbsp

    0,      // ¡

    0,      // ¢

    0,      // £

    0,      // ¤

    0,      // ¥

    0,      // ¦

    0,      // §

    0x02,   // ¨

    0,      // ©

    0x09,   // ª

    0,      // «

    0,      // ¬

    0,      // shy

    0,      // ®

    0x0E,   // ¯

    0,      // °

    0,      // ±

    0x41,   // ²

    0x45,   // ³

    0x16,   // ´

    0x1D,   // µ

    0,      // ¶

    0,      // ·

    0x22,   // ¸

    0x2D,   // ¹

    0x29,   // º

    0,      // »

    0x2F,   // ¼

    0x3B,   // ½

    0x47,   // ¾

    0,      // ¿

    0x4100, // À

    0x4101, // Á

    0x4102, // Â

    0x4103, // Ã

    0x4108, // Ä

    0x410A, // Å

    0,      // Æ

    0x4327, // Ç

    0x4500, // È

    0x4501, // É

    0x4502, // Ê

    0x4508, // Ë

    0x4900, // Ì

    0x4901, // Í

    0x4902, // Î

    0x4908, // Ï

    0,      // Ð

    0x4E03, // Ñ

    0x4F00, // Ò

    0x4F01, // Ó

    0x4F02, // Ô

    0x4F03, // Õ

    0x4F08, // Ö

    0,      // ×

    0,      // Ø

    0x5500, // Ù

    0x5501, // Ú

    0x5502, // Û

    0x5508, // Ü

    0x5901, // Ý

    0,      // Þ

    0,      // ß

    0x6100, // à

    0x6101, // á

    0x6102, // â

    0x6103, // ã

    0x6108, // ä

    0x610A, // å

    0,      // æ

    0x6327, // ç

    0x6500, // è

    0x6501, // é

    0x6502, // ê

    0x6508, // ë

    0x6900, // ì

    0x6901, // í

    0x6902, // î

    0x6908, // ï

    0,      // ð

    0x6E03, // ñ

    0x6F00, // ò

    0x6F01, // ó

    0x6F02, // ô

    0x6F03, // õ

    0x6F08, // ö

    0,      // ÷

    0,      // ø

    0x7500, // ù

    0x7501, // ú

    0x7502, // û

    0x7508, // ü

    0x7901, // ý

    0,      // þ

    0x7908, // ÿ

];

/// Table containing the compatibility decompositions.

static COMPATIBILITY_DECOMPOSITIONS: [u16; 20] = [

    0x0020, 0x0308, 0x0061, 0x0020, 0x0304, 0x0020, 0x0301, 0x03BC, 0x0020, 0x0327, 0x006F, 0x0031,

    0x2044, 0x0034, 0x0031, 0x2044, 0x0032, 0x0033, 0x2044, 0x0034,

];

const NFKC_BITS: u32 = const {

    let mut accu = 0;

    let mut i = 0;

    while i < 0x20 {

        if TABLE[i] != 0 {

            accu |= 1 << (i as u32);

        i += 1;

    accu

};

const NFD_BITS: u64 = const {

    let mut accu = 0;

    let mut i = 0x20;

    while i < TABLE.len() {

        if TABLE[i] != 0 {

            accu |= 1 << ((i - 0x20) as u32);

        i += 1;

    accu

};

const NFKD_BITS: u128 = const {

    let mut accu = 0;

    let mut i = 0;

    while i < TABLE.len() {

        if TABLE[i] != 0 {

            accu |= 1 << ((i + 0x20) as u32);

        i += 1;

    accu

};

/// Writes the compatibility decomposition of `c` to `sink`.

#[inline]

fn compatibility_decomposition(val: u16) -> &'static [u16] {

    debug_assert!(val <= 0xFF);

    let len = val & 0b11;

    let index = val >> 2;

    COMPATIBILITY_DECOMPOSITIONS

        .get(index as usize..index as usize + len as usize)

        .unwrap_or_else(|| {

            // Internal bug, not even GIGO, never supposed to happen

            debug_assert!(false);

&[]

})

/// Normalize Latin1 `text` to NFD UTF-16 written to `sink`.

#[inline]

pub fn normalize_nfd_to<W: Write16 + ?Sized>(text: &[u16], sink: &mut W) -> core::fmt::Result {

    // Indexing is OK, because the index is statically in range.

    #[expect(clippy::indexing_slicing)]

    let table = &TABLE[0x20..];

    let mut text_left = text;

    let mut iter = text_left.iter();

    while let Some(u) = iter.next() {

        let c = *u;

        if c < 0xC0 {

            continue;

        if let Some(val) = table.get(c.wrapping_sub(0xC0) as usize) {

            let v = *val;

            if v != 0 {

                let remaining = iter.as_slice();

                // Indexing is OK by construction.

                #[expect(clippy::indexing_slicing)]

                sink.write_slice(&text_left[..text_left.len() - remaining.len() - 1])?;

                text_left = remaining;

                sink.write_slice(&[v >> 8, (v & 0xFF) + 0x0300])?;

    sink.write_slice(text_left)?;

    Ok(())

/// Normalize Latin1 `text` to NFKD UTF-16 written to `sink`.

#[inline]

pub fn normalize_nfkd_to<W: Write16 + ?Sized>(text: &[u16], sink: &mut W) -> core::fmt::Result {

    let mut text_left = text;

    let mut iter = text_left.iter();

    while let Some(u) = iter.next() {

        let c = *u;

        if c < 0xA0 {

            continue;

        if let Some(val) = TABLE.get(c.wrapping_sub(0xA0) as usize) {

            let v = *val;

            if v != 0 {

                let remaining = iter.as_slice();

                // Indexing is OK by construction.

                #[expect(clippy::indexing_slicing)]

                sink.write_slice(&text_left[..text_left.len() - remaining.len() - 1])?;

                text_left = remaining;

                let hi = v >> 8;

                if hi != 0 {

                    sink.write_slice(&[hi, (v & 0xFF) + 0x0300])?;

                } else {

                    sink.write_slice(compatibility_decomposition(v))?;

    sink.write_slice(text_left)?;

    Ok(())

/// Normalize Latin1 `text` to NFKC UTF-16 written to `sink`.

#[inline]

pub fn normalize_nfkc_to<W: Write16 + ?Sized>(text: &[u16], sink: &mut W) -> core::fmt::Result {

    // Indexing is OK, because the index is statically in range.

    #[expect(clippy::indexing_slicing)]

    let table = &TABLE[..0x20];

    let mut text_left = text;

    let mut iter = text_left.iter();

    while let Some(u) = iter.next() {

        let c = *u;

        if c < 0xA0 {

            continue;

        if let Some(val) = table.get(c.wrapping_sub(0xA0) as usize) {

            let v = *val;

            if v != 0 {

                let remaining = iter.as_slice();

                // Indexing is OK by construction.

                #[expect(clippy::indexing_slicing)]

                sink.write_slice(&text_left[..text_left.len() - remaining.len() - 1])?;

                text_left = remaining;

                sink.write_slice(compatibility_decomposition(v))?;

    sink.write_slice(text_left)?;

    Ok(())

/// Split Latin1 `text` into `(head, tail)` such that the first

/// byte of `tail` is the first byte of input that is not in NFD.

/// If `text` is fully in NFD, `tail` is empty.

#[inline]

pub fn split_normalized_nfd(text: &[u8]) -> (&[u8], &[u8]) {

    let mut iter = text.iter();

    while let Some(c) = iter.next() {

        let b = *c;

        if let Some(shifted) = 1u64.checked_shl(u32::from(b.wrapping_sub(0xC0))) {

            if (NFD_BITS & shifted) != 0 {

                let tail = iter.as_slice();

                return text

                    .split_at_checked(text.len() - tail.len() - 1)

                    .unwrap_or_else(|| {

                        // Internal bug, not even GIGO, never supposed to happen

                        debug_assert!(false);

                        (&[], text)

});

    (text, &[])

/// Split Latin1 `text` into `(head, tail)` such that the first

/// byte of `tail` is the first byte of input that is not in NFKD.

/// If `text` is fully in NFKD, `tail` is empty.

#[inline]

pub fn split_normalized_nfkd(text: &[u8]) -> (&[u8], &[u8]) {

    let mut iter = text.iter();

    while let Some(c) = iter.next() {

        let b = *c;

        if let Some(shifted) = 1u128.checked_shl(u32::from(b.wrapping_sub(0x80))) {

            if (NFKD_BITS & shifted) != 0 {

                let tail = iter.as_slice();

                return text

                    .split_at_checked(text.len() - tail.len() - 1)

                    .unwrap_or_else(|| {

                        // Internal bug, not even GIGO, never supposed to happen

                        debug_assert!(false);

                        (&[], text)

});

    (text, &[])

/// Split Latin1 `text` into `(head, tail)` such that the first

/// byte of `tail` is the first byte of input that is not in NFKC.

/// If `text` is fully in NFKC, `tail` is empty.

#[inline]

pub fn split_normalized_nfkc(text: &[u8]) -> (&[u8], &[u8]) {

    let mut iter = text.iter();

    while let Some(c) = iter.next() {

        let b = *c;

        // Make ASCII go one instruction faster.

        if b < 0xA0 {

            continue;

        if let Some(shifted) = 1u32.checked_shl(u32::from(b.wrapping_sub(0xA0))) {

            if (NFKC_BITS & shifted) != 0 {

                let tail = iter.as_slice();

                return text

                    .split_at_checked(text.len() - tail.len() - 1)

                    .unwrap_or_else(|| {

                        // Internal bug, not even GIGO, never supposed to happen

                        debug_assert!(false);

                        (&[], text)

});

    (text, &[])