lib.rs - mozsearch

// Copyright Mozilla Foundation

//

// Licensed under the Apache License (Version 2.0), or the MIT license,

// (the "Licenses") at your option. You may not use this file except in

// compliance with one of the Licenses. You may obtain copies of the

// Licenses at:

//

//    https://www.apache.org/licenses/LICENSE-2.0

//    https://opensource.org/licenses/MIT

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the Licenses is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the Licenses for the specific language governing permissions and

// limitations under the Licenses.

#![no_std]

//! Provides iteration by `char` over `&[u8]` containing potentially-invalid

//! UTF-8 such that errors are handled according to the [WHATWG Encoding

//! Standard](https://encoding.spec.whatwg.org/#utf-8-decoder) (i.e. the same

//! way as in `String::from_utf8_lossy`).

//!

//! The trait `Utf8CharsEx` provides the convenience method `chars()` on

//! byte slices themselves instead of having to use the more verbose

//! `Utf8Chars::new(slice)`.

//!

//! ```rust

//! use utf8_iter::Utf8CharsEx;

//! let data = b"\xFF\xC2\xE2\xE2\x98\xF0\xF0\x9F\xF0\x9F\x92\xE2\x98\x83";

//! let from_iter: String = data.chars().collect();

//! let from_std = String::from_utf8_lossy(data);

//! assert_eq!(from_iter, from_std);

//! ```

mod indices;

mod report;

pub use crate::indices::Utf8CharIndices;

pub use crate::report::ErrorReportingUtf8Chars;

pub use crate::report::Utf8CharsError;

use core::iter::FusedIterator;

#[repr(align(64))] // Align to cache lines

struct Utf8Data {

    pub table: [u8; 384],

// This is generated code copied and pasted from utf_8.rs of encoding_rs.

// Please don't edit by hand but instead regenerate as instructed in that

// file.

static UTF8_DATA: Utf8Data = Utf8Data {

    table: [

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,

        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,

        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,

        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,

        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,

        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

],

};

// End manually copypasted generated code.

#[inline(always)]

fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {

    i.wrapping_sub(start) <= (end - start)

/// Iterator by `char` over `&[u8]` that contains

/// potentially-invalid UTF-8. See the crate documentation.

#[derive(Debug, Clone)]

pub struct Utf8Chars<'a> {

    remaining: &'a [u8],

impl<'a> Utf8Chars<'a> {

    #[inline(always)]

    /// Creates the iterator from a byte slice.

    pub fn new(bytes: &'a [u8]) -> Self {

        Utf8Chars::<'a> { remaining: bytes }

    /// Views the current remaining data in the iterator as a subslice

    /// of the original slice.

    #[inline(always)]

    pub fn as_slice(&self) -> &'a [u8] {

        self.remaining

    #[inline(never)]

    fn next_fallback(&mut self) -> Option<char> {

        if self.remaining.is_empty() {

            return None;

        let first = self.remaining[0];

        if first < 0x80 {

            self.remaining = &self.remaining[1..];

            return Some(char::from(first));

        if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {

            self.remaining = &self.remaining[1..];

            return Some('\u{FFFD}');

        let second = self.remaining[1];

        let (lower_bound, upper_bound) = match first {

            0xE0 => (0xA0, 0xBF),

            0xED => (0x80, 0x9F),

            0xF0 => (0x90, 0xBF),

            0xF4 => (0x80, 0x8F),

            _ => (0x80, 0xBF),

};

        if !in_inclusive_range8(second, lower_bound, upper_bound) {

            self.remaining = &self.remaining[1..];

            return Some('\u{FFFD}');

        if first < 0xE0 {

            self.remaining = &self.remaining[2..];

            let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);

            return Some(unsafe { char::from_u32_unchecked(point) });

        if self.remaining.len() == 2 {

            self.remaining = &self.remaining[2..];

            return Some('\u{FFFD}');

        let third = self.remaining[2];

        if !in_inclusive_range8(third, 0x80, 0xBF) {

            self.remaining = &self.remaining[2..];

            return Some('\u{FFFD}');

        if first < 0xF0 {

            self.remaining = &self.remaining[3..];

            let point = ((u32::from(first) & 0xF) << 12)

                | ((u32::from(second) & 0x3F) << 6)

                | (u32::from(third) & 0x3F);

            return Some(unsafe { char::from_u32_unchecked(point) });

        // At this point, we have a valid 3-byte prefix of a

        // four-byte sequence that has to be incomplete, because

        // otherwise `next()` would have succeeded.

        self.remaining = &self.remaining[3..];

        Some('\u{FFFD}')

impl<'a> Iterator for Utf8Chars<'a> {

    type Item = char;

    #[inline]

    fn next(&mut self) -> Option<char> {

        // Not delegating directly to `ErrorReportingUtf8Chars` to avoid

        // an extra branch in the common case based on a cursory inspection

        // of generated code in a similar case. Be sure to inspect the

        // generated code as inlined into an actual usage site carefully

        // if attempting to consolidate the source code here.

        // This loop is only broken out of as goto forward

        #[allow(clippy::never_loop)]

        loop {

            if self.remaining.len() < 4 {

                break;

            let first = self.remaining[0];

            if first < 0x80 {

                self.remaining = &self.remaining[1..];

                return Some(char::from(first));

            let second = self.remaining[1];

            if in_inclusive_range8(first, 0xC2, 0xDF) {

                if !in_inclusive_range8(second, 0x80, 0xBF) {

                    break;

                let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);

                self.remaining = &self.remaining[2..];

                return Some(unsafe { char::from_u32_unchecked(point) });

            // This table-based formulation was benchmark-based in encoding_rs,

            // but it hasn't been re-benchmarked in this iterator context.

            let third = self.remaining[2];

            if first < 0xF0 {

                if ((UTF8_DATA.table[usize::from(second)]

                    & UTF8_DATA.table[usize::from(first) + 0x80])

                    | (third >> 6))

                    != 2

                    break;

                let point = ((u32::from(first) & 0xF) << 12)

                    | ((u32::from(second) & 0x3F) << 6)

                    | (u32::from(third) & 0x3F);

                self.remaining = &self.remaining[3..];

                return Some(unsafe { char::from_u32_unchecked(point) });

            let fourth = self.remaining[3];

            if (u16::from(

                UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],

            ) | u16::from(third >> 6)

                | (u16::from(fourth & 0xC0) << 2))

                != 0x202

                break;

            let point = ((u32::from(first) & 0x7) << 18)

                | ((u32::from(second) & 0x3F) << 12)

                | ((u32::from(third) & 0x3F) << 6)

                | (u32::from(fourth) & 0x3F);

            self.remaining = &self.remaining[4..];

            return Some(unsafe { char::from_u32_unchecked(point) });

        self.next_fallback()

impl<'a> DoubleEndedIterator for Utf8Chars<'a> {

    #[inline]

    fn next_back(&mut self) -> Option<char> {

        if self.remaining.is_empty() {

            return None;

        let mut attempt = 1;

        for b in self.remaining.iter().rev() {

            if b & 0xC0 != 0x80 {

                let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);

                let mut inner = Utf8Chars::new(tail);

                let candidate = inner.next();

                if inner.as_slice().is_empty() {

                    self.remaining = head;

                    return candidate;

                break;

            if attempt == 4 {

                break;

            attempt += 1;

        self.remaining = &self.remaining[..self.remaining.len() - 1];

        Some('\u{FFFD}')

impl FusedIterator for Utf8Chars<'_> {}

/// Convenience trait that adds `chars()` and `char_indices()` methods

/// similar to the ones on string slices to byte slices.

pub trait Utf8CharsEx {

    fn chars(&self) -> Utf8Chars<'_>;

    fn char_indices(&self) -> Utf8CharIndices<'_>;

impl Utf8CharsEx for [u8] {

    /// Convenience method for creating an UTF-8 iterator

    /// for the slice.

    #[inline]

    fn chars(&self) -> Utf8Chars<'_> {

        Utf8Chars::new(self)

    /// Convenience method for creating a byte index and

    /// UTF-8 iterator for the slice.

    #[inline]

    fn char_indices(&self) -> Utf8CharIndices<'_> {

        Utf8CharIndices::new(self)

// No manually-written tests for forward-iteration, because the code passed multiple

// days of fuzzing comparing with known-good behavior.

Revision control

Copy as Markdown

Other Tools