unicode.rs - mozsearch

Enable keyboard shortcuts

//! A set of helper functions for unescaping Fluent unicode escape sequences.

//!

//! # Unicode

//!

//! Fluent supports UTF-8 in all FTL resources, but it also allows

//! unicode sequences to be escaped in [`String

//! Literals`](super::ast::InlineExpression::StringLiteral).

//!

//! Four byte sequences are encoded with `\u` and six byte

//! sqeuences using `\U`.

//! ## Example

//!

//! ```

//! use fluent_syntax::unicode::unescape_unicode_to_string;

//!

//! assert_eq!(

//!     unescape_unicode_to_string("Foo \\u5bd2 Bar"),

//!     "Foo 寒 Bar"

//! );

//!

//! assert_eq!(

//!     unescape_unicode_to_string("Foo \\U01F68A Bar"),

//!     "Foo 🚊 Bar"

//! );

//! ```

//!

//! # Other unescapes

//!

//! This also allows for a char `"` to be present inside an FTL string literal,

//! and for `\` itself to be escaped.

//!

//! ## Example

//!

//! ```

//! use fluent_syntax::unicode::unescape_unicode_to_string;

//!

//! assert_eq!(

//!     unescape_unicode_to_string("Foo \\\" Bar"),

//!     "Foo \" Bar"

//! );

//! assert_eq!(

//!     unescape_unicode_to_string("Foo \\\\ Bar"),

//!     "Foo \\ Bar"

//! );

//! ```

use std::borrow::Cow;

use std::char;

use std::fmt;

const UNKNOWN_CHAR: char = '�';

fn encode_unicode(s: Option<&str>) -> char {

    s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32))

        .unwrap_or(UNKNOWN_CHAR)

/// Unescapes to a writer without allocating.

///

/// ## Example

///

/// ```

/// use fluent_syntax::unicode::unescape_unicode;

///

/// let mut s = String::new();

/// unescape_unicode(&mut s, "Foo \\U01F60A Bar");

/// assert_eq!(s, "Foo 😊 Bar");

/// ```

pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result

where

    W: fmt::Write,

    let bytes = input.as_bytes();

    let mut start = 0;

    let mut ptr = 0;

    while let Some(b) = bytes.get(ptr) {

        if b != &b'\\' {

            ptr += 1;

            continue;

        if start != ptr {

            w.write_str(&input[start..ptr])?;

        ptr += 1;

        let new_char = match bytes.get(ptr) {

            Some(b'\\') => '\\',

            Some(b'"') => '"',

            Some(u @ b'u') | Some(u @ b'U') => {

                let seq_start = ptr + 1;

                let len = if u == &b'u' { 4 } else { 6 };

                ptr += len;

                encode_unicode(input.get(seq_start..seq_start + len))

            _ => UNKNOWN_CHAR,

};

        ptr += 1;

        w.write_char(new_char)?;

        start = ptr;

    if start != ptr {

        w.write_str(&input[start..ptr])?;

    Ok(())

/// Unescapes to a `Cow<str>` optionally allocating.

///

/// ## Example

///

/// ```

/// use fluent_syntax::unicode::unescape_unicode_to_string;

///

/// assert_eq!(

///     unescape_unicode_to_string("Foo \\U01F60A Bar"),

///     "Foo 😊 Bar"

/// );

/// ```

pub fn unescape_unicode_to_string(input: &str) -> Cow<str> {

    let bytes = input.as_bytes();

    let mut result = Cow::from(input);

    let mut ptr = 0;

    while let Some(b) = bytes.get(ptr) {

        if b != &b'\\' {

            if let Cow::Owned(ref mut s) = result {

                s.push(*b as char);

            ptr += 1;

            continue;

        if let Cow::Borrowed(_) = result {

            result = Cow::from(&input[0..ptr]);

        ptr += 1;

        let new_char = match bytes.get(ptr) {

            Some(b'\\') => '\\',

            Some(b'"') => '"',

            Some(u @ b'u') | Some(u @ b'U') => {

                let start = ptr + 1;

                let len = if u == &b'u' { 4 } else { 6 };

                ptr += len;

                input

                    .get(start..(start + len))

                    .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice)))

            _ => UNKNOWN_CHAR,

};

        result.to_mut().push(new_char);

        ptr += 1;

    result