lexer.rs - mozsearch

firefox-main/third_party/rust/wast/src/lexer.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

//! Definition of a lexer for the WebAssembly text format.

//!

//! This module provides a [`Lexer`][] type which is an iterate over the raw

//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single

//! byte in a WebAssembly text field, returning tokens even for comments and

//! whitespace. Typically you'll ignore comments and whitespace, however.

//!

//! If you'd like to iterate over the tokens in a file you can do so via:

//!

//! ```

//! # fn foo() -> Result<(), wast::Error> {

//! use wast::lexer::Lexer;

//!

//! let wat = "(module (func $foo))";

//! for token in Lexer::new(wat).iter(0) {

//!     println!("{:?}", token?);

//! }

//! # Ok(())

//! # }

//! ```

//!

//! Note that you'll typically not use this module but will rather use

//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.

//!

//! [`Lexer`]: crate::lexer::Lexer

use crate::token::Span;

use crate::Error;

use std::borrow::Cow;

use std::char;

use std::fmt;

use std::slice;

use std::str;

use std::str::Utf8Error;

/// A structure used to lex the s-expression syntax of WAT files.

///

/// This structure is used to generate [`Token`] items, which should account for

/// every single byte of the input as we iterate over it. A [`LexError`] is

/// returned for any non-lexable text.

#[derive(Clone)]

pub struct Lexer<'a> {

    input: &'a str,

    allow_confusing_unicode: bool,

/// A single token parsed from a `Lexer`.

#[derive(Copy, Clone, Debug, PartialEq)]

pub struct Token {

    /// The kind of token this represents, such as whether it's whitespace, a

    /// keyword, etc.

    pub kind: TokenKind,

    /// The byte offset within the original source for where this token came

    /// from.

    pub offset: usize,

    /// The byte length of this token as it resides in the original source.

//

    // NB: this is `u32` to enable packing `Token` into two pointers of size.

    // This does limit a single token to being at most 4G large, but that seems

    // probably ok.

    pub len: u32,

#[test]

fn token_is_not_too_big() {

    assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);

/// Classification of what was parsed from the input stream.

///

/// This enumeration contains all kinds of fragments, including comments and

/// whitespace.

#[derive(Copy, Clone, Debug, PartialEq)]

pub enum TokenKind {

    /// A line comment, preceded with `;;`

    LineComment,

    /// A block comment, surrounded by `(;` and `;)`. Note that these can be

    /// nested.

    BlockComment,

    /// A fragment of source that represents whitespace.

    Whitespace,

    /// A left-parenthesis, including the source text for where it comes from.

    LParen,

    /// A right-parenthesis, including the source text for where it comes from.

    RParen,

    /// A string literal, which is actually a list of bytes.

    String,

    /// An identifier (like `$foo`).

///

    /// All identifiers start with `$` and the payload here is the original

    /// source text.

Id,

    /// A keyword, or something that starts with an alphabetic character.

///

    /// The payload here is the original source text.

    Keyword,

    /// An annotation (like `@foo`).

///

    /// All annotations start with `@` and the payload will be the name of the

    /// annotation.

    Annotation,

    /// A reserved series of `idchar` symbols. Unknown what this is meant to be

    /// used for, you'll probably generate an error about an unexpected token.

    Reserved,

    /// An integer.

    Integer(IntegerKind),

    /// A float.

    Float(FloatKind),

/// Description of the parsed integer from the source.

#[derive(Copy, Clone, Debug, PartialEq)]

pub struct IntegerKind {

    sign: Option<SignToken>,

    has_underscores: bool,

    hex: bool,

/// Description of a parsed float from the source.

#[allow(missing_docs)]

#[derive(Copy, Clone, Debug, PartialEq)]

pub enum FloatKind {

    #[doc(hidden)]

    Inf { negative: bool },

    #[doc(hidden)]

    Nan { negative: bool },

    #[doc(hidden)]

    NanVal {

        negative: bool,

        has_underscores: bool,

},

    #[doc(hidden)]

    Normal { has_underscores: bool, hex: bool },

enum ReservedKind {

    /// "..."

    String,

    /// anything that's just a sequence of `idchars!()`

    Idchars,

    /// $"..."

    IdString,

    /// @"..."

    AnnotationString,

    /// everything else (a conglomeration of strings, idchars, etc)

    Reserved,

/// Errors that can be generated while lexing.

///

/// All lexing errors have line/colum/position information as well as a

/// `LexError` indicating what kind of error happened while lexing.

#[derive(Debug, Clone, PartialEq, Eq)]

#[non_exhaustive]

pub enum LexError {

    /// A dangling block comment was found with an unbalanced `(;` which was

    /// never terminated in the file.

    DanglingBlockComment,

    /// An unexpected character was encountered when generally parsing and

    /// looking for something else.

    Unexpected(char),

    /// An invalid `char` in a string literal was found.

    InvalidStringElement(char),

    /// An invalid string escape letter was found (the thing after the `\` in

    /// string literals)

    InvalidStringEscape(char),

    /// An invalid hexadecimal digit was found.

    InvalidHexDigit(char),

    /// An invalid base-10 digit was found.

    InvalidDigit(char),

    /// Parsing expected `wanted` but ended up finding `found` instead where the

    /// two characters aren't the same.

    Expected {

        /// The character that was expected to be found

        wanted: char,

        /// The character that was actually found

        found: char,

},

    /// We needed to parse more but EOF (or end of the string) was encountered.

    UnexpectedEof,

    /// A number failed to parse because it was too big to fit within the target

    /// type.

    NumberTooBig,

    /// An invalid unicode value was found in a `\u{...}` escape in a string,

    /// only valid unicode scalars can be escaped that way.

    InvalidUnicodeValue(u32),

    /// A lone underscore was found when parsing a number, since underscores

    /// should always be preceded and succeeded with a digit of some form.

    LoneUnderscore,

    /// A "confusing" unicode character is present in a comment or a string

    /// literal, such as a character that changes the direction text is

    /// typically displayed in editors. This could cause the human-read

    /// version to behave differently than the compiler-visible version, so

    /// these are simply rejected for now.

    ConfusingUnicode(char),

    /// An invalid utf-8 sequence was found in a quoted identifier, such as

    /// `$"\ff"`.

    InvalidUtf8Id(Utf8Error),

    /// An empty identifier was found, or a lone `$`.

    EmptyId,

    /// An empty identifier was found, or a lone `@`.

    EmptyAnnotation,

/// A sign token for an integer.

#[derive(Clone, Copy, Debug, PartialEq, Eq)]

pub enum SignToken {

    /// Plus sign: "+",

    Plus,

    /// Minus sign: "-",

    Minus,

/// A fully parsed integer from a source string with a payload ready to parse

/// into an integral type.

#[derive(Debug, PartialEq)]

pub struct Integer<'a> {

    sign: Option<SignToken>,

    val: Cow<'a, str>,

    hex: bool,

/// Possible parsed float values

#[derive(Debug, PartialEq, Eq)]

pub enum Float<'a> {

    /// A float `NaN` representation

    Nan {

        /// The specific bits to encode for this float, optionally

        val: Option<Cow<'a, str>>,

        /// Whether or not this is a negative `NaN` or not.

        negative: bool,

},

    /// An float infinite representation,

    Inf {

        #[allow(missing_docs)]

        negative: bool,

},

    /// A parsed and separated floating point value

    Val {

        /// Whether or not the `integral` and `fractional` are specified in hex

        hex: bool,

        /// The float parts before the `.`

        integral: Cow<'a, str>,

        /// The float parts after the `.`

        fractional: Option<Cow<'a, str>>,

        /// The exponent to multiple this `integral.fractional` portion of the

        /// float by. If `hex` is true this is `2^exponent` and otherwise it's

        /// `10^exponent`

        exponent: Option<Cow<'a, str>>,

},

// https://webassembly.github.io/spec/core/text/values.html#text-idchar

macro_rules! idchars {

    () => {

        b'0'..=b'9'

        | b'A'..=b'Z'

        | b'a'..=b'z'

        | b'!'

        | b'#'

        | b'$'

        | b'%'

        | b'&'

        | b'\''

        | b'*'

        | b'+'

        | b'-'

        | b'.'

        | b'/'

        | b':'

        | b'<'

        | b'='

        | b'>'

        | b'?'

        | b'@'

        | b'\\'

        | b'^'

        | b'_'

        | b'`'

        | b'|'

        | b'~'

impl<'a> Lexer<'a> {

    /// Creates a new lexer which will lex the `input` source string.

    pub fn new(input: &str) -> Lexer<'_> {

        Lexer {

            input,

            allow_confusing_unicode: false,

    /// Returns the original source input that we're lexing.

    pub fn input(&self) -> &'a str {

        self.input

    /// Configures whether "confusing" unicode characters are allowed while

    /// lexing.

///

    /// If allowed then no error will happen if these characters are found, but

    /// otherwise if disallowed a lex error will be produced when these

    /// characters are found. Confusing characters are denied by default.

///

    /// For now "confusing characters" are primarily related to the "trojan

    /// source" problem where it refers to characters which cause humans to read

    /// text differently than this lexer, such as characters that alter the

    /// left-to-right display of the source code.

    pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {

        self.allow_confusing_unicode = allow;

        self

    /// Lexes the next at the byte position `pos` in the input.

///

    /// Returns `Some` if a token is found or `None` if we're at EOF.

///

    /// The `pos` argument will be updated to point to the next token on a

    /// successful parse.

///

    /// # Errors

///

    /// Returns an error if the input is malformed.

    pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {

        let offset = *pos;

        Ok(match self.parse_kind(pos)? {

            Some(kind) => Some(Token {

                kind,

                offset,

                len: (*pos - offset).try_into().unwrap(),

}),

            None => None,

})

    fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {

        let start = *pos;

        // This `match` generally parses the grammar specified at

//

        // https://webassembly.github.io/spec/core/text/lexical.html#text-token

        let remaining = &self.input.as_bytes()[start..];

        let byte = match remaining.first() {

            Some(b) => b,

            None => return Ok(None),

};

        match byte {

            // Open-parens check the next character to see if this is the start

            // of a block comment, otherwise it's just a bland left-paren

            // token.

            b'(' => match remaining.get(1) {

                Some(b';') => {

                    let mut level = 1;

                    // Note that we're doing a byte-level search here for the

                    // close-delimiter of `;)`. The actual source text is utf-8

                    // encode in `remaining` but due to how utf-8 works we

                    // can safely search for an ASCII byte since it'll never

                    // otherwise appear in the middle of a codepoint and if we

                    // find it then it's guaranteed to be the right byte.

//

                    // Mainly we're avoiding the overhead of decoding utf-8

                    // characters into a Rust `char` since it's otherwise

                    // unnecessary work.

                    let mut iter = remaining[2..].iter();

                    while let Some(ch) = iter.next() {

                        match ch {

                            b'(' => {

                                if let Some(b';') = iter.as_slice().first() {

                                    level += 1;

                                    iter.next();

                            b';' => {

                                if let Some(b')') = iter.as_slice().first() {

                                    level -= 1;

                                    iter.next();

                                    if level == 0 {

                                        let len = remaining.len() - iter.as_slice().len();

                                        let comment = &self.input[start..][..len];

                                        *pos += len;

                                        self.check_confusing_comment(*pos, comment)?;

                                        return Ok(Some(TokenKind::BlockComment));

                            _ => {}

                    Err(self.error(start, LexError::DanglingBlockComment))

                _ => {

                    *pos += 1;

                    Ok(Some(TokenKind::LParen))

},

            b')' => {

                *pos += 1;

                Ok(Some(TokenKind::RParen))

            // https://webassembly.github.io/spec/core/text/lexical.html#white-space

            b' ' | b'\n' | b'\r' | b'\t' => {

                self.skip_ws(pos);

                Ok(Some(TokenKind::Whitespace))

            c @ (idchars!() | b'"') => {

                let (kind, src) = self.parse_reserved(pos)?;

                match kind {

                    // If the reserved token was simply a single string then

                    // that is converted to a standalone string token

                    ReservedKind::String => return Ok(Some(TokenKind::String)),

                    // If only idchars were consumed then this could be a

                    // specific kind of standalone token we're interested in.

                    ReservedKind::Idchars => {

                        // https://webassembly.github.io/spec/core/text/values.html#integers

                        if let Some(ret) = self.classify_number(src) {

                            return Ok(Some(ret));

                        // https://webassembly.github.io/spec/core/text/values.html#text-id

                        } else if *c == b'$' {

                            return Ok(Some(TokenKind::Id));

                        // part of the WebAssembly/annotations proposal

                        // (no online url yet)

                        } else if *c == b'@' {

                            return Ok(Some(TokenKind::Annotation));

                        // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword

                        } else if b'a' <= *c && *c <= b'z' {

                            return Ok(Some(TokenKind::Keyword));

                    ReservedKind::IdString => return Ok(Some(TokenKind::Id)),

                    ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),

                    // ... otherwise this was a conglomeration of idchars,

                    // strings, or just idchars that don't match a prior rule,

                    // meaning this falls through to the fallback `Reserved`

                    // token.

                    ReservedKind::Reserved => {}

                Ok(Some(TokenKind::Reserved))

            // This could be a line comment, otherwise `;` is a reserved token.

            // The second byte is checked to see if it's a `;;` line comment

//

            // Note that this character being considered as part of a

            // `reserved` token is part of the annotations proposal.

            b';' => match remaining.get(1) {

                Some(b';') => {

                    let remaining = &self.input[*pos..];

                    let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())

                        .unwrap_or(remaining.len());

                    *pos += byte_pos;

                    let comment = &remaining[..byte_pos];

                    self.check_confusing_comment(*pos, comment)?;

                    Ok(Some(TokenKind::LineComment))

                _ => {

                    *pos += 1;

                    Ok(Some(TokenKind::Reserved))

},

            // Other known reserved tokens other than `;`

//

            // Note that these characters being considered as part of a

            // `reserved` token is part of the annotations proposal.

            b',' | b'[' | b']' | b'{' | b'}' => {

                *pos += 1;

                Ok(Some(TokenKind::Reserved))

            _ => {

                let ch = self.input[start..].chars().next().unwrap();

                Err(self.error(*pos, LexError::Unexpected(ch)))

    fn skip_ws(&self, pos: &mut usize) {

        // This table is a byte lookup table to determine whether a byte is a

        // whitespace byte. There are only 4 whitespace bytes for the `*.wat`

        // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes

        // have a '1' in the table below.

//

        // Due to how utf-8 works (our input is guaranteed to be utf-8) it is

        // known that if these bytes are found they're guaranteed to be the

        // whitespace byte, so they can be safely skipped and we don't have to

        // do full utf-8 decoding. This means that the goal of this function is

        // to find the first non-whitespace byte in `remaining`.

//

        // For now this lookup table seems to be the fastest, but projects like

        // https://github.com/lemire/despacer show other simd algorithms which

        // can possibly accelerate this even more. Note that `*.wat` files often

        // have a lot of whitespace so this function is typically quite hot when

        // parsing inputs.

        #[rustfmt::skip]

        const WS: [u8; 256] = [

            //                                   \t \n       \r

            /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,

            /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            //        ' '

            /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

            /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

];

        let remaining = &self.input[*pos..];

        let non_ws_pos = remaining

            .as_bytes()

            .iter()

            .position(|b| WS[*b as usize] != 1)

            .unwrap_or(remaining.len());

        *pos += non_ws_pos;

    /// Splits off a "reserved" token which is then further processed later on

    /// to figure out which kind of token it is `depending on `ReservedKind`.

///

    /// For more information on this method see the clarification at

    /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is

    /// that this is parsing the grammar:

///

    /// ```text

    /// reserved := (idchar | string)+

    /// ```

///

    /// which means that it is eating any number of adjacent string/idchar

    /// tokens (e.g. `a"b"c`) and returning the classification of what was

    /// eaten. The classification assists in determining what the actual token

    /// here eaten looks like.

    fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {

        let mut idchars = 0u32;

        let mut strings = 0u32;

        let start = *pos;

        while let Some(byte) = self.input.as_bytes().get(*pos) {

            match byte {

                // Normal `idchars` production which appends to the reserved

                // token that's being produced.

                idchars!() => {

                    idchars += 1;

                    *pos += 1;

                // https://webassembly.github.io/spec/core/text/values.html#text-string

                b'"' => {

                    strings += 1;

                    *pos += 1;

                    let mut it = self.input[*pos..].chars();

                    let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);

                    *pos = self.input.len() - it.as_str().len();

                    match result {

                        Ok(_) => {}

                        Err(e) => {

                            let err_pos = match &e {

                                LexError::UnexpectedEof => self.input.len(),

                                _ => self.input[..*pos].char_indices().next_back().unwrap().0,

};

                            return Err(self.error(err_pos, e));

                // Nothing else is considered part of a reserved token

                _ => break,

        let ret = &self.input[start..*pos];

        Ok(match (idchars, strings) {

            (0, 0) => unreachable!(),

            (0, 1) => (ReservedKind::String, ret),

            (_, 0) => (ReservedKind::Idchars, ret),

            // Pattern match `@"..."` and `$"..."` for string-based

            // identifiers and annotations.

            (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),

            (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),

            _ => (ReservedKind::Reserved, ret),

})

    fn classify_number(&self, src: &str) -> Option<TokenKind> {

        let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {

            (Some(SignToken::Plus), stripped)

        } else if let Some(stripped) = src.strip_prefix('-') {

            (Some(SignToken::Minus), stripped)

        } else {

            (None, src)

};

        let negative = sign == Some(SignToken::Minus);

        // Handle `inf` and `nan` which are special numbers here

        if num == "inf" {

            return Some(TokenKind::Float(FloatKind::Inf { negative }));

        } else if num == "nan" {

            return Some(TokenKind::Float(FloatKind::Nan { negative }));

        } else if let Some(stripped) = num.strip_prefix("nan:0x") {

            let mut it = stripped.as_bytes().iter();

            let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;

            if it.next().is_some() {

                return None;

            return Some(TokenKind::Float(FloatKind::NanVal {

                negative,

                has_underscores,

            }));

        // Figure out if we're a hex number or not

        let test_valid: fn(u8) -> bool;

        let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {

            test_valid = |x: u8| char::from(x).is_ascii_hexdigit();

            (stripped.as_bytes().iter(), true)

        } else {

            test_valid = |x: u8| char::from(x).is_ascii_digit();

            (num.as_bytes().iter(), false)

};

        // Evaluate the first part, moving out all underscores

        let mut has_underscores = skip_underscores(&mut it, test_valid)?;

        match it.clone().next() {

            // If we're followed by something this may be a float so keep going.

            Some(_) => {}

            // Otherwise this is a valid integer literal!

            None => {

                return Some(TokenKind::Integer(IntegerKind {

                    has_underscores,

                    sign,

                    hex,

}))

        // A number can optionally be after the dot so only actually try to

        // parse one if it's there.

        if it.clone().next() == Some(&b'.') {

            it.next();

            match it.clone().next() {

                Some(c) if test_valid(*c) => {

                    if skip_underscores(&mut it, test_valid)? {

                        has_underscores = true;

                Some(_) | None => {}

};

        // Figure out if there's an exponential part here to make a float, and

        // if so parse it but defer its actual calculation until later.

        match (hex, it.next()) {

            (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {

                match it.clone().next() {

                    Some(b'-') => {

                        it.next();

                    Some(b'+') => {

                        it.next();

                    _ => {}

                if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {

                    has_underscores = true;

            (_, None) => {}

            _ => return None,

        // We should have eaten everything by now, if not then this is surely

        // not a float or integer literal.

        if it.next().is_some() {

            return None;

        return Some(TokenKind::Float(FloatKind::Normal {

            has_underscores,

            hex,

        }));

        fn skip_underscores<'a>(

            it: &mut slice::Iter<'_, u8>,

            good: fn(u8) -> bool,

        ) -> Option<bool> {

            let mut last_underscore = false;

            let mut has_underscores = false;

            let first = *it.next()?;

            if !good(first) {

                return None;

            while let Some(c) = it.clone().next() {

                if *c == b'_' && !last_underscore {

                    has_underscores = true;

                    it.next();

                    last_underscore = true;

                    continue;

                if !good(*c) {

                    break;

                last_underscore = false;

                it.next();

            if last_underscore {

                return None;

            Some(has_underscores)

    /// Verifies that `comment`, which is about to be returned, has a "confusing

    /// unicode character" in it and should instead be transformed into an

    /// error.

    fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {

        if self.allow_confusing_unicode {

            return Ok(());

        // In an effort to avoid utf-8 decoding the entire `comment` the search

        // here is a bit more optimized. This checks for the `0xe2` byte because

        // in the utf-8 encoding that's the leading encoding byte for all

        // "confusing characters". Each instance of 0xe2 is checked to see if it

        // starts a confusing character, and if so that's returned.

//

        // Also note that 0xe2 will never be found in the middle of a codepoint,

        // it's always the start of a codepoint. This means that if our special

        // characters show up they're guaranteed to start with 0xe2 bytes.

        let bytes = comment.as_bytes();

        for pos in memchr::Memchr::new(0xe2, bytes) {

            if let Some(c) = comment[pos..].chars().next() {

                if is_confusing_unicode(c) {

                    // Note that `self.cur()` accounts for already having

                    // parsed `comment`, so we move backwards to where

                    // `comment` started and then add the index within

                    // `comment`.

                    let pos = end - comment.len() + pos;

                    return Err(self.error(pos, LexError::ConfusingUnicode(c)));

        Ok(())

    fn parse_str(

        it: &mut str::Chars<'a>,

        allow_confusing_unicode: bool,

    ) -> Result<Cow<'a, [u8]>, LexError> {

        enum State {

            Start,

            String(Vec<u8>),

        let orig = it.as_str();

        let mut state = State::Start;

        loop {

            match it.next().ok_or(LexError::UnexpectedEof)? {

                '"' => break,

                '\\' => {

                    match state {

                        State::String(_) => {}

                        State::Start => {

                            let pos = orig.len() - it.as_str().len() - 1;

                            state = State::String(orig[..pos].as_bytes().to_vec());

                    let buf = match &mut state {

                        State::String(b) => b,

                        State::Start => unreachable!(),

};

                    match it.next().ok_or(LexError::UnexpectedEof)? {

                        '"' => buf.push(b'"'),

                        '\'' => buf.push(b'\''),

                        't' => buf.push(b'\t'),

                        'n' => buf.push(b'\n'),

                        'r' => buf.push(b'\r'),

                        '\\' => buf.push(b'\\'),

                        'u' => {

                            Lexer::must_eat_char(it, '{')?;

                            let n = Lexer::hexnum(it)?;

                            let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;

                            buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());

                            Lexer::must_eat_char(it, '}')?;

                        c1 if c1.is_ascii_hexdigit() => {

                            let c2 = Lexer::hexdigit(it)?;

                            buf.push(to_hex(c1) * 16 + c2);

                        c => return Err(LexError::InvalidStringEscape(c)),

                c if (c as u32) < 0x20 || c as u32 == 0x7f => {

                    return Err(LexError::InvalidStringElement(c))

                c if !allow_confusing_unicode && is_confusing_unicode(c) => {

                    return Err(LexError::ConfusingUnicode(c))

                c => match &mut state {

                    State::Start => {}

                    State::String(v) => {

                        v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());

},

        match state {

            State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),

            State::String(s) => Ok(s.into()),

    /// Parses an id-or-string-based name from `it`.

///

    /// Note that `it` should already have been lexed and this is just

    /// extracting the value. If the token lexed was `@a` then this should point

    /// to `a`.

///

    /// This will automatically detect quoted syntax such as `@"..."` and the

    /// byte string will be parsed and validated as utf-8.

///

    /// # Errors

///

    /// Returns an error if a quoted byte string is found and contains invalid

    /// utf-8.

    fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {

        if it.clone().next() == Some('"') {

            it.next();

            match Lexer::parse_str(it, true)? {

                Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {

                    Ok(s) => Ok(Cow::Borrowed(s)),

                    Err(e) => Err(LexError::InvalidUtf8Id(e)),

},

                Cow::Owned(bytes) => match String::from_utf8(bytes) {

                    Ok(s) => Ok(Cow::Owned(s)),

                    Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),

},

        } else {

            Ok(Cow::Borrowed(it.as_str()))

    fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {

        let n = Lexer::hexdigit(it)?;

        let mut last_underscore = false;

        let mut n = n as u32;

        while let Some(c) = it.clone().next() {

            if c == '_' {

                it.next();

                last_underscore = true;

                continue;

            if !c.is_ascii_hexdigit() {

                break;

            last_underscore = false;

            it.next();

            n = n

                .checked_mul(16)

                .and_then(|n| n.checked_add(to_hex(c) as u32))

                .ok_or(LexError::NumberTooBig)?;

        if last_underscore {

            return Err(LexError::LoneUnderscore);

        Ok(n)

    /// Reads a hexidecimal digit from the input stream, returning where it's

    /// defined and the hex value. Returns an error on EOF or an invalid hex

    /// digit.

    fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {

        let ch = Lexer::must_char(it)?;

        if ch.is_ascii_hexdigit() {

            Ok(to_hex(ch))

        } else {

            Err(LexError::InvalidHexDigit(ch))

    /// Reads the next character from the input string and where it's located,

    /// returning an error if the input stream is empty.

    fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {

        it.next().ok_or(LexError::UnexpectedEof)

    /// Expects that a specific character must be read next

    fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {

        let found = Lexer::must_char(it)?;

        if wanted == found {

            Ok(())

        } else {

            Err(LexError::Expected { wanted, found })

    /// Creates an error at `pos` with the specified `kind`

    fn error(&self, pos: usize, kind: LexError) -> Error {

        Error::lex(Span { offset: pos }, self.input, kind)

    /// Returns an iterator over all tokens in the original source string

    /// starting at the `pos` specified.

    pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {

        std::iter::from_fn(move || self.parse(&mut pos).transpose())

    /// Returns whether an annotation is present at `pos`. If it is present then

    /// `Ok(Some(token))` is returned corresponding to the token, otherwise

    /// `Ok(None)` is returned. If the next token cannot be parsed then an error

    /// is returned.

    pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {

        let bytes = self.input.as_bytes();

        // Quickly reject anything that for sure isn't an annotation since this

        // method is used every time an lparen is parsed.

        if bytes.get(pos) != Some(&b'@') {

            return Ok(None);

        match self.parse(&mut pos)? {

            Some(token) => match token.kind {

                TokenKind::Annotation => Ok(Some(token)),

                _ => Ok(None),

},

            None => Ok(None),

impl Token {

    /// Returns the original source text for this token.

    pub fn src<'a>(&self, s: &'a str) -> &'a str {

        &s[self.offset..][..self.len.try_into().unwrap()]

    /// Returns the identifier, without the leading `$` symbol, that this token

    /// represents.

///

    /// Note that this method returns the contents of the identifier. With a

    /// string-based identifier this means that escapes have been resolved to

    /// their string-based equivalent.

///

    /// Should only be used with `TokenKind::Id`.

///

    /// # Errors

///

    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)

    /// which is invalid utf-8.

    pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {

        let mut ch = self.src(s).chars();

        let dollar = ch.next();

        debug_assert_eq!(dollar, Some('$'));

        let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;

        if id.is_empty() {

            return Err(self.error(s, LexError::EmptyId));

        Ok(id)

    /// Returns the annotation, without the leading `@` symbol, that this token

    /// represents.

///

    /// Note that this method returns the contents of the identifier. With a

    /// string-based identifier this means that escapes have been resolved to

    /// their string-based equivalent.

///

    /// Should only be used with `TokenKind::Annotation`.

///

    /// # Errors

///

    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)

    /// which is invalid utf-8.

    pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {

        let mut ch = self.src(s).chars();

        let at = ch.next();

        debug_assert_eq!(at, Some('@'));

        let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;

        if id.is_empty() {

            return Err(self.error(s, LexError::EmptyAnnotation));

        Ok(id)

    /// Returns the keyword this token represents.

///

    /// Should only be used with [`TokenKind::Keyword`].

    pub fn keyword<'a>(&self, s: &'a str) -> &'a str {

        self.src(s)

    /// Returns the reserved string this token represents.

///

    /// Should only be used with [`TokenKind::Reserved`].

    pub fn reserved<'a>(&self, s: &'a str) -> &'a str {

        self.src(s)

    /// Returns the parsed string that this token represents.

///

    /// This returns either a raw byte slice into the source if that's possible

    /// or an owned representation to handle escaped characters and such.

///

    /// Should only be used with [`TokenKind::String`].

    pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {

        let mut ch = self.src(s).chars();

        ch.next().unwrap();

        Lexer::parse_str(&mut ch, true).unwrap()

    /// Returns the decomposed float token that this represents.

///

    /// This will slice up the float token into its component parts and return a

    /// description of the float token in the source.

///

    /// Should only be used with [`TokenKind::Float`].

    pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {

        match kind {

            FloatKind::Inf { negative } => Float::Inf { negative },

            FloatKind::Nan { negative } => Float::Nan {

                val: None,

                negative,

},

            FloatKind::NanVal {

                negative,

                has_underscores,

            } => {

                let src = self.src(s);

                let src = if src.starts_with("n") { src } else { &src[1..] };

                let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());

                if has_underscores {

                    *val.to_mut() = val.replace("_", "");

                Float::Nan {

                    val: Some(val),

                    negative,

            FloatKind::Normal {

                has_underscores,

                hex,

            } => {

                let src = self.src(s);

                let (integral, fractional, exponent) = match src.find('.') {

                    Some(i) => {

                        let integral = &src[..i];

                        let rest = &src[i + 1..];

                        let exponent = if hex {

                            rest.find('p').or_else(|| rest.find('P'))

                        } else {

                            rest.find('e').or_else(|| rest.find('E'))

};

                        match exponent {

                            Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),

                            None => (integral, Some(rest), None),

                    None => {

                        let exponent = if hex {

                            src.find('p').or_else(|| src.find('P'))

                        } else {

                            src.find('e').or_else(|| src.find('E'))

};

                        match exponent {

                            Some(i) => (&src[..i], None, Some(&src[i + 1..])),

                            None => (src, None, None),

};

                let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));

                let mut fractional = fractional.and_then(|s| {

                    if s.is_empty() {

                        None

                    } else {

                        Some(Cow::Borrowed(s))

});

                let mut exponent =

                    exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));

                if has_underscores {

                    *integral.to_mut() = integral.replace("_", "");

                    if let Some(fractional) = &mut fractional {

                        *fractional.to_mut() = fractional.replace("_", "");

                    if let Some(exponent) = &mut exponent {

                        *exponent.to_mut() = exponent.replace("_", "");

                if hex {

                    *integral.to_mut() = integral.replace("0x", "");

                Float::Val {

                    hex,

                    integral,

                    fractional,

                    exponent,

    /// Returns the decomposed integer token that this represents.

///

    /// This will slice up the integer token into its component parts and

    /// return a description of the integer token in the source.

///

    /// Should only be used with [`TokenKind::Integer`].

    pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {

        let src = self.src(s);

        let val = match kind.sign {

            Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),

            Some(SignToken::Minus) => src,

            None => src,

};

        let mut val = Cow::Borrowed(val);

        if kind.has_underscores {

            *val.to_mut() = val.replace("_", "");

        if kind.hex {

            *val.to_mut() = val.replace("0x", "");

        Integer {

            sign: kind.sign,

            hex: kind.hex,

            val,

    fn error(&self, src: &str, err: LexError) -> Error {

        Error::lex(

            Span {

                offset: self.offset,

},

            src,

            err,

impl<'a> Integer<'a> {

    /// Returns the sign token for this integer.

    pub fn sign(&self) -> Option<SignToken> {

        self.sign

    /// Returns the value string that can be parsed for this integer, as well

    /// as the base that it should be parsed in

    pub fn val(&self) -> (&str, u32) {

        (&self.val, if self.hex { 16 } else { 10 })

fn to_hex(c: char) -> u8 {

    match c {

        'a'..='f' => c as u8 - b'a' + 10,

        'A'..='F' => c as u8 - b'A' + 10,

        _ => c as u8 - b'0',

impl fmt::Display for LexError {

    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

        use LexError::*;

        match self {

            DanglingBlockComment => f.write_str("unterminated block comment")?,

            Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,

            InvalidStringElement(c) => {

                write!(f, "invalid character in string '{}'", escape_char(*c))?

            InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,

            InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,

            InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,

            Expected { wanted, found } => write!(

f,

                "expected '{}' but found '{}'",

                escape_char(*wanted),

                escape_char(*found)

)?,

            UnexpectedEof => write!(f, "unexpected end-of-file")?,

            NumberTooBig => f.write_str("number is too big to parse")?,

            InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,

            LoneUnderscore => write!(f, "bare underscore in numeric literal")?,

            ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,

            InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,

            EmptyId => write!(f, "empty identifier")?,

            EmptyAnnotation => write!(f, "empty annotation id")?,

        Ok(())

fn escape_char(c: char) -> String {

    match c {

        '\t' => String::from("\\t"),

        '\r' => String::from("\\r"),

        '\n' => String::from("\\n"),

        '\\' => String::from("\\\\"),

        '\'' => String::from("\\\'"),

        '\"' => String::from("\""),

        '\x20'..='\x7e' => String::from(c),

        _ => c.escape_unicode().to_string(),

/// This is an attempt to protect agains the "trojan source" [1] problem where

/// unicode characters can cause editors to render source code differently

/// for humans than the compiler itself sees.

///

/// To mitigate this issue, and because it's relatively rare in practice,

/// this simply rejects characters of that form.

///

/// [1]: https://www.trojansource.codes/

fn is_confusing_unicode(ch: char) -> bool {

    matches!(

ch,

        '\u{202a}'

            | '\u{202b}'

            | '\u{202d}'

            | '\u{202e}'

            | '\u{2066}'

            | '\u{2067}'

            | '\u{2068}'

            | '\u{206c}'

            | '\u{2069}'

#[cfg(test)]

mod tests {

    use super::*;

    #[test]

    fn ws_smoke() {

        fn get_whitespace(input: &str) -> &str {

            let token = get_token(input);

            match token.kind {

                TokenKind::Whitespace => token.src(input),

                other => panic!("unexpected {:?}", other),

        assert_eq!(get_whitespace(" "), " ");

        assert_eq!(get_whitespace("  "), "  ");

        assert_eq!(get_whitespace("  \n "), "  \n ");

        assert_eq!(get_whitespace("  x"), "  ");

        assert_eq!(get_whitespace("  ;"), "  ");

    #[test]

    fn line_comment_smoke() {

        fn get_line_comment(input: &str) -> &str {

            let token = get_token(input);

            match token.kind {

                TokenKind::LineComment => token.src(input),

                other => panic!("unexpected {:?}", other),

        assert_eq!(get_line_comment(";;"), ";;");

        assert_eq!(get_line_comment(";; xyz"), ";; xyz");

        assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");

        assert_eq!(get_line_comment(";;\nabc"), ";;");

        assert_eq!(get_line_comment(";;   \nabc"), ";;   ");

        assert_eq!(get_line_comment(";;   \rabc"), ";;   ");

        assert_eq!(get_line_comment(";;   \r\nabc"), ";;   ");

    #[test]

    fn block_comment_smoke() {

        fn get_block_comment(input: &str) -> &str {

            let token = get_token(input);

            match token.kind {

                TokenKind::BlockComment => token.src(input),

                other => panic!("unexpected {:?}", other),

        assert_eq!(get_block_comment("(;;)"), "(;;)");

        assert_eq!(get_block_comment("(; ;)"), "(; ;)");

        assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");

    fn get_token(input: &str) -> Token {

        Lexer::new(input)

            .parse(&mut 0)

            .expect("no first token")

            .expect("no token")

    #[test]

    fn lparen() {

        assert_eq!(get_token("((").kind, TokenKind::LParen);

    #[test]

    fn rparen() {

        assert_eq!(get_token(")(").kind, TokenKind::RParen);

    #[test]

    fn strings() {

        fn get_string(input: &str) -> Vec<u8> {

            let token = get_token(input);

            match token.kind {

                TokenKind::String => token.string(input).to_vec(),

                other => panic!("not keyword {:?}", other),

        assert_eq!(&*get_string("\"\""), b"");

        assert_eq!(&*get_string("\"a\""), b"a");

        assert_eq!(&*get_string("\"a b c d\""), b"a b c d");

        assert_eq!(&*get_string("\"\\\"\""), b"\"");

        assert_eq!(&*get_string("\"\\'\""), b"'");

        assert_eq!(&*get_string("\"\\n\""), b"\n");

        assert_eq!(&*get_string("\"\\t\""), b"\t");

        assert_eq!(&*get_string("\"\\r\""), b"\r");

        assert_eq!(&*get_string("\"\\\\\""), b"\\");

        assert_eq!(&*get_string("\"\\01\""), &[1]);

        assert_eq!(&*get_string("\"\\u{1}\""), &[1]);

        assert_eq!(

            &*get_string("\"\\u{0f3}\""),

            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()

);

        assert_eq!(

            &*get_string("\"\\u{0_f_3}\""),

            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()

);

        for i in 0..=255i32 {

            let s = format!("\"\\{:02x}\"", i);

            assert_eq!(&*get_string(&s), &[i as u8]);

    #[test]

    fn id() {

        fn get_id(input: &str) -> String {

            let token = get_token(input);

            match token.kind {

                TokenKind::Id => token.id(input).unwrap().to_string(),

                other => panic!("not id {:?}", other),

        assert_eq!(get_id("$x"), "x");

        assert_eq!(get_id("$xyz"), "xyz");

        assert_eq!(get_id("$x_z"), "x_z");

        assert_eq!(get_id("$0^"), "0^");

        assert_eq!(get_id("$0^;;"), "0^");

        assert_eq!(get_id("$0^ ;;"), "0^");

        assert_eq!(get_id("$\"x\" ;;"), "x");

    #[test]

    fn annotation() {

        fn get_annotation(input: &str) -> String {

            let token = get_token(input);

            match token.kind {

                TokenKind::Annotation => token.annotation(input).unwrap().to_string(),

                other => panic!("not annotation {:?}", other),

        assert_eq!(get_annotation("@foo"), "foo");

        assert_eq!(get_annotation("@foo "), "foo");

        assert_eq!(get_annotation("@f "), "f");

        assert_eq!(get_annotation("@\"x\" "), "x");

        assert_eq!(get_annotation("@0 "), "0");

    #[test]

    fn keyword() {

        fn get_keyword(input: &str) -> &str {

            let token = get_token(input);

            match token.kind {

                TokenKind::Keyword => token.keyword(input),

                other => panic!("not keyword {:?}", other),

        assert_eq!(get_keyword("x"), "x");

        assert_eq!(get_keyword("xyz"), "xyz");

        assert_eq!(get_keyword("x_z"), "x_z");

        assert_eq!(get_keyword("x_z "), "x_z");

        assert_eq!(get_keyword("x_z "), "x_z");

    #[test]

    fn reserved() {

        fn get_reserved(input: &str) -> &str {

            let token = get_token(input);

            match token.kind {

                TokenKind::Reserved => token.reserved(input),

                other => panic!("not reserved {:?}", other),

        assert_eq!(get_reserved("^_x "), "^_x");

    #[test]

    fn integer() {

        fn get_integer(input: &str) -> String {

            let token = get_token(input);

            match token.kind {

                TokenKind::Integer(i) => token.integer(input, i).val.to_string(),

                other => panic!("not integer {:?}", other),

        assert_eq!(get_integer("1"), "1");

        assert_eq!(get_integer("0"), "0");

        assert_eq!(get_integer("-1"), "-1");

        assert_eq!(get_integer("+1"), "1");

        assert_eq!(get_integer("+1_000"), "1000");

        assert_eq!(get_integer("+1_0_0_0"), "1000");

        assert_eq!(get_integer("+0x10"), "10");

        assert_eq!(get_integer("-0x10"), "-10");

        assert_eq!(get_integer("0x10"), "10");

    #[test]

    fn float() {

        fn get_float(input: &str) -> Float<'_> {

            let token = get_token(input);

            match token.kind {

                TokenKind::Float(f) => token.float(input, f),

                other => panic!("not float {:?}", other),

        assert_eq!(

            get_float("nan"),

            Float::Nan {

                val: None,

                negative: false

},

);

        assert_eq!(

            get_float("-nan"),

            Float::Nan {

                val: None,

                negative: true,

},

);

        assert_eq!(

            get_float("+nan"),

            Float::Nan {

                val: None,

                negative: false,

},

);

        assert_eq!(

            get_float("+nan:0x1"),

            Float::Nan {

                val: Some("1".into()),

                negative: false,

},

);

        assert_eq!(

            get_float("nan:0x7f_ffff"),

            Float::Nan {

                val: Some("7fffff".into()),

                negative: false,

},

);

        assert_eq!(get_float("inf"), Float::Inf { negative: false });

        assert_eq!(get_float("-inf"), Float::Inf { negative: true });

        assert_eq!(get_float("+inf"), Float::Inf { negative: false });

        assert_eq!(

            get_float("1.2"),

            Float::Val {

                integral: "1".into(),

                fractional: Some("2".into()),

                exponent: None,

                hex: false,

},

);

        assert_eq!(

            get_float("1.2e3"),

            Float::Val {

                integral: "1".into(),

                fractional: Some("2".into()),

                exponent: Some("3".into()),

                hex: false,

},

);

        assert_eq!(

            get_float("-1_2.1_1E+0_1"),

            Float::Val {

                integral: "-12".into(),

                fractional: Some("11".into()),

                exponent: Some("01".into()),

                hex: false,

},

);

        assert_eq!(

            get_float("+1_2.1_1E-0_1"),

            Float::Val {

                integral: "12".into(),

                fractional: Some("11".into()),

                exponent: Some("-01".into()),

                hex: false,

},

);

        assert_eq!(

            get_float("0x1_2.3_4p5_6"),

            Float::Val {

                integral: "12".into(),

                fractional: Some("34".into()),

                exponent: Some("56".into()),

                hex: true,

},

);

        assert_eq!(

            get_float("+0x1_2.3_4P-5_6"),

            Float::Val {

                integral: "12".into(),

                fractional: Some("34".into()),

                exponent: Some("-56".into()),

                hex: true,

},

);

        assert_eq!(

            get_float("1."),

            Float::Val {

                integral: "1".into(),

                fractional: None,

                exponent: None,

                hex: false,

},

);

        assert_eq!(

            get_float("0x1p-24"),

            Float::Val {

                integral: "1".into(),

                fractional: None,

                exponent: Some("-24".into()),

                hex: true,

},

);