Revision control
Copy as Markdown
Other Tools
//! Definition of a lexer for the WebAssembly text format.
//!
//! This module provides a [`Lexer`][] type which is an iterate over the raw
//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
//! byte in a WebAssembly text field, returning tokens even for comments and
//! whitespace. Typically you'll ignore comments and whitespace, however.
//!
//! If you'd like to iterate over the tokens in a file you can do so via:
//!
//! ```
//! # fn foo() -> Result<(), wast::Error> {
//! use wast::lexer::Lexer;
//!
//! let wat = "(module (func $foo))";
//! for token in Lexer::new(wat).iter(0) {
//! println!("{:?}", token?);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! Note that you'll typically not use this module but will rather use
//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
//!
//! [`Lexer`]: crate::lexer::Lexer
use crate::token::Span;
use crate::Error;
use std::borrow::Cow;
use std::char;
use std::fmt;
use std::slice;
use std::str;
use std::str::Utf8Error;
/// A structure used to lex the s-expression syntax of WAT files.
///
/// This structure is used to generate [`Token`] items, which should account for
/// every single byte of the input as we iterate over it. A [`LexError`] is
/// returned for any non-lexable text.
#[derive(Clone)]
pub struct Lexer<'a> {
input: &'a str,
allow_confusing_unicode: bool,
}
/// A single token parsed from a `Lexer`.
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct Token {
/// The kind of token this represents, such as whether it's whitespace, a
/// keyword, etc.
pub kind: TokenKind,
/// The byte offset within the original source for where this token came
/// from.
pub offset: usize,
/// The byte length of this token as it resides in the original source.
//
// NB: this is `u32` to enable packing `Token` into two pointers of size.
// This does limit a single token to being at most 4G large, but that seems
// probably ok.
pub len: u32,
}
#[test]
fn token_is_not_too_big() {
assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);
}
/// Classification of what was parsed from the input stream.
///
/// This enumeration contains all kinds of fragments, including comments and
/// whitespace.
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum TokenKind {
/// A line comment, preceded with `;;`
LineComment,
/// A block comment, surrounded by `(;` and `;)`. Note that these can be
/// nested.
BlockComment,
/// A fragment of source that represents whitespace.
Whitespace,
/// A left-parenthesis, including the source text for where it comes from.
LParen,
/// A right-parenthesis, including the source text for where it comes from.
RParen,
/// A string literal, which is actually a list of bytes.
String,
/// An identifier (like `$foo`).
///
/// All identifiers start with `$` and the payload here is the original
/// source text.
Id,
/// A keyword, or something that starts with an alphabetic character.
///
/// The payload here is the original source text.
Keyword,
/// An annotation (like `@foo`).
///
/// All annotations start with `@` and the payload will be the name of the
/// annotation.
Annotation,
/// A reserved series of `idchar` symbols. Unknown what this is meant to be
/// used for, you'll probably generate an error about an unexpected token.
Reserved,
/// An integer.
Integer(IntegerKind),
/// A float.
Float(FloatKind),
}
/// Description of the parsed integer from the source.
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct IntegerKind {
sign: Option<SignToken>,
has_underscores: bool,
hex: bool,
}
/// Description of a parsed float from the source.
#[allow(missing_docs)]
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum FloatKind {
#[doc(hidden)]
Inf { negative: bool },
#[doc(hidden)]
Nan { negative: bool },
#[doc(hidden)]
NanVal {
negative: bool,
has_underscores: bool,
},
#[doc(hidden)]
Normal { has_underscores: bool, hex: bool },
}
enum ReservedKind {
/// "..."
String,
/// anything that's just a sequence of `idchars!()`
Idchars,
/// $"..."
IdString,
/// @"..."
AnnotationString,
/// everything else (a conglomeration of strings, idchars, etc)
Reserved,
}
/// Errors that can be generated while lexing.
///
/// All lexing errors have line/colum/position information as well as a
/// `LexError` indicating what kind of error happened while lexing.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LexError {
/// A dangling block comment was found with an unbalanced `(;` which was
/// never terminated in the file.
DanglingBlockComment,
/// An unexpected character was encountered when generally parsing and
/// looking for something else.
Unexpected(char),
/// An invalid `char` in a string literal was found.
InvalidStringElement(char),
/// An invalid string escape letter was found (the thing after the `\` in
/// string literals)
InvalidStringEscape(char),
/// An invalid hexadecimal digit was found.
InvalidHexDigit(char),
/// An invalid base-10 digit was found.
InvalidDigit(char),
/// Parsing expected `wanted` but ended up finding `found` instead where the
/// two characters aren't the same.
Expected {
/// The character that was expected to be found
wanted: char,
/// The character that was actually found
found: char,
},
/// We needed to parse more but EOF (or end of the string) was encountered.
UnexpectedEof,
/// A number failed to parse because it was too big to fit within the target
/// type.
NumberTooBig,
/// An invalid unicode value was found in a `\u{...}` escape in a string,
/// only valid unicode scalars can be escaped that way.
InvalidUnicodeValue(u32),
/// A lone underscore was found when parsing a number, since underscores
/// should always be preceded and succeeded with a digit of some form.
LoneUnderscore,
/// A "confusing" unicode character is present in a comment or a string
/// literal, such as a character that changes the direction text is
/// typically displayed in editors. This could cause the human-read
/// version to behave differently than the compiler-visible version, so
/// these are simply rejected for now.
ConfusingUnicode(char),
/// An invalid utf-8 sequence was found in a quoted identifier, such as
/// `$"\ff"`.
InvalidUtf8Id(Utf8Error),
/// An empty identifier was found, or a lone `$`.
EmptyId,
/// An empty identifier was found, or a lone `@`.
EmptyAnnotation,
}
/// A sign token for an integer.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SignToken {
/// Plus sign: "+",
Plus,
/// Minus sign: "-",
Minus,
}
/// A fully parsed integer from a source string with a payload ready to parse
/// into an integral type.
#[derive(Debug, PartialEq)]
pub struct Integer<'a> {
sign: Option<SignToken>,
val: Cow<'a, str>,
hex: bool,
}
/// Possible parsed float values
#[derive(Debug, PartialEq, Eq)]
pub enum Float<'a> {
/// A float `NaN` representation
Nan {
/// The specific bits to encode for this float, optionally
val: Option<Cow<'a, str>>,
/// Whether or not this is a negative `NaN` or not.
negative: bool,
},
/// An float infinite representation,
Inf {
#[allow(missing_docs)]
negative: bool,
},
/// A parsed and separated floating point value
Val {
/// Whether or not the `integral` and `decimal` are specified in hex
hex: bool,
/// The float parts before the `.`
integral: Cow<'a, str>,
/// The float parts after the `.`
decimal: Option<Cow<'a, str>>,
/// The exponent to multiple this `integral.decimal` portion of the
/// float by. If `hex` is true this is `2^exponent` and otherwise it's
/// `10^exponent`
exponent: Option<Cow<'a, str>>,
},
}
macro_rules! idchars {
() => {
b'0'..=b'9'
| b'A'..=b'Z'
| b'a'..=b'z'
| b'!'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'*'
| b'+'
| b'-'
| b'.'
| b'/'
| b':'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'\\'
| b'^'
| b'_'
| b'`'
| b'|'
| b'~'
}
}
impl<'a> Lexer<'a> {
/// Creates a new lexer which will lex the `input` source string.
pub fn new(input: &str) -> Lexer<'_> {
Lexer {
input,
allow_confusing_unicode: false,
}
}
/// Returns the original source input that we're lexing.
pub fn input(&self) -> &'a str {
self.input
}
/// Configures whether "confusing" unicode characters are allowed while
/// lexing.
///
/// If allowed then no error will happen if these characters are found, but
/// otherwise if disallowed a lex error will be produced when these
/// characters are found. Confusing characters are denied by default.
///
/// For now "confusing characters" are primarily related to the "trojan
/// source" problem where it refers to characters which cause humans to read
/// text differently than this lexer, such as characters that alter the
/// left-to-right display of the source code.
pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
self.allow_confusing_unicode = allow;
self
}
/// Lexes the next at the byte position `pos` in the input.
///
/// Returns `Some` if a token is found or `None` if we're at EOF.
///
/// The `pos` argument will be updated to point to the next token on a
/// successful parse.
///
/// # Errors
///
/// Returns an error if the input is malformed.
pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
let offset = *pos;
Ok(match self.parse_kind(pos)? {
Some(kind) => Some(Token {
kind,
offset,
len: (*pos - offset).try_into().unwrap(),
}),
None => None,
})
}
fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
let start = *pos;
// This `match` generally parses the grammar specified at
//
let remaining = &self.input.as_bytes()[start..];
let byte = match remaining.first() {
Some(b) => b,
None => return Ok(None),
};
match byte {
// Open-parens check the next character to see if this is the start
// of a block comment, otherwise it's just a bland left-paren
// token.
b'(' => match remaining.get(1) {
Some(b';') => {
let mut level = 1;
// Note that we're doing a byte-level search here for the
// close-delimiter of `;)`. The actual source text is utf-8
// encode in `remaining` but due to how utf-8 works we
// can safely search for an ASCII byte since it'll never
// otherwise appear in the middle of a codepoint and if we
// find it then it's guaranteed to be the right byte.
//
// Mainly we're avoiding the overhead of decoding utf-8
// characters into a Rust `char` since it's otherwise
// unnecessary work.
let mut iter = remaining[2..].iter();
while let Some(ch) = iter.next() {
match ch {
b'(' => {
if let Some(b';') = iter.as_slice().first() {
level += 1;
iter.next();
}
}
b';' => {
if let Some(b')') = iter.as_slice().first() {
level -= 1;
iter.next();
if level == 0 {
let len = remaining.len() - iter.as_slice().len();
let comment = &self.input[start..][..len];
*pos += len;
self.check_confusing_comment(*pos, comment)?;
return Ok(Some(TokenKind::BlockComment));
}
}
}
_ => {}
}
}
Err(self.error(start, LexError::DanglingBlockComment))
}
_ => {
*pos += 1;
Ok(Some(TokenKind::LParen))
}
},
b')' => {
*pos += 1;
Ok(Some(TokenKind::RParen))
}
b' ' | b'\n' | b'\r' | b'\t' => {
self.skip_ws(pos);
Ok(Some(TokenKind::Whitespace))
}
c @ (idchars!() | b'"') => {
let (kind, src) = self.parse_reserved(pos)?;
match kind {
// If the reserved token was simply a single string then
// that is converted to a standalone string token
ReservedKind::String => return Ok(Some(TokenKind::String)),
// If only idchars were consumed then this could be a
// specific kind of standalone token we're interested in.
ReservedKind::Idchars => {
if let Some(ret) = self.classify_number(src) {
return Ok(Some(ret));
} else if *c == b'$' {
return Ok(Some(TokenKind::Id));
// part of the WebAssembly/annotations proposal
// (no online url yet)
} else if *c == b'@' {
return Ok(Some(TokenKind::Annotation));
} else if b'a' <= *c && *c <= b'z' {
return Ok(Some(TokenKind::Keyword));
}
}
ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
// ... otherwise this was a conglomeration of idchars,
// strings, or just idchars that don't match a prior rule,
// meaning this falls through to the fallback `Reserved`
// token.
ReservedKind::Reserved => {}
}
Ok(Some(TokenKind::Reserved))
}
// This could be a line comment, otherwise `;` is a reserved token.
// The second byte is checked to see if it's a `;;` line comment
//
// Note that this character being considered as part of a
// `reserved` token is part of the annotations proposal.
b';' => match remaining.get(1) {
Some(b';') => {
let remaining = &self.input[*pos..];
let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())
.unwrap_or(remaining.len());
*pos += byte_pos;
let comment = &remaining[..byte_pos];
self.check_confusing_comment(*pos, comment)?;
Ok(Some(TokenKind::LineComment))
}
_ => {
*pos += 1;
Ok(Some(TokenKind::Reserved))
}
},
// Other known reserved tokens other than `;`
//
// Note that these characters being considered as part of a
// `reserved` token is part of the annotations proposal.
b',' | b'[' | b']' | b'{' | b'}' => {
*pos += 1;
Ok(Some(TokenKind::Reserved))
}
_ => {
let ch = self.input[start..].chars().next().unwrap();
Err(self.error(*pos, LexError::Unexpected(ch)))
}
}
}
fn skip_ws(&self, pos: &mut usize) {
// This table is a byte lookup table to determine whether a byte is a
// whitespace byte. There are only 4 whitespace bytes for the `*.wat`
// format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
// have a '1' in the table below.
//
// Due to how utf-8 works (our input is guaranteed to be utf-8) it is
// known that if these bytes are found they're guaranteed to be the
// whitespace byte, so they can be safely skipped and we don't have to
// do full utf-8 decoding. This means that the goal of this function is
// to find the first non-whitespace byte in `remaining`.
//
// For now this lookup table seems to be the fastest, but projects like
// can possibly accelerate this even more. Note that `*.wat` files often
// have a lot of whitespace so this function is typically quite hot when
// parsing inputs.
#[rustfmt::skip]
const WS: [u8; 256] = [
// \t \n \r
/* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
/* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// ' '
/* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
let remaining = &self.input[*pos..];
let non_ws_pos = remaining
.as_bytes()
.iter()
.position(|b| WS[*b as usize] != 1)
.unwrap_or(remaining.len());
*pos += non_ws_pos;
}
/// Splits off a "reserved" token which is then further processed later on
/// to figure out which kind of token it is `depending on `ReservedKind`.
///
/// For more information on this method see the clarification at
/// that this is parsing the grammar:
///
/// ```text
/// reserved := (idchar | string)+
/// ```
///
/// which means that it is eating any number of adjacent string/idchar
/// tokens (e.g. `a"b"c`) and returning the classification of what was
/// eaten. The classification assists in determining what the actual token
/// here eaten looks like.
fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
let mut idchars = 0u32;
let mut strings = 0u32;
let start = *pos;
while let Some(byte) = self.input.as_bytes().get(*pos) {
match byte {
// Normal `idchars` production which appends to the reserved
// token that's being produced.
idchars!() => {
idchars += 1;
*pos += 1;
}
b'"' => {
strings += 1;
*pos += 1;
let mut it = self.input[*pos..].chars();
let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
*pos = self.input.len() - it.as_str().len();
match result {
Ok(_) => {}
Err(e) => {
let err_pos = match &e {
LexError::UnexpectedEof => self.input.len(),
_ => self.input[..*pos].char_indices().next_back().unwrap().0,
};
return Err(self.error(err_pos, e));
}
}
}
// Nothing else is considered part of a reserved token
_ => break,
}
}
let ret = &self.input[start..*pos];
Ok(match (idchars, strings) {
(0, 0) => unreachable!(),
(0, 1) => (ReservedKind::String, ret),
(_, 0) => (ReservedKind::Idchars, ret),
// Pattern match `@"..."` and `$"..."` for string-based
// identifiers and annotations.
(1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
(1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
_ => (ReservedKind::Reserved, ret),
})
}
fn classify_number(&self, src: &str) -> Option<TokenKind> {
let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {
(Some(SignToken::Plus), stripped)
} else if let Some(stripped) = src.strip_prefix('-') {
(Some(SignToken::Minus), stripped)
} else {
(None, src)
};
let negative = sign == Some(SignToken::Minus);
// Handle `inf` and `nan` which are special numbers here
if num == "inf" {
return Some(TokenKind::Float(FloatKind::Inf { negative }));
} else if num == "nan" {
return Some(TokenKind::Float(FloatKind::Nan { negative }));
} else if let Some(stripped) = num.strip_prefix("nan:0x") {
let mut it = stripped.as_bytes().iter();
let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;
if it.next().is_some() {
return None;
}
return Some(TokenKind::Float(FloatKind::NanVal {
negative,
has_underscores,
}));
}
// Figure out if we're a hex number or not
let test_valid: fn(u8) -> bool;
let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {
test_valid = |x: u8| char::from(x).is_ascii_hexdigit();
(stripped.as_bytes().iter(), true)
} else {
test_valid = |x: u8| char::from(x).is_ascii_digit();
(num.as_bytes().iter(), false)
};
// Evaluate the first part, moving out all underscores
let mut has_underscores = skip_underscores(&mut it, test_valid)?;
match it.clone().next() {
// If we're followed by something this may be a float so keep going.
Some(_) => {}
// Otherwise this is a valid integer literal!
None => {
return Some(TokenKind::Integer(IntegerKind {
has_underscores,
sign,
hex,
}))
}
}
// A number can optionally be after the decimal so only actually try to
// parse one if it's there.
if it.clone().next() == Some(&b'.') {
it.next();
match it.clone().next() {
Some(c) if test_valid(*c) => {
if skip_underscores(&mut it, test_valid)? {
has_underscores = true;
}
}
Some(_) | None => {}
}
};
// Figure out if there's an exponential part here to make a float, and
// if so parse it but defer its actual calculation until later.
match (hex, it.next()) {
(true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {
match it.clone().next() {
Some(b'-') => {
it.next();
}
Some(b'+') => {
it.next();
}
_ => {}
}
if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {
has_underscores = true;
}
}
(_, None) => {}
_ => return None,
}
// We should have eaten everything by now, if not then this is surely
// not a float or integer literal.
if it.next().is_some() {
return None;
}
return Some(TokenKind::Float(FloatKind::Normal {
has_underscores,
hex,
}));
fn skip_underscores<'a>(
it: &mut slice::Iter<'_, u8>,
good: fn(u8) -> bool,
) -> Option<bool> {
let mut last_underscore = false;
let mut has_underscores = false;
let first = *it.next()?;
if !good(first) {
return None;
}
while let Some(c) = it.clone().next() {
if *c == b'_' && !last_underscore {
has_underscores = true;
it.next();
last_underscore = true;
continue;
}
if !good(*c) {
break;
}
last_underscore = false;
it.next();
}
if last_underscore {
return None;
}
Some(has_underscores)
}
}
/// Verifies that `comment`, which is about to be returned, has a "confusing
/// unicode character" in it and should instead be transformed into an
/// error.
fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
if self.allow_confusing_unicode {
return Ok(());
}
// In an effort to avoid utf-8 decoding the entire `comment` the search
// here is a bit more optimized. This checks for the `0xe2` byte because
// in the utf-8 encoding that's the leading encoding byte for all
// "confusing characters". Each instance of 0xe2 is checked to see if it
// starts a confusing character, and if so that's returned.
//
// Also note that 0xe2 will never be found in the middle of a codepoint,
// it's always the start of a codepoint. This means that if our special
// characters show up they're guaranteed to start with 0xe2 bytes.
let bytes = comment.as_bytes();
for pos in memchr::Memchr::new(0xe2, bytes) {
if let Some(c) = comment[pos..].chars().next() {
if is_confusing_unicode(c) {
// Note that `self.cur()` accounts for already having
// parsed `comment`, so we move backwards to where
// `comment` started and then add the index within
// `comment`.
let pos = end - comment.len() + pos;
return Err(self.error(pos, LexError::ConfusingUnicode(c)));
}
}
}
Ok(())
}
fn parse_str(
it: &mut str::Chars<'a>,
allow_confusing_unicode: bool,
) -> Result<Cow<'a, [u8]>, LexError> {
enum State {
Start,
String(Vec<u8>),
}
let orig = it.as_str();
let mut state = State::Start;
loop {
match it.next().ok_or(LexError::UnexpectedEof)? {
'"' => break,
'\\' => {
match state {
State::String(_) => {}
State::Start => {
let pos = orig.len() - it.as_str().len() - 1;
state = State::String(orig[..pos].as_bytes().to_vec());
}
}
let buf = match &mut state {
State::String(b) => b,
State::Start => unreachable!(),
};
match it.next().ok_or(LexError::UnexpectedEof)? {
'"' => buf.push(b'"'),
'\'' => buf.push(b'\''),
't' => buf.push(b'\t'),
'n' => buf.push(b'\n'),
'r' => buf.push(b'\r'),
'\\' => buf.push(b'\\'),
'u' => {
Lexer::must_eat_char(it, '{')?;
let n = Lexer::hexnum(it)?;
let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
Lexer::must_eat_char(it, '}')?;
}
c1 if c1.is_ascii_hexdigit() => {
let c2 = Lexer::hexdigit(it)?;
buf.push(to_hex(c1) * 16 + c2);
}
c => return Err(LexError::InvalidStringEscape(c)),
}
}
c if (c as u32) < 0x20 || c as u32 == 0x7f => {
return Err(LexError::InvalidStringElement(c))
}
c if !allow_confusing_unicode && is_confusing_unicode(c) => {
return Err(LexError::ConfusingUnicode(c))
}
c => match &mut state {
State::Start => {}
State::String(v) => {
v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
}
},
}
}
match state {
State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
State::String(s) => Ok(s.into()),
}
}
/// Parses an id-or-string-based name from `it`.
///
/// Note that `it` should already have been lexed and this is just
/// extracting the value. If the token lexed was `@a` then this should point
/// to `a`.
///
/// This will automatically detect quoted syntax such as `@"..."` and the
/// byte string will be parsed and validated as utf-8.
///
/// # Errors
///
/// Returns an error if a quoted byte string is found and contains invalid
/// utf-8.
fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
if it.clone().next() == Some('"') {
it.next();
match Lexer::parse_str(it, true)? {
Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(LexError::InvalidUtf8Id(e)),
},
Cow::Owned(bytes) => match String::from_utf8(bytes) {
Ok(s) => Ok(Cow::Owned(s)),
Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
},
}
} else {
Ok(Cow::Borrowed(it.as_str()))
}
}
fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
let n = Lexer::hexdigit(it)?;
let mut last_underscore = false;
let mut n = n as u32;
while let Some(c) = it.clone().next() {
if c == '_' {
it.next();
last_underscore = true;
continue;
}
if !c.is_ascii_hexdigit() {
break;
}
last_underscore = false;
it.next();
n = n
.checked_mul(16)
.and_then(|n| n.checked_add(to_hex(c) as u32))
.ok_or(LexError::NumberTooBig)?;
}
if last_underscore {
return Err(LexError::LoneUnderscore);
}
Ok(n)
}
/// Reads a hexidecimal digit from the input stream, returning where it's
/// defined and the hex value. Returns an error on EOF or an invalid hex
/// digit.
fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
let ch = Lexer::must_char(it)?;
if ch.is_ascii_hexdigit() {
Ok(to_hex(ch))
} else {
Err(LexError::InvalidHexDigit(ch))
}
}
/// Reads the next character from the input string and where it's located,
/// returning an error if the input stream is empty.
fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
it.next().ok_or(LexError::UnexpectedEof)
}
/// Expects that a specific character must be read next
fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
let found = Lexer::must_char(it)?;
if wanted == found {
Ok(())
} else {
Err(LexError::Expected { wanted, found })
}
}
/// Creates an error at `pos` with the specified `kind`
fn error(&self, pos: usize, kind: LexError) -> Error {
Error::lex(Span { offset: pos }, self.input, kind)
}
/// Returns an iterator over all tokens in the original source string
/// starting at the `pos` specified.
pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
std::iter::from_fn(move || self.parse(&mut pos).transpose())
}
/// Returns whether an annotation is present at `pos`. If it is present then
/// `Ok(Some(token))` is returned corresponding to the token, otherwise
/// `Ok(None)` is returned. If the next token cannot be parsed then an error
/// is returned.
pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
let bytes = self.input.as_bytes();
// Quickly reject anything that for sure isn't an annotation since this
// method is used every time an lparen is parsed.
if bytes.get(pos) != Some(&b'@') {
return Ok(None);
}
match self.parse(&mut pos)? {
Some(token) => match token.kind {
TokenKind::Annotation => Ok(Some(token)),
_ => Ok(None),
},
None => Ok(None),
}
}
}
impl Token {
/// Returns the original source text for this token.
pub fn src<'a>(&self, s: &'a str) -> &'a str {
&s[self.offset..][..self.len.try_into().unwrap()]
}
/// Returns the identifier, without the leading `$` symbol, that this token
/// represents.
///
/// Note that this method returns the contents of the identifier. With a
/// string-based identifier this means that escapes have been resolved to
/// their string-based equivalent.
///
/// Should only be used with `TokenKind::Id`.
///
/// # Errors
///
/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
/// which is invalid utf-8.
pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
let mut ch = self.src(s).chars();
let dollar = ch.next();
debug_assert_eq!(dollar, Some('$'));
let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
if id.is_empty() {
return Err(self.error(s, LexError::EmptyId));
}
Ok(id)
}
/// Returns the annotation, without the leading `@` symbol, that this token
/// represents.
///
/// Note that this method returns the contents of the identifier. With a
/// string-based identifier this means that escapes have been resolved to
/// their string-based equivalent.
///
/// Should only be used with `TokenKind::Annotation`.
///
/// # Errors
///
/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
/// which is invalid utf-8.
pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
let mut ch = self.src(s).chars();
let at = ch.next();
debug_assert_eq!(at, Some('@'));
let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
if id.is_empty() {
return Err(self.error(s, LexError::EmptyAnnotation));
}
Ok(id)
}
/// Returns the keyword this token represents.
///
/// Should only be used with [`TokenKind::Keyword`].
pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
self.src(s)
}
/// Returns the reserved string this token represents.
///
/// Should only be used with [`TokenKind::Reserved`].
pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
self.src(s)
}
/// Returns the parsed string that this token represents.
///
/// This returns either a raw byte slice into the source if that's possible
/// or an owned representation to handle escaped characters and such.
///
/// Should only be used with [`TokenKind::String`].
pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
let mut ch = self.src(s).chars();
ch.next().unwrap();
Lexer::parse_str(&mut ch, true).unwrap()
}
/// Returns the decomposed float token that this represents.
///
/// This will slice up the float token into its component parts and return a
/// description of the float token in the source.
///
/// Should only be used with [`TokenKind::Float`].
pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
match kind {
FloatKind::Inf { negative } => Float::Inf { negative },
FloatKind::Nan { negative } => Float::Nan {
val: None,
negative,
},
FloatKind::NanVal {
negative,
has_underscores,
} => {
let src = self.src(s);
let src = if src.starts_with("n") { src } else { &src[1..] };
let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
if has_underscores {
*val.to_mut() = val.replace("_", "");
}
Float::Nan {
val: Some(val),
negative,
}
}
FloatKind::Normal {
has_underscores,
hex,
} => {
let src = self.src(s);
let (integral, decimal, exponent) = match src.find('.') {
Some(i) => {
let integral = &src[..i];
let rest = &src[i + 1..];
let exponent = if hex {
rest.find('p').or_else(|| rest.find('P'))
} else {
rest.find('e').or_else(|| rest.find('E'))
};
match exponent {
Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),
None => (integral, Some(rest), None),
}
}
None => {
let exponent = if hex {
src.find('p').or_else(|| src.find('P'))
} else {
src.find('e').or_else(|| src.find('E'))
};
match exponent {
Some(i) => (&src[..i], None, Some(&src[i + 1..])),
None => (src, None, None),
}
}
};
let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
let mut decimal = decimal.and_then(|s| {
if s.is_empty() {
None
} else {
Some(Cow::Borrowed(s))
}
});
let mut exponent =
exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));
if has_underscores {
*integral.to_mut() = integral.replace("_", "");
if let Some(decimal) = &mut decimal {
*decimal.to_mut() = decimal.replace("_", "");
}
if let Some(exponent) = &mut exponent {
*exponent.to_mut() = exponent.replace("_", "");
}
}
if hex {
*integral.to_mut() = integral.replace("0x", "");
}
Float::Val {
hex,
integral,
decimal,
exponent,
}
}
}
}
/// Returns the decomposed integer token that this represents.
///
/// This will slice up the integer token into its component parts and
/// return a description of the integer token in the source.
///
/// Should only be used with [`TokenKind::Integer`].
pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
let src = self.src(s);
let val = match kind.sign {
Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
Some(SignToken::Minus) => src,
None => src,
};
let mut val = Cow::Borrowed(val);
if kind.has_underscores {
*val.to_mut() = val.replace("_", "");
}
if kind.hex {
*val.to_mut() = val.replace("0x", "");
}
Integer {
sign: kind.sign,
hex: kind.hex,
val,
}
}
fn error(&self, src: &str, err: LexError) -> Error {
Error::lex(
Span {
offset: self.offset,
},
src,
err,
)
}
}
impl<'a> Integer<'a> {
/// Returns the sign token for this integer.
pub fn sign(&self) -> Option<SignToken> {
self.sign
}
/// Returns the value string that can be parsed for this integer, as well
/// as the base that it should be parsed in
pub fn val(&self) -> (&str, u32) {
(&self.val, if self.hex { 16 } else { 10 })
}
}
fn to_hex(c: char) -> u8 {
match c {
'a'..='f' => c as u8 - b'a' + 10,
'A'..='F' => c as u8 - b'A' + 10,
_ => c as u8 - b'0',
}
}
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use LexError::*;
match self {
DanglingBlockComment => f.write_str("unterminated block comment")?,
Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
InvalidStringElement(c) => {
write!(f, "invalid character in string '{}'", escape_char(*c))?
}
InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
Expected { wanted, found } => write!(
f,
"expected '{}' but found '{}'",
escape_char(*wanted),
escape_char(*found)
)?,
UnexpectedEof => write!(f, "unexpected end-of-file")?,
NumberTooBig => f.write_str("number is too big to parse")?,
InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
EmptyId => write!(f, "empty identifier")?,
EmptyAnnotation => write!(f, "empty annotation id")?,
}
Ok(())
}
}
fn escape_char(c: char) -> String {
match c {
'\t' => String::from("\\t"),
'\r' => String::from("\\r"),
'\n' => String::from("\\n"),
'\\' => String::from("\\\\"),
'\'' => String::from("\\\'"),
'\"' => String::from("\""),
'\x20'..='\x7e' => String::from(c),
_ => c.escape_unicode().to_string(),
}
}
/// This is an attempt to protect agains the "trojan source" [1] problem where
/// unicode characters can cause editors to render source code differently
/// for humans than the compiler itself sees.
///
/// To mitigate this issue, and because it's relatively rare in practice,
/// this simply rejects characters of that form.
///
fn is_confusing_unicode(ch: char) -> bool {
matches!(
ch,
'\u{202a}'
| '\u{202b}'
| '\u{202d}'
| '\u{202e}'
| '\u{2066}'
| '\u{2067}'
| '\u{2068}'
| '\u{206c}'
| '\u{2069}'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ws_smoke() {
fn get_whitespace(input: &str) -> &str {
let token = get_token(input);
match token.kind {
TokenKind::Whitespace => token.src(input),
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_whitespace(" "), " ");
assert_eq!(get_whitespace(" "), " ");
assert_eq!(get_whitespace(" \n "), " \n ");
assert_eq!(get_whitespace(" x"), " ");
assert_eq!(get_whitespace(" ;"), " ");
}
#[test]
fn line_comment_smoke() {
fn get_line_comment(input: &str) -> &str {
let token = get_token(input);
match token.kind {
TokenKind::LineComment => token.src(input),
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_line_comment(";;"), ";;");
assert_eq!(get_line_comment(";; xyz"), ";; xyz");
assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
assert_eq!(get_line_comment(";;\nabc"), ";;");
assert_eq!(get_line_comment(";; \nabc"), ";; ");
assert_eq!(get_line_comment(";; \rabc"), ";; ");
assert_eq!(get_line_comment(";; \r\nabc"), ";; ");
}
#[test]
fn block_comment_smoke() {
fn get_block_comment(input: &str) -> &str {
let token = get_token(input);
match token.kind {
TokenKind::BlockComment => token.src(input),
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_block_comment("(;;)"), "(;;)");
assert_eq!(get_block_comment("(; ;)"), "(; ;)");
assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
}
fn get_token(input: &str) -> Token {
Lexer::new(input)
.parse(&mut 0)
.expect("no first token")
.expect("no token")
}
#[test]
fn lparen() {
assert_eq!(get_token("((").kind, TokenKind::LParen);
}
#[test]
fn rparen() {
assert_eq!(get_token(")(").kind, TokenKind::RParen);
}
#[test]
fn strings() {
fn get_string(input: &str) -> Vec<u8> {
let token = get_token(input);
match token.kind {
TokenKind::String => token.string(input).to_vec(),
other => panic!("not keyword {:?}", other),
}
}
assert_eq!(&*get_string("\"\""), b"");
assert_eq!(&*get_string("\"a\""), b"a");
assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
assert_eq!(&*get_string("\"\\\"\""), b"\"");
assert_eq!(&*get_string("\"\\'\""), b"'");
assert_eq!(&*get_string("\"\\n\""), b"\n");
assert_eq!(&*get_string("\"\\t\""), b"\t");
assert_eq!(&*get_string("\"\\r\""), b"\r");
assert_eq!(&*get_string("\"\\\\\""), b"\\");
assert_eq!(&*get_string("\"\\01\""), &[1]);
assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
assert_eq!(
&*get_string("\"\\u{0f3}\""),
'\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
);
assert_eq!(
&*get_string("\"\\u{0_f_3}\""),
'\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
);
for i in 0..=255i32 {
let s = format!("\"\\{:02x}\"", i);
assert_eq!(&*get_string(&s), &[i as u8]);
}
}
#[test]
fn id() {
fn get_id(input: &str) -> String {
let token = get_token(input);
match token.kind {
TokenKind::Id => token.id(input).unwrap().to_string(),
other => panic!("not id {:?}", other),
}
}
assert_eq!(get_id("$x"), "x");
assert_eq!(get_id("$xyz"), "xyz");
assert_eq!(get_id("$x_z"), "x_z");
assert_eq!(get_id("$0^"), "0^");
assert_eq!(get_id("$0^;;"), "0^");
assert_eq!(get_id("$0^ ;;"), "0^");
assert_eq!(get_id("$\"x\" ;;"), "x");
}
#[test]
fn annotation() {
fn get_annotation(input: &str) -> String {
let token = get_token(input);
match token.kind {
TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
other => panic!("not annotation {:?}", other),
}
}
assert_eq!(get_annotation("@foo"), "foo");
assert_eq!(get_annotation("@foo "), "foo");
assert_eq!(get_annotation("@f "), "f");
assert_eq!(get_annotation("@\"x\" "), "x");
assert_eq!(get_annotation("@0 "), "0");
}
#[test]
fn keyword() {
fn get_keyword(input: &str) -> &str {
let token = get_token(input);
match token.kind {
TokenKind::Keyword => token.keyword(input),
other => panic!("not keyword {:?}", other),
}
}
assert_eq!(get_keyword("x"), "x");
assert_eq!(get_keyword("xyz"), "xyz");
assert_eq!(get_keyword("x_z"), "x_z");
assert_eq!(get_keyword("x_z "), "x_z");
assert_eq!(get_keyword("x_z "), "x_z");
}
#[test]
fn reserved() {
fn get_reserved(input: &str) -> &str {
let token = get_token(input);
match token.kind {
TokenKind::Reserved => token.reserved(input),
other => panic!("not reserved {:?}", other),
}
}
assert_eq!(get_reserved("^_x "), "^_x");
}
#[test]
fn integer() {
fn get_integer(input: &str) -> String {
let token = get_token(input);
match token.kind {
TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
other => panic!("not integer {:?}", other),
}
}
assert_eq!(get_integer("1"), "1");
assert_eq!(get_integer("0"), "0");
assert_eq!(get_integer("-1"), "-1");
assert_eq!(get_integer("+1"), "1");
assert_eq!(get_integer("+1_000"), "1000");
assert_eq!(get_integer("+1_0_0_0"), "1000");
assert_eq!(get_integer("+0x10"), "10");
assert_eq!(get_integer("-0x10"), "-10");
assert_eq!(get_integer("0x10"), "10");
}
#[test]
fn float() {
fn get_float(input: &str) -> Float<'_> {
let token = get_token(input);
match token.kind {
TokenKind::Float(f) => token.float(input, f),
other => panic!("not float {:?}", other),
}
}
assert_eq!(
get_float("nan"),
Float::Nan {
val: None,
negative: false
},
);
assert_eq!(
get_float("-nan"),
Float::Nan {
val: None,
negative: true,
},
);
assert_eq!(
get_float("+nan"),
Float::Nan {
val: None,
negative: false,
},
);
assert_eq!(
get_float("+nan:0x1"),
Float::Nan {
val: Some("1".into()),
negative: false,
},
);
assert_eq!(
get_float("nan:0x7f_ffff"),
Float::Nan {
val: Some("7fffff".into()),
negative: false,
},
);
assert_eq!(get_float("inf"), Float::Inf { negative: false });
assert_eq!(get_float("-inf"), Float::Inf { negative: true });
assert_eq!(get_float("+inf"), Float::Inf { negative: false });
assert_eq!(
get_float("1.2"),
Float::Val {
integral: "1".into(),
decimal: Some("2".into()),
exponent: None,
hex: false,
},
);
assert_eq!(
get_float("1.2e3"),
Float::Val {
integral: "1".into(),
decimal: Some("2".into()),
exponent: Some("3".into()),
hex: false,
},
);
assert_eq!(
get_float("-1_2.1_1E+0_1"),
Float::Val {
integral: "-12".into(),
decimal: Some("11".into()),
exponent: Some("01".into()),
hex: false,
},
);
assert_eq!(
get_float("+1_2.1_1E-0_1"),
Float::Val {
integral: "12".into(),
decimal: Some("11".into()),
exponent: Some("-01".into()),
hex: false,
},
);
assert_eq!(
get_float("0x1_2.3_4p5_6"),
Float::Val {
integral: "12".into(),
decimal: Some("34".into()),
exponent: Some("56".into()),
hex: true,
},
);
assert_eq!(
get_float("+0x1_2.3_4P-5_6"),
Float::Val {
integral: "12".into(),
decimal: Some("34".into()),
exponent: Some("-56".into()),
hex: true,
},
);
assert_eq!(
get_float("1."),
Float::Val {
integral: "1".into(),
decimal: None,
exponent: None,
hex: false,
},
);
assert_eq!(
get_float("0x1p-24"),
Float::Val {
integral: "1".into(),
decimal: None,
exponent: Some("-24".into()),
hex: true,
},
);
}
}