Revision control
Copy as Markdown
Other Tools
//! Lexer for parsing format descriptions.
use core::iter;
use super::{unused, Error, Location, Spanned, SpannedValue};
/// An iterator over the lexed tokens.
pub(super) struct Lexed<I: Iterator> {
/// The internal iterator.
iter: iter::Peekable<I>,
}
impl<I: Iterator> Iterator for Lexed<I> {
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
}
}
impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> {
/// Peek at the next item in the iterator.
pub(super) fn peek(&mut self) -> Option<&I::Item> {
self.iter.peek()
}
/// Consume the next token if it is whitespace.
pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
if let Some(&Ok(Token::ComponentPart {
kind: ComponentKind::Whitespace,
value,
})) = self.peek()
{
self.next(); // consume
Some(value)
} else {
None
}
}
/// Consume the next token if it is a component item that is not whitespace.
pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
if let Some(&Ok(Token::ComponentPart {
kind: ComponentKind::NotWhitespace,
value,
})) = self.peek()
{
self.next(); // consume
Some(value)
} else {
None
}
}
/// Consume the next token if it is an opening bracket.
pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
if let Some(&Ok(Token::Bracket {
kind: BracketKind::Opening,
location,
})) = self.peek()
{
self.next(); // consume
Some(location)
} else {
None
}
}
/// Peek at the next token if it is a closing bracket.
pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
if let Some(Ok(Token::Bracket {
kind: BracketKind::Closing,
location,
})) = self.peek()
{
Some(location)
} else {
None
}
}
/// Consume the next token if it is a closing bracket.
pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
if let Some(&Ok(Token::Bracket {
kind: BracketKind::Closing,
location,
})) = self.peek()
{
self.next(); // consume
Some(location)
} else {
None
}
}
}
/// A token emitted by the lexer. There is no semantic meaning at this stage.
pub(super) enum Token<'a> {
/// A literal string, formatted and parsed as-is.
Literal(Spanned<&'a [u8]>),
/// An opening or closing bracket. May or may not be the start or end of a component.
Bracket {
/// Whether the bracket is opening or closing.
kind: BracketKind,
/// Where the bracket was in the format string.
location: Location,
},
/// One part of a component. This could be its name, a modifier, or whitespace.
ComponentPart {
/// Whether the part is whitespace or not.
kind: ComponentKind,
/// The part itself.
value: Spanned<&'a [u8]>,
},
}
/// What type of bracket is present.
pub(super) enum BracketKind {
/// An opening bracket: `[`
Opening,
/// A closing bracket: `]`
Closing,
}
/// Indicates whether the component is whitespace or not.
pub(super) enum ComponentKind {
#[allow(clippy::missing_docs_in_private_items)]
Whitespace,
#[allow(clippy::missing_docs_in_private_items)]
NotWhitespace,
}
/// Attach [`Location`] information to each byte in the iterator.
fn attach_location<'item>(
iter: impl Iterator<Item = &'item u8>,
) -> impl Iterator<Item = (&'item u8, Location)> {
let mut byte_pos = 0;
iter.map(move |byte| {
let location = Location { byte: byte_pos };
byte_pos += 1;
(byte, location)
})
}
/// Parse the string into a series of [`Token`]s.
///
/// `VERSION` controls the version of the format description that is being parsed. Currently, this
/// must be 1 or 2.
///
/// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`.
/// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may
/// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
/// other characters result in a lex error.
pub(super) fn lex<const VERSION: usize>(
mut input: &[u8],
) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
validate_version!(VERSION);
let mut depth: u8 = 0;
let mut iter = attach_location(input.iter()).peekable();
let mut second_bracket_location = None;
let iter = iter::from_fn(move || {
// The flag is only set when version is zero.
if version!(..=1) {
// There is a flag set to emit the second half of an escaped bracket pair.
if let Some(location) = second_bracket_location.take() {
return Some(Ok(Token::Bracket {
kind: BracketKind::Opening,
location,
}));
}
}
Some(Ok(match iter.next()? {
// possible escape sequence
(b'\\', backslash_loc) if version!(2..) => {
match iter.next() {
Some((b'\\' | b'[' | b']', char_loc)) => {
// The escaped character is emitted as-is.
let char = &input[1..2];
input = &input[2..];
if depth == 0 {
Token::Literal(char.spanned(backslash_loc.to(char_loc)))
} else {
Token::ComponentPart {
kind: ComponentKind::NotWhitespace,
value: char.spanned(backslash_loc.to(char_loc)),
}
}
}
Some((_, loc)) => {
return Some(Err(Error {
_inner: unused(loc.error("invalid escape sequence")),
public: crate::error::InvalidFormatDescription::Expected {
what: "valid escape sequence",
index: loc.byte as _,
},
}));
}
None => {
return Some(Err(Error {
_inner: unused(backslash_loc.error("unexpected end of input")),
public: crate::error::InvalidFormatDescription::Expected {
what: "valid escape sequence",
index: backslash_loc.byte as _,
},
}));
}
}
}
// potentially escaped opening bracket
(b'[', location) if version!(..=1) => {
if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
// Escaped bracket. Store the location of the second so we can emit it later.
second_bracket_location = Some(second_location);
input = &input[2..];
} else {
// opening bracket
depth += 1;
input = &input[1..];
}
Token::Bracket {
kind: BracketKind::Opening,
location,
}
}
// opening bracket
(b'[', location) => {
depth += 1;
input = &input[1..];
Token::Bracket {
kind: BracketKind::Opening,
location,
}
}
// closing bracket
(b']', location) if depth > 0 => {
depth -= 1;
input = &input[1..];
Token::Bracket {
kind: BracketKind::Closing,
location,
}
}
// literal
(_, start_location) if depth == 0 => {
let mut bytes = 1;
let mut end_location = start_location;
while let Some((_, location)) =
iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b'\\') || byte == b'['))
{
end_location = location;
bytes += 1;
}
let value = &input[..bytes];
input = &input[bytes..];
Token::Literal(value.spanned(start_location.to(end_location)))
}
// component part
(byte, start_location) => {
let mut bytes = 1;
let mut end_location = start_location;
let is_whitespace = byte.is_ascii_whitespace();
while let Some((_, location)) = iter.next_if(|&(byte, _)| {
!matches!(byte, b'\\' | b'[' | b']')
&& is_whitespace == byte.is_ascii_whitespace()
}) {
end_location = location;
bytes += 1;
}
let value = &input[..bytes];
input = &input[bytes..];
Token::ComponentPart {
kind: if is_whitespace {
ComponentKind::Whitespace
} else {
ComponentKind::NotWhitespace
},
value: value.spanned(start_location.to(end_location)),
}
}
}))
});
Lexed {
iter: iter.peekable(),
}
}