Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use self::Token::*;
use crate::cow_rc_str::CowRcStr;
use crate::parser::ParserState;
use std::char;
use std::ops::Range;
#[cfg(not(feature = "dummy_match_byte"))]
use cssparser_macros::match_byte;
#[cfg(feature = "dummy_match_byte")]
macro_rules! match_byte {
($value:expr, $($rest:tt)* ) => {
match $value {
$(
$rest
)+
}
};
}
/// One of the pieces the CSS input is broken into.
///
/// Some components use `Cow` in order to borrow from the original input string
/// and avoid allocating/copying when possible.
#[derive(PartialEq, Debug, Clone)]
pub enum Token<'a> {
Ident(CowRcStr<'a>),
///
/// The value does not include the `@` marker.
AtKeyword(CowRcStr<'a>),
/// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
///
/// The value does not include the `#` marker.
Hash(CowRcStr<'a>),
/// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
///
/// The value does not include the `#` marker.
IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
///
/// The value does not include the quotes.
QuotedString(CowRcStr<'a>),
///
/// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a
/// `Function` token.
UnquotedUrl(CowRcStr<'a>),
/// A `<delim-token>`
Delim(char),
Number {
/// Whether the number had a `+` or `-` sign.
///
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
has_sign: bool,
/// The value as a float
value: f32,
/// If the origin source did not include a fractional part, the value as an integer.
int_value: Option<i32>,
},
Percentage {
/// Whether the number had a `+` or `-` sign.
has_sign: bool,
/// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
unit_value: f32,
/// If the origin source did not include a fractional part, the value as an integer.
/// It is **not** divided by 100.
int_value: Option<i32>,
},
Dimension {
/// Whether the number had a `+` or `-` sign.
///
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
has_sign: bool,
/// The value as a float
value: f32,
/// If the origin source did not include a fractional part, the value as an integer.
int_value: Option<i32>,
/// The unit, e.g. "px" in `12px`
unit: CowRcStr<'a>,
},
WhiteSpace(&'a str),
/// A comment.
///
/// The CSS Syntax spec does not generate tokens for comments,
/// But we do, because we can (borrowed &str makes it cheap).
///
/// The value does not include the `/*` `*/` markers.
Comment(&'a str),
/// A `:` `<colon-token>`
Colon, // :
/// A `;` `<semicolon-token>`
Semicolon, // ;
/// A `,` `<comma-token>`
Comma, // ,
IncludeMatch,
DashMatch,
PrefixMatch,
SuffixMatch,
SubstringMatch,
CDO,
CDC,
///
/// The value (name) does not include the `(` marker.
Function(CowRcStr<'a>),
/// A `<(-token>`
ParenthesisBlock,
/// A `<[-token>`
SquareBracketBlock,
/// A `<{-token>`
CurlyBracketBlock,
/// A `<bad-url-token>`
///
/// This token always indicates a parse error.
BadUrl(CowRcStr<'a>),
/// A `<bad-string-token>`
///
/// This token always indicates a parse error.
BadString(CowRcStr<'a>),
/// A `<)-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
CloseParenthesis,
/// A `<]-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
CloseSquareBracket,
/// A `<}-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
CloseCurlyBracket,
}
impl<'a> Token<'a> {
/// Return whether this token represents a parse error.
///
/// `BadUrl` and `BadString` are tokenizer-level parse errors.
///
/// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
/// and therefore parse errors when returned by one of the `Parser::next*` methods.
pub fn is_parse_error(&self) -> bool {
matches!(
*self,
BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
)
}
}
#[derive(Clone)]
pub struct Tokenizer<'a> {
input: &'a str,
/// Counted in bytes, not code points. From 0.
position: usize,
/// The position at the start of the current line; but adjusted to
/// ensure that computing the column will give the result in units
/// of UTF-16 characters.
current_line_start_position: usize,
current_line_number: u32,
var_or_env_functions: SeenStatus,
source_map_url: Option<&'a str>,
source_url: Option<&'a str>,
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum SeenStatus {
DontCare,
LookingForThem,
SeenAtLeastOne,
}
impl<'a> Tokenizer<'a> {
#[inline]
pub fn new(input: &str) -> Tokenizer {
Tokenizer {
input,
position: 0,
current_line_start_position: 0,
current_line_number: 0,
var_or_env_functions: SeenStatus::DontCare,
source_map_url: None,
source_url: None,
}
}
#[inline]
pub fn look_for_var_or_env_functions(&mut self) {
self.var_or_env_functions = SeenStatus::LookingForThem;
}
#[inline]
pub fn seen_var_or_env_functions(&mut self) -> bool {
let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
self.var_or_env_functions = SeenStatus::DontCare;
seen
}
#[inline]
pub fn see_function(&mut self, name: &str) {
if self.var_or_env_functions == SeenStatus::LookingForThem
&& (name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env"))
{
self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
}
}
#[inline]
pub fn next(&mut self) -> Result<Token<'a>, ()> {
next_token(self)
}
#[inline]
pub fn position(&self) -> SourcePosition {
debug_assert!(self.input.is_char_boundary(self.position));
SourcePosition(self.position)
}
#[inline]
pub fn current_source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
}
}
#[inline]
pub fn current_source_map_url(&self) -> Option<&'a str> {
self.source_map_url
}
#[inline]
pub fn current_source_url(&self) -> Option<&'a str> {
self.source_url
}
#[inline]
pub fn state(&self) -> ParserState {
ParserState {
position: self.position,
current_line_start_position: self.current_line_start_position,
current_line_number: self.current_line_number,
at_start_of: None,
}
}
#[inline]
pub fn reset(&mut self, state: &ParserState) {
self.position = state.position;
self.current_line_start_position = state.current_line_start_position;
self.current_line_number = state.current_line_number;
}
#[inline]
pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
self.slice(start_pos..self.position())
}
#[inline]
pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
debug_assert!(self.input.is_char_boundary(range.start.0));
debug_assert!(self.input.is_char_boundary(range.end.0));
unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
}
pub fn current_source_line(&self) -> &'a str {
let current = self.position();
let start = self
.slice(SourcePosition(0)..current)
.rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
.map_or(0, |start| start + 1);
let end = self
.slice(current..SourcePosition(self.input.len()))
.find(|c| matches!(c, '\r' | '\n' | '\x0C'))
.map_or(self.input.len(), |end| current.0 + end);
self.slice(SourcePosition(start)..SourcePosition(end))
}
#[inline]
pub fn next_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.input.as_bytes()[self.position])
}
}
// If false, `tokenizer.next_char()` will not panic.
#[inline]
fn is_eof(&self) -> bool {
!self.has_at_least(0)
}
// If true, the input has at least `n` bytes left *after* the current one.
// That is, `tokenizer.char_at(n)` will not panic.
#[inline]
fn has_at_least(&self, n: usize) -> bool {
self.position + n < self.input.len()
}
// Advance over N bytes in the input. This function can advance
// over ASCII bytes (excluding newlines), or UTF-8 sequence
// leaders (excluding leaders for 4-byte sequences).
#[inline]
pub fn advance(&mut self, n: usize) {
if cfg!(debug_assertions) {
// Each byte must either be an ASCII byte or a sequence
// leader, but not a 4-byte leader; also newlines are
// rejected.
for i in 0..n {
let b = self.byte_at(i);
debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
}
}
self.position += n
}
// Assumes non-EOF
#[inline]
fn next_byte_unchecked(&self) -> u8 {
self.byte_at(0)
}
#[inline]
fn byte_at(&self, offset: usize) -> u8 {
self.input.as_bytes()[self.position + offset]
}
// Advance over a single byte; the byte must be a UTF-8 sequence
// leader for a 4-byte sequence.
#[inline]
fn consume_4byte_intro(&mut self) {
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.position += 1;
}
// Advance over a single byte; the byte must be a UTF-8
// continuation byte.
#[inline]
fn consume_continuation_byte(&mut self) {
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
// Continuation bytes contribute to column overcount. Note
// that due to the special case for the 4-byte sequence intro,
// we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position += 1;
}
// Advance over any kind of byte, excluding newlines.
#[inline(never)]
fn consume_known_byte(&mut self, byte: u8) {
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
self.position += 1;
// Continuation bytes contribute to column overcount.
if byte & 0xF0 == 0xF0 {
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
} else if byte & 0xC0 == 0x80 {
// Note that due to the special case for the 4-byte
// sequence intro, we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
}
}
#[inline]
fn next_char(&self) -> char {
unsafe { self.input.get_unchecked(self.position().0..) }
.chars()
.next()
.unwrap()
}
// Given that a newline has been seen, advance over the newline
// and update the state.
#[inline]
fn consume_newline(&mut self) {
let byte = self.next_byte_unchecked();
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
self.position += 1;
if byte == b'\r' && self.next_byte() == Some(b'\n') {
self.position += 1;
}
self.current_line_start_position = self.position;
self.current_line_number += 1;
}
#[inline]
fn has_newline_at(&self, offset: usize) -> bool {
self.position + offset < self.input.len()
&& matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
}
#[inline]
fn consume_char(&mut self) -> char {
let c = self.next_char();
let len_utf8 = c.len_utf8();
self.position += len_utf8;
// Note that due to the special case for the 4-byte sequence
// intro, we must use wrapping add here.
self.current_line_start_position = self
.current_line_start_position
.wrapping_add(len_utf8 - c.len_utf16());
c
}
#[inline]
fn starts_with(&self, needle: &[u8]) -> bool {
self.input.as_bytes()[self.position..].starts_with(needle)
}
pub fn skip_whitespace(&mut self) {
while !self.is_eof() {
match_byte! { self.next_byte_unchecked(),
b' ' | b'\t' => {
self.advance(1)
},
b'\n' | b'\x0C' | b'\r' => {
self.consume_newline();
},
b'/' => {
if self.starts_with(b"/*") {
consume_comment(self);
} else {
return
}
}
_ => return,
}
}
}
pub fn skip_cdc_and_cdo(&mut self) {
while !self.is_eof() {
match_byte! { self.next_byte_unchecked(),
b' ' | b'\t' => {
self.advance(1)
},
b'\n' | b'\x0C' | b'\r' => {
self.consume_newline();
},
b'/' => {
if self.starts_with(b"/*") {
consume_comment(self);
} else {
return
}
}
b'<' => {
if self.starts_with(b"<!--") {
self.advance(4)
} else {
return
}
}
b'-' => {
if self.starts_with(b"-->") {
self.advance(3)
} else {
return
}
}
_ => {
return
}
}
}
}
}
/// A position from the start of the input, counted in UTF-8 bytes.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
pub struct SourcePosition(pub(crate) usize);
impl SourcePosition {
/// Returns the current byte index in the original input.
#[inline]
pub fn byte_index(&self) -> usize {
self.0
}
}
/// The line and column number for a given position within the input.
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub struct SourceLocation {
/// The line number, starting at 0 for the first line.
pub line: u32,
/// The column number within a line, starting at 1 for first the character of the line.
/// Column numbers are counted in UTF-16 code units.
pub column: u32,
}
fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
if tokenizer.is_eof() {
return Err(());
}
let b = tokenizer.next_byte_unchecked();
let token = match_byte! { b,
b' ' | b'\t' => {
consume_whitespace(tokenizer, false)
},
b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
b'"' => consume_string(tokenizer, false),
b'#' => {
tokenizer.advance(1);
if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
else if !tokenizer.is_eof() &&
matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
// Any other valid case here already resulted in IDHash.
Hash(consume_name(tokenizer))
}
else { Delim('#') }
},
b'$' => {
if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
else { tokenizer.advance(1); Delim('$') }
},
b'\'' => consume_string(tokenizer, true),
b'(' => { tokenizer.advance(1); ParenthesisBlock },
b')' => { tokenizer.advance(1); CloseParenthesis },
b'*' => {
if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
else { tokenizer.advance(1); Delim('*') }
},
b'+' => {
if (
tokenizer.has_at_least(1)
&& tokenizer.byte_at(1).is_ascii_digit()
) || (
tokenizer.has_at_least(2)
&& tokenizer.byte_at(1) == b'.'
&& tokenizer.byte_at(2).is_ascii_digit()
) {
consume_numeric(tokenizer)
} else {
tokenizer.advance(1);
Delim('+')
}
},
b',' => { tokenizer.advance(1); Comma },
b'-' => {
if (
tokenizer.has_at_least(1)
&& tokenizer.byte_at(1).is_ascii_digit()
) || (
tokenizer.has_at_least(2)
&& tokenizer.byte_at(1) == b'.'
&& tokenizer.byte_at(2).is_ascii_digit()
) {
consume_numeric(tokenizer)
} else if tokenizer.starts_with(b"-->") {
tokenizer.advance(3);
CDC
} else if is_ident_start(tokenizer) {
consume_ident_like(tokenizer)
} else {
tokenizer.advance(1);
Delim('-')
}
},
b'.' => {
if tokenizer.has_at_least(1)
&& tokenizer.byte_at(1).is_ascii_digit() {
consume_numeric(tokenizer)
} else {
tokenizer.advance(1);
Delim('.')
}
}
b'/' => {
if tokenizer.starts_with(b"/*") {
Comment(consume_comment(tokenizer))
} else {
tokenizer.advance(1);
Delim('/')
}
}
b'0'..=b'9' => consume_numeric(tokenizer),
b':' => { tokenizer.advance(1); Colon },
b';' => { tokenizer.advance(1); Semicolon },
b'<' => {
if tokenizer.starts_with(b"<!--") {
tokenizer.advance(4);
CDO
} else {
tokenizer.advance(1);
Delim('<')
}
},
b'@' => {
tokenizer.advance(1);
if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
else { Delim('@') }
},
b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
b'[' => { tokenizer.advance(1); SquareBracketBlock },
b'\\' => {
if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
else { tokenizer.advance(1); Delim('\\') }
},
b']' => { tokenizer.advance(1); CloseSquareBracket },
b'^' => {
if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
else { tokenizer.advance(1); Delim('^') }
},
b'{' => { tokenizer.advance(1); CurlyBracketBlock },
b'|' => {
if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
else { tokenizer.advance(1); Delim('|') }
},
b'}' => { tokenizer.advance(1); CloseCurlyBracket },
b'~' => {
if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
else { tokenizer.advance(1); Delim('~') }
},
_ => {
if !b.is_ascii() {
consume_ident_like(tokenizer)
} else {
tokenizer.advance(1);
Delim(b as char)
}
},
};
Ok(token)
}
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
let start_position = tokenizer.position();
if newline {
tokenizer.consume_newline();
} else {
tokenizer.advance(1);
}
while !tokenizer.is_eof() {
let b = tokenizer.next_byte_unchecked();
match_byte! { b,
b' ' | b'\t' => {
tokenizer.advance(1);
}
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
_ => {
break
}
}
}
WhiteSpace(tokenizer.slice_from(start_position))
}
// Check for sourceMappingURL or sourceURL comments and update the
// tokenizer appropriately.
fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
let directive = "# sourceMappingURL=";
let directive_old = "@ sourceMappingURL=";
// If there is a source map directive, extract the URL.
if contents.starts_with(directive) || contents.starts_with(directive_old) {
let contents = &contents[directive.len()..];
tokenizer.source_map_url = contents
.split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
.next()
}
let directive = "# sourceURL=";
let directive_old = "@ sourceURL=";
// If there is a source map directive, extract the URL.
if contents.starts_with(directive) || contents.starts_with(directive_old) {
let contents = &contents[directive.len()..];
tokenizer.source_url = contents
.split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
.next()
}
}
fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
tokenizer.advance(2); // consume "/*"
let start_position = tokenizer.position();
while !tokenizer.is_eof() {
match_byte! { tokenizer.next_byte_unchecked(),
b'*' => {
let end_position = tokenizer.position();
tokenizer.advance(1);
if tokenizer.next_byte() == Some(b'/') {
tokenizer.advance(1);
let contents = tokenizer.slice(start_position..end_position);
check_for_source_map(tokenizer, contents);
return contents
}
}
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
_ => {
// ASCII or other leading byte.
tokenizer.advance(1);
}
}
}
let contents = tokenizer.slice_from(start_position);
check_for_source_map(tokenizer, contents);
contents
}
fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
match consume_quoted_string(tokenizer, single_quote) {
Ok(value) => QuotedString(value),
Err(value) => BadString(value),
}
}
/// Return `Err(())` on syntax error (ie. unescaped newline)
fn consume_quoted_string<'a>(
tokenizer: &mut Tokenizer<'a>,
single_quote: bool,
) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
tokenizer.advance(1); // Skip the initial quote
// start_pos is at code point boundary, after " or '
let start_pos = tokenizer.position();
let mut string_bytes;
loop {
if tokenizer.is_eof() {
return Ok(tokenizer.slice_from(start_pos).into());
}
match_byte! { tokenizer.next_byte_unchecked(),
b'"' => {
if !single_quote {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return Ok(value.into())
}
tokenizer.advance(1);
}
b'\'' => {
if single_quote {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return Ok(value.into())
}
tokenizer.advance(1);
}
b'\\' | b'\0' => {
// * The tokenizer’s input is UTF-8 since it’s `&str`.
// * start_pos is at a code point boundary
// * so is the current position (which is before '\\' or '\0'
//
// So `string_bytes` is well-formed UTF-8.
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
break
}
b'\n' | b'\r' | b'\x0C' => {
return Err(tokenizer.slice_from(start_pos).into())
},
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
_ => {
// ASCII or other leading byte.
tokenizer.advance(1);
}
}
}
while !tokenizer.is_eof() {
let b = tokenizer.next_byte_unchecked();
match_byte! { b,
b'\n' | b'\r' | b'\x0C' => {
return Err(
// string_bytes is well-formed UTF-8, see other comments.
unsafe {
from_utf8_release_unchecked(string_bytes)
}.into()
);
}
b'"' => {
tokenizer.advance(1);
if !single_quote {
break;
}
}
b'\'' => {
tokenizer.advance(1);
if single_quote {
break;
}
}
b'\\' => {
tokenizer.advance(1);
if !tokenizer.is_eof() {
match tokenizer.next_byte_unchecked() {
// Escaped newline
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
// This pushes one well-formed code point
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
}
}
// else: escaped EOF, do nothing.
continue;
}
b'\0' => {
tokenizer.advance(1);
string_bytes.extend("\u{FFFD}".as_bytes());
continue;
}
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
_ => {
// ASCII or other leading byte.
tokenizer.advance(1);
},
}
// If this byte is part of a multi-byte code point,
// we’ll end up copying the whole code point before this loop does something else.
string_bytes.push(b);
}
Ok(
// string_bytes is well-formed UTF-8, see other comments.
unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
)
}
#[inline]
fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
!tokenizer.is_eof()
&& match_byte! { tokenizer.next_byte_unchecked(),
b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
b'-' => {
tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
true
}
b'\\' => !tokenizer.has_newline_at(1),
b => !b.is_ascii(),
}
},
b'\\' => !tokenizer.has_newline_at(1),
b => !b.is_ascii(),
}
}
fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
let value = consume_name(tokenizer);
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
tokenizer.advance(1);
if value.eq_ignore_ascii_case("url") {
consume_unquoted_url(tokenizer).unwrap_or(Function(value))
} else {
tokenizer.see_function(&value);
Function(value)
}
} else {
Ident(value)
}
}
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
// start_pos is the end of the previous token, therefore at a code point boundary
let start_pos = tokenizer.position();
let mut value_bytes;
loop {
if tokenizer.is_eof() {
return tokenizer.slice_from(start_pos).into();
}
match_byte! { tokenizer.next_byte_unchecked(),
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
b'\\' | b'\0' => {
// * The tokenizer’s input is UTF-8 since it’s `&str`.
// * start_pos is at a code point boundary
// * so is the current position (which is before '\\' or '\0'
//
// So `value_bytes` is well-formed UTF-8.
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
break
}
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
_b => {
return tokenizer.slice_from(start_pos).into();
}
}
}
while !tokenizer.is_eof() {
let b = tokenizer.next_byte_unchecked();
match_byte! { b,
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => {
tokenizer.advance(1);
value_bytes.push(b) // ASCII
}
b'\\' => {
if tokenizer.has_newline_at(1) { break }
tokenizer.advance(1);
// This pushes one well-formed code point
consume_escape_and_write(tokenizer, &mut value_bytes)
}
b'\0' => {
tokenizer.advance(1);
value_bytes.extend("\u{FFFD}".as_bytes());
},
b'\x80'..=b'\xBF' => {
// This byte *is* part of a multi-byte code point,
// we’ll end up copying the whole code point before this loop does something else.
tokenizer.consume_continuation_byte();
value_bytes.push(b)
}
b'\xC0'..=b'\xEF' => {
// This byte *is* part of a multi-byte code point,
// we’ll end up copying the whole code point before this loop does something else.
tokenizer.advance(1);
value_bytes.push(b)
}
b'\xF0'..=b'\xFF' => {
tokenizer.consume_4byte_intro();
value_bytes.push(b)
}
_ => {
// ASCII
break;
}
}
}
// string_bytes is well-formed UTF-8, see other comments.
unsafe { from_utf8_release_unchecked(value_bytes) }.into()
}
fn byte_to_hex_digit(b: u8) -> Option<u32> {
Some(match_byte! { b,
b'0' ..= b'9' => b - b'0',
b'a' ..= b'f' => b - b'a' + 10,
b'A' ..= b'F' => b - b'A' + 10,
_ => {
return None
}
} as u32)
}
fn byte_to_decimal_digit(b: u8) -> Option<u32> {
if b.is_ascii_digit() {
Some((b - b'0') as u32)
} else {
None
}
}
fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
// Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
// But this is always called so that there is at least one digit in \d*(\.\d+)?
// Do all the math in f64 so that large numbers overflow to +/-inf
// and i32::{MIN, MAX} are within range.
let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
b'-' => (true, -1.),
b'+' => (true, 1.),
_ => (false, 1.),
};
if has_sign {
tokenizer.advance(1);
}
let mut integral_part: f64 = 0.;
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
integral_part = integral_part * 10. + digit as f64;
tokenizer.advance(1);
if tokenizer.is_eof() {
break;
}
}
let mut is_integer = true;
let mut fractional_part: f64 = 0.;
if tokenizer.has_at_least(1)
&& tokenizer.next_byte_unchecked() == b'.'
&& tokenizer.byte_at(1).is_ascii_digit()
{
is_integer = false;
tokenizer.advance(1); // Consume '.'
let mut factor = 0.1;
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
fractional_part += digit as f64 * factor;
factor *= 0.1;
tokenizer.advance(1);
if tokenizer.is_eof() {
break;
}
}
}
let mut value = sign * (integral_part + fractional_part);
if tokenizer.has_at_least(1)
&& matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
&& (tokenizer.byte_at(1).is_ascii_digit()
|| (tokenizer.has_at_least(2)
&& matches!(tokenizer.byte_at(1), b'+' | b'-')
&& tokenizer.byte_at(2).is_ascii_digit()))
{
is_integer = false;
tokenizer.advance(1);
let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
b'-' => (true, -1.),
b'+' => (true, 1.),
_ => (false, 1.),
};
if has_sign {
tokenizer.advance(1);
}
let mut exponent: f64 = 0.;
while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
exponent = exponent * 10. + digit as f64;
tokenizer.advance(1);
if tokenizer.is_eof() {
break;
}
}
value *= f64::powf(10., sign * exponent);
}
let int_value = if is_integer {
Some(if value >= i32::MAX as f64 {
i32::MAX
} else if value <= i32::MIN as f64 {
i32::MIN
} else {
value as i32
})
} else {
None
};
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
tokenizer.advance(1);
return Percentage {
unit_value: (value / 100.) as f32,
int_value,
has_sign,
};
}
let value = value as f32;
if is_ident_start(tokenizer) {
let unit = consume_name(tokenizer);
Dimension {
value,
int_value,
has_sign,
unit,
}
} else {
Number {
value,
int_value,
has_sign,
}
}
}
#[inline]
unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
if cfg!(debug_assertions) {
String::from_utf8(string_bytes).unwrap()
} else {
String::from_utf8_unchecked(string_bytes)
}
}
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
// This is only called after "url(", so the current position is a code point boundary.
let start_position = tokenizer.position;
let from_start = &tokenizer.input[tokenizer.position..];
let mut newlines = 0;
let mut last_newline = 0;
let mut found_printable_char = false;
let mut iter = from_start.bytes().enumerate();
loop {
let (offset, b) = match iter.next() {
Some(item) => item,
None => {
tokenizer.position = tokenizer.input.len();
break;
}
};
match_byte! { b,
b' ' | b'\t' => {},
b'\n' | b'\x0C' => {
newlines += 1;
last_newline = offset;
}
b'\r' => {
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
newlines += 1;
last_newline = offset;
}
}
b'"' | b'\'' => return Err(()), // Do not advance
b')' => {
// Don't use advance, because we may be skipping
// newlines here, and we want to avoid the assert.
tokenizer.position += offset + 1;
break
}
_ => {
// Don't use advance, because we may be skipping
// newlines here, and we want to avoid the assert.
tokenizer.position += offset;
found_printable_char = true;
break
}
}
}
if newlines > 0 {
tokenizer.current_line_number += newlines;
// No need for wrapping_add here, because there's no possible
// way to wrap.
tokenizer.current_line_start_position = start_position + last_newline + 1;
}
if found_printable_char {
// This function only consumed ASCII (whitespace) bytes,
// so the current position is a code point boundary.
return Ok(consume_unquoted_url_internal(tokenizer));
} else {
return Ok(UnquotedUrl("".into()));
}
fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
// This function is only called with start_pos at a code point boundary.
let start_pos = tokenizer.position();
let mut string_bytes: Vec<u8>;
loop {
if tokenizer.is_eof() {
return UnquotedUrl(tokenizer.slice_from(start_pos).into());
}
match_byte! { tokenizer.next_byte_unchecked(),
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
let value = tokenizer.slice_from(start_pos);
return consume_url_end(tokenizer, start_pos, value.into())
}
b')' => {
let value = tokenizer.slice_from(start_pos);
tokenizer.advance(1);
return UnquotedUrl(value.into())
}
b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
| b'"' | b'\'' | b'(' => {
tokenizer.advance(1);
return consume_bad_url(tokenizer, start_pos)
},
b'\\' | b'\0' => {
// * The tokenizer’s input is UTF-8 since it’s `&str`.
// * start_pos is at a code point boundary
// * so is the current position (which is before '\\' or '\0'
//
// So `string_bytes` is well-formed UTF-8.
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
break
}
b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
_ => {
// ASCII or other leading byte.
tokenizer.advance(1);
}
}
}
while !tokenizer.is_eof() {
let b = tokenizer.next_byte_unchecked();
match_byte! { b,
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
// string_bytes is well-formed UTF-8, see other comments.
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
return consume_url_end(tokenizer, start_pos, string)
}
b')' => {
tokenizer.advance(1);
break;
}
b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
| b'"' | b'\'' | b'(' => {
tokenizer.advance(1);
return consume_bad_url(tokenizer, start_pos);
}
b'\\' => {
tokenizer.advance(1);
if tokenizer.has_newline_at(0) {
return consume_bad_url(tokenizer, start_pos)
}
// This pushes one well-formed code point to string_bytes
consume_escape_and_write(tokenizer, &mut string_bytes)
},
b'\0' => {
tokenizer.advance(1);
string_bytes.extend("\u{FFFD}".as_bytes());
}
b'\x80'..=b'\xBF' => {
// We’ll end up copying the whole code point
// before this loop does something else.
tokenizer.consume_continuation_byte();
string_bytes.push(b);
}
b'\xF0'..=b'\xFF' => {
// We’ll end up copying the whole code point
// before this loop does something else.
tokenizer.consume_4byte_intro();
string_bytes.push(b);
}
// If this byte is part of a multi-byte code point,
// we’ll end up copying the whole code point before this loop does something else.
b => {
// ASCII or other leading byte.
tokenizer.advance(1);
string_bytes.push(b)
}
}
}
UnquotedUrl(
// string_bytes is well-formed UTF-8, see other comments.
unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
)
}
fn consume_url_end<'a>(
tokenizer: &mut Tokenizer<'a>,
start_pos: SourcePosition,
string: CowRcStr<'a>,
) -> Token<'a> {
while !tokenizer.is_eof() {
match_byte! { tokenizer.next_byte_unchecked(),
b')' => {
tokenizer.advance(1);
break
}
b' ' | b'\t' => { tokenizer.advance(1); }
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
b => {
tokenizer.consume_known_byte(b);
return consume_bad_url(tokenizer, start_pos);
}
}
}
UnquotedUrl(string)
}
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
// Consume up to the closing )
while !tokenizer.is_eof() {
match_byte! { tokenizer.next_byte_unchecked(),
b')' => {
let contents = tokenizer.slice_from(start_pos).into();
tokenizer.advance(1);
return BadUrl(contents)
}
b'\\' => {
tokenizer.advance(1);
if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
tokenizer.advance(1); // Skip an escaped ')' or '\'
}
}
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
b => {
tokenizer.consume_known_byte(b);
}
}
}
BadUrl(tokenizer.slice_from(start_pos).into())
}
}
// (value, number of digits up to 6)
fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
let mut value = 0;
let mut digits = 0;
while digits < 6 && !tokenizer.is_eof() {
match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
Some(digit) => {
value = value * 16 + digit;
digits += 1;
tokenizer.advance(1);
}
None => break,
}
}
(value, digits)
}
// Same constraints as consume_escape except it writes into `bytes` the result
// instead of returning it.
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
bytes.extend(
consume_escape(tokenizer)
.encode_utf8(&mut [0; 4])
.as_bytes(),
)
}
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
// and that the next input character has already been verified
// to not be a newline.
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
if tokenizer.is_eof() {
return '\u{FFFD}';
} // Escaped EOF
match_byte! { tokenizer.next_byte_unchecked(),
b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
let (c, _) = consume_hex_digits(tokenizer);
if !tokenizer.is_eof() {
match_byte! { tokenizer.next_byte_unchecked(),
b' ' | b'\t' => {
tokenizer.advance(1)
}
b'\n' | b'\x0C' | b'\r' => {
tokenizer.consume_newline();
}
_ => {}
}
}
static REPLACEMENT_CHAR: char = '\u{FFFD}';
if c != 0 {
let c = char::from_u32(c);
c.unwrap_or(REPLACEMENT_CHAR)
} else {
REPLACEMENT_CHAR
}
},
b'\0' => {
tokenizer.advance(1);
'\u{FFFD}'
}
_ => tokenizer.consume_char(),
}
}