lib.rs - mozsearch

mozilla-central/modules/libpref/parser/src/lib.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Preferences: Backend

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//! This crate implements a prefs file parser.

//!

//! Pref files have the following grammar. Note that there are slight

//! differences between the grammar for a default prefs files and a user prefs

//! file.

//!

//! ```text

//! <pref-file>   = <pref>*

//! <pref>        = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";"

//! <pref-spec>   = "user_pref" | "pref" | "sticky_pref" // in default pref files

//! <pref-spec>   = "user_pref"                          // in user pref files

//! <pref-name>   = <string-literal>

//! <pref-value>  = <string-literal> | "true" | "false" | <int-value>

//! <int-value>   = <sign>? <int-literal>

//! <sign>        = "+" | "-"

//! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_])

//! <string-literal> =

//!   A single or double-quoted string, with the following escape sequences

//!   allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte

//!   value that is copied directly into an 8-bit string value, and \uNNNN

//!   gives a UTF-16 code unit that is converted to UTF-8 before being copied

//!   into an 8-bit string value. \x00 and \u0000 are disallowed because they

//!   would cause C++ code handling such strings to misbehave.

//! <pref-attrs>  = ("," <pref-attr>)*      // in default pref files

//!               = <empty>                 // in user pref files

//! <pref-attr>   = "sticky" | "locked"     // default pref files only

//! ```

//!

//! Comments can take three forms:

//! - `# Python-style comments`

//! - `// C++ style comments`

//! - `/* C style comments (non-nested) */`

//!

//! Non-end-of-line whitespace chars are `\t`, `\v`, `\f`, and space.

//!

//! End-of-line sequences can take three forms, each of which is considered as

//! a single EOL:

//! - `\n`

//! - `\r` (without subsequent `\n`)

//! - `\r\n`

//!

//! The valid range for `<int-value>` is -2,147,483,648..2,147,483,647. Values

//! outside that range will result in a parse error.

//!

//! A `\0` char is interpreted as the end of the file. The use of this character

//! in a prefs file is not recommended. Within string literals `\x00` or

//! `\u0000` can be used instead.

//!

//! The parser performs error recovery. On a syntax error, it will scan forward

//! to the next `;` token and then continue parsing. If the syntax error occurs

//! in the middle of a token, it will first finish obtaining the current token

//! in an appropriate fashion.

// This parser uses several important optimizations.

//

// - Because "`\0` means EOF" is part of the grammar (see above), EOF is

//   representable by a u8. If EOF was represented by an out-of-band value such

//   as -1 or 256, we'd have to return a larger type such as `u16` or `i16`

//   from `get_char()`.

//

// - When starting a new token, it uses a lookup table with the first char,

//   which quickly identifies what kind of token it will be. Furthermore, if

//   that token is an unambiguous single-char token (e.g. `(`, `)`, `+`, `,`,

//   `-`, `;`), the parser will return the appropriate token kind value at

//   minimal cost because the single-char tokens have a uniform representation.

//

// - It has a lookup table that identifies chars in string literals that need

//   special handling. This means non-special chars (the common case) can be

//   handled with a single test, rather than testing for the multiple special

//   cases.

//

// - It pre-scans string literals for special chars. If none are present, it

//   bulk copies the string literal into a Vec, which is faster than doing a

//   char-by-char copy.

//

// - It reuses Vecs to avoid creating a new one for each string literal.

use std::os::raw::{c_char, c_uchar};

//---------------------------------------------------------------------------

// The public interface

//---------------------------------------------------------------------------

/// Keep this in sync with PrefType in Preferences.cpp.

#[derive(Clone, Copy, Debug, PartialEq)]

#[repr(u8)]

pub enum PrefType {

    None,

    String,

    Int,

    Bool,

/// Keep this in sync with PrefValueKind in Preferences.h.

#[derive(Clone, Copy, Debug, PartialEq)]

#[repr(u8)]

pub enum PrefValueKind {

    Default,

    User,

/// Keep this in sync with PrefValue in Preferences.cpp.

#[repr(C)]

pub union PrefValue {

    pub string_val: *const c_char,

    pub int_val: i32,

    pub bool_val: bool,

/// Keep this in sync with PrefsParserPrefFn in Preferences.cpp.

type PrefFn = unsafe extern "C" fn(

    pref_name: *const c_char,

    pref_type: PrefType,

    pref_value_kind: PrefValueKind,

    pref_value: PrefValue,

    is_sticky: bool,

    is_locked: bool,

);

/// Keep this in sync with PrefsParserErrorFn in Preferences.cpp.

type ErrorFn = unsafe extern "C" fn(msg: *const c_char);

/// Parse the contents of a prefs file.

///

/// `buf` is a null-terminated string. `len` is its length, excluding the

/// null terminator.

///

/// `pref_fn` is called once for each successfully parsed pref.

///

/// `error_fn` is called once for each parse error detected.

///

/// Keep this in sync with the prefs_parser_parse() declaration in

/// Preferences.cpp.

#[no_mangle]

pub unsafe extern "C" fn prefs_parser_parse(

    path: *const c_char,

    kind: PrefValueKind,

    buf: *const c_char,

    len: usize,

    pref_fn: PrefFn,

    error_fn: ErrorFn,

) -> bool {

    let path = std::ffi::CStr::from_ptr(path)

        .to_string_lossy()

        .into_owned();

    // Make sure `buf` ends in a '\0', and include that in the length, because

    // it represents EOF.

    let buf = std::slice::from_raw_parts(buf as *const c_uchar, len + 1);

    assert!(buf.last() == Some(&EOF));

    let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn);

    parser.parse()

//---------------------------------------------------------------------------

// The implementation

//---------------------------------------------------------------------------

#[derive(Clone, Copy, Debug, PartialEq)]

enum Token {

    // Unambiguous single-char tokens.

    SingleChar(u8),

    // Keywords

    Pref,       // pref

    StickyPref, // sticky_pref

    UserPref,   // user_pref

    True,       // true

    False,      // false

    Sticky,     // sticky

    Locked,     // locked

    // String literal, e.g. '"string"'. The value is stored elsewhere.

    String,

    // Unsigned integer literal, e.g. '123'. Although libpref uses i32 values,

    // any '-' and '+' before an integer literal are treated as separate

    // tokens, so these token values are always positive. Furthermore, we

    // tokenize int literals as u32 so that 2147483648 (which doesn't fit into

    // an i32) can be subsequently negated to -2147483648 (which does fit into

    // an i32) if a '-' token precedes it.

    Int(u32),

    // Malformed token.

    Error(&'static str),

    // Malformed token at a particular line number. For use when

    // Parser::line_num might not be the right line number when the error is

    // reported. E.g. if a multi-line string has a bad escape sequence on the

    // first line, we don't report the error until the string's end has been

    // reached.

    ErrorAtLine(&'static str, u32),

// We categorize every char by what action should be taken when it appears at

// the start of a new token.

#[derive(Clone, Copy, PartialEq)]

enum CharKind {

    // These are ordered by frequency. See the comment in GetToken().

    SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF

    SpaceNL,    // [\t\v\f \n]

    Keyword,    // [A-Za-z_]

    Quote,      // ["']

    Slash,      // /

    Digit,      // [0-9]

    Hash,       // #

    CR,         // \r

    Other,      // Everything else; invalid except within strings and comments.

const C_SINGL: CharKind = CharKind::SingleChar;

const C_SPCNL: CharKind = CharKind::SpaceNL;

const C_KEYWD: CharKind = CharKind::Keyword;

const C_QUOTE: CharKind = CharKind::Quote;

const C_SLASH: CharKind = CharKind::Slash;

const C_DIGIT: CharKind = CharKind::Digit;

const C_HASH_: CharKind = CharKind::Hash;

const C_CR___: CharKind = CharKind::CR;

const C______: CharKind = CharKind::Other;

#[rustfmt::skip]

const CHAR_KINDS: [CharKind; 256] = [

/*         0        1        2        3        4        5        6        7        8        9    */

/*   0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL,

/*  10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR___, C______, C______, C______, C______, C______, C______,

/*  20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/*  30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH_, C______, C______, C______, C_QUOTE,

/*  40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT,

/*  50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL,

/*  60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,

/*  70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,

/*  80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,

/*  90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD,

/* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,

/* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,

/* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______,

/* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,

/* 250+ */ C______, C______, C______, C______, C______, C______

];

const _______: bool = false;

#[rustfmt::skip]

const SPECIAL_STRING_CHARS: [bool; 256] = [

/*         0        1        2        3        4        5        6        7        8        9    */

/*   0+ */    true, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  10+ */    true, _______, _______,    true, _______, _______, _______, _______, _______, _______,

/*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  30+ */ _______, _______, _______, _______,    true, _______, _______, _______, _______,    true,

/*  40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/*  90+ */ _______, _______,    true, _______, _______, _______, _______, _______, _______, _______,

/* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,

/* 250+ */ _______, _______, _______, _______, _______, _______

];

struct KeywordInfo {

    string: &'static [u8],

    token: Token,

const KEYWORD_INFOS: [KeywordInfo; 7] = [

    // These are ordered by frequency.

    KeywordInfo {

        string: b"pref",

        token: Token::Pref,

},

    KeywordInfo {

        string: b"true",

        token: Token::True,

},

    KeywordInfo {

        string: b"false",

        token: Token::False,

},

    KeywordInfo {

        string: b"user_pref",

        token: Token::UserPref,

},

    KeywordInfo {

        string: b"sticky",

        token: Token::Sticky,

},

    KeywordInfo {

        string: b"locked",

        token: Token::Locked,

},

    KeywordInfo {

        string: b"sticky_pref",

        token: Token::StickyPref,

},

];

struct Parser<'t> {

    path: &'t str,       // Path to the file being parsed. Used in error messages.

    kind: PrefValueKind, // Default prefs file or user prefs file?

    buf: &'t [u8],       // Text being parsed.

    i: usize,            // Index of next char to be read.

    line_num: u32,       // Current line number within the text.

    pref_fn: PrefFn,     // Callback for processing each pref.

    error_fn: ErrorFn,   // Callback for parse errors.

    has_errors: bool,    // Have we encountered errors?

// As described above, we use 0 to represent EOF.

const EOF: u8 = b'\0';

impl<'t> Parser<'t> {

    fn new(

        path: &'t str,

        kind: PrefValueKind,

        buf: &'t [u8],

        pref_fn: PrefFn,

        error_fn: ErrorFn,

    ) -> Parser<'t> {

        // Make sure these tables take up 1 byte per entry.

        assert!(std::mem::size_of_val(&CHAR_KINDS) == 256);

        assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256);

        Parser {

            path: path,

            kind: kind,

            buf: buf,

            i: 0,

            line_num: 1,

            pref_fn: pref_fn,

            error_fn: error_fn,

            has_errors: false,

    fn parse(&mut self) -> bool {

        // These are reused, because allocating a new Vec for every string is slow.

        let mut name_str = Vec::with_capacity(128); // For pref names.

        let mut value_str = Vec::with_capacity(512); // For string pref values.

        let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings.

        let mut token = self.get_token(&mut none_str);

        // At the top of the loop we already have a token. In a valid input

        // this will be either the first token of a new pref, or EOF.

        loop {

            // <pref-spec>

            let (pref_value_kind, mut is_sticky) = match token {

                Token::Pref if self.kind == PrefValueKind::Default => {

                    (PrefValueKind::Default, false)

                Token::StickyPref if self.kind == PrefValueKind::Default => {

                    (PrefValueKind::Default, true)

                Token::UserPref => (PrefValueKind::User, false),

                Token::SingleChar(EOF) => return !self.has_errors,

                _ => {

                    token = self.error_and_recover(

                        token,

                        if self.kind == PrefValueKind::Default {

                            "expected pref specifier at start of pref definition"

                        } else {

                            "expected 'user_pref' at start of pref definition"

},

);

                    continue;

};

            // "("

            token = self.get_token(&mut none_str);

            if token != Token::SingleChar(b'(') {

                token = self.error_and_recover(token, "expected '(' after pref specifier");

                continue;

            // <pref-name>

            token = self.get_token(&mut name_str);

            let pref_name = if token == Token::String {

                &name_str

            } else {

                token = self.error_and_recover(token, "expected pref name after '('");

                continue;

};

            // ","

            token = self.get_token(&mut none_str);

            if token != Token::SingleChar(b',') {

                token = self.error_and_recover(token, "expected ',' after pref name");

                continue;

            // <pref-value>

            token = self.get_token(&mut value_str);

            let (pref_type, pref_value) = match token {

                Token::True => (PrefType::Bool, PrefValue { bool_val: true }),

                Token::False => (PrefType::Bool, PrefValue { bool_val: false }),

                Token::String => (

                    PrefType::String,

                    PrefValue {

                        string_val: value_str.as_ptr() as *const c_char,

},

),

                Token::Int(u) => {

                    // Accept u <= 2147483647; anything larger will overflow i32.

                    if u <= std::i32::MAX as u32 {

                        (PrefType::Int, PrefValue { int_val: u as i32 })

                    } else {

                        token =

                            self.error_and_recover(Token::Error("integer literal overflowed"), "");

                        continue;

                Token::SingleChar(b'-') => {

                    token = self.get_token(&mut none_str);

                    if let Token::Int(u) = token {

                        // Accept u <= 2147483648; anything larger will overflow i32 once negated.

                        if u <= std::i32::MAX as u32 {

                                PrefType::Int,

                                PrefValue {

                                    int_val: -(u as i32),

},

                        } else if u == std::i32::MAX as u32 + 1 {

                                PrefType::Int,

                                PrefValue {

                                    int_val: std::i32::MIN,

},

                        } else {

                            token = self

                                .error_and_recover(Token::Error("integer literal overflowed"), "");

                            continue;

                    } else {

                        token = self.error_and_recover(token, "expected integer literal after '-'");

                        continue;

                Token::SingleChar(b'+') => {

                    token = self.get_token(&mut none_str);

                    if let Token::Int(u) = token {

                        // Accept u <= 2147483647; anything larger will overflow i32.

                        if u <= std::i32::MAX as u32 {

                            (PrefType::Int, PrefValue { int_val: u as i32 })

                        } else {

                            token = self

                                .error_and_recover(Token::Error("integer literal overflowed"), "");

                            continue;

                    } else {

                        token = self.error_and_recover(token, "expected integer literal after '+'");

                        continue;

                _ => {

                    token = self.error_and_recover(token, "expected pref value after ','");

                    continue;

};

            // ("," <pref-attr>)*   // default pref files only

            let mut is_locked = false;

            let mut has_attrs = false;

            if self.kind == PrefValueKind::Default {

                let ok = loop {

                    // ","

                    token = self.get_token(&mut none_str);

                    if token != Token::SingleChar(b',') {

                        break true;

                    // <pref-attr>

                    token = self.get_token(&mut none_str);

                    match token {

                        Token::Sticky => is_sticky = true,

                        Token::Locked => is_locked = true,

                        _ => {

                            token =

                                self.error_and_recover(token, "expected pref attribute after ','");

                            break false;

                    has_attrs = true;

};

                if !ok {

                    continue;

            } else {

                token = self.get_token(&mut none_str);

            // ")"

            if token != Token::SingleChar(b')') {

                let expected_msg = if self.kind == PrefValueKind::Default {

                    if has_attrs {

                        "expected ',' or ')' after pref attribute"

                    } else {

                        "expected ',' or ')' after pref value"

                } else {

                    "expected ')' after pref value"

};

                token = self.error_and_recover(token, expected_msg);

                continue;

            // ";"

            token = self.get_token(&mut none_str);

            if token != Token::SingleChar(b';') {

                token = self.error_and_recover(token, "expected ';' after ')'");

                continue;

            unsafe {

                (self.pref_fn)(

                    pref_name.as_ptr() as *const c_char,

                    pref_type,

                    pref_value_kind,

                    pref_value,

                    is_sticky,

                    is_locked,

};

            token = self.get_token(&mut none_str);

    fn error_and_recover(&mut self, token: Token, msg: &str) -> Token {

        self.has_errors = true;

        // If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and

        // the error message is within `token`. Otherwise, it's a parsing error

        // and the error message is in `msg`.

        let (msg, line_num) = match token {

            Token::Error(token_msg) => (token_msg, self.line_num),

            Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num),

            _ => (msg, self.line_num),

};

        let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg);

        let msg = std::ffi::CString::new(msg).unwrap();

        unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) };

        // "Panic-mode" recovery: consume tokens until one of the following

        // occurs.

        // - We hit a semicolon, whereupon we return the following token.

        // - We hit EOF, whereupon we return EOF.

//

        // For this to work, if the lexing functions hit EOF in an error case

        // they must unget it so we can safely reget it here.

//

        // If the starting token (passed in above) is EOF we must not get

        // another token otherwise we will read past the end of `self.buf`.

        let mut dummy_str = Vec::with_capacity(128);

        let mut token = token;

        loop {

            match token {

                Token::SingleChar(b';') => return self.get_token(&mut dummy_str),

                Token::SingleChar(EOF) => return token,

                _ => {}

            token = self.get_token(&mut dummy_str);

    #[inline(always)]

    fn get_char(&mut self) -> u8 {

        // We do the bounds check ourselves so we can return EOF on failure.

        // (Although the buffer is guaranteed to end in an EOF char, we might

        // go one char past that, whereupon we must return EOF again.)

        if self.i < self.buf.len() {

            let c = unsafe { *self.buf.get_unchecked(self.i) };

            self.i += 1;

        } else {

            debug_assert!(self.i == self.buf.len());

EOF

    // This function skips the bounds check in optimized builds. Using it at

    // the hottest two call sites gives a ~15% parsing speed boost.

    #[inline(always)]

    unsafe fn get_char_unchecked(&mut self) -> u8 {

        debug_assert!(self.i < self.buf.len());

        let c = *self.buf.get_unchecked(self.i);

        self.i += 1;

    #[inline(always)]

    fn unget_char(&mut self) {

        debug_assert!(self.i > 0);

        self.i -= 1;

    #[inline(always)]

    fn match_char(&mut self, c: u8) -> bool {

        if self.buf[self.i] == c {

            self.i += 1;

            return true;

        false

    #[inline(always)]

    fn match_single_line_comment(&mut self) {

        loop {

            // To reach here, the previous char must have been '/' (if this is

            // the first loop iteration) or non-special (if this is the second

            // or subsequent iteration), and assertions elsewhere ensure that

            // there must be at least one subsequent char after those chars

            // (the '\0' for EOF).

            let c = unsafe { self.get_char_unchecked() };

            // All the special chars have value <= b'\r'.

            if c > b'\r' {

                continue;

            match c {

                b'\n' => {

                    self.line_num += 1;

                    break;

                b'\r' => {

                    self.line_num += 1;

                    self.match_char(b'\n');

                    break;

                EOF => {

                    break;

                _ => continue,

    // Returns false if we hit EOF without closing the comment.

    fn match_multi_line_comment(&mut self) -> bool {

        loop {

            match self.get_char() {

                b'*' => {

                    if self.match_char(b'/') {

                        return true;

                b'\n' => {

                    self.line_num += 1;

                b'\r' => {

                    self.line_num += 1;

                    self.match_char(b'\n');

                EOF => return false,

                _ => continue,

    fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> {

        debug_assert!(ndigits == 2 || ndigits == 4);

        let mut value: u16 = 0;

        for _ in 0..ndigits {

            value = value << 4;

            match self.get_char() {

                c @ b'0'..=b'9' => value += (c - b'0') as u16,

                c @ b'A'..=b'F' => value += (c - b'A') as u16 + 10,

                c @ b'a'..=b'f' => value += (c - b'a') as u16 + 10,

                _ => {

                    self.unget_char();

                    return None;

        Some(value)

    #[inline(always)]

    fn char_kind(c: u8) -> CharKind {

        // Use get_unchecked() because a u8 index cannot exceed this table's

        // bounds.

        unsafe { *CHAR_KINDS.get_unchecked(c as usize) }

    #[inline(always)]

    fn is_special_string_char(c: u8) -> bool {

        // Use get_unchecked() because a u8 index cannot exceed this table's

        // bounds.

        unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) }

    // If the obtained Token has a value, it is put within the Token, unless

    // it's a string, in which case it's put in `str_buf`. This avoids

    // allocating a new Vec for every string, which is slow.

    fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token {

        loop {

            // Note: the following tests are ordered by frequency when parsing

            // greprefs.js:

            // - SingleChar      36.7%

            // - SpaceNL         27.7% (14.9% for spaces, 12.8% for NL)

            // - Keyword         13.4%

            // - Quote           11.4%

            // - Slash            8.1%

            // - Digit            2.7%

            // - Hash, CR, Other  0.0%

            let c = self.get_char();

            match Parser::char_kind(c) {

                CharKind::SingleChar => {

                    return Token::SingleChar(c);

                CharKind::SpaceNL => {

                    // It's slightly faster to combine the handling of the

                    // space chars with NL than to handle them separately; we

                    // have an extra test for this case, but one fewer test for

                    // all the subsequent CharKinds.

                    if c == b'\n' {

                        self.line_num += 1;

                    continue;

                CharKind::Keyword => {

                    let start = self.i - 1;

                    loop {

                        let c = self.get_char();

                        if Parser::char_kind(c) != CharKind::Keyword {

                            self.unget_char();

                            break;

                    for info in KEYWORD_INFOS.iter() {

                        if &self.buf[start..self.i] == info.string {

                            return info.token;

                    return Token::Error("unknown keyword");

                CharKind::Quote => {

                    return self.get_string_token(c, str_buf);

                CharKind::Slash => {

                    match self.get_char() {

                        b'/' => {

                            self.match_single_line_comment();

                        b'*' => {

                            if !self.match_multi_line_comment() {

                                return Token::Error("unterminated /* comment");

                        c @ _ => {

                            if c == b'\n' || c == b'\r' {

                                // Unget the newline char; the outer loop will

                                // reget it and adjust self.line_num

                                // appropriately.

                                self.unget_char();

                            return Token::Error("expected '/' or '*' after '/'");

                    continue;

                CharKind::Digit => {

                    let mut value = Some((c - b'0') as u32);

                    loop {

                        let c = self.get_char();

                        match Parser::char_kind(c) {

                            CharKind::Digit => {

                                fn add_digit(value: Option<u32>, c: u8) -> Option<u32> {

                                    value?.checked_mul(10)?.checked_add((c - b'0') as u32)

                                value = add_digit(value, c);

                            CharKind::Keyword => {

                                // Reject things like "123foo". Error recovery

                                // will retokenize from "foo" onward.

                                self.unget_char();

                                return Token::Error("unexpected character in integer literal");

                            _ => {

                                self.unget_char();

                                break;

                    return match value {

                        Some(v) => Token::Int(v),

                        None => Token::Error("integer literal overflowed"),

};

                CharKind::Hash => {

                    self.match_single_line_comment();

                    continue;

                CharKind::CR => {

                    self.match_char(b'\n');

                    self.line_num += 1;

                    continue;

                // Error recovery will retokenize from the next character.

                _ => return Token::Error("unexpected character"),

    fn string_error_token(&self, token: &mut Token, msg: &'static str) {

        // We only want to capture the first tokenization error within a string.

        if *token == Token::String {

            *token = Token::ErrorAtLine(msg, self.line_num);

    // Always inline this because it has a single call site.

    #[inline(always)]

    fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token {

        // First scan through the string to see if it contains any chars that

        // need special handling.

        let start = self.i;

        let has_special_chars = loop {

            // To reach here, the previous char must have been a quote

            // (quote_char), and assertions elsewhere ensure that there must be

            // at least one subsequent char (the '\0' for EOF).

            let c = unsafe { self.get_char_unchecked() };

            if Parser::is_special_string_char(c) {

                break c != quote_char;

};

        // Clear str_buf's contents without changing its capacity.

        str_buf.clear();

        // If there are no special chars (the common case), we can bulk copy it

        // to str_buf. This is a lot faster than the char-by-char loop below.

        if !has_special_chars {

            str_buf.extend(&self.buf[start..self.i - 1]);

            str_buf.push(b'\0');

            return Token::String;

        // There were special chars. Re-scan the string, filling in str_buf one

        // char at a time.

//

        // On error, we change `token` to an error token and then keep going to

        // the end of the string literal. `str_buf` won't be used in that case.

        self.i = start;

        let mut token = Token::String;

        loop {

            let c = self.get_char();

            let c2 = if !Parser::is_special_string_char(c) {

            } else if c == quote_char {

                break;

            } else if c == b'\\' {

                match self.get_char() {

                    b'\"' => b'\"',

                    b'\'' => b'\'',

                    b'\\' => b'\\',

                    b'n' => b'\n',

                    b'r' => b'\r',

                    b'x' => {

                        if let Some(value) = self.match_hex_digits(2) {

                            debug_assert!(value <= 0xff);

                            if value != 0 {

                                value as u8

                            } else {

                                self.string_error_token(&mut token, "\\x00 is not allowed");

                                continue;

                        } else {

                            self.string_error_token(&mut token, "malformed \\x escape sequence");

                            continue;

                    b'u' => {

                        if let Some(value) = self.match_hex_digits(4) {

                            let mut utf16 = vec![value];

                            if 0xd800 == (0xfc00 & value) {

                                // High surrogate value. Look for the low surrogate value.

                                if self.match_char(b'\\') && self.match_char(b'u') {

                                    if let Some(lo) = self.match_hex_digits(4) {

                                        if 0xdc00 == (0xfc00 & lo) {

                                            // Found a valid low surrogate.

                                            utf16.push(lo);

                                        } else {

                                            self.string_error_token(

                                                &mut token,

                                                "invalid low surrogate after high surrogate",

);

                                            continue;

                                if utf16.len() != 2 {

                                    self.string_error_token(

                                        &mut token,

                                        "expected low surrogate after high surrogate",

);

                                    continue;

                            } else if 0xdc00 == (0xfc00 & value) {

                                // Unaccompanied low surrogate value.

                                self.string_error_token(

                                    &mut token,

                                    "expected high surrogate before low surrogate",

);

                                continue;

                            } else if value == 0 {

                                self.string_error_token(&mut token, "\\u0000 is not allowed");

                                continue;

                            // Insert the UTF-16 sequence as UTF-8.

                            let utf8 = String::from_utf16(&utf16).unwrap();

                            str_buf.extend(utf8.as_bytes());

                        } else {

                            self.string_error_token(&mut token, "malformed \\u escape sequence");

                            continue;

                        continue; // We don't want to str_buf.push(c2) below.

                    c @ _ => {

                        if c == b'\n' || c == b'\r' {

                            // Unget the newline char; the outer loop will

                            // reget it and adjust self.line_num appropriately.

                            self.unget_char();

                        self.string_error_token(

                            &mut token,

                            "unexpected escape sequence character after '\\'",

);

                        continue;

            } else if c == b'\n' {

                self.line_num += 1;

            } else if c == b'\r' {

                self.line_num += 1;

                if self.match_char(b'\n') {

                    str_buf.push(b'\r');

                    b'\n'

                } else {

            } else if c == EOF {

                self.string_error_token(&mut token, "unterminated string literal");

                break;

            } else {

                // This case is only hit for the non-closing quote char.

                debug_assert!((c == b'\'' || c == b'\"') && c != quote_char);

};

            str_buf.push(c2);

        str_buf.push(b'\0');

        token