Source code

Revision control

Other Tools

1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
* License, v. 2.0. If a copy of the MPL was not distributed with this
3
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
//! This crate implements a prefs file parser.
6
//!
7
//! Pref files have the following grammar. Note that there are slight
8
//! differences between the grammar for a default prefs files and a user prefs
9
//! file.
10
//!
11
//! <pref-file> = <pref>*
12
//! <pref> = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";"
13
//! <pref-spec> = "user_pref" | "pref" | "sticky_pref"
14
//! <pref-name> = <string-literal>
15
//! <pref-value> = <string-literal> | "true" | "false" | <int-value>
16
//! <int-value> = <sign>? <int-literal>
17
//! <sign> = "+" | "-"
18
//! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_])
19
//! <string-literal> =
20
//! A single or double-quoted string, with the following escape sequences
21
//! allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte
22
//! value that is copied directly into an 8-bit string value, and \uNNNN
23
//! gives a UTF-16 code unit that is converted to UTF-8 before being copied
24
//! into an 8-bit string value. \x00 and \u0000 are disallowed because they
25
//! would cause C++ code handling such strings to misbehave.
26
//! <pref-attrs> = ("," <pref-attr>)* // in default pref files
27
//! = <empty> // in user pref files
28
//! <pref-attr> = "sticky" | "locked" // default pref files only
29
//!
30
//! Comments can take three forms:
31
//! - # Python-style comments
32
//! - // C++ style comments
33
//! - /* C style comments (non-nested) */
34
//!
35
//! Non-end-of-line whitespace chars are \t, \v, \f, and space.
36
//!
37
//! End-of-line sequences can take three forms, each of which is considered as a
38
//! single EoL:
39
//! - \n
40
//! - \r (without subsequent \n)
41
//! - \r\n
42
//!
43
//! The valid range for <int-value> is -2,147,483,648..2,147,483,647. Values
44
//! outside that range will result in a parse error.
45
//!
46
//! A '\0' char is interpreted as the end of the file. The use of this character
47
//! in a prefs file is not recommended. Within string literals \x00 or \u0000
48
//! can be used instead.
49
//!
50
//! The parser performs error recovery. On a syntax error, it will scan forward
51
//! to the next ';' token and then continue parsing. If the syntax error occurs
52
//! in the middle of a token, it will first finish obtaining the current token
53
//! in an appropriate fashion.
54
55
// This parser uses several important optimizations.
56
//
57
// - Because "'\0' means EOF" is part of the grammar (see above), EOF is
58
// representable by a u8. If EOF was represented by an out-of-band value such
59
// as -1 or 256, we'd have to return a larger type such as u16 or i16 from
60
// get_char().
61
//
62
// - When starting a new token, it uses a lookup table with the first char,
63
// which quickly identifies what kind of token it will be. Furthermore, if
64
// that token is an unambiguous single-char token (e.g. '(', ')', '+', ',',
65
// '-', ';'), the parser will return the appropriate token kind value at
66
// minimal cost because the single-char tokens have a uniform representation.
67
//
68
// - It has a lookup table that identifies chars in string literals that need
69
// special handling. This means non-special chars (the common case) can be
70
// handled with a single test, rather than testing for the multiple special
71
// cases.
72
//
73
// - It pre-scans string literals for special chars. If none are present, it
74
// bulk copies the string literal into a Vec, which is faster than doing a
75
// char-by-char copy.
76
//
77
// - It reuses Vecs to avoid creating a new one for each string literal.
78
79
use std::os::raw::{c_char, c_uchar};
80
81
//---------------------------------------------------------------------------
82
// The public interface
83
//---------------------------------------------------------------------------
84
85
/// Keep this in sync with PrefType in Preferences.cpp.
86
#[derive(Clone, Copy, Debug)]
87
#[repr(u8)]
88
pub enum PrefType {
89
None,
90
String,
91
Int,
92
Bool,
93
}
94
95
/// Keep this in sync with PrefValueKind in Preferences.h.
96
#[derive(Clone, Copy, Debug, PartialEq)]
97
#[repr(u8)]
98
pub enum PrefValueKind {
99
Default,
100
User
101
}
102
103
/// Keep this in sync with PrefValue in Preferences.cpp.
104
#[repr(C)]
105
pub union PrefValue {
106
string_val: *const c_char,
107
int_val: i32,
108
bool_val: bool,
109
}
110
111
/// Keep this in sync with PrefsParserPrefFn in Preferences.cpp.
112
type PrefFn = unsafe extern "C" fn(pref_name: *const c_char, pref_type: PrefType,
113
pref_value_kind: PrefValueKind, pref_value: PrefValue,
114
is_sticky: bool, is_locked: bool);
115
116
/// Keep this in sync with PrefsParserErrorFn in Preferences.cpp.
117
type ErrorFn = unsafe extern "C" fn(msg: *const c_char);
118
119
/// Parse the contents of a prefs file.
120
///
121
/// `buf` is a null-terminated string. `len` is its length, excluding the
122
/// null terminator.
123
///
124
/// `pref_fn` is called once for each successfully parsed pref.
125
///
126
/// `error_fn` is called once for each parse error detected.
127
///
128
/// Keep this in sync with the prefs_parser_parse() declaration in
129
/// Preferences.cpp.
130
#[no_mangle]
131
pub extern "C" fn prefs_parser_parse(path: *const c_char, kind: PrefValueKind, buf: *const c_char,
132
len: usize, pref_fn: PrefFn, error_fn: ErrorFn) -> bool {
133
let path = unsafe { std::ffi::CStr::from_ptr(path).to_string_lossy().into_owned() };
134
135
// Make sure `buf` ends in a '\0', and include that in the length, because
136
// it represents EOF.
137
let buf = unsafe { std::slice::from_raw_parts(buf as *const c_uchar, len + 1) };
138
assert!(buf.last() == Some(&EOF));
139
140
let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn);
141
parser.parse()
142
}
143
144
//---------------------------------------------------------------------------
145
// The implementation
146
//---------------------------------------------------------------------------
147
148
#[derive(Clone, Copy, Debug, PartialEq)]
149
enum Token {
150
// Unambiguous single-char tokens.
151
SingleChar(u8),
152
153
// Keywords
154
Pref, // pref
155
StickyPref, // sticky_pref
156
UserPref, // user_pref
157
True, // true
158
False, // false
159
Sticky, // sticky
160
Locked, // locked
161
162
// String literal, e.g. '"string"'. The value is stored elsewhere.
163
String,
164
165
// Unsigned integer literal, e.g. '123'. Although libpref uses i32 values,
166
// any '-' and '+' before an integer literal are treated as separate
167
// tokens, so these token values are always positive. Furthermore, we
168
// tokenize int literals as u32 so that 2147483648 (which doesn't fit into
169
// an i32) can be subsequently negated to -2147483648 (which does fit into
170
// an i32) if a '-' token precedes it.
171
Int(u32),
172
173
// Malformed token.
174
Error(&'static str),
175
176
// Malformed token at a particular line number. For use when
177
// Parser::line_num might not be the right line number when the error is
178
// reported. E.g. if a multi-line string has a bad escape sequence on the
179
// first line, we don't report the error until the string's end has been
180
// reached.
181
ErrorAtLine(&'static str, u32),
182
}
183
184
// We categorize every char by what action should be taken when it appears at
185
// the start of a new token.
186
#[derive(Clone, Copy, PartialEq)]
187
enum CharKind {
188
// These are ordered by frequency. See the comment in GetToken().
189
SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF
190
SpaceNL, // [\t\v\f \n]
191
Keyword, // [A-Za-z_]
192
Quote, // ["']
193
Slash, // /
194
Digit, // [0-9]
195
Hash, // #
196
CR, // \r
197
Other // Everything else; invalid except within strings and comments.
198
}
199
200
const C_SINGL: CharKind = CharKind::SingleChar;
201
const C_SPCNL: CharKind = CharKind::SpaceNL;
202
const C_KEYWD: CharKind = CharKind::Keyword;
203
const C_QUOTE: CharKind = CharKind::Quote;
204
const C_SLASH: CharKind = CharKind::Slash;
205
const C_DIGIT: CharKind = CharKind::Digit;
206
const C_HASH : CharKind = CharKind::Hash;
207
const C_CR : CharKind = CharKind::CR;
208
const C______: CharKind = CharKind::Other;
209
210
const CHAR_KINDS: [CharKind; 256] = [
211
/* 0 1 2 3 4 5 6 7 8 9 */
212
/* 0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL,
213
/* 10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR , C______, C______, C______, C______, C______, C______,
214
/* 20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
215
/* 30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH , C______, C______, C______, C_QUOTE,
216
/* 40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT,
217
/* 50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL,
218
/* 60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
219
/* 70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
220
/* 80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
221
/* 90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD,
222
/* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
223
/* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
224
/* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______,
225
/* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
226
/* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
227
/* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
228
/* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
229
/* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
230
/* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
231
/* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
232
/* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
233
/* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
234
/* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
235
/* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
236
/* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
237
/* 250+ */ C______, C______, C______, C______, C______, C______
238
];
239
240
const _______: bool = false;
241
const SPECIAL_STRING_CHARS: [bool; 256] = [
242
/* 0 1 2 3 4 5 6 7 8 9 */
243
/* 0+ */ true, _______, _______, _______, _______, _______, _______, _______, _______, _______,
244
/* 10+ */ true, _______, _______, true, _______, _______, _______, _______, _______, _______,
245
/* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
246
/* 30+ */ _______, _______, _______, _______, true, _______, _______, _______, _______, true,
247
/* 40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
248
/* 50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
249
/* 60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
250
/* 70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
251
/* 80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
252
/* 90+ */ _______, _______, true, _______, _______, _______, _______, _______, _______, _______,
253
/* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
254
/* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
255
/* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
256
/* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
257
/* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
258
/* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
259
/* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
260
/* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
261
/* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
262
/* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
263
/* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
264
/* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
265
/* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
266
/* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
267
/* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
268
/* 250+ */ _______, _______, _______, _______, _______, _______
269
];
270
271
struct KeywordInfo {
272
string: &'static [u8],
273
token: Token,
274
}
275
276
const KEYWORD_INFOS: [KeywordInfo; 7] = [
277
// These are ordered by frequency.
278
KeywordInfo { string: b"pref", token: Token::Pref },
279
KeywordInfo { string: b"true", token: Token::True },
280
KeywordInfo { string: b"false", token: Token::False },
281
KeywordInfo { string: b"user_pref", token: Token::UserPref },
282
KeywordInfo { string: b"sticky", token: Token::Sticky },
283
KeywordInfo { string: b"locked", token: Token::Locked },
284
KeywordInfo { string: b"sticky_pref", token: Token::StickyPref },
285
];
286
287
struct Parser<'t> {
288
path: &'t str, // Path to the file being parsed. Used in error messages.
289
kind: PrefValueKind, // Default prefs file or user prefs file?
290
buf: &'t [u8], // Text being parsed.
291
i: usize, // Index of next char to be read.
292
line_num: u32, // Current line number within the text.
293
pref_fn: PrefFn, // Callback for processing each pref.
294
error_fn: ErrorFn, // Callback for parse errors.
295
has_errors: bool, // Have we encountered errors?
296
}
297
298
// As described above, we use 0 to represent EOF.
299
const EOF: u8 = b'\0';
300
301
impl<'t> Parser<'t> {
302
fn new(path: &'t str, kind: PrefValueKind, buf: &'t [u8], pref_fn: PrefFn, error_fn: ErrorFn)
303
-> Parser<'t> {
304
// Make sure these tables take up 1 byte per entry.
305
assert!(std::mem::size_of_val(&CHAR_KINDS) == 256);
306
assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256);
307
308
Parser {
309
path: path,
310
kind: kind,
311
buf: buf,
312
i: 0,
313
line_num: 1,
314
pref_fn: pref_fn,
315
error_fn: error_fn,
316
has_errors: false,
317
}
318
}
319
320
fn parse(&mut self) -> bool {
321
// These are reused, because allocating a new Vec for every string is slow.
322
let mut name_str = Vec::with_capacity(128); // For pref names.
323
let mut value_str = Vec::with_capacity(512); // For string pref values.
324
let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings.
325
326
let mut token = self.get_token(&mut none_str);
327
328
// At the top of the loop we already have a token. In a valid input
329
// this will be either the first token of a new pref, or EOF.
330
loop {
331
// <pref-spec>
332
let (pref_value_kind, mut is_sticky) = match token {
333
Token::Pref => (PrefValueKind::Default, false),
334
Token::StickyPref => (PrefValueKind::Default, true),
335
Token::UserPref => (PrefValueKind::User, false),
336
Token::SingleChar(EOF) => return !self.has_errors,
337
_ => {
338
token = self.error_and_recover(
339
token, "expected pref specifier at start of pref definition");
340
continue;
341
}
342
};
343
344
// "("
345
token = self.get_token(&mut none_str);
346
if token != Token::SingleChar(b'(') {
347
token = self.error_and_recover(token, "expected '(' after pref specifier");
348
continue;
349
}
350
351
// <pref-name>
352
token = self.get_token(&mut name_str);
353
let pref_name = if token == Token::String {
354
&name_str
355
} else {
356
token = self.error_and_recover(token, "expected pref name after '('");
357
continue;
358
};
359
360
// ","
361
token = self.get_token(&mut none_str);
362
if token != Token::SingleChar(b',') {
363
token = self.error_and_recover(token, "expected ',' after pref name");
364
continue;
365
}
366
367
// <pref-value>
368
token = self.get_token(&mut value_str);
369
let (pref_type, pref_value) = match token {
370
Token::True => {
371
(PrefType::Bool, PrefValue { bool_val: true })
372
}
373
Token::False => {
374
(PrefType::Bool, PrefValue { bool_val: false })
375
}
376
Token::String => {
377
(PrefType::String,
378
PrefValue { string_val: value_str.as_ptr() as *const c_char })
379
}
380
Token::Int(u) => {
381
// Accept u <= 2147483647; anything larger will overflow i32.
382
if u <= std::i32::MAX as u32 {
383
(PrefType::Int, PrefValue { int_val: u as i32 })
384
} else {
385
token = self.error_and_recover(
386
Token::Error("integer literal overflowed"), "");
387
continue;
388
}
389
}
390
Token::SingleChar(b'-') => {
391
token = self.get_token(&mut none_str);
392
if let Token::Int(u) = token {
393
// Accept u <= 2147483648; anything larger will overflow i32 once negated.
394
if u <= std::i32::MAX as u32 {
395
(PrefType::Int, PrefValue { int_val: -(u as i32) })
396
} else if u == std::i32::MAX as u32 + 1 {
397
(PrefType::Int, PrefValue { int_val: std::i32::MIN })
398
} else {
399
token = self.error_and_recover(
400
Token::Error("integer literal overflowed"), "");
401
continue;
402
}
403
} else {
404
token = self.error_and_recover(
405
token, "expected integer literal after '-'");
406
continue;
407
}
408
409
}
410
Token::SingleChar(b'+') => {
411
token = self.get_token(&mut none_str);
412
if let Token::Int(u) = token {
413
// Accept u <= 2147483647; anything larger will overflow i32.
414
if u <= std::i32::MAX as u32 {
415
(PrefType::Int, PrefValue { int_val: u as i32 })
416
} else {
417
token = self.error_and_recover(
418
Token::Error("integer literal overflowed"), "");
419
continue;
420
}
421
} else {
422
token = self.error_and_recover(token, "expected integer literal after '+'");
423
continue;
424
}
425
426
}
427
_ => {
428
token = self.error_and_recover(token, "expected pref value after ','");
429
continue;
430
}
431
};
432
433
// ("," <pref-attr>)* // default pref files only
434
let mut is_locked = false;
435
let mut has_attrs = false;
436
if self.kind == PrefValueKind::Default {
437
let ok = loop {
438
// ","
439
token = self.get_token(&mut none_str);
440
if token != Token::SingleChar(b',') {
441
break true;
442
}
443
444
// <pref-attr>
445
token = self.get_token(&mut none_str);
446
match token {
447
Token::Sticky => is_sticky = true,
448
Token::Locked => is_locked = true,
449
_ => {
450
token =
451
self.error_and_recover(token, "expected pref attribute after ','");
452
break false;
453
}
454
}
455
has_attrs = true;
456
};
457
if !ok {
458
continue;
459
}
460
} else {
461
token = self.get_token(&mut none_str);
462
}
463
464
// ")"
465
if token != Token::SingleChar(b')') {
466
let expected_msg = if self.kind == PrefValueKind::Default {
467
if has_attrs {
468
"expected ',' or ')' after pref attribute"
469
} else {
470
"expected ',' or ')' after pref value"
471
}
472
} else {
473
"expected ')' after pref value"
474
};
475
token = self.error_and_recover(token, expected_msg);
476
continue;
477
}
478
479
// ";"
480
token = self.get_token(&mut none_str);
481
if token != Token::SingleChar(b';') {
482
token = self.error_and_recover(token, "expected ';' after ')'");
483
continue;
484
}
485
486
unsafe { (self.pref_fn)(pref_name.as_ptr() as *const c_char, pref_type, pref_value_kind,
487
pref_value, is_sticky, is_locked) };
488
489
token = self.get_token(&mut none_str);
490
}
491
}
492
493
fn error_and_recover(&mut self, token: Token, msg: &str) -> Token {
494
self.has_errors = true;
495
496
// If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and
497
// the error message is within `token`. Otherwise, it's a parsing error
498
// and the error message is in `msg`.
499
let (msg, line_num) = match token {
500
Token::Error(token_msg) => (token_msg, self.line_num),
501
Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num),
502
_ => (msg, self.line_num),
503
};
504
let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg);
505
let msg = std::ffi::CString::new(msg).unwrap();
506
unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) };
507
508
// "Panic-mode" recovery: consume tokens until one of the following
509
// occurs.
510
// - We hit a semicolon, whereupon we return the following token.
511
// - We hit EOF, whereupon we return EOF.
512
//
513
// For this to work, if the lexing functions hit EOF in an error case
514
// they must unget it so we can safely reget it here.
515
//
516
// If the starting token (passed in above) is EOF we must not get
517
// another token otherwise we will read past the end of `self.buf`.
518
let mut dummy_str = Vec::with_capacity(128);
519
let mut token = token;
520
loop {
521
match token {
522
Token::SingleChar(b';') => return self.get_token(&mut dummy_str),
523
Token::SingleChar(EOF) => return token,
524
_ => {}
525
}
526
token = self.get_token(&mut dummy_str);
527
}
528
}
529
530
#[inline(always)]
531
fn get_char(&mut self) -> u8 {
532
// We do the bounds check ourselves so we can return EOF on failure.
533
// (Although the buffer is guaranteed to end in an EOF char, we might
534
// go one char past that, whereupon we must return EOF again.)
535
if self.i < self.buf.len() {
536
let c = unsafe { *self.buf.get_unchecked(self.i) };
537
self.i += 1;
538
c
539
} else {
540
debug_assert!(self.i == self.buf.len());
541
EOF
542
}
543
}
544
545
// This function skips the bounds check in optimized builds. Using it at
546
// the hottest two call sites gives a ~15% parsing speed boost.
547
#[inline(always)]
548
unsafe fn get_char_unchecked(&mut self) -> u8 {
549
debug_assert!(self.i < self.buf.len());
550
let c = *self.buf.get_unchecked(self.i);
551
self.i += 1;
552
c
553
}
554
555
#[inline(always)]
556
fn unget_char(&mut self) {
557
debug_assert!(self.i > 0);
558
self.i -= 1;
559
}
560
561
#[inline(always)]
562
fn match_char(&mut self, c: u8) -> bool {
563
if self.buf[self.i] == c {
564
self.i += 1;
565
return true;
566
}
567
false
568
}
569
570
#[inline(always)]
571
fn match_single_line_comment(&mut self) {
572
loop {
573
// To reach here, the previous char must have been '/' (if this is
574
// the first loop iteration) or non-special (if this is the second
575
// or subsequent iteration), and assertions elsewhere ensure that
576
// there must be at least one subsequent char after those chars
577
// (the '\0' for EOF).
578
let c = unsafe { self.get_char_unchecked() };
579
580
// All the special chars have value <= b'\r'.
581
if c > b'\r' {
582
continue;
583
}
584
match c {
585
b'\n' => {
586
self.line_num += 1;
587
break;
588
}
589
b'\r' => {
590
self.line_num += 1;
591
self.match_char(b'\n');
592
break;
593
}
594
EOF => {
595
break;
596
}
597
_ => continue
598
}
599
}
600
}
601
602
// Returns false if we hit EOF without closing the comment.
603
fn match_multi_line_comment(&mut self) -> bool {
604
loop {
605
match self.get_char() {
606
b'*' => {
607
if self.match_char(b'/') {
608
return true;
609
}
610
}
611
b'\n' => {
612
self.line_num += 1;
613
}
614
b'\r' => {
615
self.line_num += 1;
616
self.match_char(b'\n');
617
}
618
EOF => {
619
return false
620
}
621
_ => continue
622
}
623
}
624
}
625
626
fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> {
627
debug_assert!(ndigits == 2 || ndigits == 4);
628
let mut value: u16 = 0;
629
for _ in 0..ndigits {
630
value = value << 4;
631
match self.get_char() {
632
c @ b'0' ..= b'9' => value += (c - b'0') as u16,
633
c @ b'A' ..= b'F' => value += (c - b'A') as u16 + 10,
634
c @ b'a' ..= b'f' => value += (c - b'a') as u16 + 10,
635
_ => {
636
self.unget_char();
637
return None;
638
}
639
}
640
}
641
Some(value)
642
}
643
644
#[inline(always)]
645
fn char_kind(c: u8) -> CharKind {
646
// Use get_unchecked() because a u8 index cannot exceed this table's
647
// bounds.
648
unsafe { *CHAR_KINDS.get_unchecked(c as usize) }
649
}
650
651
#[inline(always)]
652
fn is_special_string_char(c: u8) -> bool {
653
// Use get_unchecked() because a u8 index cannot exceed this table's
654
// bounds.
655
unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) }
656
}
657
658
// If the obtained Token has a value, it is put within the Token, unless
659
// it's a string, in which case it's put in `str_buf`. This avoids
660
// allocating a new Vec for every string, which is slow.
661
fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token {
662
loop {
663
// Note: the following tests are ordered by frequency when parsing
664
// greprefs.js:
665
// - SingleChar 36.7%
666
// - SpaceNL 27.7% (14.9% for spaces, 12.8% for NL)
667
// - Keyword 13.4%
668
// - Quote 11.4%
669
// - Slash 8.1%
670
// - Digit 2.7%
671
// - Hash, CR, Other 0.0%
672
673
let c = self.get_char();
674
match Parser::char_kind(c) {
675
CharKind::SingleChar => {
676
return Token::SingleChar(c);
677
}
678
CharKind::SpaceNL => {
679
// It's slightly faster to combine the handling of the
680
// space chars with NL than to handle them separately; we
681
// have an extra test for this case, but one fewer test for
682
// all the subsequent CharKinds.
683
if c == b'\n' {
684
self.line_num += 1;
685
}
686
continue;
687
}
688
CharKind::Keyword => {
689
let start = self.i - 1;
690
loop {
691
let c = self.get_char();
692
if Parser::char_kind(c) != CharKind::Keyword {
693
self.unget_char();
694
break;
695
}
696
}
697
for info in KEYWORD_INFOS.iter() {
698
if &self.buf[start..self.i] == info.string {
699
return info.token;
700
}
701
}
702
return Token::Error("unknown keyword");
703
}
704
CharKind::Quote => {
705
return self.get_string_token(c, str_buf);
706
}
707
CharKind::Slash => {
708
match self.get_char() {
709
b'/' => {
710
self.match_single_line_comment();
711
}
712
b'*' => {
713
if !self.match_multi_line_comment() {
714
return Token::Error("unterminated /* comment");
715
}
716
}
717
c @ _ => {
718
if c == b'\n' || c == b'\r' {
719
// Unget the newline char; the outer loop will
720
// reget it and adjust self.line_num
721
// appropriately.
722
self.unget_char();
723
}
724
return Token::Error("expected '/' or '*' after '/'");
725
}
726
}
727
continue;
728
}
729
CharKind::Digit => {
730
let mut value = Some((c - b'0') as u32);
731
loop {
732
let c = self.get_char();
733
match Parser::char_kind(c) {
734
CharKind::Digit => {
735
fn add_digit(value: Option<u32>, c: u8) -> Option<u32> {
736
value?.checked_mul(10)?.checked_add((c - b'0') as u32)
737
}
738
value = add_digit(value, c);
739
}
740
CharKind::Keyword => {
741
// Reject things like "123foo". Error recovery
742
// will retokenize from "foo" onward.
743
self.unget_char();
744
return Token::Error("unexpected character in integer literal");
745
}
746
_ => {
747
self.unget_char();
748
break;
749
}
750
}
751
}
752
return match value {
753
Some(v) => Token::Int(v),
754
None => Token::Error("integer literal overflowed"),
755
};
756
}
757
CharKind::Hash => {
758
self.match_single_line_comment();
759
continue;
760
}
761
CharKind::CR => {
762
self.match_char(b'\n');
763
self.line_num += 1;
764
continue;
765
}
766
// Error recovery will retokenize from the next character.
767
_ => return Token::Error("unexpected character")
768
}
769
}
770
}
771
772
fn string_error_token(&self, token: &mut Token, msg: &'static str) {
773
// We only want to capture the first tokenization error within a string.
774
if *token == Token::String {
775
*token = Token::ErrorAtLine(msg, self.line_num);
776
}
777
}
778
779
// Always inline this because it has a single call site.
780
#[inline(always)]
781
fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token {
782
// First scan through the string to see if it contains any chars that
783
// need special handling.
784
let start = self.i;
785
let has_special_chars = loop {
786
// To reach here, the previous char must have been a quote
787
// (quote_char), and assertions elsewhere ensure that there must be
788
// at least one subsequent char (the '\0' for EOF).
789
let c = unsafe { self.get_char_unchecked() };
790
if Parser::is_special_string_char(c) {
791
break c != quote_char;
792
}
793
};
794
795
// Clear str_buf's contents without changing its capacity.
796
str_buf.clear();
797
798
// If there are no special chars (the common case), we can bulk copy it
799
// to str_buf. This is a lot faster than the char-by-char loop below.
800
if !has_special_chars {
801
str_buf.extend(&self.buf[start..self.i - 1]);
802
str_buf.push(b'\0');
803
return Token::String;
804
}
805
806
// There were special chars. Re-scan the string, filling in str_buf one
807
// char at a time.
808
//
809
// On error, we change `token` to an error token and then keep going to
810
// the end of the string literal. `str_buf` won't be used in that case.
811
self.i = start;
812
let mut token = Token::String;
813
814
loop {
815
let c = self.get_char();
816
let c2 = if !Parser::is_special_string_char(c) {
817
c
818
819
} else if c == quote_char {
820
break;
821
822
} else if c == b'\\' {
823
match self.get_char() {
824
b'\"' => b'\"',
825
b'\'' => b'\'',
826
b'\\' => b'\\',
827
b'n' => b'\n',
828
b'r' => b'\r',
829
b'x' => {
830
if let Some(value) = self.match_hex_digits(2) {
831
debug_assert!(value <= 0xff);
832
if value != 0 {
833
value as u8
834
} else {
835
self.string_error_token(&mut token, "\\x00 is not allowed");
836
continue;
837
}
838
} else {
839
self.string_error_token(&mut token, "malformed \\x escape sequence");
840
continue;
841
}
842
}
843
b'u' => {
844
if let Some(value) = self.match_hex_digits(4) {
845
let mut utf16 = vec![value];
846
if 0xd800 == (0xfc00 & value) {
847
// High surrogate value. Look for the low surrogate value.
848
if self.match_char(b'\\') && self.match_char(b'u') {
849
if let Some(lo) = self.match_hex_digits(4) {
850
if 0xdc00 == (0xfc00 & lo) {
851
// Found a valid low surrogate.
852
utf16.push(lo);
853
} else {
854
self.string_error_token(
855
&mut token,
856
"invalid low surrogate after high surrogate");
857
continue;
858
}
859
}
860
}
861
if utf16.len() != 2 {
862
self.string_error_token(
863
&mut token, "expected low surrogate after high surrogate");
864
continue;
865
}
866
} else if 0xdc00 == (0xfc00 & value) {
867
// Unaccompanied low surrogate value.
868
self.string_error_token(
869
&mut token, "expected high surrogate before low surrogate");
870
continue;
871
} else if value == 0 {
872
self.string_error_token(&mut token, "\\u0000 is not allowed");
873
continue;
874
}
875
876
// Insert the UTF-16 sequence as UTF-8.
877
let utf8 = String::from_utf16(&utf16).unwrap();
878
str_buf.extend(utf8.as_bytes());
879
} else {
880
self.string_error_token(&mut token, "malformed \\u escape sequence");
881
continue;
882
}
883
continue; // We don't want to str_buf.push(c2) below.
884
}
885
c @ _ => {
886
if c == b'\n' || c == b'\r' {
887
// Unget the newline char; the outer loop will
888
// reget it and adjust self.line_num appropriately.
889
self.unget_char();
890
}
891
self.string_error_token(
892
&mut token, "unexpected escape sequence character after '\\'");
893
continue;
894
}
895
}
896
897
} else if c == b'\n' {
898
self.line_num += 1;
899
c
900
901
} else if c == b'\r' {
902
self.line_num += 1;
903
if self.match_char(b'\n') {
904
str_buf.push(b'\r');
905
b'\n'
906
} else {
907
c
908
}
909
910
} else if c == EOF {
911
self.string_error_token(&mut token, "unterminated string literal");
912
break;
913
914
} else {
915
// This case is only hit for the non-closing quote char.
916
debug_assert!((c == b'\'' || c == b'\"') && c != quote_char);
917
c
918
};
919
str_buf.push(c2);
920
}
921
str_buf.push(b'\0');
922
923
token
924
}
925
}