Source code

Revision control

Other Tools

1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
* License, v. 2.0. If a copy of the MPL was not distributed with this
3
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
//! This crate implements a prefs file parser.
6
//!
7
//! Pref files have the following grammar. Note that there are slight
8
//! differences between the grammar for a default prefs files and a user prefs
9
//! file.
10
//!
11
//! <pref-file> = <pref>*
12
//! <pref> = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";"
13
//! <pref-spec> = "user_pref" | "pref" | "sticky_pref"
14
//! <pref-name> = <string-literal>
15
//! <pref-value> = <string-literal> | "true" | "false" | <int-value>
16
//! <int-value> = <sign>? <int-literal>
17
//! <sign> = "+" | "-"
18
//! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_])
19
//! <string-literal> =
20
//! A single or double-quoted string, with the following escape sequences
21
//! allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte
22
//! value that is copied directly into an 8-bit string value, and \uNNNN
23
//! gives a UTF-16 code unit that is converted to UTF-8 before being copied
24
//! into an 8-bit string value. \x00 and \u0000 are disallowed because they
25
//! would cause C++ code handling such strings to misbehave.
26
//! <pref-attrs> = ("," <pref-attr>)* // in default pref files
27
//! = <empty> // in user pref files
28
//! <pref-attr> = "sticky" | "locked" // default pref files only
29
//!
30
//! Comments can take three forms:
31
//! - # Python-style comments
32
//! - // C++ style comments
33
//! - /* C style comments (non-nested) */
34
//!
35
//! Non-end-of-line whitespace chars are \t, \v, \f, and space.
36
//!
37
//! End-of-line sequences can take three forms, each of which is considered as a
38
//! single EoL:
39
//! - \n
40
//! - \r (without subsequent \n)
41
//! - \r\n
42
//!
43
//! The valid range for <int-value> is -2,147,483,648..2,147,483,647. Values
44
//! outside that range will result in a parse error.
45
//!
46
//! A '\0' char is interpreted as the end of the file. The use of this character
47
//! in a prefs file is not recommended. Within string literals \x00 or \u0000
48
//! can be used instead.
49
//!
50
//! The parser performs error recovery. On a syntax error, it will scan forward
51
//! to the next ';' token and then continue parsing. If the syntax error occurs
52
//! in the middle of a token, it will first finish obtaining the current token
53
//! in an appropriate fashion.
54
55
// This parser uses several important optimizations.
56
//
57
// - Because "'\0' means EOF" is part of the grammar (see above), EOF is
58
// representable by a u8. If EOF was represented by an out-of-band value such
59
// as -1 or 256, we'd have to return a larger type such as u16 or i16 from
60
// get_char().
61
//
62
// - When starting a new token, it uses a lookup table with the first char,
63
// which quickly identifies what kind of token it will be. Furthermore, if
64
// that token is an unambiguous single-char token (e.g. '(', ')', '+', ',',
65
// '-', ';'), the parser will return the appropriate token kind value at
66
// minimal cost because the single-char tokens have a uniform representation.
67
//
68
// - It has a lookup table that identifies chars in string literals that need
69
// special handling. This means non-special chars (the common case) can be
70
// handled with a single test, rather than testing for the multiple special
71
// cases.
72
//
73
// - It pre-scans string literals for special chars. If none are present, it
74
// bulk copies the string literal into a Vec, which is faster than doing a
75
// char-by-char copy.
76
//
77
// - It reuses Vecs to avoid creating a new one for each string literal.
78
79
use std::os::raw::{c_char, c_uchar};
80
81
//---------------------------------------------------------------------------
82
// The public interface
83
//---------------------------------------------------------------------------
84
85
/// Keep this in sync with PrefType in Preferences.cpp.
86
#[derive(Clone, Copy, Debug)]
87
#[repr(u8)]
88
pub enum PrefType {
89
None,
90
String,
91
Int,
92
Bool,
93
}
94
95
/// Keep this in sync with PrefValueKind in Preferences.h.
96
#[derive(Clone, Copy, Debug, PartialEq)]
97
#[repr(u8)]
98
pub enum PrefValueKind {
99
Default,
100
User,
101
}
102
103
/// Keep this in sync with PrefValue in Preferences.cpp.
104
#[repr(C)]
105
pub union PrefValue {
106
string_val: *const c_char,
107
int_val: i32,
108
bool_val: bool,
109
}
110
111
/// Keep this in sync with PrefsParserPrefFn in Preferences.cpp.
112
type PrefFn = unsafe extern "C" fn(
113
pref_name: *const c_char,
114
pref_type: PrefType,
115
pref_value_kind: PrefValueKind,
116
pref_value: PrefValue,
117
is_sticky: bool,
118
is_locked: bool,
119
);
120
121
/// Keep this in sync with PrefsParserErrorFn in Preferences.cpp.
122
type ErrorFn = unsafe extern "C" fn(msg: *const c_char);
123
124
/// Parse the contents of a prefs file.
125
///
126
/// `buf` is a null-terminated string. `len` is its length, excluding the
127
/// null terminator.
128
///
129
/// `pref_fn` is called once for each successfully parsed pref.
130
///
131
/// `error_fn` is called once for each parse error detected.
132
///
133
/// Keep this in sync with the prefs_parser_parse() declaration in
134
/// Preferences.cpp.
135
#[no_mangle]
136
pub extern "C" fn prefs_parser_parse(
137
path: *const c_char,
138
kind: PrefValueKind,
139
buf: *const c_char,
140
len: usize,
141
pref_fn: PrefFn,
142
error_fn: ErrorFn,
143
) -> bool {
144
let path = unsafe {
145
std::ffi::CStr::from_ptr(path)
146
.to_string_lossy()
147
.into_owned()
148
};
149
150
// Make sure `buf` ends in a '\0', and include that in the length, because
151
// it represents EOF.
152
let buf = unsafe { std::slice::from_raw_parts(buf as *const c_uchar, len + 1) };
153
assert!(buf.last() == Some(&EOF));
154
155
let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn);
156
parser.parse()
157
}
158
159
//---------------------------------------------------------------------------
160
// The implementation
161
//---------------------------------------------------------------------------
162
163
#[derive(Clone, Copy, Debug, PartialEq)]
164
enum Token {
165
// Unambiguous single-char tokens.
166
SingleChar(u8),
167
168
// Keywords
169
Pref, // pref
170
StickyPref, // sticky_pref
171
UserPref, // user_pref
172
True, // true
173
False, // false
174
Sticky, // sticky
175
Locked, // locked
176
177
// String literal, e.g. '"string"'. The value is stored elsewhere.
178
String,
179
180
// Unsigned integer literal, e.g. '123'. Although libpref uses i32 values,
181
// any '-' and '+' before an integer literal are treated as separate
182
// tokens, so these token values are always positive. Furthermore, we
183
// tokenize int literals as u32 so that 2147483648 (which doesn't fit into
184
// an i32) can be subsequently negated to -2147483648 (which does fit into
185
// an i32) if a '-' token precedes it.
186
Int(u32),
187
188
// Malformed token.
189
Error(&'static str),
190
191
// Malformed token at a particular line number. For use when
192
// Parser::line_num might not be the right line number when the error is
193
// reported. E.g. if a multi-line string has a bad escape sequence on the
194
// first line, we don't report the error until the string's end has been
195
// reached.
196
ErrorAtLine(&'static str, u32),
197
}
198
199
// We categorize every char by what action should be taken when it appears at
200
// the start of a new token.
201
#[derive(Clone, Copy, PartialEq)]
202
enum CharKind {
203
// These are ordered by frequency. See the comment in GetToken().
204
SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF
205
SpaceNL, // [\t\v\f \n]
206
Keyword, // [A-Za-z_]
207
Quote, // ["']
208
Slash, // /
209
Digit, // [0-9]
210
Hash, // #
211
CR, // \r
212
Other, // Everything else; invalid except within strings and comments.
213
}
214
215
const C_SINGL: CharKind = CharKind::SingleChar;
216
const C_SPCNL: CharKind = CharKind::SpaceNL;
217
const C_KEYWD: CharKind = CharKind::Keyword;
218
const C_QUOTE: CharKind = CharKind::Quote;
219
const C_SLASH: CharKind = CharKind::Slash;
220
const C_DIGIT: CharKind = CharKind::Digit;
221
const C_HASH_: CharKind = CharKind::Hash;
222
const C_CR___: CharKind = CharKind::CR;
223
const C______: CharKind = CharKind::Other;
224
225
#[rustfmt::skip]
226
const CHAR_KINDS: [CharKind; 256] = [
227
/* 0 1 2 3 4 5 6 7 8 9 */
228
/* 0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL,
229
/* 10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR___, C______, C______, C______, C______, C______, C______,
230
/* 20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
231
/* 30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH_, C______, C______, C______, C_QUOTE,
232
/* 40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT,
233
/* 50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL,
234
/* 60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
235
/* 70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
236
/* 80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
237
/* 90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD,
238
/* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
239
/* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
240
/* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______,
241
/* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
242
/* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
243
/* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
244
/* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
245
/* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
246
/* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
247
/* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
248
/* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
249
/* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
250
/* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
251
/* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
252
/* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
253
/* 250+ */ C______, C______, C______, C______, C______, C______
254
];
255
256
const _______: bool = false;
257
#[rustfmt::skip]
258
const SPECIAL_STRING_CHARS: [bool; 256] = [
259
/* 0 1 2 3 4 5 6 7 8 9 */
260
/* 0+ */ true, _______, _______, _______, _______, _______, _______, _______, _______, _______,
261
/* 10+ */ true, _______, _______, true, _______, _______, _______, _______, _______, _______,
262
/* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
263
/* 30+ */ _______, _______, _______, _______, true, _______, _______, _______, _______, true,
264
/* 40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
265
/* 50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
266
/* 60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
267
/* 70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
268
/* 80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
269
/* 90+ */ _______, _______, true, _______, _______, _______, _______, _______, _______, _______,
270
/* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
271
/* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
272
/* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
273
/* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
274
/* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
275
/* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
276
/* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
277
/* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
278
/* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
279
/* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
280
/* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
281
/* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
282
/* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
283
/* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
284
/* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
285
/* 250+ */ _______, _______, _______, _______, _______, _______
286
];
287
288
struct KeywordInfo {
289
string: &'static [u8],
290
token: Token,
291
}
292
293
const KEYWORD_INFOS: [KeywordInfo; 7] = [
294
// These are ordered by frequency.
295
KeywordInfo {
296
string: b"pref",
297
token: Token::Pref,
298
},
299
KeywordInfo {
300
string: b"true",
301
token: Token::True,
302
},
303
KeywordInfo {
304
string: b"false",
305
token: Token::False,
306
},
307
KeywordInfo {
308
string: b"user_pref",
309
token: Token::UserPref,
310
},
311
KeywordInfo {
312
string: b"sticky",
313
token: Token::Sticky,
314
},
315
KeywordInfo {
316
string: b"locked",
317
token: Token::Locked,
318
},
319
KeywordInfo {
320
string: b"sticky_pref",
321
token: Token::StickyPref,
322
},
323
];
324
325
struct Parser<'t> {
326
path: &'t str, // Path to the file being parsed. Used in error messages.
327
kind: PrefValueKind, // Default prefs file or user prefs file?
328
buf: &'t [u8], // Text being parsed.
329
i: usize, // Index of next char to be read.
330
line_num: u32, // Current line number within the text.
331
pref_fn: PrefFn, // Callback for processing each pref.
332
error_fn: ErrorFn, // Callback for parse errors.
333
has_errors: bool, // Have we encountered errors?
334
}
335
336
// As described above, we use 0 to represent EOF.
337
const EOF: u8 = b'\0';
338
339
impl<'t> Parser<'t> {
340
fn new(
341
path: &'t str,
342
kind: PrefValueKind,
343
buf: &'t [u8],
344
pref_fn: PrefFn,
345
error_fn: ErrorFn,
346
) -> Parser<'t> {
347
// Make sure these tables take up 1 byte per entry.
348
assert!(std::mem::size_of_val(&CHAR_KINDS) == 256);
349
assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256);
350
351
Parser {
352
path: path,
353
kind: kind,
354
buf: buf,
355
i: 0,
356
line_num: 1,
357
pref_fn: pref_fn,
358
error_fn: error_fn,
359
has_errors: false,
360
}
361
}
362
363
fn parse(&mut self) -> bool {
364
// These are reused, because allocating a new Vec for every string is slow.
365
let mut name_str = Vec::with_capacity(128); // For pref names.
366
let mut value_str = Vec::with_capacity(512); // For string pref values.
367
let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings.
368
369
let mut token = self.get_token(&mut none_str);
370
371
// At the top of the loop we already have a token. In a valid input
372
// this will be either the first token of a new pref, or EOF.
373
loop {
374
// <pref-spec>
375
let (pref_value_kind, mut is_sticky) = match token {
376
Token::Pref => (PrefValueKind::Default, false),
377
Token::StickyPref => (PrefValueKind::Default, true),
378
Token::UserPref => (PrefValueKind::User, false),
379
Token::SingleChar(EOF) => return !self.has_errors,
380
_ => {
381
token = self.error_and_recover(
382
token,
383
"expected pref specifier at start of pref definition",
384
);
385
continue;
386
}
387
};
388
389
// "("
390
token = self.get_token(&mut none_str);
391
if token != Token::SingleChar(b'(') {
392
token = self.error_and_recover(token, "expected '(' after pref specifier");
393
continue;
394
}
395
396
// <pref-name>
397
token = self.get_token(&mut name_str);
398
let pref_name = if token == Token::String {
399
&name_str
400
} else {
401
token = self.error_and_recover(token, "expected pref name after '('");
402
continue;
403
};
404
405
// ","
406
token = self.get_token(&mut none_str);
407
if token != Token::SingleChar(b',') {
408
token = self.error_and_recover(token, "expected ',' after pref name");
409
continue;
410
}
411
412
// <pref-value>
413
token = self.get_token(&mut value_str);
414
let (pref_type, pref_value) = match token {
415
Token::True => (PrefType::Bool, PrefValue { bool_val: true }),
416
Token::False => (PrefType::Bool, PrefValue { bool_val: false }),
417
Token::String => (
418
PrefType::String,
419
PrefValue {
420
string_val: value_str.as_ptr() as *const c_char,
421
},
422
),
423
Token::Int(u) => {
424
// Accept u <= 2147483647; anything larger will overflow i32.
425
if u <= std::i32::MAX as u32 {
426
(PrefType::Int, PrefValue { int_val: u as i32 })
427
} else {
428
token =
429
self.error_and_recover(Token::Error("integer literal overflowed"), "");
430
continue;
431
}
432
}
433
Token::SingleChar(b'-') => {
434
token = self.get_token(&mut none_str);
435
if let Token::Int(u) = token {
436
// Accept u <= 2147483648; anything larger will overflow i32 once negated.
437
if u <= std::i32::MAX as u32 {
438
(
439
PrefType::Int,
440
PrefValue {
441
int_val: -(u as i32),
442
},
443
)
444
} else if u == std::i32::MAX as u32 + 1 {
445
(
446
PrefType::Int,
447
PrefValue {
448
int_val: std::i32::MIN,
449
},
450
)
451
} else {
452
token = self
453
.error_and_recover(Token::Error("integer literal overflowed"), "");
454
continue;
455
}
456
} else {
457
token = self.error_and_recover(token, "expected integer literal after '-'");
458
continue;
459
}
460
}
461
Token::SingleChar(b'+') => {
462
token = self.get_token(&mut none_str);
463
if let Token::Int(u) = token {
464
// Accept u <= 2147483647; anything larger will overflow i32.
465
if u <= std::i32::MAX as u32 {
466
(PrefType::Int, PrefValue { int_val: u as i32 })
467
} else {
468
token = self
469
.error_and_recover(Token::Error("integer literal overflowed"), "");
470
continue;
471
}
472
} else {
473
token = self.error_and_recover(token, "expected integer literal after '+'");
474
continue;
475
}
476
}
477
_ => {
478
token = self.error_and_recover(token, "expected pref value after ','");
479
continue;
480
}
481
};
482
483
// ("," <pref-attr>)* // default pref files only
484
let mut is_locked = false;
485
let mut has_attrs = false;
486
if self.kind == PrefValueKind::Default {
487
let ok = loop {
488
// ","
489
token = self.get_token(&mut none_str);
490
if token != Token::SingleChar(b',') {
491
break true;
492
}
493
494
// <pref-attr>
495
token = self.get_token(&mut none_str);
496
match token {
497
Token::Sticky => is_sticky = true,
498
Token::Locked => is_locked = true,
499
_ => {
500
token =
501
self.error_and_recover(token, "expected pref attribute after ','");
502
break false;
503
}
504
}
505
has_attrs = true;
506
};
507
if !ok {
508
continue;
509
}
510
} else {
511
token = self.get_token(&mut none_str);
512
}
513
514
// ")"
515
if token != Token::SingleChar(b')') {
516
let expected_msg = if self.kind == PrefValueKind::Default {
517
if has_attrs {
518
"expected ',' or ')' after pref attribute"
519
} else {
520
"expected ',' or ')' after pref value"
521
}
522
} else {
523
"expected ')' after pref value"
524
};
525
token = self.error_and_recover(token, expected_msg);
526
continue;
527
}
528
529
// ";"
530
token = self.get_token(&mut none_str);
531
if token != Token::SingleChar(b';') {
532
token = self.error_and_recover(token, "expected ';' after ')'");
533
continue;
534
}
535
536
unsafe {
537
(self.pref_fn)(
538
pref_name.as_ptr() as *const c_char,
539
pref_type,
540
pref_value_kind,
541
pref_value,
542
is_sticky,
543
is_locked,
544
)
545
};
546
547
token = self.get_token(&mut none_str);
548
}
549
}
550
551
fn error_and_recover(&mut self, token: Token, msg: &str) -> Token {
552
self.has_errors = true;
553
554
// If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and
555
// the error message is within `token`. Otherwise, it's a parsing error
556
// and the error message is in `msg`.
557
let (msg, line_num) = match token {
558
Token::Error(token_msg) => (token_msg, self.line_num),
559
Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num),
560
_ => (msg, self.line_num),
561
};
562
let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg);
563
let msg = std::ffi::CString::new(msg).unwrap();
564
unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) };
565
566
// "Panic-mode" recovery: consume tokens until one of the following
567
// occurs.
568
// - We hit a semicolon, whereupon we return the following token.
569
// - We hit EOF, whereupon we return EOF.
570
//
571
// For this to work, if the lexing functions hit EOF in an error case
572
// they must unget it so we can safely reget it here.
573
//
574
// If the starting token (passed in above) is EOF we must not get
575
// another token otherwise we will read past the end of `self.buf`.
576
let mut dummy_str = Vec::with_capacity(128);
577
let mut token = token;
578
loop {
579
match token {
580
Token::SingleChar(b';') => return self.get_token(&mut dummy_str),
581
Token::SingleChar(EOF) => return token,
582
_ => {}
583
}
584
token = self.get_token(&mut dummy_str);
585
}
586
}
587
588
#[inline(always)]
589
fn get_char(&mut self) -> u8 {
590
// We do the bounds check ourselves so we can return EOF on failure.
591
// (Although the buffer is guaranteed to end in an EOF char, we might
592
// go one char past that, whereupon we must return EOF again.)
593
if self.i < self.buf.len() {
594
let c = unsafe { *self.buf.get_unchecked(self.i) };
595
self.i += 1;
596
c
597
} else {
598
debug_assert!(self.i == self.buf.len());
599
EOF
600
}
601
}
602
603
// This function skips the bounds check in optimized builds. Using it at
604
// the hottest two call sites gives a ~15% parsing speed boost.
605
#[inline(always)]
606
unsafe fn get_char_unchecked(&mut self) -> u8 {
607
debug_assert!(self.i < self.buf.len());
608
let c = *self.buf.get_unchecked(self.i);
609
self.i += 1;
610
c
611
}
612
613
#[inline(always)]
614
fn unget_char(&mut self) {
615
debug_assert!(self.i > 0);
616
self.i -= 1;
617
}
618
619
#[inline(always)]
620
fn match_char(&mut self, c: u8) -> bool {
621
if self.buf[self.i] == c {
622
self.i += 1;
623
return true;
624
}
625
false
626
}
627
628
#[inline(always)]
629
fn match_single_line_comment(&mut self) {
630
loop {
631
// To reach here, the previous char must have been '/' (if this is
632
// the first loop iteration) or non-special (if this is the second
633
// or subsequent iteration), and assertions elsewhere ensure that
634
// there must be at least one subsequent char after those chars
635
// (the '\0' for EOF).
636
let c = unsafe { self.get_char_unchecked() };
637
638
// All the special chars have value <= b'\r'.
639
if c > b'\r' {
640
continue;
641
}
642
match c {
643
b'\n' => {
644
self.line_num += 1;
645
break;
646
}
647
b'\r' => {
648
self.line_num += 1;
649
self.match_char(b'\n');
650
break;
651
}
652
EOF => {
653
break;
654
}
655
_ => continue,
656
}
657
}
658
}
659
660
// Returns false if we hit EOF without closing the comment.
661
fn match_multi_line_comment(&mut self) -> bool {
662
loop {
663
match self.get_char() {
664
b'*' => {
665
if self.match_char(b'/') {
666
return true;
667
}
668
}
669
b'\n' => {
670
self.line_num += 1;
671
}
672
b'\r' => {
673
self.line_num += 1;
674
self.match_char(b'\n');
675
}
676
EOF => return false,
677
_ => continue,
678
}
679
}
680
}
681
682
fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> {
683
debug_assert!(ndigits == 2 || ndigits == 4);
684
let mut value: u16 = 0;
685
for _ in 0..ndigits {
686
value = value << 4;
687
match self.get_char() {
688
c @ b'0'..=b'9' => value += (c - b'0') as u16,
689
c @ b'A'..=b'F' => value += (c - b'A') as u16 + 10,
690
c @ b'a'..=b'f' => value += (c - b'a') as u16 + 10,
691
_ => {
692
self.unget_char();
693
return None;
694
}
695
}
696
}
697
Some(value)
698
}
699
700
#[inline(always)]
701
fn char_kind(c: u8) -> CharKind {
702
// Use get_unchecked() because a u8 index cannot exceed this table's
703
// bounds.
704
unsafe { *CHAR_KINDS.get_unchecked(c as usize) }
705
}
706
707
#[inline(always)]
708
fn is_special_string_char(c: u8) -> bool {
709
// Use get_unchecked() because a u8 index cannot exceed this table's
710
// bounds.
711
unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) }
712
}
713
714
// If the obtained Token has a value, it is put within the Token, unless
715
// it's a string, in which case it's put in `str_buf`. This avoids
716
// allocating a new Vec for every string, which is slow.
717
fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token {
718
loop {
719
// Note: the following tests are ordered by frequency when parsing
720
// greprefs.js:
721
// - SingleChar 36.7%
722
// - SpaceNL 27.7% (14.9% for spaces, 12.8% for NL)
723
// - Keyword 13.4%
724
// - Quote 11.4%
725
// - Slash 8.1%
726
// - Digit 2.7%
727
// - Hash, CR, Other 0.0%
728
729
let c = self.get_char();
730
match Parser::char_kind(c) {
731
CharKind::SingleChar => {
732
return Token::SingleChar(c);
733
}
734
CharKind::SpaceNL => {
735
// It's slightly faster to combine the handling of the
736
// space chars with NL than to handle them separately; we
737
// have an extra test for this case, but one fewer test for
738
// all the subsequent CharKinds.
739
if c == b'\n' {
740
self.line_num += 1;
741
}
742
continue;
743
}
744
CharKind::Keyword => {
745
let start = self.i - 1;
746
loop {
747
let c = self.get_char();
748
if Parser::char_kind(c) != CharKind::Keyword {
749
self.unget_char();
750
break;
751
}
752
}
753
for info in KEYWORD_INFOS.iter() {
754
if &self.buf[start..self.i] == info.string {
755
return info.token;
756
}
757
}
758
return Token::Error("unknown keyword");
759
}
760
CharKind::Quote => {
761
return self.get_string_token(c, str_buf);
762
}
763
CharKind::Slash => {
764
match self.get_char() {
765
b'/' => {
766
self.match_single_line_comment();
767
}
768
b'*' => {
769
if !self.match_multi_line_comment() {
770
return Token::Error("unterminated /* comment");
771
}
772
}
773
c @ _ => {
774
if c == b'\n' || c == b'\r' {
775
// Unget the newline char; the outer loop will
776
// reget it and adjust self.line_num
777
// appropriately.
778
self.unget_char();
779
}
780
return Token::Error("expected '/' or '*' after '/'");
781
}
782
}
783
continue;
784
}
785
CharKind::Digit => {
786
let mut value = Some((c - b'0') as u32);
787
loop {
788
let c = self.get_char();
789
match Parser::char_kind(c) {
790
CharKind::Digit => {
791
fn add_digit(value: Option<u32>, c: u8) -> Option<u32> {
792
value?.checked_mul(10)?.checked_add((c - b'0') as u32)
793
}
794
value = add_digit(value, c);
795
}
796
CharKind::Keyword => {
797
// Reject things like "123foo". Error recovery
798
// will retokenize from "foo" onward.
799
self.unget_char();
800
return Token::Error("unexpected character in integer literal");
801
}
802
_ => {
803
self.unget_char();
804
break;
805
}
806
}
807
}
808
return match value {
809
Some(v) => Token::Int(v),
810
None => Token::Error("integer literal overflowed"),
811
};
812
}
813
CharKind::Hash => {
814
self.match_single_line_comment();
815
continue;
816
}
817
CharKind::CR => {
818
self.match_char(b'\n');
819
self.line_num += 1;
820
continue;
821
}
822
// Error recovery will retokenize from the next character.
823
_ => return Token::Error("unexpected character"),
824
}
825
}
826
}
827
828
fn string_error_token(&self, token: &mut Token, msg: &'static str) {
829
// We only want to capture the first tokenization error within a string.
830
if *token == Token::String {
831
*token = Token::ErrorAtLine(msg, self.line_num);
832
}
833
}
834
835
// Always inline this because it has a single call site.
836
#[inline(always)]
837
fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token {
838
// First scan through the string to see if it contains any chars that
839
// need special handling.
840
let start = self.i;
841
let has_special_chars = loop {
842
// To reach here, the previous char must have been a quote
843
// (quote_char), and assertions elsewhere ensure that there must be
844
// at least one subsequent char (the '\0' for EOF).
845
let c = unsafe { self.get_char_unchecked() };
846
if Parser::is_special_string_char(c) {
847
break c != quote_char;
848
}
849
};
850
851
// Clear str_buf's contents without changing its capacity.
852
str_buf.clear();
853
854
// If there are no special chars (the common case), we can bulk copy it
855
// to str_buf. This is a lot faster than the char-by-char loop below.
856
if !has_special_chars {
857
str_buf.extend(&self.buf[start..self.i - 1]);
858
str_buf.push(b'\0');
859
return Token::String;
860
}
861
862
// There were special chars. Re-scan the string, filling in str_buf one
863
// char at a time.
864
//
865
// On error, we change `token` to an error token and then keep going to
866
// the end of the string literal. `str_buf` won't be used in that case.
867
self.i = start;
868
let mut token = Token::String;
869
870
loop {
871
let c = self.get_char();
872
let c2 = if !Parser::is_special_string_char(c) {
873
c
874
} else if c == quote_char {
875
break;
876
} else if c == b'\\' {
877
match self.get_char() {
878
b'\"' => b'\"',
879
b'\'' => b'\'',
880
b'\\' => b'\\',
881
b'n' => b'\n',
882
b'r' => b'\r',
883
b'x' => {
884
if let Some(value) = self.match_hex_digits(2) {
885
debug_assert!(value <= 0xff);
886
if value != 0 {
887
value as u8
888
} else {
889
self.string_error_token(&mut token, "\\x00 is not allowed");
890
continue;
891
}
892
} else {
893
self.string_error_token(&mut token, "malformed \\x escape sequence");
894
continue;
895
}
896
}
897
b'u' => {
898
if let Some(value) = self.match_hex_digits(4) {
899
let mut utf16 = vec![value];
900
if 0xd800 == (0xfc00 & value) {
901
// High surrogate value. Look for the low surrogate value.
902
if self.match_char(b'\\') && self.match_char(b'u') {
903
if let Some(lo) = self.match_hex_digits(4) {
904
if 0xdc00 == (0xfc00 & lo) {
905
// Found a valid low surrogate.
906
utf16.push(lo);
907
} else {
908
self.string_error_token(
909
&mut token,
910
"invalid low surrogate after high surrogate",
911
);
912
continue;
913
}
914
}
915
}
916
if utf16.len() != 2 {
917
self.string_error_token(
918
&mut token,
919
"expected low surrogate after high surrogate",
920
);
921
continue;
922
}
923
} else if 0xdc00 == (0xfc00 & value) {
924
// Unaccompanied low surrogate value.
925
self.string_error_token(
926
&mut token,
927
"expected high surrogate before low surrogate",
928
);
929
continue;
930
} else if value == 0 {
931
self.string_error_token(&mut token, "\\u0000 is not allowed");
932
continue;
933
}
934
935
// Insert the UTF-16 sequence as UTF-8.
936
let utf8 = String::from_utf16(&utf16).unwrap();
937
str_buf.extend(utf8.as_bytes());
938
} else {
939
self.string_error_token(&mut token, "malformed \\u escape sequence");
940
continue;
941
}
942
continue; // We don't want to str_buf.push(c2) below.
943
}
944
c @ _ => {
945
if c == b'\n' || c == b'\r' {
946
// Unget the newline char; the outer loop will
947
// reget it and adjust self.line_num appropriately.
948
self.unget_char();
949
}
950
self.string_error_token(
951
&mut token,
952
"unexpected escape sequence character after '\\'",
953
);
954
continue;
955
}
956
}
957
} else if c == b'\n' {
958
self.line_num += 1;
959
c
960
} else if c == b'\r' {
961
self.line_num += 1;
962
if self.match_char(b'\n') {
963
str_buf.push(b'\r');
964
b'\n'
965
} else {
966
c
967
}
968
} else if c == EOF {
969
self.string_error_token(&mut token, "unterminated string literal");
970
break;
971
} else {
972
// This case is only hit for the non-closing quote char.
973
debug_assert!((c == b'\'' || c == b'\"') && c != quote_char);
974
c
975
};
976
str_buf.push(c2);
977
}
978
str_buf.push(b'\0');
979
980
token
981
}
982
}