Source code

Revision control

Other Tools

1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2
* vim: set ts=8 sts=2 et sw=2 tw=80:
3
* This Source Code Form is subject to the terms of the Mozilla Public
4
* License, v. 2.0. If a copy of the MPL was not distributed with this
5
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
// JS lexical scanner.
8
9
#include "frontend/TokenStream.h"
10
11
#include "mozilla/ArrayUtils.h"
12
#include "mozilla/Attributes.h"
13
#include "mozilla/IntegerTypeTraits.h"
14
#include "mozilla/Likely.h"
15
#include "mozilla/Maybe.h"
16
#include "mozilla/MemoryChecking.h"
17
#include "mozilla/ScopeExit.h"
18
#include "mozilla/Span.h"
19
#include "mozilla/TextUtils.h"
20
#include "mozilla/Utf8.h"
21
22
#include <algorithm>
23
#include <stdarg.h>
24
#include <stdint.h>
25
#include <stdio.h>
26
#include <string.h>
27
#include <type_traits>
28
#include <utility>
29
30
#include "jsexn.h"
31
#include "jsnum.h"
32
33
#include "frontend/BytecodeCompiler.h"
34
#include "frontend/Parser.h"
35
#include "frontend/ReservedWords.h"
36
#include "js/CharacterEncoding.h"
37
#include "js/RegExpFlags.h" // JS::RegExpFlags
38
#include "js/UniquePtr.h"
39
#include "util/StringBuffer.h"
40
#include "util/Unicode.h"
41
#include "vm/HelperThreads.h"
42
#include "vm/JSAtom.h"
43
#include "vm/JSContext.h"
44
#include "vm/Realm.h"
45
46
using mozilla::ArrayLength;
47
using mozilla::AsciiAlphanumericToNumber;
48
using mozilla::AssertedCast;
49
using mozilla::DecodeOneUtf8CodePoint;
50
using mozilla::IsAscii;
51
using mozilla::IsAsciiAlpha;
52
using mozilla::IsAsciiDigit;
53
using mozilla::IsAsciiHexDigit;
54
using mozilla::IsTrailingUnit;
55
using mozilla::MakeScopeExit;
56
using mozilla::MakeSpan;
57
using mozilla::Maybe;
58
using mozilla::PointerRangeSize;
59
using mozilla::Utf8Unit;
60
61
using JS::ReadOnlyCompileOptions;
62
using JS::RegExpFlag;
63
using JS::RegExpFlags;
64
65
// There's some very preliminary support for private fields in this file. It's
66
// disabled in all builds, for now.
67
//#define JS_PRIVATE_FIELDS 1
68
69
struct ReservedWordInfo {
70
const char* chars; // C string with reserved word text
71
js::frontend::TokenKind tokentype;
72
};
73
74
static const ReservedWordInfo reservedWords[] = {
75
#define RESERVED_WORD_INFO(word, name, type) \
76
{js_##word##_str, js::frontend::type},
77
FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
78
#undef RESERVED_WORD_INFO
79
};
80
81
// Returns a ReservedWordInfo for the specified characters, or nullptr if the
82
// string is not a reserved word.
83
template <typename CharT>
84
static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
85
MOZ_ASSERT(length != 0);
86
87
size_t i;
88
const ReservedWordInfo* rw;
89
const char* chars;
90
91
#define JSRW_LENGTH() length
92
#define JSRW_AT(column) s[column]
93
#define JSRW_GOT_MATCH(index) \
94
i = (index); \
95
goto got_match;
96
#define JSRW_TEST_GUESS(index) \
97
i = (index); \
98
goto test_guess;
99
#define JSRW_NO_MATCH() goto no_match;
100
#include "frontend/ReservedWordsGenerated.h"
101
#undef JSRW_NO_MATCH
102
#undef JSRW_TEST_GUESS
103
#undef JSRW_GOT_MATCH
104
#undef JSRW_AT
105
#undef JSRW_LENGTH
106
107
got_match:
108
return &reservedWords[i];
109
110
test_guess:
111
rw = &reservedWords[i];
112
chars = rw->chars;
113
do {
114
if (*s++ != static_cast<unsigned char>(*chars++)) {
115
goto no_match;
116
}
117
} while (--length != 0);
118
return rw;
119
120
no_match:
121
return nullptr;
122
}
123
124
template <>
125
MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
126
const Utf8Unit* units, size_t length) {
127
return FindReservedWord(Utf8AsUnsignedChars(units), length);
128
}
129
130
static const ReservedWordInfo* FindReservedWord(
131
JSLinearString* str, js::frontend::NameVisibility* visibility) {
132
JS::AutoCheckCannotGC nogc;
133
if (str->hasLatin1Chars()) {
134
const JS::Latin1Char* chars = str->latin1Chars(nogc);
135
size_t length = str->length();
136
#ifdef JS_PRIVATE_FIELDS
137
if (length > 0 && chars[0] == '#') {
138
*visibility = js::frontend::NameVisibility::Private;
139
return nullptr;
140
}
141
#else
142
MOZ_ASSERT_IF(length > 0, chars[0] != '#');
143
#endif
144
*visibility = js::frontend::NameVisibility::Public;
145
return FindReservedWord(chars, length);
146
}
147
148
const char16_t* chars = str->twoByteChars(nogc);
149
size_t length = str->length();
150
#ifdef JS_PRIVATE_FIELDS
151
if (length > 0 && chars[0] == '#') {
152
*visibility = js::frontend::NameVisibility::Private;
153
return nullptr;
154
}
155
#else
156
MOZ_ASSERT_IF(length > 0, chars[0] != '#');
157
#endif
158
*visibility = js::frontend::NameVisibility::Public;
159
return FindReservedWord(chars, length);
160
}
161
162
static uint32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) {
163
using namespace js;
164
165
uint32_t codePoint;
166
if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) {
167
char16_t lead = **p;
168
char16_t maybeTrail = *(*p + 1);
169
if (unicode::IsTrailSurrogate(maybeTrail)) {
170
*p += 2;
171
return unicode::UTF16Decode(lead, maybeTrail);
172
}
173
}
174
175
codePoint = **p;
176
(*p)++;
177
return codePoint;
178
}
179
180
template <typename CharT>
181
static constexpr bool IsAsciiBinary(CharT c) {
182
using UnsignedCharT = std::make_unsigned_t<CharT>;
183
auto uc = static_cast<UnsignedCharT>(c);
184
return uc == '0' || uc == '1';
185
}
186
187
template <typename CharT>
188
static constexpr bool IsAsciiOctal(CharT c) {
189
using UnsignedCharT = std::make_unsigned_t<CharT>;
190
auto uc = static_cast<UnsignedCharT>(c);
191
return '0' <= uc && uc <= '7';
192
}
193
194
template <typename CharT>
195
static constexpr uint8_t AsciiOctalToNumber(CharT c) {
196
using UnsignedCharT = std::make_unsigned_t<CharT>;
197
auto uc = static_cast<UnsignedCharT>(c);
198
return uc - '0';
199
}
200
201
namespace js {
202
203
namespace frontend {
204
205
bool IsIdentifier(JSLinearString* str) {
206
JS::AutoCheckCannotGC nogc;
207
MOZ_ASSERT(str);
208
if (str->hasLatin1Chars()) {
209
return IsIdentifier(str->latin1Chars(nogc), str->length());
210
}
211
return IsIdentifier(str->twoByteChars(nogc), str->length());
212
}
213
214
bool IsIdentifierNameOrPrivateName(JSLinearString* str) {
215
JS::AutoCheckCannotGC nogc;
216
MOZ_ASSERT(str);
217
if (str->hasLatin1Chars()) {
218
return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length());
219
}
220
return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length());
221
}
222
223
bool IsIdentifier(const Latin1Char* chars, size_t length) {
224
if (length == 0) {
225
return false;
226
}
227
228
if (!unicode::IsIdentifierStart(char16_t(*chars))) {
229
return false;
230
}
231
232
const Latin1Char* end = chars + length;
233
while (++chars != end) {
234
if (!unicode::IsIdentifierPart(char16_t(*chars))) {
235
return false;
236
}
237
}
238
239
return true;
240
}
241
242
bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) {
243
if (length == 0) {
244
return false;
245
}
246
247
if (char16_t(*chars) == '#') {
248
#ifdef JS_PRIVATE_FIELDS
249
++chars;
250
--length;
251
#else
252
return false;
253
#endif
254
}
255
256
return IsIdentifier(chars, length);
257
}
258
259
bool IsIdentifier(const char16_t* chars, size_t length) {
260
if (length == 0) {
261
return false;
262
}
263
264
const char16_t* p = chars;
265
const char16_t* end = chars + length;
266
uint32_t codePoint;
267
268
codePoint = GetSingleCodePoint(&p, end);
269
if (!unicode::IsIdentifierStart(codePoint)) {
270
return false;
271
}
272
273
while (p < end) {
274
codePoint = GetSingleCodePoint(&p, end);
275
if (!unicode::IsIdentifierPart(codePoint)) {
276
return false;
277
}
278
}
279
280
return true;
281
}
282
283
bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) {
284
if (length == 0) {
285
return false;
286
}
287
288
const char16_t* p = chars;
289
const char16_t* end = chars + length;
290
uint32_t codePoint;
291
292
codePoint = GetSingleCodePoint(&p, end);
293
if (codePoint == '#') {
294
#ifdef JS_PRIVATE_FIELDS
295
if (length == 1) {
296
return false;
297
}
298
299
codePoint = GetSingleCodePoint(&p, end);
300
#else
301
return false;
302
#endif
303
}
304
305
if (!unicode::IsIdentifierStart(codePoint)) {
306
return false;
307
}
308
309
while (p < end) {
310
codePoint = GetSingleCodePoint(&p, end);
311
if (!unicode::IsIdentifierPart(codePoint)) {
312
return false;
313
}
314
}
315
316
return true;
317
}
318
319
bool IsKeyword(JSLinearString* str) {
320
NameVisibility visibility;
321
if (const ReservedWordInfo* rw = FindReservedWord(str, &visibility)) {
322
return TokenKindIsKeyword(rw->tokentype);
323
}
324
325
return false;
326
}
327
328
TokenKind ReservedWordTokenKind(PropertyName* str) {
329
NameVisibility visibility;
330
if (const ReservedWordInfo* rw = FindReservedWord(str, &visibility)) {
331
return rw->tokentype;
332
}
333
334
return visibility == NameVisibility::Private ? TokenKind::PrivateName
335
: TokenKind::Name;
336
}
337
338
const char* ReservedWordToCharZ(PropertyName* str) {
339
NameVisibility visibility;
340
if (const ReservedWordInfo* rw = FindReservedWord(str, &visibility)) {
341
return ReservedWordToCharZ(rw->tokentype);
342
}
343
344
return nullptr;
345
}
346
347
const char* ReservedWordToCharZ(TokenKind tt) {
348
MOZ_ASSERT(tt != TokenKind::Name);
349
switch (tt) {
350
#define EMIT_CASE(word, name, type) \
351
case type: \
352
return js_##word##_str;
353
FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
354
#undef EMIT_CASE
355
default:
356
MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
357
}
358
return nullptr;
359
}
360
361
PropertyName* TokenStreamAnyChars::reservedWordToPropertyName(
362
TokenKind tt) const {
363
MOZ_ASSERT(tt != TokenKind::Name);
364
switch (tt) {
365
#define EMIT_CASE(word, name, type) \
366
case type: \
367
return cx->names().name;
368
FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
369
#undef EMIT_CASE
370
default:
371
MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
372
}
373
return nullptr;
374
}
375
376
TokenStreamAnyChars::SourceCoords::SourceCoords(JSContext* cx,
377
uint32_t initialLineNumber,
378
uint32_t initialOffset)
379
: lineStartOffsets_(cx), initialLineNum_(initialLineNumber), lastIndex_(0) {
380
// This is actually necessary! Removing it causes compile errors on
381
// GCC and clang. You could try declaring this:
382
//
383
// const uint32_t TokenStreamAnyChars::SourceCoords::MAX_PTR;
384
//
385
// which fixes the GCC/clang error, but causes bustage on Windows. Sigh.
386
//
387
uint32_t maxPtr = MAX_PTR;
388
389
// The first line begins at buffer offset |initialOffset|. MAX_PTR is the
390
// sentinel. The appends cannot fail because |lineStartOffsets_| has
391
// statically-allocated elements.
392
MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
393
MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
394
lineStartOffsets_.infallibleAppend(initialOffset);
395
lineStartOffsets_.infallibleAppend(maxPtr);
396
}
397
398
MOZ_ALWAYS_INLINE bool TokenStreamAnyChars::SourceCoords::add(
399
uint32_t lineNum, uint32_t lineStartOffset) {
400
uint32_t index = indexFromLineNumber(lineNum);
401
uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
402
403
MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
404
MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
405
406
if (index == sentinelIndex) {
407
// We haven't seen this newline before. Update lineStartOffsets_
408
// only if lineStartOffsets_.append succeeds, to keep sentinel.
409
// Otherwise return false to tell TokenStream about OOM.
410
uint32_t maxPtr = MAX_PTR;
411
if (!lineStartOffsets_.append(maxPtr)) {
412
static_assert(mozilla::IsSame<decltype(lineStartOffsets_.allocPolicy()),
413
TempAllocPolicy&>::value,
414
"this function's caller depends on it reporting an "
415
"error on failure, as TempAllocPolicy ensures");
416
return false;
417
}
418
419
lineStartOffsets_[index] = lineStartOffset;
420
} else {
421
// We have seen this newline before (and ungot it). Do nothing (other
422
// than checking it hasn't mysteriously changed).
423
// This path can be executed after hitting OOM, so check index.
424
MOZ_ASSERT_IF(index < sentinelIndex,
425
lineStartOffsets_[index] == lineStartOffset);
426
}
427
return true;
428
}
429
430
MOZ_ALWAYS_INLINE bool TokenStreamAnyChars::SourceCoords::fill(
431
const TokenStreamAnyChars::SourceCoords& other) {
432
MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
433
MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
434
MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
435
436
if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
437
return true;
438
}
439
440
uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
441
lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
442
443
for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
444
i++) {
445
if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
446
return false;
447
}
448
}
449
return true;
450
}
451
452
MOZ_ALWAYS_INLINE uint32_t
453
TokenStreamAnyChars::SourceCoords::indexFromOffset(uint32_t offset) const {
454
uint32_t iMin, iMax, iMid;
455
456
if (lineStartOffsets_[lastIndex_] <= offset) {
457
// If we reach here, offset is on a line the same as or higher than
458
// last time. Check first for the +0, +1, +2 cases, because they
459
// typically cover 85--98% of cases.
460
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
461
return lastIndex_; // index is same as last time
462
}
463
464
// If we reach here, there must be at least one more entry (plus the
465
// sentinel). Try it.
466
lastIndex_++;
467
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
468
return lastIndex_; // index is one higher than last time
469
}
470
471
// The same logic applies here.
472
lastIndex_++;
473
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
474
return lastIndex_; // index is two higher than last time
475
}
476
477
// No luck. Oh well, we have a better-than-default starting point for
478
// the binary search.
479
iMin = lastIndex_ + 1;
480
MOZ_ASSERT(iMin <
481
lineStartOffsets_.length() - 1); // -1 due to the sentinel
482
483
} else {
484
iMin = 0;
485
}
486
487
// This is a binary search with deferred detection of equality, which was
488
// marginally faster in this case than a standard binary search.
489
// The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
490
// want one before that.
491
iMax = lineStartOffsets_.length() - 2;
492
while (iMax > iMin) {
493
iMid = iMin + (iMax - iMin) / 2;
494
if (offset >= lineStartOffsets_[iMid + 1]) {
495
iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
496
} else {
497
iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
498
}
499
}
500
501
MOZ_ASSERT(iMax == iMin);
502
MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
503
MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
504
505
lastIndex_ = iMin;
506
return iMin;
507
}
508
509
TokenStreamAnyChars::SourceCoords::LineToken
510
TokenStreamAnyChars::SourceCoords::lineToken(uint32_t offset) const {
511
return LineToken(indexFromOffset(offset), offset);
512
}
513
514
TokenStreamAnyChars::TokenStreamAnyChars(JSContext* cx,
515
const ReadOnlyCompileOptions& options,
516
StrictModeGetter* smg)
517
: srcCoords(cx, options.lineno, options.scriptSourceOffset),
518
#if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
519
longLineColumnInfo_(cx),
520
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
521
options_(options),
522
tokens(),
523
cursor_(0),
524
lookahead(),
525
lineno(options.lineno),
526
flags(),
527
linebase(0),
528
prevLinebase(size_t(-1)),
529
filename_(options.filename()),
530
displayURL_(nullptr),
531
sourceMapURL_(nullptr),
532
cx(cx),
533
mutedErrors(options.mutedErrors()),
534
strictModeGetter(smg) {
535
// |isExprEnding| was initially zeroed: overwrite the true entries here.
536
isExprEnding[size_t(TokenKind::Comma)] = true;
537
isExprEnding[size_t(TokenKind::Semi)] = true;
538
isExprEnding[size_t(TokenKind::Colon)] = true;
539
isExprEnding[size_t(TokenKind::RightParen)] = true;
540
isExprEnding[size_t(TokenKind::RightBracket)] = true;
541
isExprEnding[size_t(TokenKind::RightCurly)] = true;
542
}
543
544
template <typename Unit>
545
TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
546
const Unit* units,
547
size_t length,
548
size_t startOffset)
549
: TokenStreamCharsShared(cx), sourceUnits(units, length, startOffset) {}
550
551
template <>
552
MOZ_MUST_USE bool TokenStreamCharsBase<char16_t>::
553
fillCharBufferFromSourceNormalizingAsciiLineBreaks(const char16_t* cur,
554
const char16_t* end) {
555
MOZ_ASSERT(this->charBuffer.length() == 0);
556
557
while (cur < end) {
558
char16_t ch = *cur++;
559
if (ch == '\r') {
560
ch = '\n';
561
if (cur < end && *cur == '\n') {
562
cur++;
563
}
564
}
565
566
if (!this->charBuffer.append(ch)) {
567
return false;
568
}
569
}
570
571
MOZ_ASSERT(cur == end);
572
return true;
573
}
574
575
template <>
576
MOZ_MUST_USE bool TokenStreamCharsBase<Utf8Unit>::
577
fillCharBufferFromSourceNormalizingAsciiLineBreaks(const Utf8Unit* cur,
578
const Utf8Unit* end) {
579
MOZ_ASSERT(this->charBuffer.length() == 0);
580
581
while (cur < end) {
582
Utf8Unit unit = *cur++;
583
if (MOZ_LIKELY(IsAscii(unit))) {
584
char16_t ch = unit.toUint8();
585
if (ch == '\r') {
586
ch = '\n';
587
if (cur < end && *cur == Utf8Unit('\n')) {
588
cur++;
589
}
590
}
591
592
if (!this->charBuffer.append(ch)) {
593
return false;
594
}
595
596
continue;
597
}
598
599
Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
600
MOZ_ASSERT(ch.isSome(),
601
"provided source text should already have been validated");
602
603
if (!appendCodePointToCharBuffer(ch.value())) {
604
return false;
605
}
606
}
607
608
MOZ_ASSERT(cur == end);
609
return true;
610
}
611
612
template <typename Unit, class AnyCharsAccess>
613
TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
614
JSContext* cx, const ReadOnlyCompileOptions& options, const Unit* units,
615
size_t length)
616
: TokenStreamChars<Unit, AnyCharsAccess>(cx, units, length,
617
options.scriptSourceOffset) {}
618
619
bool TokenStreamAnyChars::checkOptions() {
620
// Constrain starting columns to half of the range of a signed 32-bit value,
621
// to avoid overflow.
622
if (options().column >= mozilla::MaxValue<int32_t>::value / 2 + 1) {
623
reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
624
return false;
625
}
626
627
return true;
628
}
629
630
void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) {
631
va_list args;
632
va_start(args, errorNumber);
633
634
reportErrorNoOffsetVA(errorNumber, &args);
635
636
va_end(args);
637
}
638
639
void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
640
va_list* args) {
641
ErrorMetadata metadata;
642
computeErrorMetadataNoOffset(&metadata);
643
644
ReportCompileError(cx, std::move(metadata), nullptr, JSREPORT_ERROR,
645
errorNumber, args);
646
}
647
648
// Use the fastest available getc.
649
#if defined(HAVE_GETC_UNLOCKED)
650
# define fast_getc getc_unlocked
651
#elif defined(HAVE__GETC_NOLOCK)
652
# define fast_getc _getc_nolock
653
#else
654
# define fast_getc getc
655
#endif
656
657
MOZ_MUST_USE MOZ_ALWAYS_INLINE bool
658
TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
659
prevLinebase = linebase;
660
linebase = lineStartOffset;
661
lineno++;
662
return srcCoords.add(lineno, linebase);
663
}
664
665
#ifdef DEBUG
666
667
template <>
668
inline void SourceUnits<char16_t>::assertNextCodePoint(
669
const PeekedCodePoint<char16_t>& peeked) {
670
char32_t c = peeked.codePoint();
671
if (c < unicode::NonBMPMin) {
672
MOZ_ASSERT(peeked.lengthInUnits() == 1);
673
MOZ_ASSERT(ptr[0] == c);
674
} else {
675
MOZ_ASSERT(peeked.lengthInUnits() == 2);
676
char16_t lead, trail;
677
unicode::UTF16Encode(c, &lead, &trail);
678
MOZ_ASSERT(ptr[0] == lead);
679
MOZ_ASSERT(ptr[1] == trail);
680
}
681
}
682
683
template <>
684
inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
685
const PeekedCodePoint<Utf8Unit>& peeked) {
686
char32_t c = peeked.codePoint();
687
688
// This is all roughly indulgence of paranoia only for assertions, so the
689
// reimplementation of UTF-8 encoding a code point is (we think) a virtue.
690
uint8_t expectedUnits[4] = {};
691
if (c < 0x80) {
692
expectedUnits[0] = AssertedCast<uint8_t>(c);
693
} else if (c < 0x800) {
694
expectedUnits[0] = 0b1100'0000 | (c >> 6);
695
expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
696
} else if (c < 0x10000) {
697
expectedUnits[0] = 0b1110'0000 | (c >> 12);
698
expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
699
expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
700
} else {
701
expectedUnits[0] = 0b1111'0000 | (c >> 18);
702
expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
703
expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
704
expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
705
}
706
707
MOZ_ASSERT(peeked.lengthInUnits() <= 4);
708
for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
709
MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
710
}
711
}
712
713
#endif // DEBUG
714
715
#if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
716
717
static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
718
const Utf8Unit** ptr, const Utf8Unit* limit) {
719
MOZ_ASSERT(*ptr <= limit);
720
721
// |limit| is a code point boundary.
722
if (MOZ_UNLIKELY(*ptr == limit)) {
723
return;
724
}
725
726
// Otherwise rewind past trailing units to the start of the code point.
727
# ifdef DEBUG
728
size_t retracted = 0;
729
# endif
730
while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
731
--*ptr;
732
# ifdef DEBUG
733
retracted++;
734
# endif
735
}
736
737
MOZ_ASSERT(retracted < 4,
738
"the longest UTF-8 code point is four units, so this should never "
739
"retract more than three units");
740
}
741
742
static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
743
const char16_t** ptr, const char16_t* limit) {
744
MOZ_ASSERT(*ptr <= limit);
745
746
// |limit| is a code point boundary.
747
if (MOZ_UNLIKELY(*ptr == limit)) {
748
return;
749
}
750
751
// Otherwise the pointer must be retracted by one iff it splits a two-unit
752
// code point.
753
if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
754
// Outside test suites testing garbage WTF-16, it's basically guaranteed
755
// here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
756
if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
757
--*ptr;
758
}
759
}
760
}
761
762
template <typename Unit>
763
uint32_t TokenStreamAnyChars::computePartialColumn(
764
const LineToken lineToken, const uint32_t offset,
765
const SourceUnits<Unit>& sourceUnits) const {
766
lineToken.assertConsistentOffset(offset);
767
768
const uint32_t line = lineNumber(lineToken);
769
const uint32_t start = srcCoords.lineStart(lineToken);
770
771
// Reset the previous offset/column cache for this line, if the previous
772
// lookup wasn't on this line.
773
if (line != lineOfLastColumnComputation_) {
774
lineOfLastColumnComputation_ = line;
775
lastChunkVectorForLine_ = nullptr;
776
lastOffsetOfComputedColumn_ = start;
777
lastComputedColumn_ = 0;
778
}
779
780
// Compute and return the final column number from a partial offset/column,
781
// using the last-cached offset/column if they're more optimal.
782
auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
783
uint32_t partialCols,
784
UnitsType unitsType) {
785
MOZ_ASSERT(partialOffset <= offset);
786
787
// If the last lookup on this line was closer to |offset|, use it.
788
if (partialOffset < this->lastOffsetOfComputedColumn_ &&
789
this->lastOffsetOfComputedColumn_ <= offset) {
790
partialOffset = this->lastOffsetOfComputedColumn_;
791
partialCols = this->lastComputedColumn_;
792
}
793
794
const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
795
const Unit* end = sourceUnits.codeUnitPtrAt(offset);
796
797
size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
798
partialOffset += offsetDelta;
799
800
if (unitsType == UnitsType::GuaranteedSingleUnit) {
801
MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
802
"guaranteed-single-units also guarantee pointer distance "
803
"equals code point count");
804
partialCols += offsetDelta;
805
} else {
806
partialCols +=
807
AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
808
}
809
810
this->lastOffsetOfComputedColumn_ = partialOffset;
811
this->lastComputedColumn_ = partialCols;
812
return partialCols;
813
};
814
815
const uint32_t offsetInLine = offset - start;
816
817
// The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
818
const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
819
if (chunkIndex == 0) {
820
// We don't know from an |offset| in the zeroth chunk that this line is even
821
// long. First-chunk info is mostly useless, anyway -- we have |start|
822
// already. So if we have *easy* access to that zeroth chunk, use it --
823
// otherwise just count pessimally. (This will still benefit from caching
824
// the last column/offset for computations for successive offsets, so it's
825
// not *always* worst-case.)
826
UnitsType unitsType;
827
if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
828
MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
829
unitsType = (*lastChunkVectorForLine_)[0].unitsType();
830
} else {
831
unitsType = UnitsType::PossiblyMultiUnit;
832
}
833
834
return ColumnFromPartial(start, 0, unitsType);
835
}
836
837
// If this line has no chunk vector yet, insert one in the hash map. (The
838
// required index is allocated and filled further down.)
839
if (!lastChunkVectorForLine_) {
840
auto ptr = longLineColumnInfo_.lookupForAdd(line);
841
if (!ptr) {
842
// This could rehash and invalidate a cached vector pointer, but the outer
843
// condition means we don't have a cached pointer.
844
if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
845
// In case of OOM, just count columns from the start of the line.
846
cx->recoverFromOutOfMemory();
847
return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
848
}
849
}
850
851
// Note that adding elements to this vector won't invalidate this pointer.
852
lastChunkVectorForLine_ = &ptr->value();
853
}
854
855
const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
856
857
auto RetractedOffsetOfChunk = [
858
# ifdef DEBUG
859
this,
860
# endif
861
start, limit,
862
&sourceUnits](uint32_t index) {
863
MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
864
865
uint32_t naiveOffset = start + index * ColumnChunkLength;
866
const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
867
868
const Unit* actualPtr = naivePtr;
869
RetractPointerToCodePointBoundary(&actualPtr, limit);
870
871
# ifdef DEBUG
872
if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
873
UnitsType::GuaranteedSingleUnit) {
874
MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
875
}
876
# endif
877
878
return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
879
};
880
881
uint32_t partialOffset;
882
uint32_t partialColumn;
883
UnitsType unitsType;
884
885
auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
886
if (chunkIndex < entriesLen) {
887
// We've computed the chunk |offset| resides in. Compute the column number
888
// from the chunk.
889
partialOffset = RetractedOffsetOfChunk(chunkIndex);
890
partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
891
892
// This is exact if |chunkIndex| isn't the last chunk.
893
unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
894
895
// Otherwise the last chunk is pessimistically assumed to contain multi-unit
896
// code points because we haven't fully examined its contents yet -- they
897
// may not have been tokenized yet, they could contain encoding errors, or
898
// they might not even exist.
899
MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
900
(*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
901
UnitsType::PossiblyMultiUnit);
902
} else {
903
// Extend the vector from its last entry or the start of the line. (This is
904
// also a suitable partial start point if we must recover from OOM.)
905
if (entriesLen > 0) {
906
partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
907
partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
908
} else {
909
partialOffset = start;
910
partialColumn = 0;
911
}
912
913
if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
914
// As earlier, just start from the greatest offset/column in case of OOM.
915
cx->recoverFromOutOfMemory();
916
return ColumnFromPartial(partialOffset, partialColumn,
917
UnitsType::PossiblyMultiUnit);
918
}
919
920
// OOM is no longer possible now. \o/
921
922
// The vector always begins with the column of the line start, i.e. zero,
923
// with chunk units pessimally assumed not single-unit.
924
if (entriesLen == 0) {
925
lastChunkVectorForLine_->infallibleAppend(
926
ChunkInfo(0, UnitsType::PossiblyMultiUnit));
927
entriesLen++;
928
}
929
930
do {
931
const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
932
const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
933
start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
934
935
MOZ_ASSERT(begin < chunkLimit);
936
MOZ_ASSERT(chunkLimit <= limit);
937
938
static_assert(ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength,
939
"chunk length in code units must be able to contain the "
940
"largest encoding of a code point, for retracting below to "
941
"never underflow");
942
943
// Prior tokenizing ensured that [begin, limit) is validly encoded, and
944
// |begin < chunkLimit|, so any retraction here can't underflow.
945
RetractPointerToCodePointBoundary(&chunkLimit, limit);
946
947
MOZ_ASSERT(begin < chunkLimit);
948
MOZ_ASSERT(chunkLimit <= limit);
949
950
size_t numUnits = PointerRangeSize(begin, chunkLimit);
951
size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
952
953
// If this chunk (which will become non-final at the end of the loop) is
954
// all single-unit code points, annotate the chunk accordingly.
955
if (numUnits == numCodePoints) {
956
lastChunkVectorForLine_->back().guaranteeSingleUnits();
957
}
958
959
partialOffset += numUnits;
960
partialColumn += numCodePoints;
961
962
lastChunkVectorForLine_->infallibleEmplaceBack(
963
partialColumn, UnitsType::PossiblyMultiUnit);
964
} while (entriesLen < chunkIndex + 1);
965
966
// We're at a spot in the current final chunk, and final chunks never have
967
// complete units information, so be pessimistic.
968
unitsType = UnitsType::PossiblyMultiUnit;
969
}
970
971
return ColumnFromPartial(partialOffset, partialColumn, unitsType);
972
}
973
974
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
975
976
template <typename Unit, class AnyCharsAccess>
977
uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
978
LineToken lineToken, uint32_t offset) const {
979
lineToken.assertConsistentOffset(offset);
980
981
const TokenStreamAnyChars& anyChars = anyCharsAccess();
982
983
uint32_t partialCols =
984
#if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
985
anyChars.computePartialColumn(lineToken, offset, this->sourceUnits)
986
#else
987
offset - anyChars.lineStart(lineToken)
988
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
989
;
990
991
return (lineToken.isFirstLine() ? anyChars.options_.column : 0) + partialCols;
992
}
993
994
template <typename Unit, class AnyCharsAccess>
995
void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
996
uint32_t offset, uint32_t* line, uint32_t* column) const {
997
const TokenStreamAnyChars& anyChars = anyCharsAccess();
998
999
auto lineToken = anyChars.lineToken(offset);
1000
*line = anyChars.lineNumber(lineToken);
1001
*column = computeColumn(lineToken, offset);
1002
}
1003
1004
template <class AnyCharsAccess>
1005
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
1006
uint8_t relevantUnits, unsigned errorNumber, ...) {
1007
va_list args;
1008
va_start(args, errorNumber);
1009
1010
do {
1011
size_t offset = this->sourceUnits.offset();
1012
1013
ErrorMetadata err;
1014
1015
TokenStreamAnyChars& anyChars = anyCharsAccess();
1016
1017
bool canAddLineOfContext = fillExceptingContext(&err, offset);
1018
if (canAddLineOfContext) {
1019
if (!internalComputeLineOfContext(&err, offset)) {
1020
break;
1021
}
1022
1023
// As this is an encoding error, the computed window-end must be
1024
// identical to the location of the error -- any further on and the
1025
// window would contain invalid Unicode.
1026
MOZ_ASSERT_IF(err.lineOfContext != nullptr,
1027
err.lineLength == err.tokenOffset);
1028
}
1029
1030
auto notes = MakeUnique<JSErrorNotes>();
1031
if (!notes) {
1032
ReportOutOfMemory(anyChars.cx);
1033
break;
1034
}
1035
1036
// The largest encoding of a UTF-8 code point is 4 units. (Encoding an
1037
// obsolete 5- or 6-byte code point will complain only about a bad lead
1038
// code unit.)
1039
constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
1040
1041
MOZ_ASSERT(relevantUnits > 0);
1042
1043
char badUnitsStr[MaxWidth];
1044
char* ptr = badUnitsStr;
1045
while (relevantUnits > 0) {
1046
byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
1047
ptr[4] = ' ';
1048
1049
ptr += 5;
1050
relevantUnits--;
1051
}
1052
1053
ptr[-1] = '\0';
1054
1055
uint32_t line, column;
1056
computeLineAndColumn(offset, &line, &column);
1057
1058
if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), 0, line,
1059
column, GetErrorMessage, nullptr,
1060
JSMSG_BAD_CODE_UNITS, badUnitsStr)) {
1061
break;
1062
}
1063
1064
ReportCompileError(anyChars.cx, std::move(err), std::move(notes),
1065
JSREPORT_ERROR, errorNumber, &args);
1066
} while (false);
1067
1068
va_end(args);
1069
}
1070
1071
template <class AnyCharsAccess>
1072
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
1073
Utf8Unit lead) {
1074
uint8_t leadValue = lead.toUint8();
1075
1076
char leadByteStr[5];
1077
byteToTerminatedString(leadValue, leadByteStr);
1078
1079
internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
1080
}
1081
1082
template <class AnyCharsAccess>
1083
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
1084
Utf8Unit lead, uint8_t remaining, uint8_t required) {
1085
uint8_t leadValue = lead.toUint8();
1086
1087
MOZ_ASSERT(required == 2 || required == 3 || required == 4);
1088
MOZ_ASSERT(remaining < 4);
1089
MOZ_ASSERT(remaining < required);
1090
1091
char leadByteStr[5];
1092
byteToTerminatedString(leadValue, leadByteStr);
1093
1094
// |toHexChar| produces the desired decimal numbers for values < 4.
1095
const char expectedStr[] = {toHexChar(required - 1), '\0'};
1096
const char actualStr[] = {toHexChar(remaining - 1), '\0'};
1097
1098
internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
1099
expectedStr, required == 2 ? "" : "s", actualStr,
1100
remaining == 2 ? " was" : "s were");
1101
}
1102
1103
template <class AnyCharsAccess>
1104
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
1105
uint8_t unitsObserved) {
1106
Utf8Unit badUnit =
1107
this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
1108
1109
char badByteStr[5];
1110
byteToTerminatedString(badUnit.toUint8(), badByteStr);
1111
1112
internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
1113
badByteStr);
1114
}
1115
1116
template <class AnyCharsAccess>
1117
MOZ_COLD void
1118
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
1119
uint32_t codePoint, uint8_t codePointLength, const char* reason) {
1120
// Construct a string like "0x203D" (including null terminator) to include
1121
// in the error message. Write the string end-to-start from end to start
1122
// of an adequately sized |char| array, shifting least significant nibbles
1123
// off the number and writing the corresponding hex digits until done, then
1124
// prefixing with "0x". |codePointStr| points at the incrementally
1125
// computed string, within |codePointCharsArray|'s bounds.
1126
1127
// 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
1128
// bits in a four-byte UTF-8 code unit sequence.
1129
constexpr size_t MaxHexSize = sizeof(
1130
"0x1F"
1131
"FFFF"); // including '\0'
1132
char codePointCharsArray[MaxHexSize];
1133
1134
char* codePointStr = codePointCharsArray + ArrayLength(codePointCharsArray);
1135
*--codePointStr = '\0';
1136
1137
// Note that by do-while looping here rather than while-looping, this
1138
// writes a '0' when |codePoint == 0|.
1139
do {
1140
MOZ_ASSERT(codePointCharsArray < codePointStr);
1141
*--codePointStr = toHexChar(codePoint & 0xF);
1142
codePoint >>= 4;
1143
} while (codePoint);
1144
1145
MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
1146
*--codePointStr = 'x';
1147
*--codePointStr = '0';
1148
1149
internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
1150
codePointStr, reason);
1151
}
1152
1153
template <class AnyCharsAccess>
1154
MOZ_MUST_USE bool
1155
TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
1156
Utf8Unit lead, char32_t* codePoint) {
1157
auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1158
1159
auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
1160
this->notEnoughUnits(lead, remaining, required);
1161
};
1162
1163
auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
1164
this->badTrailingUnit(unitsObserved);
1165
};
1166
1167
auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
1168
this->badCodePoint(badCodePoint, unitsObserved);
1169
};
1170
1171
auto onNotShortestForm = [this](char32_t badCodePoint,
1172
uint8_t unitsObserved) {
1173
this->notShortestForm(badCodePoint, unitsObserved);
1174
};
1175
1176
// If a valid code point is decoded, this function call consumes its code
1177
// units. If not, it ungets the lead code unit and invokes the right error
1178
// handler, so on failure we must immediately return false.
1179
SourceUnitsIterator iter(this->sourceUnits);
1180
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
1181
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1182
onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1183
if (maybeCodePoint.isNothing()) {
1184
return false;
1185
}
1186
1187
*codePoint = maybeCodePoint.value();
1188
return true;
1189
}
1190
1191
template <class AnyCharsAccess>
1192
bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
1193
int32_t lead, int32_t* codePoint) {
1194
MOZ_ASSERT(lead != EOF);
1195
MOZ_ASSERT(!isAsciiCodePoint(lead),
1196
"ASCII code unit/point must be handled separately");
1197
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1198
"getNonAsciiCodePoint called incorrectly");
1199
1200
// The code point is usually |lead|: overwrite later if needed.
1201
*codePoint = lead;
1202
1203
// ECMAScript specifically requires that unpaired UTF-16 surrogates be
1204
// treated as the corresponding code point and not as an error. See
1206
// Thus this function does not consider any sequence of 16-bit numbers to
1207
// be intrinsically in error.
1208
1209
// Dispense with single-unit code points and lone trailing surrogates.
1210
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
1211
if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
1212
lead == unicode::PARA_SEPARATOR)) {
1213
if (!updateLineInfoForEOL()) {
1214
#ifdef DEBUG
1215
*codePoint = EOF; // sentinel value to hopefully cause errors
1216
#endif
1217
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1218
return false;
1219
}
1220
1221
*codePoint = '\n';
1222
} else {
1223
MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1224
}
1225
1226
return true;
1227
}
1228
1229
// Also handle a lead surrogate not paired with a trailing surrogate.
1230
if (MOZ_UNLIKELY(
1231
this->sourceUnits.atEnd() ||
1232
!unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1233
MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1234
return true;
1235
}
1236
1237
// Otherwise we have a multi-unit code point.
1238
*codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1239
MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1240
return true;
1241
}
1242
1243
template <typename Unit, class AnyCharsAccess>
1244
bool TokenStreamSpecific<Unit, AnyCharsAccess>::getCodePoint(int32_t* cp) {
1245
int32_t unit = getCodeUnit();
1246
if (unit == EOF) {
1247
MOZ_ASSERT(anyCharsAccess().flags.isEOF,
1248
"flags.isEOF should have been set by getCodeUnit()");
1249
*cp = EOF;
1250
return true;
1251
}
1252
1253
if (isAsciiCodePoint(unit)) {
1254
return getFullAsciiCodePoint(unit, cp);
1255
}
1256
1257
return getNonAsciiCodePoint(unit, cp);
1258
}
1259
1260
template <class AnyCharsAccess>
1261
bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
1262
int32_t unit, int32_t* codePoint) {
1263
MOZ_ASSERT(unit != EOF);
1264
MOZ_ASSERT(!isAsciiCodePoint(unit),
1265
"ASCII code unit/point must be handled separately");
1266
1267
Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
1268
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1269
"getNonAsciiCodePoint called incorrectly");
1270
1271
auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1272
1273
auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
1274
uint_fast8_t required) {
1275
this->notEnoughUnits(lead, remaining, required);
1276
};
1277
1278
auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
1279
this->badTrailingUnit(unitsObserved);
1280
};
1281
1282
auto onBadCodePoint = [this](char32_t badCodePoint,
1283
uint_fast8_t unitsObserved) {
1284
this->badCodePoint(badCodePoint, unitsObserved);
1285
};
1286
1287
auto onNotShortestForm = [this](char32_t badCodePoint,
1288
uint_fast8_t unitsObserved) {
1289
this->notShortestForm(badCodePoint, unitsObserved);
1290
};
1291
1292
// This consumes the full, valid code point or ungets |lead| and calls the
1293
// appropriate error functor on failure.
1294
SourceUnitsIterator iter(this->sourceUnits);
1295
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
1296
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1297
onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1298
if (maybeCodePoint.isNothing()) {
1299
return false;
1300
}
1301
1302
char32_t cp = maybeCodePoint.value();
1303
if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
1304
cp == unicode::PARA_SEPARATOR)) {
1305
if (!updateLineInfoForEOL()) {
1306
#ifdef DEBUG
1307
*codePoint = EOF; // sentinel value to hopefully cause errors
1308
#endif
1309
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1310
return false;
1311
}
1312
1313
*codePoint = '\n';
1314
} else {
1315
MOZ_ASSERT(!IsLineTerminator(cp));
1316
*codePoint = AssertedCast<int32_t>(cp);
1317
}
1318
1319
return true;
1320
}
1321
1322
template <>
1323
size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
1324
// This is JS's understanding of UTF-16 that allows lone surrogates, so
1325
// we have to exclude lone surrogates from [windowStart, offset) ourselves.
1326
1327
const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1328
1329
const char16_t* const initial = codeUnitPtrAt(offset);
1330
const char16_t* p = initial;
1331
1332
auto HalfWindowSize = [&p, &initial]() {
1333
return PointerRangeSize(p, initial);
1334
};
1335
1336
while (true) {
1337
MOZ_ASSERT(earliestPossibleStart <= p);
1338
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1339
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1340
break;
1341
}
1342
1343
char16_t c = p[-1];
1344
1345
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1346
// string and template literals. These code points do affect line and
1347
// column coordinates, even as they encode their literal values.
1348
if (IsLineTerminator(c)) {
1349
break;
1350
}
1351
1352
// Don't allow invalid UTF-16 in pre-context. (Current users don't
1353
// require this, and this behavior isn't currently imposed on
1354
// pre-context, but these facts might change someday.)
1355
1356
if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
1357
break;
1358
}
1359
1360
// Optimistically include the code unit, reverting below if needed.
1361
p--;
1362
1363
// If it's not a surrogate at all, keep going.
1364
if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
1365
continue;
1366
}
1367
1368
// Stop if we don't have a usable surrogate pair.
1369
if (HalfWindowSize() >= WindowRadius ||
1370
p <= earliestPossibleStart || // trail surrogate at low end
1371
!unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
1372
{
1373
p++;
1374
break;
1375
}
1376
1377
p--;
1378
}
1379
1380
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1381
return offset - HalfWindowSize();
1382
}
1383
1384
template <>
1385
size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
1386
// |offset| must be the location of the error or somewhere before it, so we
1387
// know preceding data is valid UTF-8.
1388
1389
const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1390
1391
const Utf8Unit* const initial = codeUnitPtrAt(offset);
1392
const Utf8Unit* p = initial;
1393
1394
auto HalfWindowSize = [&p, &initial]() {
1395
return PointerRangeSize(p, initial);
1396
};
1397
1398
while (true) {
1399
MOZ_ASSERT(earliestPossibleStart <= p);
1400
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1401
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1402
break;
1403
}
1404
1405
// Peek backward for a line break, and only decrement if there is none.
1406
uint8_t prev = p[-1].toUint8();
1407
1408
// First check for the ASCII LineTerminators.
1409
if (prev == '\r' || prev == '\n') {
1410
break;
1411
}
1412
1413
// Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
1414
// (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
1415
// aren't three code units available, some comparison here will fail
1416
// before we'd underflow.
1417
if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
1418
p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
1419
break;
1420
}
1421
1422
// Rewind over the non-LineTerminator. This can't underflow
1423
// |earliestPossibleStart| because it begins a code point.
1424
while (IsTrailingUnit(*--p)) {
1425
continue;
1426
}
1427
1428
MOZ_ASSERT(earliestPossibleStart <= p);
1429
1430
// But if we underflowed |WindowRadius|, adjust forward and stop.
1431
if (HalfWindowSize() > WindowRadius) {
1432
static_assert(WindowRadius > 3,
1433
"skipping over non-lead code units below must not "
1434
"advance past |offset|");
1435
1436
while (IsTrailingUnit(*++p)) {
1437
continue;
1438
}
1439
1440
MOZ_ASSERT(HalfWindowSize() < WindowRadius);
1441
break;
1442
}
1443
}
1444
1445
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1446
return offset - HalfWindowSize();
1447
}
1448
1449
template <>
1450
size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
1451
const char16_t* const initial = codeUnitPtrAt(offset);
1452
const char16_t* p = initial;
1453
1454
auto HalfWindowSize = [&initial, &p]() {
1455
return PointerRangeSize(initial, p);
1456
};
1457
1458
while (true) {
1459
MOZ_ASSERT(p <= limit_);
1460
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1461
if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1462
break;
1463
}
1464
1465
char16_t c = *p;
1466
1467
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1468
// string and template literals. These code points do affect line and
1469
// column coordinates, even as they encode their literal values.
1470
if (IsLineTerminator(c)) {
1471
break;
1472
}
1473
1474
// Don't allow invalid UTF-16 in post-context. (Current users don't
1475
// require this, and this behavior isn't currently imposed on
1476
// pre-context, but these facts might change someday.)
1477
1478
if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
1479
break;
1480
}
1481
1482
// Optimistically consume the code unit, ungetting it below if needed.
1483
p++;
1484
1485
// If it's not a surrogate at all, keep going.
1486
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
1487
continue;
1488
}
1489
1490
// Retract if the lead surrogate would stand alone at the end of the
1491
// window.
1492
if (HalfWindowSize() >= WindowRadius || // split pair
1493
p >= limit_ || // half-pair at end of source
1494
!unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
1495
{
1496
p--;
1497
break;
1498
}
1499
1500
p++;
1501
}
1502
1503
return offset + HalfWindowSize();
1504
}
1505
1506
template <>
1507
size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
1508
const Utf8Unit* const initial = codeUnitPtrAt(offset);
1509
const Utf8Unit* p = initial;
1510
1511
auto HalfWindowSize = [&initial, &p]() {
1512
return PointerRangeSize(initial, p);
1513
};
1514
1515
while (true) {
1516
MOZ_ASSERT(p <= limit_);
1517
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1518
if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1519
break;
1520
}
1521
1522
// A non-encoding error might be followed by an encoding error within
1523
// |maxEnd|, so we must validate as we go to not include invalid UTF-8
1524
// in the computed window. What joy!
1525
1526
Utf8Unit lead = *p;
1527
if (mozilla::IsAscii(lead)) {
1528
if (IsSingleUnitLineTerminator(lead)) {
1529
break;
1530
}
1531
1532
p++;
1533
continue;
1534
}
1535
1536
PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
1537
if (peeked.isNone()) {
1538
break; // encoding error
1539
}
1540
1541
char32_t c = peeked.codePoint();
1542
if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
1543
c == unicode::PARA_SEPARATOR)) {
1544
break;
1545
}
1546
1547
MOZ_ASSERT(!IsLineTerminator(c));
1548
1549
uint8_t len = peeked.lengthInUnits();
1550
if (HalfWindowSize() + len > WindowRadius) {
1551
break;
1552
}
1553
1554
p += len;
1555
}
1556
1557
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1558
return offset + HalfWindowSize();
1559
}
1560
1561
template <typename Unit, class AnyCharsAccess>
1562
bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
1563
const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
1564
while (this->sourceUnits.addressOfNextCodeUnit() < end) {
1565
int32_t c;
1566
if (!getCodePoint(&c)) {
1567
return false;
1568
}
1569
}
1570
1571
TokenStreamAnyChars& anyChars = anyCharsAccess();
1572
Token* cur = const_cast<Token*>(&anyChars.currentToken());
1573
cur->pos.begin = this->sourceUnits.offset();
1574
cur->pos.end = cur->pos.begin;
1575
MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
1576
anyChars.lookahead = 0;
1577
return true;
1578
}
1579
1580
template <typename Unit, class AnyCharsAccess>
1581
void TokenStreamSpecific<Unit, AnyCharsAccess>::seek(const Position& pos) {
1582
TokenStreamAnyChars& anyChars = anyCharsAccess();
1583
1584
this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
1585
/* allowPoisoned = */ true);
1586
anyChars.flags = pos.flags;
1587
anyChars.lineno = pos.lineno;
1588
anyChars.linebase = pos.linebase;
1589
anyChars.prevLinebase = pos.prevLinebase;
1590
anyChars.lookahead = pos.lookahead;
1591
1592
anyChars.tokens[anyChars.cursor()] = pos.currentToken;
1593
for (unsigned i = 0; i < anyChars.lookahead; i++) {
1594
anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
1595
}
1596
}
1597
1598
template <typename Unit, class AnyCharsAccess>
1599
bool TokenStreamSpecific<Unit, AnyCharsAccess>::seek(
1600
const Position& pos, const TokenStreamAnyChars& other) {
1601
if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
1602
return false;
1603
}
1604
1605
seek(pos);
1606
return true;
1607
}
1608
1609
void TokenStreamAnyChars::computeErrorMetadataNoOffset(ErrorMetadata* err) {
1610
err->isMuted = mutedErrors;
1611
err->filename = filename_;
1612
err->lineNumber = 0;
1613
err->columnNumber = 0;
1614
1615
MOZ_ASSERT(err->lineOfContext == nullptr);
1616
}
1617
1618
bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
1619
uint32_t offset) {
1620
err->isMuted = mutedErrors;
1621
1622
// If this TokenStreamAnyChars doesn't have location information, try to
1623
// get it from the caller.
1624
if (!filename_ && !cx->isHelperThreadContext()) {
1625
NonBuiltinFrameIter iter(cx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
1626
cx->realm()->principals());
1627
if (!iter.done() && iter.filename()) {
1628
err->filename = iter.filename();
1629
err->lineNumber = iter.computeLine(&err->columnNumber);
1630
return false;
1631
}
1632
}
1633
1634
// Otherwise use this TokenStreamAnyChars's location information.
1635
err->filename = filename_;
1636
return true;
1637
}
1638
1639
template <typename Unit, class AnyCharsAccess>
1640
bool TokenStreamSpecific<Unit, AnyCharsAccess>::hasTokenizationStarted() const {
1641
const TokenStreamAnyChars& anyChars = anyCharsAccess();
1642
return anyChars.isCurrentTokenType(TokenKind::Eof) && !anyChars.isEOF();
1643
}
1644
1645
template <>
1646
inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
1647
const char16_t* encodedWindow, size_t encodedTokenOffset,
1648
size_t* utf16TokenOffset, size_t encodedWindowLength,
1649
size_t* utf16WindowLength) {
1650
MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
1651
}
1652
1653
template <>
1654
inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
1655
const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
1656
size_t* utf16TokenOffset, size_t encodedWindowLength,
1657
size_t* utf16WindowLength) {
1658
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1659
"token offset must be within the window, and the two lambda "
1660
"calls below presume this ordering of values");
1661
1662
const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
1663
1664
size_t i = 0;
1665
auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
1666
while (encodedWindow < limit) {
1667
Utf8Unit lead = *encodedWindow++;
1668
if (MOZ_LIKELY(IsAscii(lead))) {
1669
// ASCII contributes a single UTF-16 code unit.
1670
i++;
1671
continue;
1672
}
1673
1674
Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
1675
MOZ_ASSERT(cp.isSome(),
1676
"computed window should only contain valid UTF-8");
1677
1678
i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
1679
}
1680
1681
return i;
1682
};
1683
1684
// Compute the token offset from |i == 0| and the initial |encodedWindow|.
1685
const Utf8Unit* token = encodedWindow + encodedTokenOffset;
1686
MOZ_ASSERT(token <= encodedWindowEnd);
1687
*utf16TokenOffset = ComputeUtf16Count(token);
1688
1689
// Compute the window length, picking up from |i| and |encodedWindow| that,
1690
// in general, were modified just above.
1691
*utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
1692
}
1693
1694
template <typename Unit>
1695
bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
1696
uint32_t offset) {
1697
// Rename the variable to make meaning clearer: an offset into source units
1698
// in Unit encoding.
1699
size_t encodedOffset = offset;
1700
1701
// These are also offsets into source units in Unit encoding.
1702
size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
1703
size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
1704
1705
size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
1706
MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
1707
1708
// Don't add a useless "line" of context when the window ends up empty
1709
// because of an invalid encoding at the start of a line.
1710
if (encodedWindowLength == 0) {
1711
MOZ_ASSERT(err->lineOfContext == nullptr,
1712
"ErrorMetadata::lineOfContext must be null so we don't "
1713
"have to set the lineLength/tokenOffset fields");
1714
return true;
1715
}
1716
1717
// We might have hit an error while processing some source code feature
1718
// that's accumulating text into |this->charBuffer| -- e.g. we could be
1719
// halfway into a regular expression literal, then encounter invalid UTF-8.
1720
// Thus we must clear |this->charBuffer| of prior work.
1721
this->charBuffer.clear();
1722
1723
const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
1724
if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(
1725
encodedWindow, encodedWindow + encodedWindowLength)) {
1726
return false;
1727
}
1728
1729
size_t utf16WindowLength = this->charBuffer.length();
1730
1731
// The windowed string is null-terminated.
1732
if (!this->charBuffer.append('\0')) {
1733
return false;
1734
}
1735
1736
err->lineOfContext.reset(this->charBuffer.extractOrCopyRawBuffer());
1737
if (!err->lineOfContext) {
1738
return false;
1739
}
1740
1741
size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
1742
1743
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1744
"token offset must be inside the window");
1745
1746
// The length in UTF-8 code units of a code point is always greater than or
1747
// equal to the same code point's length in UTF-16 code points. ASCII code
1748
// points are 1 unit in either encoding. Code points in [U+0080, U+10000)
1749
// are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in
1750
// [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
1751
//
1752
// Therefore, if encoded window length equals the length in UTF-16 (this is
1753
// always the case for Unit=char16_t), the UTF-16 offsets are exactly the
1754
// encoded offsets. Otherwise we must convert offset/length from UTF-8 to
1755
// UTF-16.
1756
if (std::is_same<Unit, char16_t>::value) {
1757
MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
1758
"UTF-16 to UTF-16 shouldn't change window length");
1759
err->tokenOffset = encodedTokenOffset;
1760
err->lineLength = encodedWindowLength;
1761
} else {
1762
MOZ_ASSERT((std::is_same<Unit, Utf8Unit>::value),
1763
"should only see UTF-8 here");
1764
1765
bool simple = utf16WindowLength == encodedWindowLength;
1766
MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
1767
IsAscii<Unit>) == simple,
1768
"equal window lengths in UTF-8 should correspond only to "
1769
"wholly-ASCII text");
1770
1771
if (simple) {
1772
err->tokenOffset = encodedTokenOffset;
1773
err->lineLength = encodedWindowLength;
1774
} else {
1775
sourceUnits.computeWindowOffsetAndLength(
1776
encodedWindow, encodedTokenOffset, &err->tokenOffset,
1777
encodedWindowLength, &err->lineLength);
1778
}
1779
}
1780
1781
return true;
1782
}
1783
1784
template <typename Unit, class AnyCharsAccess>
1785
bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
1786
ErrorMetadata* err, const ErrorOffset& errorOffset) {
1787
if (errorOffset.is<NoOffset>()) {
1788
anyCharsAccess().computeErrorMetadataNoOffset(err);
1789
return true;
1790
}
1791
1792
uint32_t offset;
1793
if (errorOffset.is<uint32_t>()) {
1794
offset = errorOffset.as<uint32_t>();
1795
} else {
1796
offset = this->sourceUnits.offset();
1797
}
1798
1799
// This function's return value isn't a success/failure indication: it
1800
// returns true if this TokenStream can be used to provide a line of
1801
// context.
1802
if (fillExceptingContext(err, offset)) {
1803
// Add a line of context from this TokenStream to help with debugging.
1804
return internalComputeLineOfContext(err, offset);
1805
}
1806
1807
// We can't fill in any more here.
1808
return true;
1809
}
1810
1811
// We have encountered a '\': check for a Unicode escape sequence after it.
1812
// Return the length of the escape sequence and the encoded code point (by
1813
// value) if we found a Unicode escape sequence, and skip all code units
1814
// involed. Otherwise, return 0 and don't advance along the buffer.
1815
template <typename Unit, class AnyCharsAccess>
1816
uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
1817
uint32_t* codePoint) {
1818
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1819
1820
int32_t unit = getCodeUnit();
1821
if (unit != 'u') {
1822
// NOTE: |unit| may be EOF here.
1823
ungetCodeUnit(unit);
1824
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1825
return 0;
1826
}
1827
1828
char16_t v;
1829
unit = getCodeUnit();
1830
if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
1831
*codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
1832
return 5;
1833
}
1834
1835
if (unit == '{') {
1836
return matchExtendedUnicodeEscape(codePoint);