Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// JS lexical scanner.
#include "frontend/TokenStream.h"
#include "mozilla/ArrayUtils.h"
#include "mozilla/Attributes.h"
#include "mozilla/Likely.h"
#include "mozilla/Maybe.h"
#include "mozilla/MemoryChecking.h"
#include "mozilla/ScopeExit.h"
#include "mozilla/Span.h"
#include "mozilla/TemplateLib.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Utf8.h"
#include <algorithm>
#include <iterator>
#include <limits>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <type_traits>
#include <utility>
#include "jsnum.h"
#include "frontend/FrontendContext.h"
#include "frontend/Parser.h"
#include "frontend/ParserAtom.h"
#include "frontend/ReservedWords.h"
#include "js/CharacterEncoding.h" // JS::ConstUTF8CharsZ
#include "js/ColumnNumber.h" // JS::LimitedColumnNumberOneOrigin, JS::ColumnNumberOneOrigin, JS::TaggedColumnNumberOneOrigin
#include "js/ErrorReport.h" // JSErrorBase
#include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
#include "js/Printf.h" // JS_smprintf
#include "js/RegExpFlags.h" // JS::RegExpFlags
#include "js/UniquePtr.h"
#include "util/Text.h"
#include "util/Unicode.h"
#include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter
#include "vm/JSContext.h"
#include "vm/Realm.h"
using mozilla::AsciiAlphanumericToNumber;
using mozilla::AssertedCast;
using mozilla::DecodeOneUtf8CodePoint;
using mozilla::IsAscii;
using mozilla::IsAsciiAlpha;
using mozilla::IsAsciiDigit;
using mozilla::IsAsciiHexDigit;
using mozilla::IsTrailingUnit;
using mozilla::MakeScopeExit;
using mozilla::Maybe;
using mozilla::PointerRangeSize;
using mozilla::Span;
using mozilla::Utf8Unit;
using JS::ReadOnlyCompileOptions;
using JS::RegExpFlag;
using JS::RegExpFlags;
struct ReservedWordInfo {
const char* chars; // C string with reserved word text
js::frontend::TokenKind tokentype;
};
static const ReservedWordInfo reservedWords[] = {
#define RESERVED_WORD_INFO(word, name, type) {#word, js::frontend::type},
FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
#undef RESERVED_WORD_INFO
};
enum class ReservedWordsIndex : size_t {
#define ENTRY_(_1, NAME, _3) NAME,
FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
#undef ENTRY_
};
// Returns a ReservedWordInfo for the specified characters, or nullptr if the
// string is not a reserved word.
template <typename CharT>
static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
MOZ_ASSERT(length != 0);
size_t i;
const ReservedWordInfo* rw;
const char* chars;
#define JSRW_LENGTH() length
#define JSRW_AT(column) s[column]
#define JSRW_GOT_MATCH(index) \
i = (index); \
goto got_match;
#define JSRW_TEST_GUESS(index) \
i = (index); \
goto test_guess;
#define JSRW_NO_MATCH() goto no_match;
#include "frontend/ReservedWordsGenerated.h"
#undef JSRW_NO_MATCH
#undef JSRW_TEST_GUESS
#undef JSRW_GOT_MATCH
#undef JSRW_AT
#undef JSRW_LENGTH
got_match:
return &reservedWords[i];
test_guess:
rw = &reservedWords[i];
chars = rw->chars;
do {
if (*s++ != static_cast<unsigned char>(*chars++)) {
goto no_match;
}
} while (--length != 0);
return rw;
no_match:
return nullptr;
}
template <>
MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
const Utf8Unit* units, size_t length) {
return FindReservedWord(Utf8AsUnsignedChars(units), length);
}
static const ReservedWordInfo* FindReservedWord(
const js::frontend::TaggedParserAtomIndex atom) {
switch (atom.rawData()) {
#define CASE_(_1, NAME, _3) \
case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
return &reservedWords[size_t(ReservedWordsIndex::NAME)];
FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
#undef CASE_
}
return nullptr;
}
template <typename CharT>
static constexpr bool IsAsciiBinary(CharT c) {
using UnsignedCharT = std::make_unsigned_t<CharT>;
auto uc = static_cast<UnsignedCharT>(c);
return uc == '0' || uc == '1';
}
template <typename CharT>
static constexpr bool IsAsciiOctal(CharT c) {
using UnsignedCharT = std::make_unsigned_t<CharT>;
auto uc = static_cast<UnsignedCharT>(c);
return '0' <= uc && uc <= '7';
}
template <typename CharT>
static constexpr uint8_t AsciiOctalToNumber(CharT c) {
using UnsignedCharT = std::make_unsigned_t<CharT>;
auto uc = static_cast<UnsignedCharT>(c);
return uc - '0';
}
namespace js {
namespace frontend {
bool IsKeyword(TaggedParserAtomIndex atom) {
if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
return TokenKindIsKeyword(rw->tokentype);
}
return false;
}
TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
if (const ReservedWordInfo* rw = FindReservedWord(name)) {
return rw->tokentype;
}
return TokenKind::Limit;
}
const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
if (const ReservedWordInfo* rw = FindReservedWord(name)) {
return ReservedWordToCharZ(rw->tokentype);
}
return nullptr;
}
const char* ReservedWordToCharZ(TokenKind tt) {
MOZ_ASSERT(tt != TokenKind::Name);
switch (tt) {
#define EMIT_CASE(word, name, type) \
case type: \
return #word;
FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
#undef EMIT_CASE
default:
MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
}
return nullptr;
}
TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
TokenKind tt) const {
MOZ_ASSERT(tt != TokenKind::Name);
switch (tt) {
#define EMIT_CASE(word, name, type) \
case type: \
return TaggedParserAtomIndex::WellKnown::name();
FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
#undef EMIT_CASE
default:
MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
}
return TaggedParserAtomIndex::null();
}
SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber,
uint32_t initialOffset)
: lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) {
// This is actually necessary! Removing it causes compile errors on
// GCC and clang. You could try declaring this:
//
// const uint32_t SourceCoords::MAX_PTR;
//
// which fixes the GCC/clang error, but causes bustage on Windows. Sigh.
//
uint32_t maxPtr = MAX_PTR;
// The first line begins at buffer offset |initialOffset|. MAX_PTR is the
// sentinel. The appends cannot fail because |lineStartOffsets_| has
// statically-allocated elements.
MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
lineStartOffsets_.infallibleAppend(initialOffset);
lineStartOffsets_.infallibleAppend(maxPtr);
}
MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
uint32_t lineStartOffset) {
uint32_t index = indexFromLineNumber(lineNum);
uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
if (index == sentinelIndex) {
// We haven't seen this newline before. Update lineStartOffsets_
// only if lineStartOffsets_.append succeeds, to keep sentinel.
// Otherwise return false to tell TokenStream about OOM.
uint32_t maxPtr = MAX_PTR;
if (!lineStartOffsets_.append(maxPtr)) {
static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
TempAllocPolicy&>,
"this function's caller depends on it reporting an "
"error on failure, as TempAllocPolicy ensures");
return false;
}
lineStartOffsets_[index] = lineStartOffset;
} else {
// We have seen this newline before (and ungot it). Do nothing (other
// than checking it hasn't mysteriously changed).
// This path can be executed after hitting OOM, so check index.
MOZ_ASSERT_IF(index < sentinelIndex,
lineStartOffsets_[index] == lineStartOffset);
}
return true;
}
MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
return true;
}
uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
i++) {
if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
return false;
}
}
return true;
}
MOZ_ALWAYS_INLINE uint32_t
SourceCoords::indexFromOffset(uint32_t offset) const {
uint32_t iMin, iMax, iMid;
if (lineStartOffsets_[lastIndex_] <= offset) {
// If we reach here, offset is on a line the same as or higher than
// last time. Check first for the +0, +1, +2 cases, because they
// typically cover 85--98% of cases.
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
return lastIndex_; // index is same as last time
}
// If we reach here, there must be at least one more entry (plus the
// sentinel). Try it.
lastIndex_++;
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
return lastIndex_; // index is one higher than last time
}
// The same logic applies here.
lastIndex_++;
if (offset < lineStartOffsets_[lastIndex_ + 1]) {
return lastIndex_; // index is two higher than last time
}
// No luck. Oh well, we have a better-than-default starting point for
// the binary search.
iMin = lastIndex_ + 1;
MOZ_ASSERT(iMin <
lineStartOffsets_.length() - 1); // -1 due to the sentinel
} else {
iMin = 0;
}
// This is a binary search with deferred detection of equality, which was
// marginally faster in this case than a standard binary search.
// The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
// want one before that.
iMax = lineStartOffsets_.length() - 2;
while (iMax > iMin) {
iMid = iMin + (iMax - iMin) / 2;
if (offset >= lineStartOffsets_[iMid + 1]) {
iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
} else {
iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
}
}
MOZ_ASSERT(iMax == iMin);
MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
lastIndex_ = iMin;
return iMin;
}
SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
return LineToken(indexFromOffset(offset), offset);
}
TokenStreamAnyChars::TokenStreamAnyChars(FrontendContext* fc,
const ReadOnlyCompileOptions& options,
StrictModeGetter* smg)
: fc(fc),
options_(options),
strictModeGetter_(smg),
filename_(options.filename()),
longLineColumnInfo_(fc),
srcCoords(fc, options.lineno, options.scriptSourceOffset),
lineno(options.lineno),
mutedErrors(options.mutedErrors()) {
// |isExprEnding| was initially zeroed: overwrite the true entries here.
isExprEnding[size_t(TokenKind::Comma)] = true;
isExprEnding[size_t(TokenKind::Semi)] = true;
isExprEnding[size_t(TokenKind::Colon)] = true;
isExprEnding[size_t(TokenKind::RightParen)] = true;
isExprEnding[size_t(TokenKind::RightBracket)] = true;
isExprEnding[size_t(TokenKind::RightCurly)] = true;
}
template <typename Unit>
TokenStreamCharsBase<Unit>::TokenStreamCharsBase(FrontendContext* fc,
ParserAtomsTable* parserAtoms,
const Unit* units,
size_t length,
size_t startOffset)
: TokenStreamCharsShared(fc, parserAtoms),
sourceUnits(units, length, startOffset) {}
bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
const char16_t* cur,
const char16_t* end) {
MOZ_ASSERT(charBuffer.length() == 0);
while (cur < end) {
char16_t ch = *cur++;
if (ch == '\r') {
ch = '\n';
if (cur < end && *cur == '\n') {
cur++;
}
}
if (!charBuffer.append(ch)) {
return false;
}
}
MOZ_ASSERT(cur == end);
return true;
}
bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
const Utf8Unit* cur,
const Utf8Unit* end) {
MOZ_ASSERT(charBuffer.length() == 0);
while (cur < end) {
Utf8Unit unit = *cur++;
if (MOZ_LIKELY(IsAscii(unit))) {
char16_t ch = unit.toUint8();
if (ch == '\r') {
ch = '\n';
if (cur < end && *cur == Utf8Unit('\n')) {
cur++;
}
}
if (!charBuffer.append(ch)) {
return false;
}
continue;
}
Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
MOZ_ASSERT(ch.isSome(),
"provided source text should already have been validated");
if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
return false;
}
}
MOZ_ASSERT(cur == end);
return true;
}
template <typename Unit, class AnyCharsAccess>
TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
FrontendContext* fc, ParserAtomsTable* parserAtoms,
const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
: TokenStreamChars<Unit, AnyCharsAccess>(fc, parserAtoms, units, length,
options.scriptSourceOffset) {}
bool TokenStreamAnyChars::checkOptions() {
// Constrain starting columns to where they will saturate.
if (options().column.oneOriginValue() >
JS::LimitedColumnNumberOneOrigin::Limit) {
reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
return false;
}
return true;
}
void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) const {
va_list args;
va_start(args, errorNumber);
reportErrorNoOffsetVA(errorNumber, &args);
va_end(args);
}
void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
va_list* args) const {
ErrorMetadata metadata;
computeErrorMetadataNoOffset(&metadata);
ReportCompileErrorLatin1VA(fc, std::move(metadata), nullptr, errorNumber,
args);
}
[[nodiscard]] MOZ_ALWAYS_INLINE bool
TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
prevLinebase = linebase;
linebase = lineStartOffset;
lineno++;
// On overflow, report error.
if (MOZ_UNLIKELY(!lineno)) {
reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
return false;
}
return srcCoords.add(lineno, linebase);
}
#ifdef DEBUG
template <>
inline void SourceUnits<char16_t>::assertNextCodePoint(
const PeekedCodePoint<char16_t>& peeked) {
char32_t c = peeked.codePoint();
if (c < unicode::NonBMPMin) {
MOZ_ASSERT(peeked.lengthInUnits() == 1);
MOZ_ASSERT(ptr[0] == c);
} else {
MOZ_ASSERT(peeked.lengthInUnits() == 2);
char16_t lead, trail;
unicode::UTF16Encode(c, &lead, &trail);
MOZ_ASSERT(ptr[0] == lead);
MOZ_ASSERT(ptr[1] == trail);
}
}
template <>
inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
const PeekedCodePoint<Utf8Unit>& peeked) {
char32_t c = peeked.codePoint();
// This is all roughly indulgence of paranoia only for assertions, so the
// reimplementation of UTF-8 encoding a code point is (we think) a virtue.
uint8_t expectedUnits[4] = {};
if (c < 0x80) {
expectedUnits[0] = AssertedCast<uint8_t>(c);
} else if (c < 0x800) {
expectedUnits[0] = 0b1100'0000 | (c >> 6);
expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
} else if (c < 0x10000) {
expectedUnits[0] = 0b1110'0000 | (c >> 12);
expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
} else {
expectedUnits[0] = 0b1111'0000 | (c >> 18);
expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
}
MOZ_ASSERT(peeked.lengthInUnits() <= 4);
for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
}
}
#endif // DEBUG
static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
const Utf8Unit** ptr, const Utf8Unit* limit) {
MOZ_ASSERT(*ptr <= limit);
// |limit| is a code point boundary.
if (MOZ_UNLIKELY(*ptr == limit)) {
return;
}
// Otherwise rewind past trailing units to the start of the code point.
#ifdef DEBUG
size_t retracted = 0;
#endif
while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
--*ptr;
#ifdef DEBUG
retracted++;
#endif
}
MOZ_ASSERT(retracted < 4,
"the longest UTF-8 code point is four units, so this should never "
"retract more than three units");
}
static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
const char16_t** ptr, const char16_t* limit) {
MOZ_ASSERT(*ptr <= limit);
// |limit| is a code point boundary.
if (MOZ_UNLIKELY(*ptr == limit)) {
return;
}
// Otherwise the pointer must be retracted by one iff it splits a two-unit
// code point.
if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
// Outside test suites testing garbage WTF-16, it's basically guaranteed
// here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
--*ptr;
}
}
}
template <typename Unit>
JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffset(
const LineToken lineToken, const uint32_t offset,
const SourceUnits<Unit>& sourceUnits) const {
lineToken.assertConsistentOffset(offset);
const uint32_t start = srcCoords.lineStart(lineToken);
const uint32_t offsetInLine = offset - start;
if constexpr (std::is_same_v<Unit, char16_t>) {
// Column offset is in UTF-16 code units.
return JS::ColumnNumberUnsignedOffset(offsetInLine);
}
return computeColumnOffsetForUTF8(lineToken, offset, start, offsetInLine,
sourceUnits);
}
template <typename Unit>
JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffsetForUTF8(
const LineToken lineToken, const uint32_t offset, const uint32_t start,
const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const {
const uint32_t line = lineNumber(lineToken);
// Reset the previous offset/column number offset cache for this line, if the
// previous lookup wasn't on this line.
if (line != lineOfLastColumnComputation_) {
lineOfLastColumnComputation_ = line;
lastChunkVectorForLine_ = nullptr;
lastOffsetOfComputedColumn_ = start;
lastComputedColumnOffset_ = JS::ColumnNumberUnsignedOffset::zero();
}
// Compute and return the final column number offset from a partially
// calculated offset/column number offset, using the last-cached
// offset/column number offset if they're more optimal.
auto OffsetFromPartial =
[this, offset, &sourceUnits](
uint32_t partialOffset,
JS::ColumnNumberUnsignedOffset partialColumnOffset,
UnitsType unitsType) {
MOZ_ASSERT(partialOffset <= offset);
// If the last lookup on this line was closer to |offset|, use it.
if (partialOffset < this->lastOffsetOfComputedColumn_ &&
this->lastOffsetOfComputedColumn_ <= offset) {
partialOffset = this->lastOffsetOfComputedColumn_;
partialColumnOffset = this->lastComputedColumnOffset_;
}
const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
const Unit* end = sourceUnits.codeUnitPtrAt(offset);
size_t offsetDelta =
AssertedCast<uint32_t>(PointerRangeSize(begin, end));
partialOffset += offsetDelta;
if (unitsType == UnitsType::GuaranteedSingleUnit) {
MOZ_ASSERT(unicode::CountUTF16CodeUnits(begin, end) == offsetDelta,
"guaranteed-single-units also guarantee pointer distance "
"equals UTF-16 code unit count");
partialColumnOffset += JS::ColumnNumberUnsignedOffset(offsetDelta);
} else {
partialColumnOffset += JS::ColumnNumberUnsignedOffset(
AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end)));
}
this->lastOffsetOfComputedColumn_ = partialOffset;
this->lastComputedColumnOffset_ = partialColumnOffset;
return partialColumnOffset;
};
// We won't add an entry to |longLineColumnInfo_| for lines where the maximum
// column has offset less than this value. The most common (non-minified)
// long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
// the next power of two for efficient division/multiplication below.
constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
// The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
if (chunkIndex == 0) {
// We don't know from an |offset| in the zeroth chunk that this line is even
// long. First-chunk info is mostly useless, anyway -- we have |start|
// already. So if we have *easy* access to that zeroth chunk, use it --
// otherwise just count pessimally. (This will still benefit from caching
// the last column/offset for computations for successive offsets, so it's
// not *always* worst-case.)
UnitsType unitsType;
if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
MOZ_ASSERT((*lastChunkVectorForLine_)[0].columnOffset() ==
JS::ColumnNumberUnsignedOffset::zero());
unitsType = (*lastChunkVectorForLine_)[0].unitsType();
} else {
unitsType = UnitsType::PossiblyMultiUnit;
}
return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
unitsType);
}
// If this line has no chunk vector yet, insert one in the hash map. (The
// required index is allocated and filled further down.)
if (!lastChunkVectorForLine_) {
auto ptr = longLineColumnInfo_.lookupForAdd(line);
if (!ptr) {
// This could rehash and invalidate a cached vector pointer, but the outer
// condition means we don't have a cached pointer.
if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(fc))) {
// In case of OOM, just count columns from the start of the line.
fc->recoverFromOutOfMemory();
return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
UnitsType::PossiblyMultiUnit);
}
}
// Note that adding elements to this vector won't invalidate this pointer.
lastChunkVectorForLine_ = &ptr->value();
}
const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
auto RetractedOffsetOfChunk = [
#ifdef DEBUG
this,
#endif
start, limit,
&sourceUnits](uint32_t index) {
MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
uint32_t naiveOffset = start + index * ColumnChunkLength;
const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
const Unit* actualPtr = naivePtr;
RetractPointerToCodePointBoundary(&actualPtr, limit);
#ifdef DEBUG
if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
UnitsType::GuaranteedSingleUnit) {
MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
}
#endif
return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
};
uint32_t partialOffset;
JS::ColumnNumberUnsignedOffset partialColumnOffset;
UnitsType unitsType;
auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
if (chunkIndex < entriesLen) {
// We've computed the chunk |offset| resides in. Compute the column number
// from the chunk.
partialOffset = RetractedOffsetOfChunk(chunkIndex);
partialColumnOffset = (*lastChunkVectorForLine_)[chunkIndex].columnOffset();
// This is exact if |chunkIndex| isn't the last chunk.
unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
// Otherwise the last chunk is pessimistically assumed to contain multi-unit
// code points because we haven't fully examined its contents yet -- they
// may not have been tokenized yet, they could contain encoding errors, or
// they might not even exist.
MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
(*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
UnitsType::PossiblyMultiUnit);
} else {
// Extend the vector from its last entry or the start of the line. (This is
// also a suitable partial start point if we must recover from OOM.)
if (entriesLen > 0) {
partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
partialColumnOffset =
(*lastChunkVectorForLine_)[entriesLen - 1].columnOffset();
} else {
partialOffset = start;
partialColumnOffset = JS::ColumnNumberUnsignedOffset::zero();
}
if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
// As earlier, just start from the greatest offset/column in case of OOM.
fc->recoverFromOutOfMemory();
return OffsetFromPartial(partialOffset, partialColumnOffset,
UnitsType::PossiblyMultiUnit);
}
// OOM is no longer possible now. \o/
// The vector always begins with the column of the line start, i.e. zero,
// with chunk units pessimally assumed not single-unit.
if (entriesLen == 0) {
lastChunkVectorForLine_->infallibleAppend(
ChunkInfo(JS::ColumnNumberUnsignedOffset::zero(),
UnitsType::PossiblyMultiUnit));
entriesLen++;
}
do {
const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
MOZ_ASSERT(begin < chunkLimit);
MOZ_ASSERT(chunkLimit <= limit);
static_assert(
ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
"any retraction below is assumed to never underflow to the "
"preceding chunk, even for the longest code point");
// Prior tokenizing ensured that [begin, limit) is validly encoded, and
// |begin < chunkLimit|, so any retraction here can't underflow.
RetractPointerToCodePointBoundary(&chunkLimit, limit);
MOZ_ASSERT(begin < chunkLimit);
MOZ_ASSERT(chunkLimit <= limit);
size_t numUnits = PointerRangeSize(begin, chunkLimit);
size_t numUTF16CodeUnits =
unicode::CountUTF16CodeUnits(begin, chunkLimit);
// If this chunk (which will become non-final at the end of the loop) is
// all single-unit code points, annotate the chunk accordingly.
if (numUnits == numUTF16CodeUnits) {
lastChunkVectorForLine_->back().guaranteeSingleUnits();
}
partialOffset += numUnits;
partialColumnOffset += JS::ColumnNumberUnsignedOffset(numUTF16CodeUnits);
lastChunkVectorForLine_->infallibleEmplaceBack(
partialColumnOffset, UnitsType::PossiblyMultiUnit);
} while (entriesLen < chunkIndex + 1);
// We're at a spot in the current final chunk, and final chunks never have
// complete units information, so be pessimistic.
unitsType = UnitsType::PossiblyMultiUnit;
}
return OffsetFromPartial(partialOffset, partialColumnOffset, unitsType);
}
template <typename Unit, class AnyCharsAccess>
JS::LimitedColumnNumberOneOrigin
GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
LineToken lineToken, uint32_t offset) const {
lineToken.assertConsistentOffset(offset);
const TokenStreamAnyChars& anyChars = anyCharsAccess();
JS::ColumnNumberUnsignedOffset columnOffset =
anyChars.computeColumnOffset(lineToken, offset, this->sourceUnits);
if (!lineToken.isFirstLine()) {
return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
JS::ColumnNumberOneOrigin() + columnOffset);
}
if (1 + columnOffset.value() > JS::LimitedColumnNumberOneOrigin::Limit) {
return JS::LimitedColumnNumberOneOrigin::limit();
}
return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
(anyChars.options_.column + columnOffset).oneOriginValue());
}
template <typename Unit, class AnyCharsAccess>
void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
uint32_t offset, uint32_t* line,
JS::LimitedColumnNumberOneOrigin* column) const {
const TokenStreamAnyChars& anyChars = anyCharsAccess();
auto lineToken = anyChars.lineToken(offset);
*line = anyChars.lineNumber(lineToken);
*column = computeColumn(lineToken, offset);
}
template <class AnyCharsAccess>
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
uint8_t relevantUnits, unsigned errorNumber, ...) {
va_list args;
va_start(args, errorNumber);
do {
size_t offset = this->sourceUnits.offset();
ErrorMetadata err;
TokenStreamAnyChars& anyChars = anyCharsAccess();
bool canAddLineOfContext = fillExceptingContext(&err, offset);
if (canAddLineOfContext) {
if (!internalComputeLineOfContext(&err, offset)) {
break;
}
// As this is an encoding error, the computed window-end must be
// identical to the location of the error -- any further on and the
// window would contain invalid Unicode.
MOZ_ASSERT_IF(err.lineOfContext != nullptr,
err.lineLength == err.tokenOffset);
}
auto notes = MakeUnique<JSErrorNotes>();
if (!notes) {
ReportOutOfMemory(anyChars.fc);
break;
}
// The largest encoding of a UTF-8 code point is 4 units. (Encoding an
// obsolete 5- or 6-byte code point will complain only about a bad lead
// code unit.)
constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
MOZ_ASSERT(relevantUnits > 0);
char badUnitsStr[MaxWidth];
char* ptr = badUnitsStr;
while (relevantUnits > 0) {
byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
ptr[4] = ' ';
ptr += 5;
relevantUnits--;
}
ptr[-1] = '\0';
uint32_t line;
JS::LimitedColumnNumberOneOrigin column;
computeLineAndColumn(offset, &line, &column);
if (!notes->addNoteASCII(anyChars.fc, anyChars.getFilename().c_str(), 0,
line, JS::ColumnNumberOneOrigin(column),
GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS,
badUnitsStr)) {
break;
}
ReportCompileErrorLatin1VA(anyChars.fc, std::move(err), std::move(notes),
errorNumber, &args);
} while (false);
va_end(args);
}
template <class AnyCharsAccess>
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
Utf8Unit lead) {
uint8_t leadValue = lead.toUint8();
char leadByteStr[5];
byteToTerminatedString(leadValue, leadByteStr);
internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
}
template <class AnyCharsAccess>
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
Utf8Unit lead, uint8_t remaining, uint8_t required) {
uint8_t leadValue = lead.toUint8();
MOZ_ASSERT(required == 2 || required == 3 || required == 4);
MOZ_ASSERT(remaining < 4);
MOZ_ASSERT(remaining < required);
char leadByteStr[5];
byteToTerminatedString(leadValue, leadByteStr);
// |toHexChar| produces the desired decimal numbers for values < 4.
const char expectedStr[] = {toHexChar(required - 1), '\0'};
const char actualStr[] = {toHexChar(remaining - 1), '\0'};
internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
expectedStr, required == 2 ? "" : "s", actualStr,
remaining == 2 ? " was" : "s were");
}
template <class AnyCharsAccess>
MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
uint8_t unitsObserved) {
Utf8Unit badUnit =
this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
char badByteStr[5];
byteToTerminatedString(badUnit.toUint8(), badByteStr);
internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
badByteStr);
}
template <class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
char32_t codePoint, uint8_t codePointLength, const char* reason) {
// Construct a string like "0x203D" (including null terminator) to include
// in the error message. Write the string end-to-start from end to start
// of an adequately sized |char| array, shifting least significant nibbles
// off the number and writing the corresponding hex digits until done, then
// prefixing with "0x". |codePointStr| points at the incrementally
// computed string, within |codePointCharsArray|'s bounds.
// 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
// bits in a four-byte UTF-8 code unit sequence.
constexpr size_t MaxHexSize = sizeof(
"0x1F"
"FFFF"); // including '\0'
char codePointCharsArray[MaxHexSize];
char* codePointStr = std::end(codePointCharsArray);
*--codePointStr = '\0';
// Note that by do-while looping here rather than while-looping, this
// writes a '0' when |codePoint == 0|.
do {
MOZ_ASSERT(codePointCharsArray < codePointStr);
*--codePointStr = toHexChar(codePoint & 0xF);
codePoint >>= 4;
} while (codePoint);
MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
*--codePointStr = 'x';
*--codePointStr = '0';
internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
codePointStr, reason);
}
template <class AnyCharsAccess>
[[nodiscard]] bool
TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
Utf8Unit lead, char32_t* codePoint) {
auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
this->notEnoughUnits(lead, remaining, required);
};
auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
this->badTrailingUnit(unitsObserved);
};
auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
this->badCodePoint(badCodePoint, unitsObserved);
};
auto onNotShortestForm = [this](char32_t badCodePoint,
uint8_t unitsObserved) {
this->notShortestForm(badCodePoint, unitsObserved);
};
// If a valid code point is decoded, this function call consumes its code
// units. If not, it ungets the lead code unit and invokes the right error
// handler, so on failure we must immediately return false.
SourceUnitsIterator iter(this->sourceUnits);
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
if (maybeCodePoint.isNothing()) {
return false;
}
*codePoint = maybeCodePoint.value();
return true;
}
template <class AnyCharsAccess>
bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
int32_t lead, char32_t* codePoint) {
MOZ_ASSERT(lead != EOF);
MOZ_ASSERT(!isAsciiCodePoint(lead),
"ASCII code unit/point must be handled separately");
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
"getNonAsciiCodePoint called incorrectly");
// The code point is usually |lead|: overwrite later if needed.
*codePoint = AssertedCast<char32_t>(lead);
// ECMAScript specifically requires that unpaired UTF-16 surrogates be
// treated as the corresponding code point and not as an error. See
// Thus this function does not consider any sequence of 16-bit numbers to
// be intrinsically in error.
// Dispense with single-unit code points and lone trailing surrogates.
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
lead == unicode::PARA_SEPARATOR)) {
if (!updateLineInfoForEOL()) {
#ifdef DEBUG
// Assign to a sentinel value to hopefully cause errors.
*codePoint = std::numeric_limits<char32_t>::max();
#endif
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
return false;
}
*codePoint = '\n';
} else {
MOZ_ASSERT(!IsLineTerminator(*codePoint));
}
return true;
}
// Also handle a lead surrogate not paired with a trailing surrogate.
if (MOZ_UNLIKELY(
this->sourceUnits.atEnd() ||
!unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
MOZ_ASSERT(!IsLineTerminator(*codePoint));
return true;
}
// Otherwise we have a multi-unit code point.
*codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
MOZ_ASSERT(!IsLineTerminator(*codePoint));
return true;
}
template <class AnyCharsAccess>
bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
int32_t unit, char32_t* codePoint) {
MOZ_ASSERT(unit != EOF);
MOZ_ASSERT(!isAsciiCodePoint(unit),
"ASCII code unit/point must be handled separately");
Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
"getNonAsciiCodePoint called incorrectly");
auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
uint_fast8_t required) {
this->notEnoughUnits(lead, remaining, required);
};
auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
this->badTrailingUnit(unitsObserved);
};
auto onBadCodePoint = [this](char32_t badCodePoint,
uint_fast8_t unitsObserved) {
this->badCodePoint(badCodePoint, unitsObserved);
};
auto onNotShortestForm = [this](char32_t badCodePoint,
uint_fast8_t unitsObserved) {
this->notShortestForm(badCodePoint, unitsObserved);
};
// This consumes the full, valid code point or ungets |lead| and calls the
// appropriate error functor on failure.
SourceUnitsIterator iter(this->sourceUnits);
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
if (maybeCodePoint.isNothing()) {
return false;
}
char32_t cp = maybeCodePoint.value();
if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
cp == unicode::PARA_SEPARATOR)) {
if (!updateLineInfoForEOL()) {
#ifdef DEBUG
// Assign to a sentinel value to hopefully cause errors.
*codePoint = std::numeric_limits<char32_t>::max();
#endif
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
return false;
}
*codePoint = '\n';
} else {
MOZ_ASSERT(!IsLineTerminator(cp));
*codePoint = cp;
}
return true;
}
template <>
size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
// This is JS's understanding of UTF-16 that allows lone surrogates, so
// we have to exclude lone surrogates from [windowStart, offset) ourselves.
const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
const char16_t* const initial = codeUnitPtrAt(offset);
const char16_t* p = initial;
auto HalfWindowSize = [&p, &initial]() {
return PointerRangeSize(p, initial);
};
while (true) {
MOZ_ASSERT(earliestPossibleStart <= p);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
break;
}
char16_t c = p[-1];
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
// string and template literals. These code points do affect line and
// column coordinates, even as they encode their literal values.
if (IsLineTerminator(c)) {
break;
}
// Don't allow invalid UTF-16 in pre-context. (Current users don't
// require this, and this behavior isn't currently imposed on
// pre-context, but these facts might change someday.)
if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
break;
}
// Optimistically include the code unit, reverting below if needed.
p--;
// If it's not a surrogate at all, keep going.
if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
continue;
}
// Stop if we don't have a usable surrogate pair.
if (HalfWindowSize() >= WindowRadius ||
p <= earliestPossibleStart || // trail surrogate at low end
!unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
{
p++;
break;
}
p--;
}
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
return offset - HalfWindowSize();
}
template <>
size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
// |offset| must be the location of the error or somewhere before it, so we
// know preceding data is valid UTF-8.
const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
const Utf8Unit* const initial = codeUnitPtrAt(offset);
const Utf8Unit* p = initial;
auto HalfWindowSize = [&p, &initial]() {
return PointerRangeSize(p, initial);
};
while (true) {
MOZ_ASSERT(earliestPossibleStart <= p);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
break;
}
// Peek backward for a line break, and only decrement if there is none.
uint8_t prev = p[-1].toUint8();
// First check for the ASCII LineTerminators.
if (prev == '\r' || prev == '\n') {
break;
}
// Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
// (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
// aren't three code units available, some comparison here will fail
// before we'd underflow.
if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
break;
}
// Rewind over the non-LineTerminator. This can't underflow
// |earliestPossibleStart| because it begins a code point.
while (IsTrailingUnit(*--p)) {
continue;
}
MOZ_ASSERT(earliestPossibleStart <= p);
// But if we underflowed |WindowRadius|, adjust forward and stop.
if (HalfWindowSize() > WindowRadius) {
static_assert(WindowRadius > 3,
"skipping over non-lead code units below must not "
"advance past |offset|");
while (IsTrailingUnit(*++p)) {
continue;
}
MOZ_ASSERT(HalfWindowSize() < WindowRadius);
break;
}
}
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
return offset - HalfWindowSize();
}
template <>
size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
const char16_t* const initial = codeUnitPtrAt(offset);
const char16_t* p = initial;
auto HalfWindowSize = [&initial, &p]() {
return PointerRangeSize(initial, p);
};
while (true) {
MOZ_ASSERT(p <= limit_);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
break;
}
char16_t c = *p;
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
// string and template literals. These code points do affect line and
// column coordinates, even as they encode their literal values.
if (IsLineTerminator(c)) {
break;
}
// Don't allow invalid UTF-16 in post-context. (Current users don't
// require this, and this behavior isn't currently imposed on
// pre-context, but these facts might change someday.)
if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
break;
}
// Optimistically consume the code unit, ungetting it below if needed.
p++;
// If it's not a surrogate at all, keep going.
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
continue;
}
// Retract if the lead surrogate would stand alone at the end of the
// window.
if (HalfWindowSize() >= WindowRadius || // split pair
p >= limit_ || // half-pair at end of source
!unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
{
p--;
break;
}
p++;
}
return offset + HalfWindowSize();
}
template <>
size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
const Utf8Unit* const initial = codeUnitPtrAt(offset);
const Utf8Unit* p = initial;
auto HalfWindowSize = [&initial, &p]() {
return PointerRangeSize(initial, p);
};
while (true) {
MOZ_ASSERT(p <= limit_);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
break;
}
// A non-encoding error might be followed by an encoding error within
// |maxEnd|, so we must validate as we go to not include invalid UTF-8
// in the computed window. What joy!
Utf8Unit lead = *p;
if (mozilla::IsAscii(lead)) {
if (IsSingleUnitLineTerminator(lead)) {
break;
}
p++;
continue;
}
PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
if (peeked.isNone()) {
break; // encoding error
}
char32_t c = peeked.codePoint();
if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
c == unicode::PARA_SEPARATOR)) {
break;
}
MOZ_ASSERT(!IsLineTerminator(c));
uint8_t len = peeked.lengthInUnits();
if (HalfWindowSize() + len > WindowRadius) {
break;
}
p += len;
}
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
return offset + HalfWindowSize();
}
template <typename Unit, class AnyCharsAccess>
bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
while (this->sourceUnits.addressOfNextCodeUnit() < end) {
if (!getCodePoint()) {
return false;
}
}
TokenStreamAnyChars& anyChars = anyCharsAccess();
Token* cur = const_cast<Token*>(&anyChars.currentToken());
cur->pos.begin = this->sourceUnits.offset();
cur->pos.end = cur->pos.begin;
#ifdef DEBUG
cur->type = TokenKind::Limit;
#endif
MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
anyChars.lookahead = 0;
return true;
}
template <typename Unit, class AnyCharsAccess>
void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
TokenStreamAnyChars& anyChars = anyCharsAccess();
this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
/* allowPoisoned = */ true);
anyChars.flags = pos.flags;
anyChars.lineno = pos.lineno;
anyChars.linebase = pos.linebase;
anyChars.prevLinebase = pos.prevLinebase;
anyChars.lookahead = pos.lookahead;
anyChars.tokens[anyChars.cursor()] = pos.currentToken;
for (unsigned i = 0; i < anyChars.lookahead; i++) {
anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
}
}
template <typename Unit, class AnyCharsAccess>
bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
const Position& pos, const TokenStreamAnyChars& other) {
if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
return false;
}
seekTo(pos);
return true;
}
void TokenStreamAnyChars::computeErrorMetadataNoOffset(
ErrorMetadata* err) const {
err->isMuted = mutedErrors;
err->filename = filename_;
err->lineNumber = 0;
err->columnNumber = JS::ColumnNumberOneOrigin();
MOZ_ASSERT(err->lineOfContext == nullptr);
}
bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
uint32_t offset) const {
err->isMuted = mutedErrors;
// If this TokenStreamAnyChars doesn't have location information, try to
// get it from the caller.
if (!filename_) {
JSContext* maybeCx = context()->maybeCurrentJSContext();
if (maybeCx) {
NonBuiltinFrameIter iter(maybeCx,
FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
maybeCx->realm()->principals());
if (!iter.done() && iter.filename()) {
err->filename = JS::ConstUTF8CharsZ(iter.filename());
JS::TaggedColumnNumberOneOrigin columnNumber;
err->lineNumber = iter.computeLine(&columnNumber);
err->columnNumber =
JS::ColumnNumberOneOrigin(columnNumber.oneOriginValue());
return false;
}
}
}
// Otherwise use this TokenStreamAnyChars's location information.
err->filename = filename_;
return true;
}
template <>
inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
const char16_t* encodedWindow, size_t encodedTokenOffset,
size_t* utf16TokenOffset, size_t encodedWindowLength,
size_t* utf16WindowLength) const {
MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
}
template <>
inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
size_t* utf16TokenOffset, size_t encodedWindowLength,
size_t* utf16WindowLength) const {
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
"token offset must be within the window, and the two lambda "
"calls below presume this ordering of values");
const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
size_t i = 0;
auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
while (encodedWindow < limit) {
Utf8Unit lead = *encodedWindow++;
if (MOZ_LIKELY(IsAscii(lead))) {
// ASCII contributes a single UTF-16 code unit.
i++;
continue;
}
Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
MOZ_ASSERT(cp.isSome(),
"computed window should only contain valid UTF-8");
i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
}
return i;
};
// Compute the token offset from |i == 0| and the initial |encodedWindow|.
const Utf8Unit* token = encodedWindow + encodedTokenOffset;
MOZ_ASSERT(token <= encodedWindowEnd);
*utf16TokenOffset = ComputeUtf16Count(token);
// Compute the window length, picking up from |i| and |encodedWindow| that,
// in general, were modified just above.
*utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
}
template <typename Unit>
bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
uint32_t offset) const {
// Rename the variable to make meaning clearer: an offset into source units
// in Unit encoding.
size_t encodedOffset = offset;
// These are also offsets into source units in Unit encoding.
size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
// Don't add a useless "line" of context when the window ends up empty
// because of an invalid encoding at the start of a line.
if (encodedWindowLength == 0) {
MOZ_ASSERT(err->lineOfContext == nullptr,
"ErrorMetadata::lineOfContext must be null so we don't "
"have to set the lineLength/tokenOffset fields");
return true;
}
CharBuffer lineOfContext(fc);
const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
return false;
}
size_t utf16WindowLength = lineOfContext.length();
// The windowed string is null-terminated.
if (!lineOfContext.append('\0')) {
return false;
}
err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
if (!err->lineOfContext) {
return false;
}
size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
"token offset must be inside the window");
// The length in UTF-8 code units of a code point is always greater than or
// equal to the same code point's length in UTF-16 code points. ASCII code
// points are 1 unit in either encoding. Code points in [U+0080, U+10000)
// are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in
// [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
//
// Therefore, if encoded window length equals the length in UTF-16 (this is
// always the case for Unit=char16_t), the UTF-16 offsets are exactly the
// encoded offsets. Otherwise we must convert offset/length from UTF-8 to
// UTF-16.
if constexpr (std::is_same_v<Unit, char16_t>) {
MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
"UTF-16 to UTF-16 shouldn't change window length");
err->tokenOffset = encodedTokenOffset;
err->lineLength = encodedWindowLength;
} else {
static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
bool simple = utf16WindowLength == encodedWindowLength;
#ifdef DEBUG
auto isAscii = [](Unit u) { return IsAscii(u); };
MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
isAscii) == simple,
"equal window lengths in UTF-8 should correspond only to "
"wholly-ASCII text");
#endif
if (simple) {
err->tokenOffset = encodedTokenOffset;
err->lineLength = encodedWindowLength;
} else {
sourceUnits.computeWindowOffsetAndLength(
encodedWindow, encodedTokenOffset, &err->tokenOffset,
encodedWindowLength, &err->lineLength);
}
}
return true;
}
template <typename Unit, class AnyCharsAccess>
bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
ErrorMetadata* err, const ErrorOffset& errorOffset) const {
if (errorOffset.is<NoOffset>()) {
anyCharsAccess().computeErrorMetadataNoOffset(err);
return true;
}
uint32_t offset;
if (errorOffset.is<uint32_t>()) {
offset = errorOffset.as<uint32_t>();
} else {
offset = this->sourceUnits.offset();
}
// This function's return value isn't a success/failure indication: it
// returns true if this TokenStream can be used to provide a line of
// context.
if (fillExceptingContext(err, offset)) {
// Add a line of context from this TokenStream to help with debugging.
return internalComputeLineOfContext(err, offset);
}
// We can't fill in any more here.
return true;
}
template <typename Unit, class AnyCharsAccess>
void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
int32_t cp) {
UniqueChars display = JS_smprintf("U+%04X", cp);
if (!display) {
ReportOutOfMemory(anyCharsAccess().fc);
return;
}
error(JSMSG_ILLEGAL_CHARACTER, display.get());
}
// We have encountered a '\': check for a Unicode escape sequence after it.
// Return the length of the escape sequence and the encoded code point (by
// value) if we found a Unicode escape sequence, and skip all code units
// involed. Otherwise, return 0 and don't advance along the buffer.
template <typename Unit, class AnyCharsAccess>
uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
char32_t* codePoint) {
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
int32_t unit = getCodeUnit();
if (unit != 'u') {
// NOTE: |unit| may be EOF here.
ungetCodeUnit(unit);
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
return 0;
}
char16_t v;
unit = getCodeUnit();
if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
*codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
return 5;
}
if (unit == '{') {
return matchExtendedUnicodeEscape(codePoint);
}
// NOTE: |unit| may be EOF here, so this ungets either one or two units.
ungetCodeUnit(unit);
ungetCodeUnit('u');
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
return 0;
}
template <typename Unit, class AnyCharsAccess>
uint32_t
GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
char32_t* codePoint) {
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
int32_t unit = getCodeUnit();
// Skip leading zeroes.
uint32_t leadingZeroes = 0;
while (unit == '0') {
leadingZeroes++;
unit = getCodeUnit();
}
size_t i = 0;
uint32_t code = 0;
while (IsAsciiHexDigit(unit) && i < 6) {
code = (code << 4) | AsciiAlphanumericToNumber(unit);
unit = getCodeUnit();
i++;
}
uint32_t gotten =
2 + // 'u{'
leadingZeroes + i + // significant hexdigits
(unit != EOF); // subtract a get if it didn't contribute to length
if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
code <= unicode::NonBMPMax) {
*codePoint = code;
return gotten;
}
this->sourceUnits.unskipCodeUnits(gotten);
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
return 0;
}
template <typename Unit, class AnyCharsAccess>
uint32_t
GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
char32_t* codePoint) {
uint32_t length = matchUnicodeEscape(codePoint);
if (MOZ_LIKELY(length > 0)) {
if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
return length;
}
this->sourceUnits.unskipCodeUnits(length);
}
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
return 0;
}
template <typename Unit, class AnyCharsAccess>
bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
char32_t* codePoint) {
uint32_t length = matchUnicodeEscape(codePoint);
if (MOZ_LIKELY(length > 0)) {
if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
return true;
}
this->sourceUnits.unskipCodeUnits(length);
}
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
return false;
}
template <typename Unit, class AnyCharsAccess>
[[nodiscard]] bool