Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set sw=2 ts=2 et tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsHtml5StreamParser.h"
#include <stdlib.h>
#include <string.h>
#include <utility>
#include "ErrorList.h"
#include "GeckoProfiler.h"
#include "js/GCAPI.h"
#include "mozilla/Buffer.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/Encoding.h"
#include "mozilla/EncodingDetector.h"
#include "mozilla/Likely.h"
#include "mozilla/Maybe.h"
#include "mozilla/SchedulerGroup.h"
#include "mozilla/ScopeExit.h"
#include "mozilla/Services.h"
#include "mozilla/StaticPrefs_html5.h"
#include "mozilla/StaticPrefs_network.h"
#include "mozilla/TextUtils.h"
#include "mozilla/glean/NetwerkMetrics.h"
#include "mozilla/Unused.h"
#include "mozilla/dom/BindingDeclarations.h"
#include "mozilla/dom/BrowsingContext.h"
#include "mozilla/dom/DebuggerUtilsBinding.h"
#include "mozilla/dom/Document.h"
#include "mozilla/Vector.h"
#include "nsContentSink.h"
#include "nsContentUtils.h"
#include "nsCycleCollectionTraversalCallback.h"
#include "nsHtml5AtomTable.h"
#include "nsHtml5Highlighter.h"
#include "nsHtml5Module.h"
#include "nsHtml5OwningUTF16Buffer.h"
#include "nsHtml5Parser.h"
#include "nsHtml5Speculation.h"
#include "nsHtml5StreamParserPtr.h"
#include "nsHtml5Tokenizer.h"
#include "nsHtml5TreeBuilder.h"
#include "nsHtml5TreeOpExecutor.h"
#include "nsIChannel.h"
#include "nsIContentSink.h"
#include "nsID.h"
#include "nsIDTD.h"
#include "nsIDocShell.h"
#include "nsIHttpChannel.h"
#include "nsIInputStream.h"
#include "nsINestedURI.h"
#include "nsIObserverService.h"
#include "nsIRequest.h"
#include "nsIRunnable.h"
#include "nsIScriptError.h"
#include "nsIThread.h"
#include "nsIThreadRetargetableRequest.h"
#include "nsITimer.h"
#include "nsIURI.h"
#include "nsJSEnvironment.h"
#include "nsLiteralString.h"
#include "nsNetUtil.h"
#include "nsString.h"
#include "nsTPromiseFlatString.h"
#include "nsThreadUtils.h"
#include "nsXULAppAPI.h"
extern "C" {
// Defined in intl/encoding_glue/src/lib.rs
const mozilla::Encoding* xmldecl_parse(const uint8_t* buf, size_t buf_len);
};
using namespace mozilla;
using namespace mozilla::dom;
/*
* Note that nsHtml5StreamParser implements cycle collecting AddRef and
* Release. Therefore, nsHtml5StreamParser must never be refcounted from
* the parser thread!
*
* To work around this limitation, runnables posted by the main thread to the
* parser thread hold their reference to the stream parser in an
* nsHtml5StreamParserPtr. Upon creation, nsHtml5StreamParserPtr addrefs the
* object it holds
* just like a regular nsRefPtr. This is OK, since the creation of the
* runnable and the nsHtml5StreamParserPtr happens on the main thread.
*
* When the runnable is done on the parser thread, the destructor of
* nsHtml5StreamParserPtr runs there. It doesn't call Release on the held object
* directly. Instead, it posts another runnable back to the main thread where
* that runnable calls Release on the wrapped object.
*
* When posting runnables in the other direction, the runnables have to be
* created on the main thread when nsHtml5StreamParser is instantiated and
* held for the lifetime of the nsHtml5StreamParser. This works, because the
* same runnabled can be dispatched multiple times and currently runnables
* posted from the parser thread to main thread don't need to wrap any
* runnable-specific data. (In the other direction, the runnables most notably
* wrap the byte data of the stream.)
*/
NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
NS_INTERFACE_TABLE(nsHtml5StreamParser, nsISupports)
NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
NS_INTERFACE_MAP_END
NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
tmp->DropTimer();
NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest)
NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner)
tmp->mExecutorFlusher = nullptr;
tmp->mLoadFlusher = nullptr;
tmp->mExecutor = nullptr;
NS_IMPL_CYCLE_COLLECTION_UNLINK_END
NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest)
NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner)
// hack: count the strongly owned edge wrapped in the runnable
if (tmp->mExecutorFlusher) {
NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
}
// hack: count the strongly owned edge wrapped in the runnable
if (tmp->mLoadFlusher) {
NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
}
NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
class nsHtml5ExecutorFlusher : public Runnable {
private:
RefPtr<nsHtml5TreeOpExecutor> mExecutor;
public:
explicit nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
: Runnable("nsHtml5ExecutorFlusher"), mExecutor(aExecutor) {}
NS_IMETHOD Run() override {
if (!mExecutor->isInList()) {
Document* doc = mExecutor->GetDocument();
if (XRE_IsContentProcess() &&
nsContentUtils::
HighPriorityEventPendingForTopLevelDocumentBeforeContentfulPaint(
doc)) {
// Possible early paint pending, reuse the runnable and try to
// call RunFlushLoop later.
nsCOMPtr<nsIRunnable> flusher = this;
if (NS_SUCCEEDED(doc->Dispatch(flusher.forget()))) {
PROFILER_MARKER_UNTYPED("HighPrio blocking parser flushing(1)", DOM);
return NS_OK;
}
}
mExecutor->RunFlushLoop();
}
return NS_OK;
}
};
class nsHtml5LoadFlusher : public Runnable {
private:
RefPtr<nsHtml5TreeOpExecutor> mExecutor;
public:
explicit nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
: Runnable("nsHtml5LoadFlusher"), mExecutor(aExecutor) {}
NS_IMETHOD Run() override {
mExecutor->FlushSpeculativeLoads();
return NS_OK;
}
};
nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
nsHtml5Parser* aOwner,
eParserMode aMode)
: mBomState(eBomState::BOM_SNIFFING_NOT_STARTED),
mCharsetSource(kCharsetUninitialized),
mEncodingSwitchSource(kCharsetUninitialized),
mEncoding(X_USER_DEFINED_ENCODING), // Obviously bogus value to notice if
// not updated
mNeedsEncodingSwitchTo(nullptr),
mSeenEligibleMetaCharset(false),
mChardetEof(false),
#ifdef DEBUG
mStartedFeedingDetector(false),
mStartedFeedingDevTools(false),
#endif
mReparseForbidden(false),
mForceAutoDetection(false),
mChannelHadCharset(false),
mLookingForMetaCharset(false),
mStartsWithLtQuestion(false),
mLookingForXmlDeclarationForXmlViewSource(false),
mTemplatePushedOrHeadPopped(false),
mGtBuffer(nullptr),
mGtPos(0),
mLastBuffer(nullptr), // Will be filled when starting
mExecutor(aExecutor),
mTreeBuilder(new nsHtml5TreeBuilder(
(aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML)
? nullptr
: mExecutor->GetStage(),
mExecutor->GetStage(), aMode == NORMAL)),
mTokenizer(
new nsHtml5Tokenizer(mTreeBuilder.get(), aMode == VIEW_SOURCE_XML)),
mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex"),
mOwner(aOwner),
mLastWasCR(false),
mStreamState(eHtml5StreamState::STREAM_NOT_STARTED),
mSpeculating(false),
mAtEOF(false),
mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex"),
mSpeculationFailureCount(0),
mNumBytesBuffered(0),
mTerminated(false),
mInterrupted(false),
mEventTarget(nsHtml5Module::GetStreamParserEventTarget()),
mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor)),
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDetectorHasSeenNonAscii(false),
mDecodingLocalFileWithoutTokenizing(false),
mBufferingBytes(false),
mFlushTimer(NS_NewTimer(mEventTarget)),
mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
mFlushTimerArmed(false),
mFlushTimerEverFired(false),
mMode(aMode) {
NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
#ifdef DEBUG
mAtomTable.SetPermittedLookupEventTarget(mEventTarget);
#endif
mTokenizer->setInterner(&mAtomTable);
mTokenizer->setEncodingDeclarationHandler(this);
if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
nsHtml5Highlighter* highlighter =
new nsHtml5Highlighter(mExecutor->GetStage());
mTokenizer->EnableViewSource(highlighter); // takes ownership
mTreeBuilder->EnableViewSource(highlighter); // doesn't own
}
// There's a zeroing operator new for everything else
}
nsHtml5StreamParser::~nsHtml5StreamParser() {
NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
mTokenizer->end();
#ifdef DEBUG
{
mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex);
MOZ_ASSERT(!mFlushTimer, "Flush timer was not dropped before dtor!");
}
mRequest = nullptr;
mUnicodeDecoder = nullptr;
mFirstBuffer = nullptr;
mExecutor = nullptr;
mTreeBuilder = nullptr;
mTokenizer = nullptr;
mOwner = nullptr;
#endif
}
nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
return mRequest ? CallQueryInterface(mRequest, aChannel)
: NS_ERROR_NOT_AVAILABLE;
}
std::tuple<NotNull<const Encoding*>, nsCharsetSource>
nsHtml5StreamParser::GuessEncoding(bool aInitial) {
MOZ_ASSERT(
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII &&
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
auto encoding =
mForceAutoDetection
? ifHadBeenForced
: mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
nsCharsetSource source =
aInitial
? (mForceAutoDetection
? kCharsetFromInitialUserForcedAutoDetection
: (mDecodingLocalFileWithoutTokenizing
? kCharsetFromFinalAutoDetectionFile
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic))
: (mForceAutoDetection
? kCharsetFromFinalUserForcedAutoDetection
: (mDecodingLocalFileWithoutTokenizing
? kCharsetFromFinalAutoDetectionFile
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
} else if (ifHadBeenForced == UTF_8_ENCODING) {
MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
mCharsetSource ==
kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 ||
mEncoding == ISO_2022_JP_ENCODING);
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII;
} else if (encoding != ifHadBeenForced) {
if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII;
} else {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
}
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII;
} else {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII;
}
} else if (source ==
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII;
} else if (ifHadBeenForced == UTF_8_ENCODING) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (encoding != ifHadBeenForced) {
source =
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
}
return {encoding, source};
}
void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer) {
#ifdef DEBUG
mStartedFeedingDetector = true;
#endif
MOZ_ASSERT(!mChardetEof);
mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, false);
}
void nsHtml5StreamParser::DetectorEof() {
#ifdef DEBUG
mStartedFeedingDetector = true;
#endif
if (mChardetEof) {
return;
}
mChardetEof = true;
mDetectorHasSeenNonAscii = mDetector->Feed(Span<const uint8_t>(), true);
}
void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) {
MOZ_ASSERT(NS_IsMainThread());
BrowsingContext* browsingContext =
mExecutor->GetDocument()->GetBrowsingContext();
if (browsingContext && browsingContext->WatchedByDevTools()) {
mURIToSendToDevtools = aURL;
nsID uuid;
nsresult rv = nsID::GenerateUUIDInPlace(uuid);
if (!NS_FAILED(rv)) {
char buffer[NSID_LENGTH];
uuid.ToProvidedString(buffer);
mUUIDForDevtools = NS_ConvertASCIItoUTF16(buffer);
}
}
if (aURL) {
nsCOMPtr<nsIURI> temp;
if (aURL->SchemeIs("view-source")) {
nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
nested->GetInnerURI(getter_AddRefs(temp));
} else {
temp = aURL;
}
if (temp->SchemeIs("data")) {
// Avoid showing potentially huge data: URLs. The three last bytes are
// UTF-8 for an ellipsis.
mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
} else {
nsresult rv = temp->GetSpec(mViewSourceTitle);
if (NS_FAILED(rv)) {
mViewSourceTitle.AssignLiteral("\xE2\x80\xA6");
}
}
}
}
nsresult
nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
Span<const uint8_t> aPrefix, Span<const uint8_t> aFromSegment) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
nsresult rv = WriteStreamBytes(aPrefix);
NS_ENSURE_SUCCESS(rv, rv);
return WriteStreamBytes(aFromSegment);
}
void nsHtml5StreamParser::SetupDecodingFromBom(
NotNull<const Encoding*> aEncoding) {
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
mEncoding = aEncoding;
mDecodingLocalFileWithoutTokenizing = false;
mLookingForMetaCharset = false;
mBufferingBytes = false;
mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
mCharsetSource = kCharsetFromByteOrderMark;
mForceAutoDetection = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false);
mBomState = BOM_SNIFFING_OVER;
if (mMode == VIEW_SOURCE_HTML) {
mTokenizer->StartViewSourceBodyContents();
}
}
void nsHtml5StreamParser::SetupDecodingFromUtf16BogoXml(
NotNull<const Encoding*> aEncoding) {
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
mEncoding = aEncoding;
mDecodingLocalFileWithoutTokenizing = false;
mLookingForMetaCharset = false;
mBufferingBytes = false;
mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
mCharsetSource = kCharsetFromXmlDeclarationUtf16;
mForceAutoDetection = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false);
mBomState = BOM_SNIFFING_OVER;
if (mMode == VIEW_SOURCE_HTML) {
mTokenizer->StartViewSourceBodyContents();
}
auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
dst[0] = '<';
dst[1] = '?';
dst[2] = 'x';
mLastBuffer->AdvanceEnd(3);
MOZ_ASSERT(!mStartedFeedingDevTools);
OnNewContent(dst.To(3));
}
size_t nsHtml5StreamParser::LengthOfLtContainingPrefixInSecondBuffer() {
MOZ_ASSERT(mBufferedBytes.Length() <= 2);
if (mBufferedBytes.Length() < 2) {
return 0;
}
Buffer<uint8_t>& second = mBufferedBytes[1];
const uint8_t* elements = second.Elements();
const uint8_t* lt = (const uint8_t*)memchr(elements, '>', second.Length());
if (lt) {
return (lt - elements) + 1;
}
return 0;
}
nsresult nsHtml5StreamParser::SniffStreamBytes(Span<const uint8_t> aFromSegment,
bool aEof) {
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
MOZ_ASSERT_IF(aEof, aFromSegment.IsEmpty());
if (mCharsetSource >=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
mCharsetSource <= kCharsetFromFinalUserForcedAutoDetection) {
if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) {
mTreeBuilder->MaybeComplainAboutCharset("EncDetectorReloadPlain", true,
0);
} else {
mTreeBuilder->MaybeComplainAboutCharset("EncDetectorReload", true, 0);
}
}
// mEncoding and mCharsetSource potentially have come from channel or higher
// by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
// If we don't find a BOM, the previously set values of mEncoding and
// mCharsetSource are not modified by the BOM sniffing here.
static uint8_t utf8[] = {0xEF, 0xBB};
static uint8_t utf16le[] = {0xFF};
static uint8_t utf16be[] = {0xFE};
static uint8_t utf16leXml[] = {'<', 0x00, '?', 0x00, 'x'};
static uint8_t utf16beXml[] = {0x00, '<', 0x00, '?', 0x00};
// Buffer for replaying past bytes based on state machine state. If
// writing this from scratch, probably wouldn't do it this way, but
// let's keep the changes to a minimum.
const uint8_t* prefix = utf8;
size_t prefixLength = 0;
if (aEof && mBomState == BOM_SNIFFING_NOT_STARTED) {
// Avoid handling aEof in the BOM_SNIFFING_NOT_STARTED state below.
mBomState = BOM_SNIFFING_OVER;
}
for (size_t i = 0;
(i < aFromSegment.Length() && mBomState != BOM_SNIFFING_OVER) || aEof;
i++) {
switch (mBomState) {
case BOM_SNIFFING_NOT_STARTED:
MOZ_ASSERT(i == 0, "Bad BOM sniffing state.");
MOZ_ASSERT(!aEof, "Should have checked for aEof above!");
switch (aFromSegment[0]) {
case 0xEF:
mBomState = SEEN_UTF_8_FIRST_BYTE;
break;
case 0xFF:
mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
break;
case 0xFE:
mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
break;
case 0x00:
if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
mCharsetSource != kCharsetFromChannel) {
mBomState = SEEN_UTF_16_BE_XML_FIRST;
} else {
mBomState = BOM_SNIFFING_OVER;
}
break;
case '<':
if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 &&
mCharsetSource != kCharsetFromChannel) {
mBomState = SEEN_UTF_16_LE_XML_FIRST;
} else {
mBomState = BOM_SNIFFING_OVER;
}
break;
default:
mBomState = BOM_SNIFFING_OVER;
break;
}
break;
case SEEN_UTF_16_LE_FIRST_BYTE:
if (!aEof && aFromSegment[i] == 0xFE) {
SetupDecodingFromBom(UTF_16LE_ENCODING);
return WriteStreamBytes(aFromSegment.From(i + 1));
}
prefix = utf16le;
prefixLength = 1 - i;
mBomState = BOM_SNIFFING_OVER;
break;
case SEEN_UTF_16_BE_FIRST_BYTE:
if (!aEof && aFromSegment[i] == 0xFF) {
SetupDecodingFromBom(UTF_16BE_ENCODING);
return WriteStreamBytes(aFromSegment.From(i + 1));
}
prefix = utf16be;
prefixLength = 1 - i;
mBomState = BOM_SNIFFING_OVER;
break;
case SEEN_UTF_8_FIRST_BYTE:
if (!aEof && aFromSegment[i] == 0xBB) {
mBomState = SEEN_UTF_8_SECOND_BYTE;
} else {
prefixLength = 1 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_8_SECOND_BYTE:
if (!aEof && aFromSegment[i] == 0xBF) {
SetupDecodingFromBom(UTF_8_ENCODING);
return WriteStreamBytes(aFromSegment.From(i + 1));
}
prefixLength = 2 - i;
mBomState = BOM_SNIFFING_OVER;
break;
case SEEN_UTF_16_BE_XML_FIRST:
if (!aEof && aFromSegment[i] == '<') {
mBomState = SEEN_UTF_16_BE_XML_SECOND;
} else {
prefix = utf16beXml;
prefixLength = 1 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_BE_XML_SECOND:
if (!aEof && aFromSegment[i] == 0x00) {
mBomState = SEEN_UTF_16_BE_XML_THIRD;
} else {
prefix = utf16beXml;
prefixLength = 2 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_BE_XML_THIRD:
if (!aEof && aFromSegment[i] == '?') {
mBomState = SEEN_UTF_16_BE_XML_FOURTH;
} else {
prefix = utf16beXml;
prefixLength = 3 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_BE_XML_FOURTH:
if (!aEof && aFromSegment[i] == 0x00) {
mBomState = SEEN_UTF_16_BE_XML_FIFTH;
} else {
prefix = utf16beXml;
prefixLength = 4 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_BE_XML_FIFTH:
if (!aEof && aFromSegment[i] == 'x') {
SetupDecodingFromUtf16BogoXml(UTF_16BE_ENCODING);
return WriteStreamBytes(aFromSegment.From(i + 1));
}
prefix = utf16beXml;
prefixLength = 5 - i;
mBomState = BOM_SNIFFING_OVER;
break;
case SEEN_UTF_16_LE_XML_FIRST:
if (!aEof && aFromSegment[i] == 0x00) {
mBomState = SEEN_UTF_16_LE_XML_SECOND;
} else {
if (!aEof && aFromSegment[i] == '?' &&
!(mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN)) {
mStartsWithLtQuestion = true;
}
prefix = utf16leXml;
prefixLength = 1 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_LE_XML_SECOND:
if (!aEof && aFromSegment[i] == '?') {
mBomState = SEEN_UTF_16_LE_XML_THIRD;
} else {
prefix = utf16leXml;
prefixLength = 2 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_LE_XML_THIRD:
if (!aEof && aFromSegment[i] == 0x00) {
mBomState = SEEN_UTF_16_LE_XML_FOURTH;
} else {
prefix = utf16leXml;
prefixLength = 3 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_LE_XML_FOURTH:
if (!aEof && aFromSegment[i] == 'x') {
mBomState = SEEN_UTF_16_LE_XML_FIFTH;
} else {
prefix = utf16leXml;
prefixLength = 4 - i;
mBomState = BOM_SNIFFING_OVER;
}
break;
case SEEN_UTF_16_LE_XML_FIFTH:
if (!aEof && aFromSegment[i] == 0x00) {
SetupDecodingFromUtf16BogoXml(UTF_16LE_ENCODING);
return WriteStreamBytes(aFromSegment.From(i + 1));
}
prefix = utf16leXml;
prefixLength = 5 - i;
mBomState = BOM_SNIFFING_OVER;
break;
default:
mBomState = BOM_SNIFFING_OVER;
break;
}
if (aEof) {
break;
}
}
// if we get here, there either was no BOM or the BOM sniffing isn't complete
// yet
MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
"Should not come here if BOM was found.");
MOZ_ASSERT(mCharsetSource != kCharsetFromXmlDeclarationUtf16,
"Should not come here if UTF-16 bogo-XML declaration was found.");
MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
"kCharsetFromOtherComponent is for XSLT.");
if (mBomState == BOM_SNIFFING_OVER) {
if (mMode == VIEW_SOURCE_XML && mStartsWithLtQuestion &&
mCharsetSource < kCharsetFromChannel) {
// Sniff for XML declaration only.
MOZ_ASSERT(!mLookingForXmlDeclarationForXmlViewSource);
MOZ_ASSERT(!aEof);
MOZ_ASSERT(!mLookingForMetaCharset);
MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
// Maybe we've already buffered a '>'.
MOZ_ASSERT(!mBufferedBytes.IsEmpty(),
"How did at least <? not get buffered?");
Buffer<uint8_t>& first = mBufferedBytes[0];
const Encoding* encoding =
xmldecl_parse(first.Elements(), first.Length());
if (encoding) {
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromXmlDeclaration;
} else if (memchr(first.Elements(), '>', first.Length())) {
// There was a '>', but an encoding still wasn't found.
; // fall through to commit to the UTF-8 default.
} else if (size_t lengthOfPrefix =
LengthOfLtContainingPrefixInSecondBuffer()) {
// This can only happen if the first buffer was a lone '<', because
// we come here upon seeing the second byte '?' if the first two bytes
// were "<?". That is, the only way how we aren't dealing with the first
// buffer is if the first buffer only contained a single '<' and we are
// dealing with the second buffer that starts with '?'.
MOZ_ASSERT(first.Length() == 1);
MOZ_ASSERT(mBufferedBytes[1][0] == '?');
// Our scanner for XML declaration-like syntax wants to see a contiguous
// buffer, so let's linearize the data. (Ideally, the XML declaration
// scanner would be incremental, but this is the rare path anyway.)
Vector<uint8_t> contiguous;
if (!contiguous.append(first.Elements(), first.Length())) {
MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
return NS_ERROR_OUT_OF_MEMORY;
}
if (!contiguous.append(mBufferedBytes[1].Elements(), lengthOfPrefix)) {
MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
return NS_ERROR_OUT_OF_MEMORY;
}
encoding = xmldecl_parse(contiguous.begin(), contiguous.length());
if (encoding) {
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromXmlDeclaration;
}
// else no XML decl, commit to the UTF-8 default.
} else {
MOZ_ASSERT(mBufferingBytes);
mLookingForXmlDeclarationForXmlViewSource = true;
return NS_OK;
}
} else if (mMode != VIEW_SOURCE_XML &&
(mForceAutoDetection || mCharsetSource < kCharsetFromChannel)) {
// In order to use the buffering logic for meta with mForceAutoDetection,
// we set mLookingForMetaCharset but still actually potentially ignore the
// meta.
mFirstBufferOfMetaScan = mFirstBuffer;
MOZ_ASSERT(mLookingForMetaCharset);
if (mMode == VIEW_SOURCE_HTML) {
auto r = mTokenizer->FlushViewSource();
if (r.isErr()) {
return r.unwrapErr();
}
}
auto r = mTreeBuilder->Flush();
if (r.isErr()) {
return r.unwrapErr();
}
// Encoding committer flushes the ops on the main thread.
mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
nsHtml5Speculation* speculation = new nsHtml5Speculation(
mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(),
mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot());
MOZ_ASSERT(!mFlushTimerArmed, "How did we end up arming the timer?");
if (mMode == VIEW_SOURCE_HTML) {
mTokenizer->SetViewSourceOpSink(speculation);
mTokenizer->StartViewSourceBodyContents();
} else {
MOZ_ASSERT(mMode != VIEW_SOURCE_XML);
mTreeBuilder->SetOpSink(speculation);
}
mSpeculations.AppendElement(speculation); // adopts the pointer
mSpeculating = true;
} else {
mLookingForMetaCharset = false;
mBufferingBytes = false;
mDecodingLocalFileWithoutTokenizing = false;
if (mMode == VIEW_SOURCE_HTML) {
mTokenizer->StartViewSourceBodyContents();
}
}
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
Span(prefix, prefixLength), aFromSegment);
}
return NS_OK;
}
class AddContentRunnable : public Runnable {
public:
AddContentRunnable(const nsAString& aParserID, nsIURI* aURI,
Span<const char16_t> aData, bool aComplete)
: Runnable("AddContent") {
nsAutoCString spec;
aURI->GetSpec(spec);
mData.mUri.Construct(NS_ConvertUTF8toUTF16(spec));
mData.mParserID.Construct(aParserID);
mData.mContents.Construct(aData.Elements(), aData.Length());
mData.mComplete.Construct(aComplete);
}
NS_IMETHOD Run() override {
nsAutoString json;
if (!mData.ToJSON(json)) {
return NS_ERROR_FAILURE;
}
nsCOMPtr<nsIObserverService> obsService = services::GetObserverService();
if (obsService) {
obsService->NotifyObservers(nullptr, "devtools-html-content",
PromiseFlatString(json).get());
}
return NS_OK;
}
HTMLContent mData;
};
inline void nsHtml5StreamParser::OnNewContent(Span<const char16_t> aData) {
#ifdef DEBUG
mStartedFeedingDevTools = true;
#endif
if (mURIToSendToDevtools) {
if (aData.IsEmpty()) {
// Optimize out the runnable.
return;
}
NS_DispatchToMainThread(new AddContentRunnable(mUUIDForDevtools,
mURIToSendToDevtools, aData,
/* aComplete */ false));
}
}
inline void nsHtml5StreamParser::OnContentComplete() {
#ifdef DEBUG
mStartedFeedingDevTools = true;
#endif
if (mURIToSendToDevtools) {
NS_DispatchToMainThread(new AddContentRunnable(
mUUIDForDevtools, mURIToSendToDevtools, Span<const char16_t>(),
/* aComplete */ true));
mURIToSendToDevtools = nullptr;
}
}
nsresult nsHtml5StreamParser::WriteStreamBytes(
Span<const uint8_t> aFromSegment) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
mTokenizerMutex.AssertCurrentThreadOwns();
// mLastBuffer should always point to a buffer of the size
// READ_BUFFER_SIZE.
if (!mLastBuffer) {
NS_WARNING("mLastBuffer should not be null!");
MarkAsBroken(NS_ERROR_NULL_POINTER);
return NS_ERROR_NULL_POINTER;
}
size_t totalRead = 0;
auto src = aFromSegment;
for (;;) {
auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE);
auto [result, read, written, hadErrors] =
mUnicodeDecoder->DecodeToUTF16(src, dst, false);
if (!(mLookingForMetaCharset || mDecodingLocalFileWithoutTokenizing)) {
OnNewContent(dst.To(written));
}
if (hadErrors && !mHasHadErrors) {
mHasHadErrors = true;
if (mEncoding == UTF_8_ENCODING) {
mTreeBuilder->TryToEnableEncodingMenu();
}
}
src = src.From(read);
totalRead += read;
mLastBuffer->AdvanceEnd(written);
if (result == kOutputFull) {
RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE);
if (!newBuf) {
MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
return NS_ERROR_OUT_OF_MEMORY;
}
mLastBuffer = (mLastBuffer->next = std::move(newBuf));
} else {
MOZ_ASSERT(totalRead == aFromSegment.Length(),
"The Unicode decoder consumed the wrong number of bytes.");
(void)totalRead;
if (!mLookingForMetaCharset && mDecodingLocalFileWithoutTokenizing &&
mNumBytesBuffered == LOCAL_FILE_UTF_8_BUFFER_SIZE) {
MOZ_ASSERT(!mStartedFeedingDetector);
for (auto&& buffer : mBufferedBytes) {
FeedDetector(buffer);
}
// If the file is exactly LOCAL_FILE_UTF_8_BUFFER_SIZE bytes long
// we end up not considering the EOF. That's not fatal, since we
// don't consider the EOF if the file is
// LOCAL_FILE_UTF_8_BUFFER_SIZE + 1 bytes long.
auto [encoding, source] = GuessEncoding(true);
mCharsetSource = source;
if (encoding != mEncoding) {
mEncoding = encoding;
nsresult rv = ReDecodeLocalFile();
if (NS_FAILED(rv)) {
return rv;
}
} else {
MOZ_ASSERT(mEncoding == UTF_8_ENCODING);
nsresult rv = CommitLocalFileToEncoding();
if (NS_FAILED(rv)) {
return rv;
}
}
}
return NS_OK;
}
}
}
[[nodiscard]] nsresult nsHtml5StreamParser::ReDecodeLocalFile() {
MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing && !mLookingForMetaCharset);