Source code
Revision control
Copy as Markdown
Other Tools
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
#include "mozilla/widget/nsGtkHtmlUtils.h"
#include "mozilla/Encoding.h"
#include "mozilla/Logging.h"
#include "nsReadableUtils.h"
using namespace mozilla;
static mozilla::LazyLogModule sGtkHtmlLog("GtkHtmlUtils");
#define GTK_HTML_LOG(...) \
MOZ_LOG(sGtkHtmlLog, mozilla::LogLevel::Debug, (__VA_ARGS__))
namespace mozilla::widget {
/*
* This function extracts the encoding label from the subset of HTML internal
* encoding declaration syntax that uses the old long form with double quotes
* and without spaces around the equals sign between the "content" attribute
* name and the attribute value.
*
* This was added for the sake of an ancient version of StarOffice
* non-UTF-8 encodings is still necessary and if this function
* still needs to exist.
*
* As of December 2022, both Gecko and LibreOffice emit an UTF-8
* declaration that this function successfully extracts "UTF-8" from,
* but that's also the default that we fall back on if this function
* fails to extract a label.
*/
bool GetHTMLCharset(Span<const char> aData, nsCString& aFoundCharset) {
// Assume ASCII first to find "charset" info
const nsDependentCSubstring htmlStr(aData);
nsACString::const_iterator start, end;
htmlStr.BeginReading(start);
htmlStr.EndReading(end);
nsACString::const_iterator valueStart(start), valueEnd(start);
if (CaseInsensitiveFindInReadable("CONTENT=\"text/html;"_ns, start, end)) {
start = end;
htmlStr.EndReading(end);
if (CaseInsensitiveFindInReadable("charset="_ns, start, end)) {
valueStart = end;
start = end;
htmlStr.EndReading(end);
if (FindCharInReadable('"', start, end)) {
valueEnd = start;
}
}
}
// find "charset" in HTML
if (valueStart != valueEnd) {
aFoundCharset = Substring(valueStart, valueEnd);
ToUpperCase(aFoundCharset);
return true;
}
return false;
}
bool DecodeHTMLData(Span<const char> aData, nsString& aOutDecoded) {
nsAutoCString charset;
if (!GetHTMLCharset(aData, charset)) {
GTK_HTML_LOG("DecodeHTMLData: charset not found, falling back to utf-8");
charset.AssignLiteral("utf-8");
} else {
GTK_HTML_LOG("DecodeHTMLData: detected charset %s", charset.get());
}
auto encoding = Encoding::ForLabelNoReplacement(charset);
if (!encoding) {
GTK_HTML_LOG("DecodeHTMLData: unrecognised charset label \"%s\"",
charset.get());
return false;
}
// Per spec, UTF-16BE/LE in HTML should be treated as UTF-8.
if (encoding == UTF_16LE_ENCODING || encoding == UTF_16BE_ENCODING) {
GTK_HTML_LOG("DecodeHTMLData: UTF-16LE/BE overridden to UTF-8 per spec");
encoding = UTF_8_ENCODING;
}
// Remove kHTMLMarkupPrefix again, it won't necessarily cause any
// issues, but might confuse other users.
const size_t prefixLen = std::size(kHTMLMarkupPrefix) - 1;
if (aData.Length() >= prefixLen && nsDependentCSubstring(aData.To(prefixLen))
.EqualsLiteral(kHTMLMarkupPrefix)) {
aData = aData.From(prefixLen);
}
auto [rv, enc] = encoding->Decode(AsBytes(aData), aOutDecoded);
if (enc != UTF_8_ENCODING &&
MOZ_LOG_TEST(sGtkHtmlLog, mozilla::LogLevel::Debug)) {
nsCString decoderName;
enc->Name(decoderName);
GTK_HTML_LOG("DecodeHTMLData: expected UTF-8 decoder but got %s",
decoderName.get());
}
if (NS_FAILED(rv)) {
GTK_HTML_LOG("DecodeHTMLData: decoding failed");
}
return NS_SUCCEEDED(rv);
}
} // namespace mozilla::widget
#undef GTK_HTML_LOG