Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
// @ts-check
/**
* @param {Document} document
* @returns {string}
*/
export function extractTextFromDOM(document) {
const blocks = subdivideNodeIntoBlocks(document.body);
let textContent = "";
for (const block of blocks) {
let innerText = "";
const element = asHTMLElement(block);
const text = asTextNode(block);
if (element) {
innerText = element.innerText.trim();
} else if (text?.nodeValue) {
innerText = text.nodeValue.trim();
}
if (innerText) {
textContent += "\n" + innerText;
}
}
return textContent;
}
/**
* Tags excluded from text extraction.
*/
const CONTENT_EXCLUDED_TAGS = new Set([
// TODO - We should add this and write some tests.
"CODE",
// The following are deprecated tags.
"DIR",
"APPLET",
// The following are embedded elements, and are not supported (yet).
"MATH",
"EMBED",
"OBJECT",
"IFRAME",
// This is an SVG tag that can contain arbitrary XML, ignore it.
"METADATA",
// These are elements that are treated as opaque by Firefox which causes their
// innerHTML property to be just the raw text node behind it. Any text that is sent as
// HTML must be valid, and there is no guarantee that the innerHTML is valid.
"NOSCRIPT",
"NOEMBED",
"NOFRAMES",
// Do not parse the HEAD tag.
"HEAD",
// These are not user-visible tags.
"STYLE",
"SCRIPT",
"TEMPLATE",
]);
const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(",");
/**
* Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API.
* This allows for extracting the content from WebComponents, which is not
* normally feasible in non-privileged contexts.
*
* @param {Node} node
*
* @returns {ShadowRoot | null}
*/
function getShadowRoot(node) {
return asElement(node)?.openOrClosedShadowRoot ?? null;
}
/**
* Determines if a node is ready for text extraction, or if it should be subdivided
* further. It doesn't check if the node has already been processed. This id done
* at the block level.
*
* @param {Node} node
* @returns {number} - NodeFilter acceptance status.
*/
function determineBlockStatus(node) {
if (!node) {
return NodeFilter.FILTER_REJECT;
}
if (getShadowRoot(node)) {
return NodeFilter.FILTER_ACCEPT;
}
if (isExcludedNode(node)) {
// This is an explicit.
return NodeFilter.FILTER_REJECT;
}
if (
containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) &&
!hasNonWhitespaceTextNodes(node)
) {
// Skip this node, and dig deeper into its tree to cut off smaller pieces to extract.
return NodeFilter.FILTER_SKIP;
}
if (nodeNeedsSubdividing(node)) {
// Skip this node, and dig deeper into its tree to cut off smaller pieces
// to extract. It is presumed to be a wrapper of block elements.
return NodeFilter.FILTER_SKIP;
}
// This textContent call is fairly expensive.
if (!node.textContent?.trim().length) {
// Do not use subtrees that are empty of text.
return !node.hasChildNodes()
? NodeFilter.FILTER_REJECT
: NodeFilter.FILTER_SKIP;
}
// This node can be treated as entire block and is ready for text extraction.
return NodeFilter.FILTER_ACCEPT;
}
/**
* Determine if this element is an inline element or a block element.
*
* @param {Node} node
* @returns {boolean}
*/
function nodeNeedsSubdividing(node) {
const element = asElement(node);
if (!element) {
// Only elements need to be further subdivided.
return false;
}
for (let childNode of element.childNodes) {
if (!childNode) {
continue;
}
switch (childNode.nodeType) {
case Node.TEXT_NODE: {
// Keep checking for more inline or text nodes.
continue;
}
case Node.ELEMENT_NODE: {
if (getIsBlockLike(childNode)) {
// This node is a block node, so it needs further subdividing.
return true;
} else if (nodeNeedsSubdividing(childNode)) {
// This non-block-like node may contain other block-like nodes.
return true;
}
// Keep checking for more inline or text nodes.
continue;
}
default: {
return true;
}
}
}
return false;
}
/**
* Returns true if an HTML element is hidden based on factors such as collapsed state and
* computed style, otherwise false.
*
* @param {HTMLElement} element
* @returns {boolean}
*/
function isHTMLElementHidden(element) {
// This is a cheap and easy check that will not compute style or force reflow.
if (element.hidden) {
// The element is explicitly hidden.
return true;
}
// Handle open/closed <details> elements. This will also not compute style or force reflow.
if (
// The element is within a closed <details>
element.closest("details:not([open])") &&
// The element is not part of the <summary> of the <details>, which is always visible, even when closed.
!element.closest("summary")
) {
// The element is within a closed <details> and is not part of the <summary>, therefore it is not visible.
return true;
}
// This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible.
if (
!(
element.offsetWidth ||
element.offsetHeight ||
element.getClientRects().length
)
) {
return true;
}
const { ownerGlobal } = element;
if (!ownerGlobal) {
// We cannot compute the style without ownerGlobal, so we will assume it is not visible.
return true;
}
// This flushes the style, which is a performance cost.
const style = ownerGlobal.getComputedStyle(element);
if (!style) {
// We were unable to compute the style, so we will assume it is not visible.
return true;
}
// This is an issue with the DOM library generation.
// @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339)
const { display, visibility, opacity } = style;
return (
display === "none" ||
visibility === "hidden" ||
visibility === "collapse" ||
opacity === "0"
);
}
/**
* @param {Node} node
*/
function isExcludedNode(node) {
// Property access be expensive, so destructure required properties so they are
// not accessed multiple times.
const { nodeType } = node;
if (nodeType === Node.TEXT_NODE) {
// Text nodes are never excluded.
return false;
}
const element = asElement(node);
if (!element) {
// Only elements and and text nodes should be considered.
return true;
}
const { nodeName } = element;
if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) {
// SVG tags can be lowercased, so ensure everything is uppercased.
// This is an excluded tag.
return true;
}
return false;
}
/**
* Like `#isExcludedNode` but looks at the full subtree. Used to see whether
* we can consider a subtree, or whether we should split it into smaller
* branches first to try to exclude more of the content.
*
* @param {Node} node
* @param {string} excludedNodeSelector
*
* @returns {boolean}
*/
function containsExcludedNode(node, excludedNodeSelector) {
return Boolean(asElement(node)?.querySelector(excludedNodeSelector));
}
/**
* Test whether any of the direct child text nodes of are non-whitespace text nodes.
*
* For example:
* - `<p>test</p>`: yes
* - `<p> </p>`: no
* - `<p><b>test</b></p>`: no
*
* @param {Node} node
*
* @returns {boolean}
*/
function hasNonWhitespaceTextNodes(node) {
if (node.nodeType !== Node.ELEMENT_NODE) {
// Only check element nodes.
return false;
}
for (const child of node.childNodes) {
const textNode = asTextNode(child);
if (textNode) {
if (!textNode.textContent?.trim()) {
// This is just whitespace.
continue;
}
// A text node with content was found.
return true;
}
}
// No text nodes were found.
return false;
}
/**
* Start walking down through a node's subtree and decide which nodes to extract content
* from. This first node is the root of the page.
*
* The nodes go through a process of subdivision until an appropriate sized chunk
* of inline text can be found.
*
* @param {Node} node
* @returns {Set<Node>}
*/
function subdivideNodeIntoBlocks(node) {
/** @type {Set<Node>} */
const blocks = new Set();
switch (determineBlockStatus(node)) {
case NodeFilter.FILTER_REJECT: {
// This node is rejected as it shouldn't be used for text extraction.
return blocks;
}
// Either a shadow host or a block element
case NodeFilter.FILTER_ACCEPT: {
const shadowRoot = getShadowRoot(node);
if (shadowRoot) {
processSubdivide(shadowRoot, blocks);
} else {
const element = asHTMLElement(node);
if (element && isHTMLElementHidden(element)) {
break;
}
if (noAncestorsAdded(node, blocks)) {
blocks.add(node);
}
}
break;
}
case NodeFilter.FILTER_SKIP: {
// This node may have text to extract, but it needs to be subdivided into smaller
// pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes
// that contain enough inline elements to extract.
processSubdivide(node, blocks);
break;
}
}
return blocks;
}
/**
* Add qualified nodes to have their text content extracted by recursively walking
* through the DOM tree of nodes, including elements in the Shadow DOM.
*
* @param {Node} node
* @param {Set<Node>} blocks
*/
function processSubdivide(node, blocks) {
const { ownerDocument } = node;
if (!ownerDocument) {
return;
}
// This iterator will contain each node that has been subdivided enough to have its
// text extracted.
const nodeIterator = ownerDocument.createTreeWalker(
node,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
determineBlockStatus
);
let currentNode;
while ((currentNode = nodeIterator.nextNode())) {
const shadowRoot = getShadowRoot(currentNode);
if (shadowRoot) {
processSubdivide(shadowRoot, blocks);
} else if (noAncestorsAdded(currentNode, blocks)) {
blocks.add(currentNode);
}
}
}
/**
* TODO - The original TranslationsDocument algorithm didn't require this, so perhaps
* something was not ported correctly. This should be removed to see if the error
* can be reproduced, and this mitigation removed.
*
* @param {Node} node
* @param {Set<Node>} blocks
*/
function noAncestorsAdded(node, blocks) {
for (const ancestor of getAncestorsIterator(node)) {
if (blocks.has(ancestor)) {
return false;
}
}
return true;
}
/**
* Returns an iterator of a node's ancestors.
*
* @param {Node} node
*
* @yields {Node}
*/
function* getAncestorsIterator(node) {
const document = node.ownerDocument;
if (!document) {
return;
}
for (
let parent = node.parentNode;
parent && parent !== document.documentElement;
parent = parent.parentNode
) {
yield parent;
}
}
/**
* Reads the elements computed style and determines if the element is a block-like
* element or not. Every element that lays out like a block should be used as a unit
* for text extraction.
*
* @param {Node} node
* @returns {boolean}
*/
function getIsBlockLike(node) {
const element = asElement(node);
if (!element) {
return false;
}
const { ownerGlobal } = element;
if (!ownerGlobal) {
return false;
}
if (element.namespaceURI === "http://www.w3.org/2000/svg") {
// SVG elements will report as inline, but there is no block layout in SVG.
// Treat every SVG element as being block so that every node will be subdivided.
return true;
}
/** @type {Record<string, string>} */
// @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable.
const style = ownerGlobal.getComputedStyle(element) ?? { display: null };
return style.display !== "inline" && style.display !== "none";
}
/**
* Use TypeScript to determine if the Node is an Element.
*
* @param {Node | null | undefined} node
* @returns {Element | null}
*/
function asElement(node) {
if (node?.nodeType === Node.ELEMENT_NODE) {
return /** @type {HTMLElement} */ (node);
}
return null;
}
/**
* Use TypeScript to determine if the Node is an Element.
*
* @param {Node | null} node
*
* @returns {Text | null}
*/
function asTextNode(node) {
if (node?.nodeType === Node.TEXT_NODE) {
return /** @type {Text} */ (node);
}
return null;
}
/**
* Use TypeScript to determine if the Node is an HTMLElement.
*
* @param {Node | null} node
*
* @returns {HTMLElement | null}
*/
function asHTMLElement(node) {
if (HTMLElement.isInstance(node)) {
return node;
}
return null;
}