DOMExtractor.sys.mjs

firefox-main/toolkit/components/pageextractor/DOMExtractor.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Machine Learning

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */

// @ts-check

/**

 * @param {Document} document

 * @returns {string}

*/

export function extractTextFromDOM(document) {

  const blocks = subdivideNodeIntoBlocks(document.body);

  let textContent = "";

  for (const block of blocks) {

    let innerText = "";

    const element = asHTMLElement(block);

    const text = asTextNode(block);

    if (element) {

      innerText = element.innerText.trim();

    } else if (text?.nodeValue) {

      innerText = text.nodeValue.trim();

    if (innerText) {

      textContent += "\n" + innerText;

  return textContent;

/**

 * Tags excluded from text extraction.

*/

const CONTENT_EXCLUDED_TAGS = new Set([

  // TODO - We should add this and write some tests.

  "CODE",

  // The following are deprecated tags.

  "DIR",

  "APPLET",

  // The following are embedded elements, and are not supported (yet).

  "MATH",

  "EMBED",

  "OBJECT",

  "IFRAME",

  // This is an SVG tag that can contain arbitrary XML, ignore it.

  "METADATA",

  // These are elements that are treated as opaque by Firefox which causes their

  // innerHTML property to be just the raw text node behind it. Any text that is sent as

  // HTML must be valid, and there is no guarantee that the innerHTML is valid.

  "NOSCRIPT",

  "NOEMBED",

  "NOFRAMES",

  // Do not parse the HEAD tag.

  "HEAD",

  // These are not user-visible tags.

  "STYLE",

  "SCRIPT",

  "TEMPLATE",

]);

const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(",");

/**

 * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API.

 * This allows for extracting the content from WebComponents, which is not

 * normally feasible in non-privileged contexts.

 * @param {Node} node

 * @returns {ShadowRoot | null}

*/

function getShadowRoot(node) {

  return asElement(node)?.openOrClosedShadowRoot ?? null;

/**

 * Determines if a node is ready for text extraction, or if it should be subdivided

 * further. It doesn't check if the node has already been processed. This id done

 * at the block level.

 * @param {Node} node

 * @returns {number} - NodeFilter acceptance status.

*/

function determineBlockStatus(node) {

  if (!node) {

    return NodeFilter.FILTER_REJECT;

  if (getShadowRoot(node)) {

    return NodeFilter.FILTER_ACCEPT;

  if (isExcludedNode(node)) {

    // This is an explicit.

    return NodeFilter.FILTER_REJECT;

  if (

    containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) &&

    !hasNonWhitespaceTextNodes(node)

) {

    // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract.

    return NodeFilter.FILTER_SKIP;

  if (nodeNeedsSubdividing(node)) {

    // Skip this node, and dig deeper into its tree to cut off smaller pieces

    // to extract. It is presumed to be a wrapper of block elements.

    return NodeFilter.FILTER_SKIP;

  // This textContent call is fairly expensive.

  if (!node.textContent?.trim().length) {

    // Do not use subtrees that are empty of text.

    return !node.hasChildNodes()

      ? NodeFilter.FILTER_REJECT

      : NodeFilter.FILTER_SKIP;

  // This node can be treated as entire block and is ready for text extraction.

  return NodeFilter.FILTER_ACCEPT;

/**

 * Determine if this element is an inline element or a block element.

 * @param {Node} node

 * @returns {boolean}

*/

function nodeNeedsSubdividing(node) {

  const element = asElement(node);

  if (!element) {

    // Only elements need to be further subdivided.

    return false;

  for (let childNode of element.childNodes) {

    if (!childNode) {

      continue;

    switch (childNode.nodeType) {

      case Node.TEXT_NODE: {

        // Keep checking for more inline or text nodes.

        continue;

      case Node.ELEMENT_NODE: {

        if (getIsBlockLike(childNode)) {

          // This node is a block node, so it needs further subdividing.

          return true;

        } else if (nodeNeedsSubdividing(childNode)) {

          // This non-block-like node may contain other block-like nodes.

          return true;

        // Keep checking for more inline or text nodes.

        continue;

      default: {

        return true;

  return false;

/**

 * Returns true if an HTML element is hidden based on factors such as collapsed state and

 * computed style, otherwise false.

 * @param {HTMLElement} element

 * @returns {boolean}

*/

function isHTMLElementHidden(element) {

  // This is a cheap and easy check that will not compute style or force reflow.

  if (element.hidden) {

    // The element is explicitly hidden.

    return true;

  // Handle open/closed <details> elements. This will also not compute style or force reflow.

  // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details

  if (

    // The element is within a closed <details>

    element.closest("details:not([open])") &&

    // The element is not part of the <summary> of the <details>, which is always visible, even when closed.

    !element.closest("summary")

) {

    // The element is within a closed <details> and is not part of the <summary>, therefore it is not visible.

    return true;

  // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible.

  // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10

  if (

!(

      element.offsetWidth ||

      element.offsetHeight ||

      element.getClientRects().length

) {

    return true;

  const { ownerGlobal } = element;

  if (!ownerGlobal) {

    // We cannot compute the style without ownerGlobal, so we will assume it is not visible.

    return true;

  // This flushes the style, which is a performance cost.

  const style = ownerGlobal.getComputedStyle(element);

  if (!style) {

    // We were unable to compute the style, so we will assume it is not visible.

    return true;

  // This is an issue with the DOM library generation.

  // @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339)

  const { display, visibility, opacity } = style;

  return (

    display === "none" ||

    visibility === "hidden" ||

    visibility === "collapse" ||

    opacity === "0"

);

/**

 * @param {Node} node

*/

function isExcludedNode(node) {

  // Property access be expensive, so destructure required properties so they are

  // not accessed multiple times.

  const { nodeType } = node;

  if (nodeType === Node.TEXT_NODE) {

    // Text nodes are never excluded.

    return false;

  const element = asElement(node);

  if (!element) {

    // Only elements and and text nodes should be considered.

    return true;

  const { nodeName } = element;

  if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) {

    // SVG tags can be lowercased, so ensure everything is uppercased.

    // This is an excluded tag.

    return true;

  return false;

/**

 * Like `#isExcludedNode` but looks at the full subtree. Used to see whether

 * we can consider a subtree, or whether we should split it into smaller

 * branches first to try to exclude more of the content.

 * @param {Node} node

 * @param {string} excludedNodeSelector

 * @returns {boolean}

*/

function containsExcludedNode(node, excludedNodeSelector) {

  return Boolean(asElement(node)?.querySelector(excludedNodeSelector));

/**

 * Test whether any of the direct child text nodes of are non-whitespace text nodes.

 * For example:

 *   - `<p>test</p>`: yes

 *   - `<p> </p>`: no

 *   - `<p><b>test</b></p>`: no

 * @param {Node} node

 * @returns {boolean}

*/

function hasNonWhitespaceTextNodes(node) {

  if (node.nodeType !== Node.ELEMENT_NODE) {

    // Only check element nodes.

    return false;

  for (const child of node.childNodes) {

    const textNode = asTextNode(child);

    if (textNode) {

      if (!textNode.textContent?.trim()) {

        // This is just whitespace.

        continue;

      // A text node with content was found.

      return true;

  // No text nodes were found.

  return false;

/**

 * Start walking down through a node's subtree and decide which nodes to extract content

 * from. This first node is the root of the page.

 * The nodes go through a process of subdivision until an appropriate sized chunk

 * of inline text can be found.

 * @param {Node} node

 * @returns {Set<Node>}

*/

function subdivideNodeIntoBlocks(node) {

  /** @type {Set<Node>} */

  const blocks = new Set();

  switch (determineBlockStatus(node)) {

    case NodeFilter.FILTER_REJECT: {

      // This node is rejected as it shouldn't be used for text extraction.

      return blocks;

    // Either a shadow host or a block element

    case NodeFilter.FILTER_ACCEPT: {

      const shadowRoot = getShadowRoot(node);

      if (shadowRoot) {

        processSubdivide(shadowRoot, blocks);

      } else {

        const element = asHTMLElement(node);

        if (element && isHTMLElementHidden(element)) {

          break;

        if (noAncestorsAdded(node, blocks)) {

          blocks.add(node);

      break;

    case NodeFilter.FILTER_SKIP: {

      // This node may have text to extract, but it needs to be subdivided into smaller

      // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes

      // that contain enough inline elements to extract.

      processSubdivide(node, blocks);

      break;

  return blocks;

/**

 * Add qualified nodes to have their text content extracted by recursively walking

 * through the DOM tree of nodes, including elements in the Shadow DOM.

 * @param {Node} node

 * @param {Set<Node>} blocks

*/

function processSubdivide(node, blocks) {

  const { ownerDocument } = node;

  if (!ownerDocument) {

    return;

  // This iterator will contain each node that has been subdivided enough to have its

  // text extracted.

  const nodeIterator = ownerDocument.createTreeWalker(

    node,

    NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,

    determineBlockStatus

);

  let currentNode;

  while ((currentNode = nodeIterator.nextNode())) {

    const shadowRoot = getShadowRoot(currentNode);

    if (shadowRoot) {

      processSubdivide(shadowRoot, blocks);

    } else if (noAncestorsAdded(currentNode, blocks)) {

      blocks.add(currentNode);

/**

 * TODO - The original TranslationsDocument algorithm didn't require this, so perhaps

 * something was not ported correctly. This should be removed to see if the error

 * can be reproduced, and this mitigation removed.

 * @param {Node} node

 * @param {Set<Node>} blocks

*/

function noAncestorsAdded(node, blocks) {

  for (const ancestor of getAncestorsIterator(node)) {

    if (blocks.has(ancestor)) {

      return false;

  return true;

/**

 * Returns an iterator of a node's ancestors.

 * @param {Node} node

 * @yields {Node}

*/

function* getAncestorsIterator(node) {

  const document = node.ownerDocument;

  if (!document) {

    return;

  for (

    let parent = node.parentNode;

    parent && parent !== document.documentElement;

    parent = parent.parentNode

) {

    yield parent;

/**

 * Reads the elements computed style and determines if the element is a block-like

 * element or not. Every element that lays out like a block should be used as a unit

 * for text extraction.

 * @param {Node} node

 * @returns {boolean}

*/

function getIsBlockLike(node) {

  const element = asElement(node);

  if (!element) {

    return false;

  const { ownerGlobal } = element;

  if (!ownerGlobal) {

    return false;

  if (element.namespaceURI === "http://www.w3.org/2000/svg") {

    // SVG elements will report as inline, but there is no block layout in SVG.

    // Treat every SVG element as being block so that every node will be subdivided.

    return true;

  /** @type {Record<string, string>} */

  // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable.

  const style = ownerGlobal.getComputedStyle(element) ?? { display: null };

  return style.display !== "inline" && style.display !== "none";

/**

 * Use TypeScript to determine if the Node is an Element.

 * @param {Node | null | undefined} node

 * @returns {Element | null}

*/

function asElement(node) {

  if (node?.nodeType === Node.ELEMENT_NODE) {

    return /** @type {HTMLElement} */ (node);

  return null;

/**

 * Use TypeScript to determine if the Node is an Element.

 * @param {Node | null} node

 * @returns {Text | null}

*/

function asTextNode(node) {

  if (node?.nodeType === Node.TEXT_NODE) {

    return /** @type {Text} */ (node);

  return null;

/**

 * Use TypeScript to determine if the Node is an HTMLElement.

 * @param {Node | null} node

 * @returns {HTMLElement | null}

*/

function asHTMLElement(node) {

  if (HTMLElement.isInstance(node)) {

    return node;

  return null;