translations-perf-data.py

Enable keyboard shortcuts

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""

description:

  A CLI tool to extract perftest metadata from a Translations HTML test file.

example:

  ❯ python3 toolkit/components/translations/tests/scripts/translations-perf-data.py \\

    --page_path="toolkit/components/translations/tests/browser/translations-bencher-es.html" \\

    --model_path="~/Downloads/cab5e093-7b55-47ea-a247-9747cc0109e3.spm"

note:

  The vocab model file can be downloaded from the following page:

  https://gregtatum.github.io/taskcluster-tools/src/models/

"""

import argparse

import sys

from pathlib import Path

import sentencepiece as spm

from bs4 import BeautifulSoup

from icu import BreakIterator, Locale

class CustomArgumentParser(argparse.ArgumentParser):

    """Custom argument parser to display help on errors."""

    def error(self, message):

        """Override error to display help message."""

        print(f"\nerror: {message}\n", file=sys.stderr)

        self.print_help()

        sys.exit(2)

def parse_arguments() -> argparse.Namespace:

    """Parse CLI arguments."""

    parser = CustomArgumentParser(

        description=__doc__,  # Use the module's docstring as the description

        formatter_class=argparse.RawDescriptionHelpFormatter,  # Use custom formatter

    parser.add_argument(

        "--page_path",

        required=True,

        type=Path,

        help="The HTML test file from which to extract perftest metadata.",

    parser.add_argument(

        "--model_path",

        required=True,

        type=Path,

        help="The SentencePiece vocab model file for the test page's language.",

    return parser

def extract_page_language(html_path: Path) -> str:

    """Extract the lang attribute from the HTML file."""

    with html_path.open("r", encoding="utf-8") as file:

        soup = BeautifulSoup(file, "html.parser")

    lang = soup.find("html").get("lang")

    if not lang:

        raise ValueError(f"Language not specified in the HTML file at {html_path}.")

    return lang

def extract_body_text(page_language: str, html_path: Path) -> str:

    """Extract text content from the <body> element of an HTML file,

    ignoring sub-elements with a lang attribute not matching page language."""

    with html_path.open("r", encoding="utf-8") as file:

        soup = BeautifulSoup(file, "html.parser")

    body = soup.find("body")

    if body is None:

        raise ValueError(f"No <body> element found in the HTML file at {html_path}.")

    # Find all elements with a `lang` attribute that does not match source_lang_tag

    for element in body.find_all(attrs={"lang": True}):

        if element["lang"] != page_language:

            element.decompose()  # Remove the element and its children

    return body.get_text()

def is_word_like(segment: str) -> bool:

    """Determine if a segment is word-like."""

    segment = segment.strip()

    if not segment:

        # A word-like segment should not be only whitespace.

        return False

    # A word-like segment should not be only punctuation.

    return any(char.isalnum() for char in segment)

def count_words(text: str, language: str) -> int:

    """Count the words in text using ICU BreakIterator."""

    locale = Locale(language)

    break_iterator = BreakIterator.createWordInstance(locale)

    break_iterator.setText(text)

    word_count = 0

    lhs_boundary = break_iterator.first()

    rhs_boundary = break_iterator.nextBoundary()

    while rhs_boundary != BreakIterator.DONE:

        if is_word_like(text[lhs_boundary:rhs_boundary]):

            word_count += 1

        lhs_boundary = rhs_boundary

        rhs_boundary = break_iterator.nextBoundary()

    return word_count

def count_tokens(text: str, model_path: Path) -> int:

    """Count the tokens in the text using SentencePiece."""

    processor = spm.SentencePieceProcessor(model_file=str(model_path))

    return len(processor.encode(text))

def main() -> None:

    parser = parse_arguments()

    args = parser.parse_args()

    args.page_path = args.page_path.expanduser()

    args.model_path = args.model_path.expanduser()

    page_language = extract_page_language(args.page_path)

    body_text = extract_body_text(page_language, args.page_path)

    token_count = count_tokens(body_text, args.model_path)

    word_count = count_words(body_text, page_language)

    print()

    print(f'pageLanguage: "{page_language}",')

    print(f"tokenCount: {token_count},")

    print(f"wordCount: {word_count},")

    print("\n⏩ NEXT STEPS ⏩\n")

    print(

        "These metadata should be added to the TranslationsBencher static #PAGE_DATA located in:\n"

    print("browser/components/translations/tests/browser/head.js")

    print()

if __name__ == "__main__":

    main()