verify-new-dict.sh

firefox-main/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/verify-new-dict.sh

Enable keyboard shortcuts

Source code

File a bug in Core :: Spelling checker

Revision control

Copy as Markdown

Other Tools

HG Web

#! /usr/bin/env sh

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Sanity checks for the regenerated en_US-mozilla dictionary, run after

# make-new-dict.sh and before install-new-dict.sh. Written for POSIX sh

# so it runs unchanged on macOS (BSD userland) and Linux.

set -e

WKDIR="`pwd`"

DICT="$WKDIR/en_US-mozilla.dic"

AFF="$WKDIR/en_US-mozilla.aff"

BASELINE_DIC="$WKDIR/utf8/en-US-utf8.dic"

BASELINE_AFF="$WKDIR/utf8/en-US-utf8.aff"

MOZ_SPECIFIC="$WKDIR/mozilla-specific.txt"

SCOWL_DIR="$WKDIR/scowl"

MUNCH_LIST="$SCOWL_DIR/speller/munch-list"

MOZ_REMOVED="$WKDIR/5-mozilla-removed.txt"

WORDLIST_DIFF_URL_BASE="https://raw.githubusercontent.com/en-wl/wordlist-diff"

if [ ! -f "$DICT" ] || [ ! -f "$AFF" ]; then

  echo "ERROR: $DICT or $AFF not found. Run make-new-dict.sh first."

  exit 1

fi

if [ ! -f "$BASELINE_DIC" ] || [ ! -f "$BASELINE_AFF" ]; then

  echo "ERROR: baseline files missing under $WKDIR/utf8/."

  exit 1

fi

errors=0

warnings=0

fail() {

  printf 'FAIL: %s\n' "$1"

  errors=$((errors + 1))

warn() {

  printf 'WARN: %s\n' "$1"

  warnings=$((warnings + 1))

ok() {

  printf 'OK:   %s\n' "$1"

printf '\n=== 1. ISO-8859-1 round-trip ===\n'

if iconv -f utf-8 -t iso-8859-1 < "$DICT" > /dev/null 2>/dev/null; then

  ok "Dictionary fits in ISO-8859-1"

else

  fail "Dictionary contains characters outside ISO-8859-1 (install-new-dict.sh would mangle them)"

fi

printf '\n=== 2. Mozilla-specific words preserved ===\n'

while IFS= read -r line; do

  case "$line" in

    ''|'#'*) continue ;;

  esac

  word=${line%%/*}

  if grep -qE "^${word}(\$|/)" "$DICT"; then

    ok "$word"

  else

    fail "Missing Mozilla-specific word: $word"

fi

done < "$MOZ_SPECIFIC"

printf '\n=== 3. Suggestion exclusions preserved ===\n'

TMPD="${TMPDIR:-/tmp}"

old_nosug="$TMPD/old-nosug-$$"

new_nosug="$TMPD/new-nosug-$$"

trap 'rm -f "$old_nosug" "$new_nosug"' EXIT

grep '!$' "$BASELINE_DIC" | LC_ALL=C sort > "$old_nosug"

grep '!$' "$DICT"         | LC_ALL=C sort > "$new_nosug"

missing=`comm -23 "$old_nosug" "$new_nosug"`

added=`comm -13 "$old_nosug" "$new_nosug"`

if [ -z "$missing" ]; then

  ok "All previous suggestion exclusions preserved"

else

  fail "Missing suggestion exclusions:"

  printf '%s\n' "$missing" | sed 's/^/  /'

fi

if [ -n "$added" ]; then

  printf 'INFO: New suggestion exclusions:\n'

  printf '%s\n' "$added" | sed 's/^/  /'

fi

printf '\n=== 4. Diff stats ===\n'

old_lines=`wc -l < "$BASELINE_DIC" | tr -d ' '`

new_lines=`wc -l < "$DICT" | tr -d ' '`

delta=$((new_lines - old_lines))

abs=${delta#-}

case $delta in

  -*) delta_str=$delta ;;

  *)  delta_str="+$delta" ;;

esac

if [ "$old_lines" -gt 0 ]; then

  pct=$((abs * 100 / old_lines))

else

  pct=0

fi

printf 'Baseline lines: %s\n' "$old_lines"

printf 'New lines:      %s (delta %s, %s%%)\n' "$new_lines" "$delta_str" "$pct"

if [ "$pct" -gt 25 ]; then

  warn "Line count changed by more than 25% of the baseline; double-check the output"

fi

printf '\n=== 5. Upstream en_US.txt subset check ===\n'

# The Mozilla dictionary should equal upstream en_US.txt minus Mozilla

# removals, plus Mozilla additions, variants and accented words. So every

# word in upstream en_US.txt that isn't in 5-mozilla-removed.txt should be

# present in the regenerated wordlist obtained by expanding en_US-mozilla.dic

# through its affix file.

# The reference en_US.txt lives in the wordlist-diff mirror, which carries

# the same release tags as SCOWL itself. We only need that one file, so

# fetch it directly from raw.githubusercontent.com instead of requiring a

# clone of the repo.

scowl_version=`git -C "$SCOWL_DIR" describe --tags --exact-match 2>/dev/null || true`

if [ -z "$scowl_version" ]; then

  warn "$SCOWL_DIR is not on a tagged release; skipping upstream subset check."

elif ! command -v curl >/dev/null 2>&1; then

  warn "curl not available; skipping upstream subset check."

elif [ ! -x "$MUNCH_LIST" ]; then

  warn "$MUNCH_LIST not available; skipping upstream subset check."

elif [ ! -f "$MOZ_REMOVED" ]; then

  warn "$MOZ_REMOVED not found; run make-new-dict.sh first."

else

  upstream_raw="$TMPD/wordlist-diff-en_US-$$.txt"

  upstream_sorted="$TMPD/upstream-$$"

  final_wordlist="$TMPD/final-wordlist-$$"

  removed_sorted="$TMPD/removed-$$"

  expected_subset="$TMPD/expected-$$"

  unexpected_missing="$TMPD/unexpected-$$"

  trap 'rm -f "$old_nosug" "$new_nosug" "$upstream_raw" "$upstream_sorted" "$final_wordlist" "$removed_sorted" "$expected_subset" "$unexpected_missing"' EXIT

  url="$WORDLIST_DIFF_URL_BASE/$scowl_version/en_US.txt"

  printf 'Fetching %s ...\n' "$url"

  if ! curl -fsSL "$url" -o "$upstream_raw"; then

    warn "Could not download $url; skipping upstream subset check."

  else

    # Expand the regenerated dictionary through its affix file to get the

    # full wordlist. The .dic is UTF-8 at this point; munch-list operates

    # on ISO-8859-1, so pipe it through iconv. Strip the count line at

    # the top (only digits).

    iconv -f utf-8 -t iso-8859-1 "$DICT" \

      | grep -v '^[0-9]\+$' \

      | LC_ALL=C "$MUNCH_LIST" expand "$AFF" \

      | LC_ALL=C sort -u > "$final_wordlist"

    # Normalize the upstream baseline and Mozilla removals to ISO-8859-1

    # to match the regenerated wordlist. Drop any characters that can't

    # be represented (they can't be in the shipped .dic either; check 1

    # catches that case separately).

    iconv -f utf-8 -t iso-8859-1//TRANSLIT "$upstream_raw" 2>/dev/null | LC_ALL=C sort -u > "$upstream_sorted"

    iconv -f utf-8 -t iso-8859-1//TRANSLIT "$MOZ_REMOVED" 2>/dev/null | LC_ALL=C sort -u > "$removed_sorted"

    # Drop words Mozilla intentionally removed from the upstream baseline.

    LC_ALL=C comm -23 "$upstream_sorted" "$removed_sorted" > "$expected_subset"

    # Anything still missing from the regenerated wordlist is unexpected.

    LC_ALL=C comm -23 "$expected_subset" "$final_wordlist" > "$unexpected_missing"

    if [ ! -s "$unexpected_missing" ]; then

      ok "Upstream en_US.txt at $scowl_version is a subset of the regenerated wordlist (minus Mozilla removals)"

    else

      total=`wc -l < "$unexpected_missing" | tr -d ' '`

      fail "$total upstream words from $scowl_version missing from the regenerated wordlist and not in 5-mozilla-removed.txt:"

      head -n 20 "$unexpected_missing" | sed 's/^/  /'

      if [ "$total" -gt 20 ]; then

        printf '  ... and %s more\n' $((total - 20))

fi

fi

fi

fi

printf '\n=== Summary ===\n'

printf 'Errors: %d  Warnings: %d\n' "$errors" "$warnings"

if [ "$errors" -gt 0 ]; then

  exit 1

fi

exit 0