Source code

Revision control

Copy as Markdown

Other Tools

#! /usr/bin/env sh
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Sanity checks for the regenerated en_US-mozilla dictionary, run after
# make-new-dict.sh and before install-new-dict.sh. Written for POSIX sh
# so it runs unchanged on macOS (BSD userland) and Linux.
set -e
WKDIR="`pwd`"
DICT="$WKDIR/en_US-mozilla.dic"
AFF="$WKDIR/en_US-mozilla.aff"
BASELINE_DIC="$WKDIR/utf8/en-US-utf8.dic"
BASELINE_AFF="$WKDIR/utf8/en-US-utf8.aff"
MOZ_SPECIFIC="$WKDIR/mozilla-specific.txt"
SCOWL_DIR="$WKDIR/scowl"
MUNCH_LIST="$SCOWL_DIR/speller/munch-list"
MOZ_REMOVED="$WKDIR/5-mozilla-removed.txt"
if [ ! -f "$DICT" ] || [ ! -f "$AFF" ]; then
echo "ERROR: $DICT or $AFF not found. Run make-new-dict.sh first."
exit 1
fi
if [ ! -f "$BASELINE_DIC" ] || [ ! -f "$BASELINE_AFF" ]; then
echo "ERROR: baseline files missing under $WKDIR/utf8/."
exit 1
fi
errors=0
warnings=0
fail() {
printf 'FAIL: %s\n' "$1"
errors=$((errors + 1))
}
warn() {
printf 'WARN: %s\n' "$1"
warnings=$((warnings + 1))
}
ok() {
printf 'OK: %s\n' "$1"
}
printf '\n=== 1. ISO-8859-1 round-trip ===\n'
if iconv -f utf-8 -t iso-8859-1 < "$DICT" > /dev/null 2>/dev/null; then
ok "Dictionary fits in ISO-8859-1"
else
fail "Dictionary contains characters outside ISO-8859-1 (install-new-dict.sh would mangle them)"
fi
printf '\n=== 2. Mozilla-specific words preserved ===\n'
while IFS= read -r line; do
case "$line" in
''|'#'*) continue ;;
esac
word=${line%%/*}
if grep -qE "^${word}(\$|/)" "$DICT"; then
ok "$word"
else
fail "Missing Mozilla-specific word: $word"
fi
done < "$MOZ_SPECIFIC"
printf '\n=== 3. Suggestion exclusions preserved ===\n'
TMPD="${TMPDIR:-/tmp}"
old_nosug="$TMPD/old-nosug-$$"
new_nosug="$TMPD/new-nosug-$$"
trap 'rm -f "$old_nosug" "$new_nosug"' EXIT
grep '!$' "$BASELINE_DIC" | LC_ALL=C sort > "$old_nosug"
grep '!$' "$DICT" | LC_ALL=C sort > "$new_nosug"
missing=`comm -23 "$old_nosug" "$new_nosug"`
added=`comm -13 "$old_nosug" "$new_nosug"`
if [ -z "$missing" ]; then
ok "All previous suggestion exclusions preserved"
else
fail "Missing suggestion exclusions:"
printf '%s\n' "$missing" | sed 's/^/ /'
fi
if [ -n "$added" ]; then
printf 'INFO: New suggestion exclusions:\n'
printf '%s\n' "$added" | sed 's/^/ /'
fi
printf '\n=== 4. Diff stats ===\n'
old_lines=`wc -l < "$BASELINE_DIC" | tr -d ' '`
new_lines=`wc -l < "$DICT" | tr -d ' '`
delta=$((new_lines - old_lines))
abs=${delta#-}
case $delta in
-*) delta_str=$delta ;;
*) delta_str="+$delta" ;;
esac
if [ "$old_lines" -gt 0 ]; then
pct=$((abs * 100 / old_lines))
else
pct=0
fi
printf 'Baseline lines: %s\n' "$old_lines"
printf 'New lines: %s (delta %s, %s%%)\n' "$new_lines" "$delta_str" "$pct"
if [ "$pct" -gt 25 ]; then
warn "Line count changed by more than 25% of the baseline; double-check the output"
fi
printf '\n=== 5. Upstream en_US.txt subset check ===\n'
# The Mozilla dictionary should equal upstream en_US.txt minus Mozilla
# removals, plus Mozilla additions, variants and accented words. So every
# word in upstream en_US.txt that isn't in 5-mozilla-removed.txt should be
# present in the regenerated wordlist obtained by expanding en_US-mozilla.dic
# through its affix file.
#
# The reference en_US.txt lives in the wordlist-diff mirror, which carries
# the same release tags as SCOWL itself. We only need that one file, so
# fetch it directly from raw.githubusercontent.com instead of requiring a
# clone of the repo.
scowl_version=`git -C "$SCOWL_DIR" describe --tags --exact-match 2>/dev/null || true`
if [ -z "$scowl_version" ]; then
warn "$SCOWL_DIR is not on a tagged release; skipping upstream subset check."
elif ! command -v curl >/dev/null 2>&1; then
warn "curl not available; skipping upstream subset check."
elif [ ! -x "$MUNCH_LIST" ]; then
warn "$MUNCH_LIST not available; skipping upstream subset check."
elif [ ! -f "$MOZ_REMOVED" ]; then
warn "$MOZ_REMOVED not found; run make-new-dict.sh first."
else
upstream_raw="$TMPD/wordlist-diff-en_US-$$.txt"
upstream_sorted="$TMPD/upstream-$$"
final_wordlist="$TMPD/final-wordlist-$$"
removed_sorted="$TMPD/removed-$$"
expected_subset="$TMPD/expected-$$"
unexpected_missing="$TMPD/unexpected-$$"
trap 'rm -f "$old_nosug" "$new_nosug" "$upstream_raw" "$upstream_sorted" "$final_wordlist" "$removed_sorted" "$expected_subset" "$unexpected_missing"' EXIT
url="$WORDLIST_DIFF_URL_BASE/$scowl_version/en_US.txt"
printf 'Fetching %s ...\n' "$url"
if ! curl -fsSL "$url" -o "$upstream_raw"; then
warn "Could not download $url; skipping upstream subset check."
else
# Expand the regenerated dictionary through its affix file to get the
# full wordlist. The .dic is UTF-8 at this point; munch-list operates
# on ISO-8859-1, so pipe it through iconv. Strip the count line at
# the top (only digits).
iconv -f utf-8 -t iso-8859-1 "$DICT" \
| grep -v '^[0-9]\+$' \
| LC_ALL=C "$MUNCH_LIST" expand "$AFF" \
| LC_ALL=C sort -u > "$final_wordlist"
# Normalize the upstream baseline and Mozilla removals to ISO-8859-1
# to match the regenerated wordlist. Drop any characters that can't
# be represented (they can't be in the shipped .dic either; check 1
# catches that case separately).
iconv -f utf-8 -t iso-8859-1//TRANSLIT "$upstream_raw" 2>/dev/null | LC_ALL=C sort -u > "$upstream_sorted"
iconv -f utf-8 -t iso-8859-1//TRANSLIT "$MOZ_REMOVED" 2>/dev/null | LC_ALL=C sort -u > "$removed_sorted"
# Drop words Mozilla intentionally removed from the upstream baseline.
LC_ALL=C comm -23 "$upstream_sorted" "$removed_sorted" > "$expected_subset"
# Anything still missing from the regenerated wordlist is unexpected.
LC_ALL=C comm -23 "$expected_subset" "$final_wordlist" > "$unexpected_missing"
if [ ! -s "$unexpected_missing" ]; then
ok "Upstream en_US.txt at $scowl_version is a subset of the regenerated wordlist (minus Mozilla removals)"
else
total=`wc -l < "$unexpected_missing" | tr -d ' '`
fail "$total upstream words from $scowl_version missing from the regenerated wordlist and not in 5-mozilla-removed.txt:"
head -n 20 "$unexpected_missing" | sed 's/^/ /'
if [ "$total" -gt 20 ]; then
printf ' ... and %s more\n' $((total - 20))
fi
fi
fi
fi
printf '\n=== Summary ===\n'
printf 'Errors: %d Warnings: %d\n' "$errors" "$warnings"
if [ "$errors" -gt 0 ]; then
exit 1
fi
exit 0