Source code
Revision control
Copy as Markdown
Other Tools
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
"""Convert the SCOWLv2 affix file to the Mozilla-shipped ISO-8859-1 form.
SCOWLv2's affix file is UTF-8, includes ICONV rules pairing the curly
apostrophe (U+2019) with the ASCII apostrophe, and adds U+2019 to
WORDCHARS. Mozilla ships the file as ISO-8859-1, which can't represent
U+2019, so this script:
1. Strips ICONV rules.
2. Changes "SET UTF-8" to "SET ISO8859-1".
3. Drops U+2019 from WORDCHARS.
4. Rewrites the file in ISO-8859-1.
"""
import argparse
import pathlib
import sys
def main(argv=None):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"aff_file",
type=pathlib.Path,
help="affix file to convert in place (read as UTF-8, written as ISO-8859-1)",
)
args = parser.parse_args(argv)
out_lines = []
for line in args.aff_file.read_text(encoding="utf-8").splitlines():
if line.startswith("ICONV"):
continue
if line == "SET UTF-8":
out_lines.append("SET ISO8859-1")
elif line.startswith("WORDCHARS"):
out_lines.append(line.replace("’", ""))
else:
out_lines.append(line)
args.aff_file.write_text("\n".join(out_lines) + "\n", encoding="iso-8859-1")
return 0
if __name__ == "__main__":
sys.exit(main())