Source code

Revision control

Copy as Markdown

Other Tools

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
""" Usage:
make_intl_data.py langtags [cldr_common.zip]
make_intl_data.py tzdata
make_intl_data.py currency
make_intl_data.py units
make_intl_data.py numbering
Target "langtags":
This script extracts information about 1) mappings between deprecated and
current Unicode BCP 47 locale identifiers, and 2) deprecated and current
BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
code in intl/components/LocaleGenerated.cpp. The code is used in
intl/components/Locale.cpp.
Target "tzdata":
This script computes which time zone informations are not up-to-date in ICU
and provides the necessary mappings to workaround this problem.
Target "currency":
Generates the mapping from currency codes to decimal digits used for them.
Target "units":
Generate source and test files using the list of so-called "sanctioned unit
identifiers" and verifies that the ICU data filter includes these units.
Target "numbering":
Generate source and test files using the list of numbering systems with
simple digit mappings and verifies that it's in sync with ICU/CLDR.
"""
import io
import json
import os
import re
import sys
import tarfile
import tempfile
from contextlib import closing
from functools import partial, total_ordering
from itertools import chain, groupby, tee
from operator import attrgetter, itemgetter
from zipfile import ZipFile
import yaml
if sys.version_info.major == 2:
from itertools import ifilter as filter
from itertools import ifilterfalse as filterfalse
from itertools import imap as map
from itertools import izip_longest as zip_longest
from urllib2 import Request as UrlRequest
from urllib2 import urlopen
from urlparse import urlsplit
else:
from itertools import filterfalse, zip_longest
from urllib.parse import urlsplit
from urllib.request import Request as UrlRequest
from urllib.request import urlopen
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def writeMappingHeader(println, description, source, url):
if type(description) is not list:
description = [description]
for desc in description:
println("// {0}".format(desc))
println("// Derived from {0}.".format(source))
println("// {0}".format(url))
def writeMappingsVar(println, mapping, name, description, source, url):
"""Writes a variable definition with a mapping table.
Writes the contents of dictionary |mapping| through the |println|
function with the given variable name and a comment with description,
fileDate, and URL.
"""
println("")
writeMappingHeader(println, description, source, url)
println("var {0} = {{".format(name))
for key, value in sorted(mapping.items(), key=itemgetter(0)):
println(' "{0}": "{1}",'.format(key, value))
println("};")
def writeMappingsBinarySearch(
println,
fn_name,
type_name,
name,
validate_fn,
validate_case_fn,
mappings,
tag_maxlength,
description,
source,
url,
):
"""Emit code to perform a binary search on language tag subtags.
Uses the contents of |mapping|, which can either be a dictionary or set,
to emit a mapping function to find subtag replacements.
"""
println("")
writeMappingHeader(println, description, source, url)
println(
"""
bool mozilla::intl::Locale::{0}({1} {2}) {{
MOZ_ASSERT({3}({2}.Span()));
MOZ_ASSERT({4}({2}.Span()));
""".format(
fn_name, type_name, name, validate_fn, validate_case_fn
).strip()
)
writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength)
println(
"""
}""".lstrip(
"\n"
)
)
def writeMappingsBinarySearchBody(
println, source_name, target_name, mappings, tag_maxlength
):
def write_array(subtags, name, length, fixed):
if fixed:
println(
" static const char {}[{}][{}] = {{".format(
name, len(subtags), length + 1
)
)
else:
println(" static const char* {}[{}] = {{".format(name, len(subtags)))
# Group in pairs of ten to not exceed the 80 line column limit.
for entries in grouper(subtags, 10):
entries = (
'"{}"'.format(tag).rjust(length + 2)
for tag in entries
if tag is not None
)
println(" {},".format(", ".join(entries)))
println(" };")
trailing_return = True
# Sort the subtags by length. That enables using an optimized comparator
# for the binary search, which only performs a single |memcmp| for multiple
# of two subtag lengths.
mappings_keys = mappings.keys() if type(mappings) == dict else mappings
for length, subtags in groupby(sorted(mappings_keys, key=len), len):
# Omit the length check if the current length is the maximum length.
if length != tag_maxlength:
println(
"""
if ({}.Length() == {}) {{
""".format(
source_name, length
).rstrip(
"\n"
)
)
else:
trailing_return = False
println(
"""
{
""".rstrip(
"\n"
)
)
# The subtags need to be sorted for binary search to work.
subtags = sorted(subtags)
def equals(subtag):
return """{}.EqualTo("{}")""".format(source_name, subtag)
# Don't emit a binary search for short lists.
if len(subtags) == 1:
if type(mappings) == dict:
println(
"""
if ({}) {{
{}.Set(mozilla::MakeStringSpan("{}"));
return true;
}}
return false;
""".format(
equals(subtags[0]), target_name, mappings[subtags[0]]
).strip(
"\n"
)
)
else:
println(
"""
return {};
""".format(
equals(subtags[0])
).strip(
"\n"
)
)
elif len(subtags) <= 4:
if type(mappings) == dict:
for subtag in subtags:
println(
"""
if ({}) {{
{}.Set("{}");
return true;
}}
""".format(
equals(subtag), target_name, mappings[subtag]
).strip(
"\n"
)
)
println(
"""
return false;
""".strip(
"\n"
)
)
else:
cond = (equals(subtag) for subtag in subtags)
cond = (" ||\n" + " " * (4 + len("return "))).join(cond)
println(
"""
return {};
""".format(
cond
).strip(
"\n"
)
)
else:
write_array(subtags, source_name + "s", length, True)
if type(mappings) == dict:
write_array([mappings[k] for k in subtags], "aliases", length, False)
println(
"""
if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
{1}.Set(mozilla::MakeStringSpan(replacement));
return true;
}}
return false;
""".format(
source_name, target_name
).rstrip()
)
else:
println(
"""
return HasReplacement({0}s, {0});
""".format(
source_name
).rstrip()
)
println(
"""
}
""".strip(
"\n"
)
)
if trailing_return:
println(
"""
return false;"""
)
def writeComplexLanguageTagMappings(
println, complex_language_mappings, description, source, url
):
println("")
writeMappingHeader(println, description, source, url)
println(
"""
void mozilla::intl::Locale::PerformComplexLanguageMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
""".lstrip()
)
# Merge duplicate language entries.
language_aliases = {}
for deprecated_language, (language, script, region) in sorted(
complex_language_mappings.items(), key=itemgetter(0)
):
key = (language, script, region)
if key not in language_aliases:
language_aliases[key] = []
else:
language_aliases[key].append(deprecated_language)
first_language = True
for deprecated_language, (language, script, region) in sorted(
complex_language_mappings.items(), key=itemgetter(0)
):
key = (language, script, region)
if deprecated_language in language_aliases[key]:
continue
if_kind = "if" if first_language else "else if"
first_language = False
cond = (
'Language().EqualTo("{}")'.format(lang)
for lang in [deprecated_language] + language_aliases[key]
)
cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{""".format(
if_kind, cond
).strip(
"\n"
)
)
println(
"""
SetLanguage("{}");""".format(
language
).strip(
"\n"
)
)
if script is not None:
println(
"""
if (Script().Missing()) {{
SetScript("{}");
}}""".format(
script
).strip(
"\n"
)
)
if region is not None:
println(
"""
if (Region().Missing()) {{
SetRegion("{}");
}}""".format(
region
).strip(
"\n"
)
)
println(
"""
}""".strip(
"\n"
)
)
println(
"""
}
""".strip(
"\n"
)
)
def writeComplexRegionTagMappings(
println, complex_region_mappings, description, source, url
):
println("")
writeMappingHeader(println, description, source, url)
println(
"""
void mozilla::intl::Locale::PerformComplexRegionMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span()));
MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span()));
""".lstrip()
)
# |non_default_replacements| is a list and hence not hashable. Convert it
# to a string to get a proper hashable value.
def hash_key(default, non_default_replacements):
return (default, str(sorted(str(v) for v in non_default_replacements)))
# Merge duplicate region entries.
region_aliases = {}
for deprecated_region, (default, non_default_replacements) in sorted(
complex_region_mappings.items(), key=itemgetter(0)
):
key = hash_key(default, non_default_replacements)
if key not in region_aliases:
region_aliases[key] = []
else:
region_aliases[key].append(deprecated_region)
first_region = True
for deprecated_region, (default, non_default_replacements) in sorted(
complex_region_mappings.items(), key=itemgetter(0)
):
key = hash_key(default, non_default_replacements)
if deprecated_region in region_aliases[key]:
continue
if_kind = "if" if first_region else "else if"
first_region = False
cond = (
'Region().EqualTo("{}")'.format(region)
for region in [deprecated_region] + region_aliases[key]
)
cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{""".format(
if_kind, cond
).strip(
"\n"
)
)
replacement_regions = sorted(
{region for (_, _, region) in non_default_replacements}
)
first_case = True
for replacement_region in replacement_regions:
replacement_language_script = sorted(
(language, script)
for (language, script, region) in (non_default_replacements)
if region == replacement_region
)
if_kind = "if" if first_case else "else if"
first_case = False
def compare_tags(language, script):
if script is None:
return 'Language().EqualTo("{}")'.format(language)
return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format(
language, script
)
cond = (
compare_tags(language, script)
for (language, script) in replacement_language_script
)
cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{
SetRegion("{}");
}}""".format(
if_kind, cond, replacement_region
)
.rstrip()
.strip("\n")
)
println(
"""
else {{
SetRegion("{}");
}}
}}""".format(
default
)
.rstrip()
.strip("\n")
)
println(
"""
}
""".strip(
"\n"
)
)
def writeVariantTagMappings(println, variant_mappings, description, source, url):
"""Writes a function definition that maps variant subtags."""
println(
"""
static const char* ToCharPointer(const char* str) {
return str;
}
static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) {
return str.get();
}
template <typename T, typename U = T>
static bool IsLessThan(const T& a, const U& b) {
return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
}
"""
)
writeMappingHeader(println, description, source, url)
println(
"""
bool mozilla::intl::Locale::PerformVariantMappings() {
// The variant subtags need to be sorted for binary search.
MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
IsLessThan<decltype(mVariants)::ElementType>));
auto removeVariantAt = [&](size_t index) {
mVariants.erase(mVariants.begin() + index);
};
auto insertVariantSortedIfNotPresent = [&](const char* variant) {
auto* p = std::lower_bound(
mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>);
// Don't insert the replacement when already present.
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return true;
}
// Insert the preferred variant in sort order.
auto preferred = DuplicateStringToUniqueChars(variant);
return !!mVariants.insert(p, std::move(preferred));
};
for (size_t i = 0; i < mVariants.length();) {
const char* variant = mVariants[i].get();
MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant)));
""".lstrip()
)
(no_alias, with_alias) = partition(
variant_mappings.items(), lambda item: item[1] is None
)
no_replacements = " ||\n ".join(
f"""strcmp(variant, "{deprecated_variant}") == 0"""
for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0))
)
println(
f"""
if ({no_replacements}) {{
removeVariantAt(i);
}}
""".strip(
"\n"
)
)
for deprecated_variant, (type, replacement) in sorted(
with_alias, key=itemgetter(0)
):
println(
f"""
else if (strcmp(variant, "{deprecated_variant}") == 0) {{
removeVariantAt(i);
""".strip(
"\n"
)
)
if type == "language":
println(
f"""
SetLanguage("{replacement}");
""".strip(
"\n"
)
)
elif type == "region":
println(
f"""
SetRegion("{replacement}");
""".strip(
"\n"
)
)
else:
assert type == "variant"
println(
f"""
if (!insertVariantSortedIfNotPresent("{replacement}")) {{
return false;
}}
""".strip(
"\n"
)
)
println(
"""
}
""".strip(
"\n"
)
)
println(
"""
else {
i++;
}
}
return true;
}
""".strip(
"\n"
)
)
def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url):
"""Writes a function definition that maps legacy language tags."""
println("")
writeMappingHeader(println, description, source, url)
println(
"""\
bool mozilla::intl::Locale::UpdateLegacyMappings() {
// We're mapping legacy tags to non-legacy form here.
// Other tags remain unchanged.
//
// Legacy tags are either sign language tags ("sgn") or have one or multiple
// variant subtags. Therefore we can quickly exclude most tags by checking
// these two subtags.
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
if (!Language().EqualTo("sgn") && mVariants.length() == 0) {
return true;
}
#ifdef DEBUG
for (const auto& variant : Variants()) {
MOZ_ASSERT(IsStructurallyValidVariantTag(variant));
MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant));
}
#endif
// The variant subtags need to be sorted for binary search.
MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
IsLessThan<decltype(mVariants)::ElementType>));
auto findVariant = [this](const char* variant) {
auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType,
decltype(variant)>);
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return p;
}
return static_cast<decltype(p)>(nullptr);
};
auto insertVariantSortedIfNotPresent = [&](const char* variant) {
auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType,
decltype(variant)>);
// Don't insert the replacement when already present.
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return true;
}
// Insert the preferred variant in sort order.
auto preferred = DuplicateStringToUniqueChars(variant);
return !!mVariants.insert(p, std::move(preferred));
};
auto removeVariant = [&](auto* p) {
size_t index = std::distance(mVariants.begin(), p);
mVariants.erase(mVariants.begin() + index);
};
auto removeVariants = [&](auto* p, auto* q) {
size_t pIndex = std::distance(mVariants.begin(), p);
size_t qIndex = std::distance(mVariants.begin(), q);
MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted");
mVariants.erase(mVariants.begin() + qIndex);
mVariants.erase(mVariants.begin() + pIndex);
};"""
)
# Helper class for pattern matching.
class AnyClass:
def __eq__(self, obj):
return obj is not None
Any = AnyClass()
# Group the mappings by language.
legacy_mappings_by_language = {}
for type, replacement in legacy_mappings.items():
(language, _, _, _) = type
legacy_mappings_by_language.setdefault(language, {})[type] = replacement
# Handle the empty language case first.
if None in legacy_mappings_by_language:
# Get the mappings and remove them from the dict.
mappings = legacy_mappings_by_language.pop(None)
# This case only applies for the "hepburn-heploc" -> "alalc97"
# mapping, so just inline it here.
from_tag = (None, None, None, "hepburn-heploc")
to_tag = (None, None, None, "alalc97")
assert len(mappings) == 1
assert mappings[from_tag] == to_tag
println(
"""
if (mVariants.length() >= 2) {
if (auto* hepburn = findVariant("hepburn")) {
if (auto* heploc = findVariant("heploc")) {
removeVariants(hepburn, heploc);
if (!insertVariantSortedIfNotPresent("alalc97")) {
return false;
}
}
}
}
"""
)
# Handle sign languages next.
if "sgn" in legacy_mappings_by_language:
mappings = legacy_mappings_by_language.pop("sgn")
# Legacy sign language mappings have the form "sgn-XX" where "XX" is
# some region code.
assert all(type == ("sgn", None, Any, None) for type in mappings.keys())
# Legacy sign languages are mapped to a single language subtag.
assert all(
replacement == (Any, None, None, None) for replacement in mappings.values()
)
println(
"""
if (Language().EqualTo("sgn")) {
if (Region().Present() && SignLanguageMapping(mLanguage, Region())) {
mRegion.Set(mozilla::MakeStringSpan(""));
}
}
""".rstrip().lstrip(
"\n"
)
)
# Finally handle all remaining cases.
# The remaining mappings have neither script nor region subtags in the source locale.
assert all(
type == (Any, None, None, Any)
for mappings in legacy_mappings_by_language.values()
for type in mappings.keys()
)
# And they have neither script nor region nor variant subtags in the target locale.
assert all(
replacement == (Any, None, None, None)
for mappings in legacy_mappings_by_language.values()
for replacement in mappings.values()
)
# Compact the mappings table by removing empty fields.
legacy_mappings_by_language = {
lang: {
variants: r_language
for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items()
}
for (lang, mappings) in legacy_mappings_by_language.items()
}
# Try to combine the remaining cases.
legacy_mappings_compact = {}
# Python can't hash dicts or lists, so use the string representation as the hash key.
def hash_key(mappings):
return str(sorted(mappings.items(), key=itemgetter(0)))
for lang, mappings in sorted(
legacy_mappings_by_language.items(), key=itemgetter(0)
):
key = hash_key(mappings)
legacy_mappings_compact.setdefault(key, []).append(lang)
for langs in legacy_mappings_compact.values():
language_equal_to = (
f"""Language().EqualTo("{lang}")""" for lang in sorted(langs)
)
cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to)
println(
f"""
else if ({cond}) {{
""".rstrip().lstrip(
"\n"
)
)
mappings = legacy_mappings_by_language[langs[0]]
# Count the variant subtags to determine the sort order.
def variant_size(m):
(k, _) = m
return len(k.split("-"))
# Alias rules are applied by largest union size first.
for size, mappings_by_size in groupby(
sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size
):
# Convert grouper object to dict.
mappings_by_size = dict(mappings_by_size)
is_first = True
chain_if = size == 1
# Alias rules are applied in alphabetical order
for variants, r_language in sorted(
mappings_by_size.items(), key=itemgetter(0)
):
sorted_variants = sorted(variants.split("-"))
len_variants = len(sorted_variants)
maybe_else = "else " if chain_if and not is_first else ""
is_first = False
for i, variant in enumerate(sorted_variants):
println(
f"""
{" " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{
""".rstrip().lstrip(
"\n"
)
)
indent = " " * len_variants
println(
f"""
{indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)});
{indent}SetLanguage("{r_language}");
{indent}{"return true;" if not chain_if else ""}
""".rstrip().lstrip(
"\n"
)
)
for i in range(len_variants, 0, -1):
println(
f"""
{" " * (i - 1)}}}
""".rstrip().lstrip(
"\n"
)
)
println(
"""
}
""".rstrip().lstrip(
"\n"
)
)
println(
"""
return true;
}"""
)
def writeSignLanguageMappingsFunction(
println, legacy_mappings, description, source, url
):
"""Writes a function definition that maps legacy sign language tags."""
println("")
writeMappingHeader(println, description, source, url)
println(
"""\
bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language,
const RegionSubtag& region) {
MOZ_ASSERT(language.EqualTo("sgn"));
MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
""".rstrip()
)
region_mappings = {
rg: lg
for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items()
if lang == "sgn"
}
source_name = "region"
target_name = "language"
tag_maxlength = 3
writeMappingsBinarySearchBody(
println, source_name, target_name, region_mappings, tag_maxlength
)
println(
"""
}""".lstrip()
)
def readSupplementalData(core_file):
"""Reads CLDR Supplemental Data and extracts information for Intl.js.
Information extracted:
- legacyMappings: mappings from legacy tags to preferred complete language tags
- languageMappings: mappings from language subtags to preferred subtags
- complexLanguageMappings: mappings from language subtags with complex rules
- regionMappings: mappings from region subtags to preferred subtags
- complexRegionMappings: mappings from region subtags with complex rules
- variantMappings: mappings from variant subtags to preferred subtags
- likelySubtags: likely subtags used for generating test data only
Returns these mappings as dictionaries.
"""
import xml.etree.ElementTree as ET
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
re_unicode_language_id = re.compile(
r"""
^
# unicode_language_id = unicode_language_subtag
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
(?P<language>[a-z]{2,3}|[a-z]{5,8})
# (sep unicode_script_subtag)?
# unicode_script_subtag = alpha{4}
(?:-(?P<script>[a-z]{4}))?
# (sep unicode_region_subtag)?
# unicode_region_subtag = (alpha{2} | digit{3})
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
# (sep unicode_variant_subtag)*
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
$
""",
re.IGNORECASE | re.VERBOSE,
)
# CLDR uses "_" as the separator for some elements. Replace it with "-".
def bcp47_id(cldr_id):
return cldr_id.replace("_", "-")
# Return the tuple (language, script, region, variants) and assert all
# subtags are in canonical case.
def bcp47_canonical(language, script, region, variants):
# Canonical case for language subtags is lower case.
assert language is None or language.lower() == language
# Canonical case for script subtags is title case.
assert script is None or script.title() == script
# Canonical case for region subtags is upper case.
assert region is None or region.upper() == region
# Canonical case for variant subtags is lower case.
assert variants is None or variants.lower() == variants
return (language, script, region, variants[1:] if variants else None)
# Language ids are interpreted as multi-maps in
#
# See UTS35, §Annex C, Definitions - 1. Multimap interpretation.
def language_id_to_multimap(language_id):
match = re_unicode_language_id.match(language_id)
assert (
match is not None
), f"{language_id} invalid Unicode BCP 47 locale identifier"
canonical_language_id = bcp47_canonical(
*match.group("language", "script", "region", "variants")
)
(language, _, _, _) = canonical_language_id
# Normalize "und" language to None, but keep the rest as is.
return (language if language != "und" else None,) + canonical_language_id[1:]
rules = {}
territory_exception_rules = {}
tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
# Load the rules from supplementalMetadata.xml.
#
# See UTS35, §Annex C, Definitions - 2. Alias elements.
# See UTS35, §Annex C, Preprocessing.
for alias_name in [
"languageAlias",
"scriptAlias",
"territoryAlias",
"variantAlias",
]:
for alias in tree.iterfind(".//" + alias_name):
# Replace '_' by '-'.
type = bcp47_id(alias.get("type"))
replacement = bcp47_id(alias.get("replacement"))
# Prefix with "und-".
if alias_name != "languageAlias":
type = "und-" + type
# Discard all rules where the type is an invalid languageId.
if re_unicode_language_id.match(type) is None:
continue
type = language_id_to_multimap(type)
# Multiple, whitespace-separated territory replacements may be present.
if alias_name == "territoryAlias" and " " in replacement:
replacements = replacement.split(" ")
replacement_list = [
language_id_to_multimap("und-" + r) for r in replacements
]
assert (
type not in territory_exception_rules
), f"Duplicate alias rule: {type}"
territory_exception_rules[type] = replacement_list
# The first element is the default territory replacement.
replacement = replacements[0]
# Prefix with "und-".
if alias_name != "languageAlias":
replacement = "und-" + replacement
replacement = language_id_to_multimap(replacement)
assert type not in rules, f"Duplicate alias rule: {type}"
rules[type] = replacement
# Helper class for pattern matching.
class AnyClass:
def __eq__(self, obj):
return obj is not None
Any = AnyClass()
modified_rules = True
loop_count = 0
while modified_rules:
modified_rules = False
loop_count += 1
# UTS 35 defines that canonicalization is applied until a fixed point has
# been reached. This iterative application of the canonicalization algorithm
# is only needed for a relatively small set of rules, so we can precompute
# the transitive closure of all rules here and then perform a single pass
# when canonicalizing language tags at runtime.
transitive_rules = {}
# Compute the transitive closure.
# Any case which currently doesn't occur in the CLDR sources isn't supported
# and will lead to throwing an error.
for type, replacement in rules.items():
(language, script, region, variants) = type
(r_language, r_script, r_region, r_variants) = replacement
for i_type, i_replacement in rules.items():
(i_language, i_script, i_region, i_variants) = i_type
(i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement
if i_language is not None and i_language == r_language:
# This case currently only occurs when neither script nor region
# subtags are present. A single variant subtags may be present
# in |type|. And |i_type| definitely has a single variant subtag.
# Should this ever change, update this code accordingly.
assert type == (Any, None, None, None) or type == (
Any,
None,
None,
Any,
)
assert replacement == (Any, None, None, None)
assert i_type == (Any, None, None, Any)
assert i_replacement == (Any, None, None, None)
# This case happens for the rules
# "zh-guoyu -> zh",
# "zh-hakka -> hak", and
# "und-hakka -> und".
# Given the possible input "zh-guoyu-hakka", the first rule will
# change it to "zh-hakka", and then the second rule can be
# applied. (The third rule isn't applied ever.)
#
# Let's assume there's a hypothetical rule
# "zh-aaaaa" -> "en"
# And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en"
# is applied before "zh-hakka -> hak", because rules are sorted
# alphabetically. That means the overall result is "en":
# "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then
# "hakka" is removed through the third rule.
#
# No current rule requires to handle this special case, so we
# don't yet support it.
assert variants is None or variants <= i_variants
# Combine all variants and remove duplicates.
vars = set(
i_variants.split("-")
+ (variants.split("-") if variants else [])
)
# Add the variants alphabetically sorted.
n_type = (language, None, None, "-".join(sorted(vars)))
assert (
n_type not in transitive_rules
or transitive_rules[n_type] == i_replacement
)
transitive_rules[n_type] = i_replacement
continue
if i_script is not None and i_script == r_script:
# This case currently doesn't occur, so we don't yet support it.
raise ValueError(
f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
)
if i_region is not None and i_region == r_region:
# This case currently only applies for sign language
# replacements. Similar to the language subtag case any other
# combination isn't currently supported.
assert type == (None, None, Any, None)
assert replacement == (None, None, Any, None)
assert i_type == ("sgn", None, Any, None)
assert i_replacement == (Any, None, None, None)
n_type = ("sgn", None, region, None)
assert n_type not in transitive_rules
transitive_rules[n_type] = i_replacement
continue
if i_variants is not None and i_variants == r_variants:
# This case currently doesn't occur, so we don't yet support it.
raise ValueError(
f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
)
# Ensure there are no contradicting rules.
assert all(
rules[type] == replacement
for (type, replacement) in transitive_rules.items()
if type in rules
)
# If |transitive_rules| is not a subset of |rules|, new rules will be added.
modified_rules = not (transitive_rules.keys() <= rules.keys())
# Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}"
# case. Failing this assertion means either there's a bug when computing the
# stop condition of this loop or a new kind of legacy language tags was added.
if modified_rules and loop_count > 1:
new_rules = {k for k in transitive_rules.keys() if k not in rules}
for k in new_rules:
assert k == (Any, None, None, "guoyu-hakka") or k == (
Any,
None,
None,
"guoyu-xiang",
)
# Merge the transitive rules.
rules.update(transitive_rules)
# Computes the size of the union of all field value sets.
def multi_map_size(locale_id):
(language, script, region, variants) = locale_id
return (
(1 if language is not None else 0)
+ (1 if script is not None else 0)
+ (1 if region is not None else 0)
+ (len(variants.split("-")) if variants is not None else 0)
)
# Dictionary of legacy mappings, contains raw rules, e.g.
# (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97").
legacy_mappings = {}
# Dictionary of simple language subtag mappings, e.g. "in" -> "id".
language_mappings = {}
# Dictionary of complex language subtag mappings, modifying more than one
# subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
complex_language_mappings = {}
# Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh".
script_mappings = {}
# Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
region_mappings = {}
# Dictionary of complex region subtag mappings, containing more than one
# replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
complex_region_mappings = {}
# Dictionary of aliased variant subtags to a tuple of preferred replacement
# type and replacement, e.g. "arevela" -> ("language", "hy") or
# "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
variant_mappings = {}
# Preprocess all rules so we can perform a single lookup per subtag at runtime.
for type, replacement in rules.items():
(language, script, region, variants) = type
(r_language, r_script, r_region, r_variants) = replacement
type_map_size = multi_map_size(type)
# Most mappings are one-to-one and can be encoded through lookup tables.
if type_map_size == 1:
if language is not None:
assert r_language is not None, "Can't remove a language subtag"
# We don't yet support this case.
assert (
r_variants is None
), f"Unhandled variant replacement in language alias: {replacement}"
if replacement == (Any, None, None, None):
language_mappings[language] = r_language
else:
complex_language_mappings[language] = replacement[:-1]
elif script is not None:
# We don't support removing script subtags.
assert (
r_script is not None
), f"Can't remove a script subtag: {replacement}"
# We only support one-to-one script mappings for now.
assert replacement == (
None,
Any,
None,
None,
), f"Unhandled replacement in script alias: {replacement}"
script_mappings[script] = r_script
elif region is not None:
# We don't support removing region subtags.
assert (
r_region is not None
), f"Can't remove a region subtag: {replacement}"
# We only support one-to-one region mappings for now.
assert replacement == (
None,
None,
Any,
None,
), f"Unhandled replacement in region alias: {replacement}"
if type not in territory_exception_rules:
region_mappings[region] = r_region
else:
complex_region_mappings[region] = [
r_region
for (_, _, r_region, _) in territory_exception_rules[type]
]
else:
assert variants is not None
assert len(variants.split("-")) == 1
# We only support one-to-one variant mappings for now.
assert (
multi_map_size(replacement) <= 1
), f"Unhandled replacement in variant alias: {replacement}"
if r_language is not None:
variant_mappings[variants] = ("language", r_language)
elif r_script is not None:
variant_mappings[variants] = ("script", r_script)
elif r_region is not None:
variant_mappings[variants] = ("region", r_region)
elif r_variants is not None:
assert len(r_variants.split("-")) == 1
variant_mappings[variants] = ("variant", r_variants)
else:
variant_mappings[variants] = None
else:
# Alias rules which have multiple input fields must be processed
# first. This applies only to a handful of rules, so our generated
# code adds fast paths to skip these rules in the common case.
# Case 1: Language and at least one variant subtag.
if language is not None and variants is not None:
pass
# Case 2: Sign language and a region subtag.
elif language == "sgn" and region is not None:
pass
# Case 3: "hepburn-heploc" to "alalc97" canonicalization.
elif (
language is None
and variants is not None
and len(variants.split("-")) == 2
):
pass
# Any other combination is currently unsupported.
else:
raise ValueError(f"{type} -> {replacement}")
legacy_mappings[type] = replacement
tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
likely_subtags = {}
for likely_subtag in tree.iterfind(".//likelySubtag"):
from_tag = bcp47_id(likely_subtag.get("from"))
from_match = re_unicode_language_id.match(from_tag)
assert (
from_match is not None
), f"{from_tag} invalid Unicode BCP 47 locale identifier"
assert (
from_match.group("variants") is None
), f"unexpected variant subtags in {from_tag}"
to_tag = bcp47_id(likely_subtag.get("to"))
to_match = re_unicode_language_id.match(to_tag)
assert (
to_match is not None
), f"{to_tag} invalid Unicode BCP 47 locale identifier"
assert (
to_match.group("variants") is None
), f"unexpected variant subtags in {to_tag}"
from_canonical = bcp47_canonical(
*from_match.group("language", "script", "region", "variants")
)
to_canonical = bcp47_canonical(
*to_match.group("language", "script", "region", "variants")
)
# Remove the empty variant subtags.
from_canonical = from_canonical[:-1]
to_canonical = to_canonical[:-1]
likely_subtags[from_canonical] = to_canonical
complex_region_mappings_final = {}
for deprecated_region, replacements in complex_region_mappings.items():
# Find all likely subtag entries which don't already contain a region
# subtag and whose target region is in the list of replacement regions.
region_likely_subtags = [
(from_language, from_script, to_region)
for (
(from_language, from_script, from_region),
(_, _, to_region),
) in likely_subtags.items()
if from_region is None and to_region in replacements
]
# The first replacement entry is the default region.
default = replacements[0]
# Find all likely subtag entries whose region matches the default region.
default_replacements = {
(language, script)
for (language, script, region) in region_likely_subtags
if region == default
}
# And finally find those entries which don't use the default region.
# These are the entries we're actually interested in, because those need
# to be handled specially when selecting the correct preferred region.
non_default_replacements = [
(language, script, region)
for (language, script, region) in region_likely_subtags
if (language, script) not in default_replacements
]
# Remove redundant mappings.
#
# For example starting with CLDR 43, the deprecated region "SU" has the
# following non-default replacement entries for "GE":
# - ('sva', None, 'GE')
# - ('sva', 'Cyrl', 'GE')
# - ('sva', 'Latn', 'GE')
#
# The latter two entries are redundant, because they're already handled
# by the first entry.
non_default_replacements = [
(language, script, region)
for (language, script, region) in non_default_replacements
if script is None
or (language, None, region) not in non_default_replacements
]
# If there are no non-default replacements, we can handle the region as
# part of the simple region mapping.
if non_default_replacements:
complex_region_mappings_final[deprecated_region] = (
default,
non_default_replacements,
)
else:
region_mappings[deprecated_region] = default
return {
"legacyMappings": legacy_mappings,
"languageMappings": language_mappings,
"complexLanguageMappings": complex_language_mappings,
"scriptMappings": script_mappings,
"regionMappings": region_mappings,
"complexRegionMappings": complex_region_mappings_final,
"variantMappings": variant_mappings,
"likelySubtags": likely_subtags,
}
def readUnicodeExtensions(core_file):
import xml.etree.ElementTree as ET
# Match all xml-files in the BCP 47 directory.
bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
#
# type = alphanum{3,8} (sep alphanum{3,8})* ;
typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
#
# unicode_region_subtag = alpha{2} ;
alphaRegionRE = re.compile(r"^[A-Z]{2}$", re.IGNORECASE)
# Mapping from Unicode extension types to dict of deprecated to
# preferred values.
mapping = {
# Unicode BCP 47 U Extension
"u": {},
# Unicode BCP 47 T Extension
"t": {},
}
def readBCP47File(file):
tree = ET.parse(file)
for keyword in tree.iterfind(".//keyword/key"):
extension = keyword.get("extension", "u")
assert (
extension == "u" or extension == "t"
), "unknown extension type: {}".format(extension)
extension_name = keyword.get("name")
for type in keyword.iterfind("type"):
#
# The key or type name used by Unicode locale extension with 'u' extension
# syntax or the 't' extensions syntax. When alias below is absent, this name
# can be also used with the old style "@key=type" syntax.
name = type.get("name")
# Ignore the special name:
if name in (
"CODEPOINTS",
"REORDER_CODE",
"RG_KEY_VALUE",
"SCRIPT_CODE",
"SUBDIVISION_CODE",
"PRIVATE_USE",
):
continue
# All other names should match the 'type' production.
assert (
typeRE.match(name) is not None
), "{} matches the 'type' production".format(name)
#
# The preferred value of the deprecated key, type or attribute element.
# When a key, type or attribute element is deprecated, this attribute is
# used for specifying a new canonical form if available.
preferred = type.get("preferred")
#
# The BCP 47 form is the canonical form, and recommended. Other aliases are
# included only for backwards compatibility.
alias = type.get("alias")
#
# Use the bcp47 data to replace keys, types, tfields, and tvalues by their
# canonical forms. See Section 3.6.4 U Extension Data Files) and Section
# 3.7.1 T Extension Data Files. The aliases are in the alias attribute
# value, while the canonical is in the name attribute value.
# 'preferred' contains the new preferred name, 'alias' the compatibility
# name, but then there's this entry where 'preferred' and 'alias' are the
# same. So which one to choose? Assume 'preferred' is the actual canonical
# name.
#
# <type name="islamicc"
# description="Civil (algorithmic) Arabic calendar"
# deprecated="true"
# preferred="islamic-civil"
# alias="islamic-civil"/>
if preferred is not None:
assert typeRE.match(preferred), preferred
mapping[extension].setdefault(extension_name, {})[name] = preferred
if alias is not None:
for alias_name in alias.lower().split(" "):
# Ignore alias entries which don't match the 'type' production.
if typeRE.match(alias_name) is None:
continue
# See comment above when 'alias' and 'preferred' are both present.
if (
preferred is not None
and name in mapping[extension][extension_name]
):
continue
# Skip over entries where 'name' and 'alias' are equal.
#
# <type name="pst8pdt"
# description="POSIX style time zone for US Pacific Time"
# alias="PST8PDT"
# since="1.8"/>
if name == alias_name:
continue
mapping[extension].setdefault(extension_name, {})[
alias_name
] = name
def readSupplementalMetadata(file):
# Find subdivision and region replacements.
#
#
# Replace aliases in special key values:
# - If there is an 'sd' or 'rg' key, replace any subdivision alias
# in its value in the same way, using subdivisionAlias data.
tree = ET.parse(file)
for alias in tree.iterfind(".//subdivisionAlias"):
type = alias.get("type")
assert (
typeRE.match(type) is not None
), "{} matches the 'type' production".format(type)
# Take the first replacement when multiple ones are present.
replacement = alias.get("replacement").split(" ")[0].lower()
# Append "zzzz" if the replacement is a two-letter region code.
if alphaRegionRE.match(replacement) is not None:
replacement += "zzzz"
# Assert the replacement is syntactically correct.
assert (
typeRE.match(replacement) is not None
), "replacement {} matches the 'type' production".format(replacement)
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
mapping["u"].setdefault("rg", {})[type] = replacement
mapping["u"].setdefault("sd", {})[type] = replacement
for name in core_file.namelist():
if bcpFileRE.match(name):
readBCP47File(core_file.open(name))
readSupplementalMetadata(
core_file.open("common/supplemental/supplementalMetadata.xml")
)
return {
"unicodeMappings": mapping["u"],
"transformMappings": mapping["t"],
}
def writeCLDRLanguageTagData(println, data, url):
"""Writes the language tag data to the Intl data file."""
println(generatedFileWarning)
println("// Version: CLDR-{}".format(data["version"]))
println("// URL: {}".format(url))
println(
"""
#include "mozilla/Assertions.h"
#include "mozilla/Span.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <string>
#include <type_traits>
#include "mozilla/intl/Locale.h"
using namespace mozilla::intl::LanguageTagLimits;
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline bool HasReplacement(
const char (&subtags)[Length][TagLength],
const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.Length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.Span().data();
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
}
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline const char* SearchReplacement(
const char (&subtags)[Length][TagLength], const char* (&aliases)[Length],
const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.Length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.Span().data();
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
return aliases[std::distance(std::begin(subtags), p)];
}
return nullptr;
}
#ifdef DEBUG
static bool IsAsciiLowercaseAlphanumeric(char c) {
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
}
static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
return IsAsciiLowercaseAlphanumeric(c) || c == '-';
}
static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(),
mozilla::IsAsciiLowercaseAlpha<char>);
}
static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) {
return mozilla::IsAsciiUppercaseAlpha(span[0]) &&
std::all_of(span.begin() + 1, span.end(),
mozilla::IsAsciiLowercaseAlpha<char>);
}
static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(),
mozilla::IsAsciiUppercaseAlpha<char>) ||
std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
}
static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
return std::all_of(type.begin(), type.end(),
IsAsciiLowercaseAlphanumericOrDash);
}
static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
return std::all_of(type.begin(), type.end(),
IsAsciiLowercaseAlphanumericOrDash);
}
#endif
""".rstrip()
)
source = "CLDR Supplemental Data, version {}".format(data["version"])
legacy_mappings = data["legacyMappings"]
language_mappings = data["languageMappings"]
complex_language_mappings = data["complexLanguageMappings"]
script_mappings = data["scriptMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
variant_mappings = data["variantMappings"]
unicode_mappings = data["unicodeMappings"]
transform_mappings = data["transformMappings"]
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
language_maxlength = 8
# unicode_script_subtag = alpha{4} ;
script_maxlength = 4
# unicode_region_subtag = (alpha{2} | digit{3}) ;
region_maxlength = 3
writeMappingsBinarySearch(
println,
"LanguageMapping",
"LanguageSubtag&",
"language",
"IsStructurallyValidLanguageTag",
"IsCanonicallyCasedLanguageTag",
language_mappings,
language_maxlength,
"Mappings from language subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ComplexLanguageMapping",
"const LanguageSubtag&",
"language",
"IsStructurallyValidLanguageTag",
"IsCanonicallyCasedLanguageTag",
complex_language_mappings.keys(),
language_maxlength,
"Language subtags with complex mappings.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ScriptMapping",
"ScriptSubtag&",
"script",
"IsStructurallyValidScriptTag",
"IsCanonicallyCasedScriptTag",
script_mappings,
script_maxlength,
"Mappings from script subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"RegionMapping",
"RegionSubtag&",
"region",
"IsStructurallyValidRegionTag",
"IsCanonicallyCasedRegionTag",
region_mappings,
region_maxlength,
"Mappings from region subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ComplexRegionMapping",
"const RegionSubtag&",
"region",
"IsStructurallyValidRegionTag",
"IsCanonicallyCasedRegionTag",
complex_region_mappings.keys(),
region_maxlength,
"Region subtags with complex mappings.",
source,
url,
)
writeComplexLanguageTagMappings(
println,
complex_language_mappings,
"Language subtags with complex mappings.",
source,
url,
)
writeComplexRegionTagMappings(
println,
complex_region_mappings,
"Region subtags with complex mappings.",
source,
url,
)
writeVariantTagMappings(
println,
variant_mappings,
"Mappings from variant subtags to preferred values.",
source,
url,
)
writeLegacyMappingsFunction(
println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url
)
writeSignLanguageMappingsFunction(
println, legacy_mappings, "Mappings from legacy sign languages.", source, url
)
writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
"""Writes the likely-subtags test file."""
println(generatedFileWarning)
source = "CLDR Supplemental Data, version {}".format(data["version"])
language_mappings = data["languageMappings"]
complex_language_mappings = data["complexLanguageMappings"]
script_mappings = data["scriptMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
likely_subtags = data["likelySubtags"]
def bcp47(tag):
(language, script, region) = tag
return "{}{}{}".format(
language, "-" + script if script else "", "-" + region if region else ""
)
def canonical(tag):
(language, script, region) = tag
# Map deprecated language subtags.
if language in language_mappings:
language = language_mappings[language]
elif language in complex_language_mappings:
(language2, script2, region2) = complex_language_mappings[language]
(language, script, region) = (
language2,
script if script else script2,
region if region else region2,
)
# Map deprecated script subtags.
if script in script_mappings:
script = script_mappings[script]
# Map deprecated region subtags.
if region in region_mappings:
region = region_mappings[region]
else:
# Assume no complex region mappings are needed for now.
assert (
region not in complex_region_mappings
), "unexpected region with complex mappings: {}".format(region)
return (language, script, region)
def addLikelySubtags(tag):
# Step 1: Canonicalize.
(language, script, region) = canonical(tag)
if script == "Zzzz":
script = None
if region == "ZZ":
region = None
# Step 2: Lookup.
searches = (
(language, script, region),
(language, None, region),
(language, script, None),
(language, None, None),
("und", script, None),
)
search = next(search for search in searches if search in likely_subtags)
(language_s, script_s, region_s) = search
(language_m, script_m, region_m) = likely_subtags[search]
# Step 3: Return.
return (
language if language != language_s else language_m,
script if script != script_s else script_m,
region if region != region_s else region_m,
)
def removeLikelySubtags(tag):
# Step 1: Add likely subtags.
max = addLikelySubtags(tag)
# Step 2: Remove variants (doesn't apply here).
# Step 3: Find a match.
(language, script, region) = max
for trial in (
(language, None, None),
(language, None, region),
(language, script, None),
):
if addLikelySubtags(trial) == max:
return trial
# Step 4: Return maximized if no match found.
return max
def likely_canonical(from_tag, to_tag):
# Canonicalize the input tag.
from_tag = canonical(from_tag)
# Update the expected result if necessary.
if from_tag in likely_subtags:
to_tag = likely_subtags[from_tag]
# Canonicalize the expected output.
to_canonical = canonical(to_tag)
# Sanity check: This should match the result of |addLikelySubtags|.
assert to_canonical == addLikelySubtags(from_tag)
return to_canonical
# |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
likely_subtags_canonical = {
k: likely_canonical(k, v) for (k, v) in likely_subtags.items()
}
# Add test data for |Intl.Locale.prototype.maximize()|.
writeMappingsVar(
println,
{bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
"maxLikelySubtags",
"Extracted from likelySubtags.xml.",
source,
url,
)
# Use the maximalized tags as the input for the remove likely-subtags test.
minimized = {
tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()
}
# Add test data for |Intl.Locale.prototype.minimize()|.
writeMappingsVar(
println,
{bcp47(k): bcp47(v) for (k, v) in minimized.items()},
"minLikelySubtags",
"Extracted from likelySubtags.xml.",
source,
url,
)
println(
"""
for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
}"""
)
println(
"""
for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
}"""
)
println(
"""
if (typeof reportCompare === "function")
reportCompare(0, 0);"""
)
def readCLDRVersionFromICU():
icuDir = os.path.join(topsrcdir, "intl/icu/source")
if not os.path.isdir(icuDir):
raise RuntimeError("not a directory: {}".format(icuDir))
reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}')
for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")):
m = reVersion.match(line)
if m:
version = m.group(1)
break
if version is None:
raise RuntimeError("can't resolve CLDR version")
return version
def updateCLDRLangTags(args):
"""Update the LanguageTagGenerated.cpp file."""
version = args.version
url = args.url
out = args.out
filename = args.file
# Determine current CLDR version from ICU.
if version is None:
version = readCLDRVersionFromICU()
url = url.replace("<VERSION>", version)
print("Arguments:")
print("\tCLDR version: %s" % version)
print("\tDownload url: %s" % url)
if filename is not None:
print("\tLocal CLDR common.zip file: %s" % filename)
print("\tOutput file: %s" % out)
print("")
data = {
"version": version,
}
def readFiles(cldr_file):
with ZipFile(cldr_file) as zip_file:
data.update(readSupplementalData(zip_file))
data.update(readUnicodeExtensions(zip_file))
print("Processing CLDR data...")
if filename is not None:
print("Always make sure you have the newest CLDR common.zip!")
with open(filename, "rb") as cldr_file:
readFiles(cldr_file)
else:
print("Downloading CLDR common.zip...")
with closing(urlopen(url)) as cldr_file:
cldr_data = io.BytesIO(cldr_file.read())
readFiles(cldr_data)