Source code

Revision control

Other Tools

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
#
4
# This Source Code Form is subject to the terms of the Mozilla Public
5
# License, v. 2.0. If a copy of the MPL was not distributed with this
6
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8
""" Usage:
9
make_intl_data.py langtags [cldr_core.zip]
10
make_intl_data.py tzdata
11
make_intl_data.py currency
12
13
14
Target "langtags":
15
This script extracts information about 1) mappings between deprecated and
16
current Unicode BCP 47 locale identifiers, and 2) deprecated and current
17
BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
18
code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
19
20
21
Target "tzdata":
22
This script computes which time zone informations are not up-to-date in ICU
23
and provides the necessary mappings to workaround this problem.
25
26
27
Target "currency":
28
Generates the mapping from currency codes to decimal digits used for them.
29
"""
30
31
from __future__ import print_function
32
import os
33
import re
34
import io
35
import sys
36
import tarfile
37
import tempfile
38
from contextlib import closing
39
from functools import partial, total_ordering
40
from itertools import chain, groupby, tee
41
from operator import attrgetter, itemgetter
42
from zipfile import ZipFile
43
44
if sys.version_info.major == 2:
45
from itertools import ifilter as filter, ifilterfalse as filterfalse, imap as map,\
46
izip_longest as zip_longest
47
from urllib2 import urlopen, Request as UrlRequest
48
from urlparse import urlsplit
49
else:
50
from itertools import filterfalse, zip_longest
51
from urllib.request import urlopen, Request as UrlRequest
52
from urllib.parse import urlsplit
53
54
56
def grouper(iterable, n, fillvalue=None):
57
"Collect data into fixed-length chunks or blocks"
58
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
59
args = [iter(iterable)] * n
60
return zip_longest(*args, fillvalue=fillvalue)
61
62
63
def writeMappingHeader(println, description, source, url):
64
if type(description) is not list:
65
description = [description]
66
for desc in description:
67
println(u"// {0}".format(desc))
68
println(u"// Derived from {0}.".format(source))
69
println(u"// {0}".format(url))
70
71
72
def writeMappingsVar(println, mapping, name, description, source, url):
73
""" Writes a variable definition with a mapping table.
74
75
Writes the contents of dictionary |mapping| through the |println|
76
function with the given variable name and a comment with description,
77
fileDate, and URL.
78
"""
79
println(u"")
80
writeMappingHeader(println, description, source, url)
81
println(u"var {0} = {{".format(name))
82
for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
83
println(u' "{0}": "{1}",'.format(key, value))
84
println(u"};")
85
86
87
def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, validate_case_fn,
88
mappings, tag_maxlength, description, source, url):
89
""" Emit code to perform a binary search on language tag subtags.
90
91
Uses the contents of |mapping|, which can either be a dictionary or set,
92
to emit a mapping function to find subtag replacements.
93
"""
94
println(u"")
95
writeMappingHeader(println, description, source, url)
96
println(u"""
97
bool js::intl::LanguageTag::{0}({1} {2}) {{
98
MOZ_ASSERT({3}({2}.span()));
99
MOZ_ASSERT({4}({2}.span()));
100
""".format(fn_name, type_name, name, validate_fn, validate_case_fn).strip())
101
102
def write_array(subtags, name, length, fixed):
103
if fixed:
104
println(u" static const char {}[{}][{}] = {{".format(name, len(subtags),
105
length + 1))
106
else:
107
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
108
109
# Group in pairs of ten to not exceed the 80 line column limit.
110
for entries in grouper(subtags, 10):
111
entries = (u"\"{}\"".format(tag).rjust(length + 2)
112
for tag in entries if tag is not None)
113
println(u" {},".format(u", ".join(entries)))
114
115
println(u" };")
116
117
trailing_return = True
118
119
# Sort the subtags by length. That enables using an optimized comparator
120
# for the binary search, which only performs a single |memcmp| for multiple
121
# of two subtag lengths.
122
mappings_keys = mappings.keys() if type(mappings) == dict else mappings
123
for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
124
# Omit the length check if the current length is the maximum length.
125
if length != tag_maxlength:
126
println(u"""
127
if ({}.length() == {}) {{
128
""".format(name, length).rstrip("\n"))
129
else:
130
trailing_return = False
131
println(u"""
132
{
133
""".rstrip("\n"))
134
135
# The subtags need to be sorted for binary search to work.
136
subtags = sorted(subtags)
137
138
def equals(subtag):
139
return u"""{}.equalTo("{}")""".format(name, subtag)
140
141
# Don't emit a binary search for short lists.
142
if len(subtags) == 1:
143
if type(mappings) == dict:
144
println(u"""
145
if ({}) {{
146
{}.set("{}");
147
return true;
148
}}
149
return false;
150
""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
151
else:
152
println(u"""
153
return {};
154
""".format(equals(subtags[0])).strip("\n"))
155
elif len(subtags) <= 4:
156
if type(mappings) == dict:
157
for subtag in subtags:
158
println(u"""
159
if ({}) {{
160
{}.set("{}");
161
return true;
162
}}
163
""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
164
165
println(u"""
166
return false;
167
""".strip("\n"))
168
else:
169
cond = (equals(subtag) for subtag in subtags)
170
cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
171
println(u"""
172
return {};
173
""".format(cond).strip("\n"))
174
else:
175
write_array(subtags, name + "s", length, True)
176
177
if type(mappings) == dict:
178
write_array([mappings[k] for k in subtags], u"aliases", length, False)
179
180
println(u"""
181
if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
182
{0}.set(mozilla::MakeStringSpan(replacement));
183
return true;
184
}}
185
return false;
186
""".format(name).rstrip())
187
else:
188
println(u"""
189
return HasReplacement({0}s, {0});
190
""".format(name).rstrip())
191
192
println(u"""
193
}
194
""".strip("\n"))
195
196
if trailing_return:
197
println(u"""
198
return false;""")
199
200
println(u"""
201
}""".lstrip("\n"))
202
203
204
def writeComplexLanguageTagMappings(println, complex_language_mappings,
205
description, source, url):
206
println(u"")
207
writeMappingHeader(println, description, source, url)
208
println(u"""
209
void js::intl::LanguageTag::performComplexLanguageMappings() {
210
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
211
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
212
""".lstrip())
213
214
# Merge duplicate language entries.
215
language_aliases = {}
216
for (deprecated_language, (language, script, region)) in (
217
sorted(complex_language_mappings.items(), key=itemgetter(0))
218
):
219
key = (language, script, region)
220
if key not in language_aliases:
221
language_aliases[key] = []
222
else:
223
language_aliases[key].append(deprecated_language)
224
225
first_language = True
226
for (deprecated_language, (language, script, region)) in (
227
sorted(complex_language_mappings.items(), key=itemgetter(0))
228
):
229
key = (language, script, region)
230
if deprecated_language in language_aliases[key]:
231
continue
232
233
if_kind = u"if" if first_language else u"else if"
234
first_language = False
235
236
cond = (u"language().equalTo(\"{}\")".format(lang)
237
for lang in [deprecated_language] + language_aliases[key])
238
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
239
240
println(u"""
241
{} ({}) {{""".format(if_kind, cond).strip("\n"))
242
243
println(u"""
244
setLanguage("{}");""".format(language).strip("\n"))
245
246
if script is not None:
247
println(u"""
248
if (script().missing()) {{
249
setScript("{}");
250
}}""".format(script).strip("\n"))
251
if region is not None:
252
println(u"""
253
if (region().missing()) {{
254
setRegion("{}");
255
}}""".format(region).strip("\n"))
256
println(u"""
257
}""".strip("\n"))
258
259
println(u"""
260
}
261
""".strip("\n"))
262
263
264
def writeComplexRegionTagMappings(println, complex_region_mappings,
265
description, source, url):
266
println(u"")
267
writeMappingHeader(println, description, source, url)
268
println(u"""
269
void js::intl::LanguageTag::performComplexRegionMappings() {
270
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
271
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
272
MOZ_ASSERT(IsStructurallyValidRegionTag(region().span()));
273
MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span()));
274
""".lstrip())
275
276
# |non_default_replacements| is a list and hence not hashable. Convert it
277
# to a string to get a proper hashable value.
278
def hash_key(default, non_default_replacements):
279
return (default, str(sorted(str(v) for v in non_default_replacements)))
280
281
# Merge duplicate region entries.
282
region_aliases = {}
283
for (deprecated_region, (default, non_default_replacements)) in (
284
sorted(complex_region_mappings.items(), key=itemgetter(0))
285
):
286
key = hash_key(default, non_default_replacements)
287
if key not in region_aliases:
288
region_aliases[key] = []
289
else:
290
region_aliases[key].append(deprecated_region)
291
292
first_region = True
293
for (deprecated_region, (default, non_default_replacements)) in (
294
sorted(complex_region_mappings.items(), key=itemgetter(0))
295
):
296
key = hash_key(default, non_default_replacements)
297
if deprecated_region in region_aliases[key]:
298
continue
299
300
if_kind = u"if" if first_region else u"else if"
301
first_region = False
302
303
cond = (u"region().equalTo(\"{}\")".format(region)
304
for region in [deprecated_region] + region_aliases[key])
305
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
306
307
println(u"""
308
{} ({}) {{""".format(if_kind, cond).strip("\n"))
309
310
replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
311
312
first_case = True
313
for replacement_region in replacement_regions:
314
replacement_language_script = sorted(((language, script)
315
for (language, script, region) in (
316
non_default_replacements
317
)
318
if region == replacement_region),
319
key=itemgetter(0))
320
321
if_kind = u"if" if first_case else u"else if"
322
first_case = False
323
324
def compare_tags(language, script):
325
if script is None:
326
return u"language().equalTo(\"{}\")".format(language)
327
return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
328
language, script)
329
330
cond = (compare_tags(language, script)
331
for (language, script) in replacement_language_script)
332
cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
333
334
println(u"""
335
{} ({}) {{
336
setRegion("{}");
337
}}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
338
339
println(u"""
340
else {{
341
setRegion("{}");
342
}}
343
}}""".format(default).rstrip().strip("\n"))
344
345
println(u"""
346
}
347
""".strip("\n"))
348
349
350
def writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
351
description, source, url):
352
""" Writes a function definition that maps grandfathered language tags. """
353
println(u"")
354
writeMappingHeader(println, description, source, url)
355
println(u"""\
356
bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
357
// We're mapping regular grandfathered tags to non-grandfathered form here.
358
// Other tags remain unchanged.
359
//
360
// regular = "art-lojban"
361
// / "cel-gaulish"
362
// / "no-bok"
363
// / "no-nyn"
364
// / "zh-guoyu"
365
// / "zh-hakka"
366
// / "zh-min"
367
// / "zh-min-nan"
368
// / "zh-xiang"
369
//
370
// Therefore we can quickly exclude most tags by checking every
371
// |unicode_locale_id| subcomponent for characteristics not shared by any of
372
// the regular grandfathered (RG) tags:
373
//
374
// * Real-world |unicode_language_subtag|s are all two or three letters,
375
// so don't waste time running a useless |language.length > 3| fast-path.
376
// * No RG tag has a "script"-looking component.
377
// * No RG tag has a "region"-looking component.
378
// * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
379
// zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
380
// no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
381
// that |unicode_locale_id| doesn't support.)
382
// * No RG tag contains |extensions| or |pu_extensions|.
383
if (script().present() ||
384
region().present() ||
385
variants().length() != 1 ||
386
extensions().length() != 0 ||
387
privateuse()) {
388
return true;
389
}
390
391
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
392
MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variants()[0].get())));
393
394
auto variantEqualTo = [this](const char* variant) {
395
return strcmp(variants()[0].get(), variant) == 0;
396
};""")
397
398
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
399
#
400
# Doesn't allow any 'extensions' subtags.
401
re_unicode_locale_id = re.compile(
402
r"""
403
^
404
# unicode_language_id = unicode_language_subtag
405
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
406
(?P<language>[a-z]{2,3}|[a-z]{5,8})
407
408
# (sep unicode_script_subtag)?
409
# unicode_script_subtag = alpha{4}
410
(?:-(?P<script>[a-z]{4}))?
411
412
# (sep unicode_region_subtag)?
413
# unicode_region_subtag = (alpha{2} | digit{3})
414
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
415
416
# (sep unicode_variant_subtag)*
417
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
418
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
419
420
# pu_extensions?
421
# pu_extensions = sep [xX] (sep alphanum{1,8})+
422
(?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
423
$
424
""", re.IGNORECASE | re.VERBOSE)
425
426
is_first = True
427
428
for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
429
tag_match = re_unicode_locale_id.match(tag)
430
assert tag_match is not None
431
432
tag_language = tag_match.group("language")
433
assert tag_match.group("script") is None, (
434
"{} does not contain a script subtag".format(tag))
435
assert tag_match.group("region") is None, (
436
"{} does not contain a region subtag".format(tag))
437
tag_variants = tag_match.group("variants")
438
assert tag_variants is not None, (
439
"{} contains a variant subtag".format(tag))
440
assert tag_match.group("privateuse") is None, (
441
"{} does not contain a privateuse subtag".format(tag))
442
443
tag_variant = tag_variants[1:]
444
assert "-" not in tag_variant, (
445
"{} contains only a single variant".format(tag))
446
447
modern_match = re_unicode_locale_id.match(modern)
448
assert modern_match is not None
449
450
modern_language = modern_match.group("language")
451
modern_script = modern_match.group("script")
452
modern_region = modern_match.group("region")
453
modern_variants = modern_match.group("variants")
454
modern_privateuse = modern_match.group("privateuse")
455
456
println(u"""
457
// {} -> {}
458
""".format(tag, modern).rstrip())
459
460
println(u"""
461
{}if (language().equalTo("{}") && variantEqualTo("{}")) {{
462
""".format("" if is_first else "else ",
463
tag_language,
464
tag_variant).rstrip().strip("\n"))
465
466
is_first = False
467
468
println(u"""
469
setLanguage("{}");
470
""".format(modern_language).rstrip().strip("\n"))
471
472
if modern_script is not None:
473
println(u"""
474
setScript("{}");
475
""".format(modern_script).rstrip().strip("\n"))
476
477
if modern_region is not None:
478
println(u"""
479
setRegion("{}");
480
""".format(modern_region).rstrip().strip("\n"))
481
482
assert modern_variants is None, (
483
"all regular grandfathered tags' modern forms do not contain variant subtags")
484
485
println(u"""
486
clearVariants();
487
""".rstrip().strip("\n"))
488
489
if modern_privateuse is not None:
490
println(u"""
491
auto privateuse = DuplicateString(cx, "{}");
492
if (!privateuse) {{
493
return false;
494
}}
495
setPrivateuse(std::move(privateuse));
496
""".format(modern_privateuse).rstrip().rstrip("\n"))
497
498
println(u"""
499
return true;
500
}""".rstrip().strip("\n"))
501
502
println(u"""
503
return true;
504
}""")
505
506
507
def readSupplementalData(core_file):
508
""" Reads CLDR Supplemental Data and extracts information for Intl.js.
509
510
Information extracted:
511
- grandfatheredMappings: mappings from grandfathered tags to preferred
512
complete language tags
513
- languageMappings: mappings from language subtags to preferred subtags
514
- complexLanguageMappings: mappings from language subtags with complex rules
515
- regionMappings: mappings from region subtags to preferred subtags
516
- complexRegionMappings: mappings from region subtags with complex rules
517
- likelySubtags: likely subtags used for generating test data only
518
Returns these mappings as dictionaries.
519
"""
520
import xml.etree.ElementTree as ET
521
522
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
523
re_unicode_language_id = re.compile(
524
r"""
525
^
526
# unicode_language_id = unicode_language_subtag
527
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
528
(?P<language>[a-z]{2,3}|[a-z]{5,8})
529
530
# (sep unicode_script_subtag)?
531
# unicode_script_subtag = alpha{4}
532
(?:-(?P<script>[a-z]{4}))?
533
534
# (sep unicode_region_subtag)?
535
# unicode_region_subtag = (alpha{2} | digit{3})
536
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
537
538
# (sep unicode_variant_subtag)*
539
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
540
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
541
$
542
""", re.IGNORECASE | re.VERBOSE)
543
544
re_unicode_language_subtag = re.compile(
545
r"""
546
^
547
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
548
([a-z]{2,3}|[a-z]{5,8})
549
$
550
""", re.IGNORECASE | re.VERBOSE)
551
552
re_unicode_region_subtag = re.compile(
553
r"""
554
^
555
# unicode_region_subtag = (alpha{2} | digit{3})
556
([a-z]{2}|[0-9]{3})
557
$
558
""", re.IGNORECASE | re.VERBOSE)
559
560
# The fixed list of BCP 47 grandfathered language tags.
561
grandfathered_tags = (
562
"art-lojban",
563
"cel-gaulish",
564
"en-GB-oed",
565
"i-ami",
566
"i-bnn",
567
"i-default",
568
"i-enochian",
569
"i-hak",
570
"i-klingon",
571
"i-lux",
572
"i-mingo",
573
"i-navajo",
574
"i-pwn",
575
"i-tao",
576
"i-tay",
577
"i-tsu",
578
"no-bok",
579
"no-nyn",
580
"sgn-BE-FR",
581
"sgn-BE-NL",
582
"sgn-CH-DE",
583
"zh-guoyu",
584
"zh-hakka",
585
"zh-min",
586
"zh-min-nan",
587
"zh-xiang",
588
)
589
590
# The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
591
unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
592
if re_unicode_language_id.match(tag)}
593
594
# Dictionary of simple language subtag mappings, e.g. "in" -> "id".
595
language_mappings = {}
596
597
# Dictionary of complex language subtag mappings, modifying more than one
598
# subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
599
complex_language_mappings = {}
600
601
# Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
602
region_mappings = {}
603
604
# Dictionary of complex region subtag mappings, containing more than one
605
# replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
606
complex_region_mappings = {}
607
608
# Dictionary of grandfathered mappings to preferred values.
609
grandfathered_mappings = {}
610
611
# CLDR uses "_" as the separator for some elements. Replace it with "-".
612
def bcp47_id(cldr_id):
613
return cldr_id.replace("_", "-")
614
615
# CLDR uses the canonical case for most entries, but there are some
616
# exceptions, like:
617
# <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
618
# Therefore canonicalize all tags to be on the safe side.
619
def bcp47_canonical(language, script, region):
620
# Canonical case for language subtags is lower case.
621
# Canonical case for script subtags is title case.
622
# Canonical case for region subtags is upper case.
623
return (language.lower() if language else None,
624
script.title() if script else None,
625
region.upper() if region else None)
626
627
tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
628
629
for language_alias in tree.iterfind(".//languageAlias"):
630
type = bcp47_id(language_alias.get("type"))
631
replacement = bcp47_id(language_alias.get("replacement"))
632
633
# Handle grandfathered mappings first.
634
if type in unicode_bcp47_grandfathered_tags:
635
grandfathered_mappings[type] = replacement
636
continue
637
638
# We're only interested in language subtag matches, so ignore any
639
# entries which have additional subtags.
640
if re_unicode_language_subtag.match(type) is None:
641
continue
642
643
if re_unicode_language_subtag.match(replacement) is not None:
644
# Canonical case for language subtags is lower-case.
645
language_mappings[type] = replacement.lower()
646
else:
647
replacement_match = re_unicode_language_id.match(replacement)
648
assert replacement_match is not None, (
649
"{} invalid Unicode BCP 47 locale identifier".format(replacement))
650
assert replacement_match.group("variants") is None, (
651
"{}: unexpected variant subtags in {}".format(type, replacement))
652
653
complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
654
replacement_match.group("script"),
655
replacement_match.group("region"))
656
657
for territory_alias in tree.iterfind(".//territoryAlias"):
658
type = territory_alias.get("type")
659
replacement = territory_alias.get("replacement")
660
661
# We're only interested in region subtag matches, so ignore any entries
662
# which contain legacy formats, e.g. three letter region codes.
663
if re_unicode_region_subtag.match(type) is None:
664
continue
665
666
if re_unicode_region_subtag.match(replacement) is not None:
667
# Canonical case for region subtags is upper-case.
668
region_mappings[type] = replacement.upper()
669
else:
670
# Canonical case for region subtags is upper-case.
671
replacements = [r.upper() for r in replacement.split(" ")]
672
assert all(
673
re_unicode_region_subtag.match(loc) is not None for loc in replacements
674
), "{} invalid region subtags".format(replacement)
675
complex_region_mappings[type] = replacements
676
677
tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
678
679
likely_subtags = {}
680
681
for likely_subtag in tree.iterfind(".//likelySubtag"):
682
from_tag = bcp47_id(likely_subtag.get("from"))
683
from_match = re_unicode_language_id.match(from_tag)
684
assert from_match is not None, (
685
"{} invalid Unicode BCP 47 locale identifier".format(from_tag))
686
assert from_match.group("variants") is None, (
687
"unexpected variant subtags in {}".format(from_tag))
688
689
to_tag = bcp47_id(likely_subtag.get("to"))
690
to_match = re_unicode_language_id.match(to_tag)
691
assert to_match is not None, (
692
"{} invalid Unicode BCP 47 locale identifier".format(to_tag))
693
assert to_match.group("variants") is None, (
694
"unexpected variant subtags in {}".format(to_tag))
695
696
from_canonical = bcp47_canonical(from_match.group("language"),
697
from_match.group("script"),
698
from_match.group("region"))
699
700
to_canonical = bcp47_canonical(to_match.group("language"),
701
to_match.group("script"),
702
to_match.group("region"))
703
704
likely_subtags[from_canonical] = to_canonical
705
706
complex_region_mappings_final = {}
707
708
for (deprecated_region, replacements) in complex_region_mappings.items():
709
# Find all likely subtag entries which don't already contain a region
710
# subtag and whose target region is in the list of replacement regions.
711
region_likely_subtags = [(from_language, from_script, to_region)
712
for ((from_language, from_script, from_region),
713
(_, _, to_region)) in likely_subtags.items()
714
if from_region is None and to_region in replacements]
715
716
# The first replacement entry is the default region.
717
default = replacements[0]
718
719
# Find all likely subtag entries whose region matches the default region.
720
default_replacements = {(language, script)
721
for (language, script, region) in region_likely_subtags
722
if region == default}
723
724
# And finally find those entries which don't use the default region.
725
# These are the entries we're actually interested in, because those need
726
# to be handled specially when selecting the correct preferred region.
727
non_default_replacements = [(language, script, region)
728
for (language, script, region) in region_likely_subtags
729
if (language, script) not in default_replacements]
730
731
# If there are no non-default replacements, we can handle the region as
732
# part of the simple region mapping.
733
if non_default_replacements:
734
complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
735
else:
736
region_mappings[deprecated_region] = default
737
738
return {"grandfatheredMappings": grandfathered_mappings,
739
"languageMappings": language_mappings,
740
"complexLanguageMappings": complex_language_mappings,
741
"regionMappings": region_mappings,
742
"complexRegionMappings": complex_region_mappings_final,
743
"likelySubtags": likely_subtags,
744
}
745
746
747
def readUnicodeExtensions(core_file):
748
import xml.etree.ElementTree as ET
749
750
# Match all xml-files in the BCP 47 directory.
751
bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
752
754
#
755
# type = alphanum{3,8} (sep alphanum{3,8})* ;
756
typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
757
758
# Mapping from Unicode extension types to dict of deprecated to
759
# preferred values.
760
mapping = {}
761
762
def readBCP47File(file):
763
tree = ET.parse(file)
764
for keyword in tree.iterfind(".//keyword/key"):
765
# Skip over keywords whose extension is not "u".
766
if keyword.get("extension", "u") != "u":
767
continue
768
769
extension_name = keyword.get("name")
770
771
for type in keyword.iterfind("type"):
773
#
774
# The key or type name used by Unicode locale extension with 'u' extension
775
# syntax or the 't' extensions syntax. When alias below is absent, this name
776
# can be also used with the old style "@key=type" syntax.
777
name = type.get("name")
778
779
# Ignore the special name:
785
if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
786
"PRIVATE_USE"):
787
continue
788
789
# All other names should match the 'type' production.
790
assert typeRE.match(name) is not None, (
791
"{} matches the 'type' production".format(name))
792
794
#
795
# The preferred value of the deprecated key, type or attribute element.
796
# When a key, type or attribute element is deprecated, this attribute is
797
# used for specifying a new canonical form if available.
798
preferred = type.get("preferred")
799
801
#
802
# The BCP 47 form is the canonical form, and recommended. Other aliases are
803
# included only for backwards compatibility.
804
alias = type.get("alias")
805
807
#
808
# Use the bcp47 data to replace keys, types, tfields, and tvalues by their
809
# canonical forms. See Section 3.6.4 U Extension Data Files) and Section
810
# 3.7.1 T Extension Data Files. The aliases are in the alias attribute
811
# value, while the canonical is in the name attribute value.
812
813
# 'preferred' contains the new preferred name, 'alias' the compatibility
814
# name, but then there's this entry where 'preferred' and 'alias' are the
815
# same. So which one to choose? Assume 'preferred' is the actual canonical
816
# name.
817
#
818
# <type name="islamicc"
819
# description="Civil (algorithmic) Arabic calendar"
820
# deprecated="true"
821
# preferred="islamic-civil"
822
# alias="islamic-civil"/>
823
824
if preferred is not None:
825
assert typeRE.match(preferred), preferred
826
mapping.setdefault(extension_name, {})[name] = preferred
827
828
if alias is not None:
829
for alias_name in alias.lower().split(" "):
830
# Ignore alias entries which don't match the 'type' production.
831
if typeRE.match(alias_name) is None:
832
continue
833
834
# See comment above when 'alias' and 'preferred' are both present.
835
if (preferred is not None and
836
name in mapping[extension_name]):
837
continue
838
839
# Skip over entries where 'name' and 'alias' are equal.
840
#
841
# <type name="pst8pdt"
842
# description="POSIX style time zone for US Pacific Time"
843
# alias="PST8PDT"
844
# since="1.8"/>
845
if name == alias_name:
846
continue
847
848
mapping.setdefault(extension_name, {})[alias_name] = name
849
850
def readSupplementalMetadata(file):
851
# Find subdivision and region replacements.
852
#
854
#
855
# Replace aliases in special key values:
856
# - If there is an 'sd' or 'rg' key, replace any subdivision alias
857
# in its value in the same way, using subdivisionAlias data.
858
tree = ET.parse(file)
859
for alias in tree.iterfind(".//subdivisionAlias"):
860
type = alias.get("type")
861
assert typeRE.match(type) is not None, (
862
"{} matches the 'type' production".format(type))
863
864
# Take the first replacement when multiple ones are present.
865
replacement = alias.get("replacement").split(" ")[0].lower()
866
867
# Skip over invalid replacements.
868
#
869
# <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
870
#
871
# It's not entirely clear to me if CLDR actually wants to use
872
# "axzzzz" as the replacement for this case.
873
if typeRE.match(replacement) is None:
874
continue
875
876
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
877
mapping.setdefault("rg", {})[type] = replacement
878
mapping.setdefault("sd", {})[type] = replacement
879
880
for name in core_file.namelist():
881
if bcpFileRE.match(name):
882
readBCP47File(core_file.open(name))
883
884
readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
885
886
return mapping
887
888
889
def writeCLDRLanguageTagData(println, data, url):
890
""" Writes the language tag data to the Intl data file. """
891
892
println(generatedFileWarning)
893
println(u"// Version: CLDR-{}".format(data["version"]))
894
println(u"// URL: {}".format(url))
895
896
println(u"""
897
#include "mozilla/Assertions.h"
898
#include "mozilla/Span.h"
899
#include "mozilla/TextUtils.h"
900
901
#include <algorithm>
902
#include <cstdint>
903
#include <cstring>
904
#include <iterator>
905
#include <type_traits>
906
907
#include "builtin/intl/LanguageTag.h"
908
#include "util/Text.h"
909
#include "vm/JSContext.h"
910
911
using namespace js::intl::LanguageTagLimits;
912
913
template <size_t Length, size_t TagLength, size_t SubtagLength>
914
static inline bool HasReplacement(
915
const char (&subtags)[Length][TagLength],
916
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
917
MOZ_ASSERT(subtag.length() == TagLength - 1,
918
"subtag must have the same length as the list of subtags");
919
920
const char* ptr = subtag.span().data();
921
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
922
[](const char* a, const char* b) {
923
return memcmp(a, b, TagLength - 1) < 0;
924
});
925
}
926
927
template <size_t Length, size_t TagLength, size_t SubtagLength>
928
static inline const char* SearchReplacement(
929
const char (&subtags)[Length][TagLength],
930
const char* (&aliases)[Length],
931
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
932
MOZ_ASSERT(subtag.length() == TagLength - 1,
933
"subtag must have the same length as the list of subtags");
934
935
const char* ptr = subtag.span().data();
936
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
937
[](const char* a, const char* b) {
938
return memcmp(a, b, TagLength - 1) < 0;
939
});
940
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
941
return aliases[std::distance(std::begin(subtags), p)];
942
}
943
return nullptr;
944
}
945
946
#ifdef DEBUG
947
static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
948
// Tell the analysis the |std::all_of| function can't GC.
949
JS::AutoSuppressGCAnalysis nogc;
950
951
return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
952
}
953
954
static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
955
// Tell the analysis the |std::all_of| function can't GC.
956
JS::AutoSuppressGCAnalysis nogc;
957
958
return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) ||
959
std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
960
}
961
962
static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
963
auto isAsciiLowercaseAlphaOrDigit = [](char c) {
964
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
965
};
966
967
// Tell the analysis the |std::all_of| function can't GC.
968
JS::AutoSuppressGCAnalysis nogc;
969
970
return std::all_of(span.begin(), span.end(), isAsciiLowercaseAlphaOrDigit);
971
}
972
#endif
973
""".rstrip())
974
975
source = u"CLDR Supplemental Data, version {}".format(data["version"])
976
grandfathered_mappings = data["grandfatheredMappings"]
977
language_mappings = data["languageMappings"]
978
complex_language_mappings = data["complexLanguageMappings"]
979
region_mappings = data["regionMappings"]
980
complex_region_mappings = data["complexRegionMappings"]
981
unicode_mappings = data["unicodeMappings"]
982
983
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
984
language_maxlength = 8
985
986
# unicode_region_subtag = (alpha{2} | digit{3}) ;
987
region_maxlength = 3
988
989
writeMappingsBinarySearch(println, "languageMapping",
990
"LanguageSubtag&", "language",
991
"IsStructurallyValidLanguageTag",
992
"IsCanonicallyCasedLanguageTag",
993
language_mappings, language_maxlength,
994
"Mappings from language subtags to preferred values.", source, url)
995
writeMappingsBinarySearch(println, "complexLanguageMapping",
996
"const LanguageSubtag&", "language",
997
"IsStructurallyValidLanguageTag",
998
"IsCanonicallyCasedLanguageTag",
999
complex_language_mappings.keys(), language_maxlength,
1000
"Language subtags with complex mappings.", source, url)
1001
writeMappingsBinarySearch(println, "regionMapping",
1002
"RegionSubtag&", "region",
1003
"IsStructurallyValidRegionTag",
1004
"IsCanonicallyCasedRegionTag",
1005
region_mappings, region_maxlength,
1006
"Mappings from region subtags to preferred values.", source, url)
1007
writeMappingsBinarySearch(println, "complexRegionMapping",
1008
"const RegionSubtag&", "region",
1009
"IsStructurallyValidRegionTag",
1010
"IsCanonicallyCasedRegionTag",
1011
complex_region_mappings.keys(), region_maxlength,
1012
"Region subtags with complex mappings.", source, url)
1013
1014
writeComplexLanguageTagMappings(println, complex_language_mappings,
1015
"Language subtags with complex mappings.", source, url)
1016
writeComplexRegionTagMappings(println, complex_region_mappings,
1017
"Region subtags with complex mappings.", source, url)
1018
1019
writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
1020
"Canonicalize grandfathered locale identifiers.", source,
1021
url)
1022
1023
writeUnicodeExtensionsMappings(println, unicode_mappings)
1024
1025
1026
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
1027
""" Writes the likely-subtags test file. """
1028
1029
println(generatedFileWarning)
1030
1031
source = u"CLDR Supplemental Data, version {}".format(data["version"])
1032
language_mappings = data["languageMappings"]
1033
complex_language_mappings = data["complexLanguageMappings"]
1034
region_mappings = data["regionMappings"]
1035
complex_region_mappings = data["complexRegionMappings"]
1036
likely_subtags = data["likelySubtags"]
1037
1038
def bcp47(tag):
1039
(language, script, region) = tag
1040
return "{}{}{}".format(language,
1041
"-" + script if script else "",
1042
"-" + region if region else "")
1043
1044
def canonical(tag):
1045
(language, script, region) = tag
1046
1047
# Map deprecated language subtags.
1048
if language in language_mappings:
1049
language = language_mappings[language]
1050
elif language in complex_language_mappings:
1051
(language2, script2, region2) = complex_language_mappings[language]
1052
(language, script, region) = (language2,
1053
script if script else script2,
1054
region if region else region2)
1055
1056
# Map deprecated region subtags.
1057
if region in region_mappings:
1058
region = region_mappings[region]
1059
else:
1060
# Assume no complex region mappings are needed for now.
1061
assert region not in complex_region_mappings,\
1062
"unexpected region with complex mappings: {}".format(region)
1063
1064
return (language, script, region)
1065
1067
1068
def addLikelySubtags(tag):
1069
# Step 1: Canonicalize.
1070
(language, script, region) = canonical(tag)
1071
if script == "Zzzz":
1072
script = None
1073
if region == "ZZ":
1074
region = None
1075
1076
# Step 2: Lookup.
1077
searches = ((language, script, region),
1078
(language, None, region),
1079
(language, script, None),
1080
(language, None, None),
1081
("und", script, None))
1082
search = next(search for search in searches if search in likely_subtags)
1083
1084
(language_s, script_s, region_s) = search
1085
(language_m, script_m, region_m) = likely_subtags[search]
1086
1087
# Step 3: Return.
1088
return (language if language != language_s else language_m,
1089
script if script != script_s else script_m,
1090
region if region != region_s else region_m)
1091
1093
def removeLikelySubtags(tag):
1094
# Step 1: Add likely subtags.
1095
max = addLikelySubtags(tag)
1096
1097
# Step 2: Remove variants (doesn't apply here).
1098
1099
# Step 3: Find a match.
1100
(language, script, region) = max
1101
for trial in ((language, None, None), (language, None, region), (language, script, None)):
1102
if addLikelySubtags(trial) == max:
1103
return trial
1104
1105
# Step 4: Return maximized if no match found.
1106
return max
1107
1108
def likely_canonical(from_tag, to_tag):
1109
# Canonicalize the input tag.
1110
from_tag = canonical(from_tag)
1111
1112
# Update the expected result if necessary.
1113
if from_tag in likely_subtags:
1114
to_tag = likely_subtags[from_tag]
1115
1116
# Canonicalize the expected output.
1117
to_canonical = canonical(to_tag)
1118
1119
# Sanity check: This should match the result of |addLikelySubtags|.
1120
assert to_canonical == addLikelySubtags(from_tag)
1121
1122
return to_canonical
1123
1124
# |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
1125
likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()}
1126
1127
# Add test data for |Intl.Locale.prototype.maximize()|.
1128
writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
1129
"maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
1130
1131
# Use the maximalized tags as the input for the remove likely-subtags test.
1132
minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()}
1133
1134
# Add test data for |Intl.Locale.prototype.minimize()|.
1135
writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
1136
"minLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
1137
1138
println(u"""
1139
for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
1140
assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
1141
}""")
1142
1143
println(u"""
1144
for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
1145
assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
1146
}""")
1147
1148
println(u"""
1149
if (typeof reportCompare === "function")
1150
reportCompare(0, 0);""")
1151
1152
1153
def updateCLDRLangTags(args):
1154
""" Update the LanguageTagGenerated.cpp file. """
1155
version = args.version
1156
url = args.url
1157
out = args.out
1158
filename = args.file
1159
1160
url = url.replace("<VERSION>", version)
1161
1162
print("Arguments:")
1163
print("\tCLDR version: %s" % version)
1164
print("\tDownload url: %s" % url)
1165
if filename is not None:
1166
print("\tLocal CLDR core.zip file: %s" % filename)
1167
print("\tOutput file: %s" % out)
1168
print("")
1169
1170
data = {
1171
"version": version,
1172
}
1173
1174
def readFiles(cldr_file):
1175
with ZipFile(cldr_file) as zip_file:
1176
data.update(readSupplementalData(zip_file))
1177
data["unicodeMappings"] = readUnicodeExtensions(zip_file)
1178
1179
print("Processing CLDR data...")
1180
if filename is not None:
1181
print("Always make sure you have the newest CLDR core.zip!")
1182
with open(filename, "rb") as cldr_file:
1183
readFiles(cldr_file)
1184
else:
1185
print("Downloading CLDR core.zip...")
1186
with closing(urlopen(url)) as cldr_file:
1187
cldr_data = io.BytesIO(cldr_file.read())
1188
readFiles(cldr_data)
1189
1190
print("Writing Intl data...")
1191
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
1192
println = partial(print, file=f)
1193
1194
writeCLDRLanguageTagData(println, data, url)
1195
1196
print("Writing Intl test data...")
1197
test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
1198
"../../tests/non262/Intl/Locale/likely-subtags-generated.js")
1199
with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
1200
println = partial(print, file=f)
1201
1202
println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||"
1203
u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))")
1204
writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
1205
1206
1207
def flines(filepath, encoding="utf-8"):
1208
""" Open filepath and iterate over its content. """
1209
with io.open(filepath, mode="r", encoding=encoding) as f:
1210
for line in f:
1211
yield line
1212
1213
1214
@total_ordering
1215
class Zone(object):
1216
""" Time zone with optional file name. """
1217
1218
def __init__(self, name, filename=""):
1219
self.name = name
1220
self.filename = filename
1221
1222
def __eq__(self, other):
1223
return hasattr(other, "name") and self.name == other.name
1224
1225
def __lt__(self, other):
1226
return self.name < other.name
1227
1228
def __hash__(self):
1229
return hash(self.name)
1230
1231
def __str__(self):
1232
return self.name
1233
1234
def __repr__(self):
1235
return self.name
1236
1237
1238
class TzDataDir(object):
1239
""" tzdata source from a directory. """
1240
1241
def __init__(self, obj):
1242
self.name = partial(os.path.basename, obj)
1243
self.resolve = partial(os.path.join, obj)
1244
self.basename = os.path.basename
1245
self.isfile = os.path.isfile
1246
self.listdir = partial(os.listdir, obj)
1247
self.readlines = flines
1248
1249
1250
class TzDataFile(object):
1251
""" tzdata source from a file (tar or gzipped). """
1252
1253
def __init__(self, obj):
1254
self.name = lambda: os.path.splitext(os.path.splitext(os.path.basename(obj))[0])[0]
1255
self.resolve = obj.getmember
1256
self.basename = attrgetter("name")
1257
self.isfile = tarfile.TarInfo.isfile
1258
self.listdir = obj.getnames
1259
self.readlines = partial(self._tarlines, obj)
1260
1261
def _tarlines(self, tar, m):
1262
with closing(tar.extractfile(m)) as f:
1263
for line in f:
1264
yield line.decode("utf-8")
1265
1266
1267
def validateTimeZones(zones, links):
1268
""" Validate the zone and link entries. """
1269
linkZones = set(links.keys())
1270
intersect = linkZones.intersection(zones)
1271
if intersect:
1272
raise RuntimeError("Links also present in zones: %s" % intersect)
1273
1274
zoneNames = {z.name for z in zones}
1275
linkTargets = set(links.values())
1276
if not linkTargets.issubset(zoneNames):
1277
raise RuntimeError("Link targets not found: %s" % linkTargets.difference(zoneNames))
1278
1279
1280
def partition(iterable, *predicates):
1281
def innerPartition(pred, it):
1282
it1, it2 = tee(it)
1283
return (filter(pred, it1), filterfalse(pred, it2))
1284
if len(predicates) == 0:
1285
return iterable
1286
(left, right) = innerPartition(predicates[0], iterable)
1287
if len(predicates) == 1:
1288
return (left, right)
1289
return tuple([left] + list(partition(right, *predicates[1:])))
1290
1291
1292
def listIANAFiles(tzdataDir):
1293
def isTzFile(d, m, f):
1294
return m(f) and d.isfile(d.resolve(f))
1295
return filter(partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match),
1296
tzdataDir.listdir())
1297
1298
1299
def readIANAFiles(tzdataDir, files):
1300
""" Read all IANA time zone files from the given iterable. """
1301
nameSyntax = "[\w/+\-]+"
1302
pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax)
1303
pLink = re.compile(r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" %
1304
(nameSyntax, nameSyntax))
1305
1306
def createZone(line, fname):
1307
match = pZone.match(line)
1308
name = match.group("name")
1309
return Zone(name, fname)
1310
1311
def createLink(line, fname):
1312
match = pLink.match(line)
1313
(name, target) = match.group("name", "target")
1314
return (Zone(name, fname), target)
1315
1316
zones = set()
1317
links = dict()
1318
for filename in files:
1319
filepath = tzdataDir.resolve(filename)
1320
for line in tzdataDir.readlines(filepath):
1321
if line.startswith("Zone"):
1322
zones.add(createZone(line, filename))
1323
if line.startswith("Link"):
1324
(link, target) = createLink(line, filename)
1325
links[link] = target
1326
1327
return (zones, links)
1328
1329
1330
def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory):
1331
""" Read the IANA time zone information from `tzdataDir`. """
1332
1333
backzoneFiles = {"backzone"}
1334
(bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
1335
1336
# Read zone and link infos.
1337
(zones, links) = readIANAFiles(tzdataDir, tzfiles)
1338
(backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
1339
1340
# Remove the placeholder time zone "Factory".
1341
if ignoreFactory:
1342
zones.remove(Zone("Factory"))
1343
1344
# Merge with backzone data.
1345
if not ignoreBackzone:
1346
zones |= backzones
1347
links = {name: target for name, target in links.items() if name not in backzones}
1348
links.update(backlinks)
1349
1350
validateTimeZones(zones, links)
1351
1352
return (zones, links)
1353
1354
1355
def readICUResourceFile(filename):
1356
""" Read an ICU resource file.
1357
1358
Yields (<table-name>, <startOrEnd>, <value>) for each table.
1359
"""
1360
1361
numberValue = r"-?\d+"
1362
stringValue = r'".+?"'
1363
1364
def asVector(val): return r"%s(?:\s*,\s*%s)*" % (val, val)
1365
numberVector = asVector(numberValue)
1366
stringVector = asVector(stringValue)
1367
1368
reNumberVector = re.compile(numberVector)
1369
reStringVector = re.compile(stringVector)
1370
reNumberValue = re.compile(numberValue)
1371
reStringValue = re.compile(stringValue)
1372
1373
def parseValue(value):
1374
m = reNumberVector.match(value)
1375
if m:
1376
return [int(v) for v in reNumberValue.findall(value)]
1377
m = reStringVector.match(value)
1378
if m:
1379
return [v[1:-1] for v in reStringValue.findall(value)]
1380
raise RuntimeError("unknown value type: %s" % value)
1381
1382
def extractValue(values):
1383
if len(values) == 0:
1384
return None
1385
if len(values) == 1:
1386
return values[0]
1387
return values
1388
1389
def line(*args):
1390
maybeMultiComments = r"(?:/\*[^*]*\*/)*"
1391
maybeSingleComment = r"(?://.*)?"
1392
lineStart = "^%s" % maybeMultiComments
1393
lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment)
1394
return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd])))
1395
1396
tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)'
1397
tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector)
1398
1399
reStartTable = line(tableName, r"\{")
1400
reEndTable = line(r"\}")
1401
reSingleValue = line(r",?", tableValue, r",?")
1402
reCompactTable = line(tableName, r"\{", tableValue, r"\}")
1403
reEmptyLine = line()
1404
1405
tables = []
1406
1407
def currentTable(): return "|".join(tables)
1408
values = []
1409
for line in flines(filename, "utf-8-sig"):
1410
line = line.strip()
1411
if line == "":
1412
continue
1413
1414
m = reEmptyLine.match(line)
1415
if m:
1416
continue
1417
1418
m = reStartTable.match(line)
1419
if m:
1420
assert len(values) == 0
1421
tables.append(m.group("name"))
1422
continue
1423
1424
m = reEndTable.match(line)
1425
if m:
1426
yield (currentTable(), extractValue(values))
1427
tables.pop()
1428
values = []
1429
continue
1430
1431
m = reCompactTable.match(line)
1432
if m:
1433
assert len(values) == 0
1434
tables.append(m.group("name"))
1435
yield (currentTable(), extractValue(parseValue(m.group("value"))))
1436
tables.pop()
1437
continue
1438
1439
m = reSingleValue.match(line)
1440
if m and tables:
1441
values.extend(parseValue(m.group("value")))
1442
continue
1443
1444
raise RuntimeError("unknown entry: %s" % line)
1445
1446
1447
def readICUTimeZonesFromTimezoneTypes(icuTzDir):
1448
""" Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt
1449
and returns the tuple (zones, links).
1450
"""
1451
typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|"
1452
typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|"
1453
1454
def toTimeZone(name): return Zone(name.replace(":", "/"))
1455
1456
zones = set()
1457
links = dict()
1458
1459
for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")):
1460
if name.startswith(typeMapTimeZoneKey):
1461
zones.add(toTimeZone(name[len(typeMapTimeZoneKey):]))
1462
if name.startswith(typeAliasTimeZoneKey):
1463
links[toTimeZone(name[len(typeAliasTimeZoneKey):])] = value
1464
1465
# Remove the ICU placeholder time zone "Etc/Unknown".
1466
zones.remove(Zone("Etc/Unknown"))
1467
1468
# tzdata2017c removed the link Canada/East-Saskatchewan -> America/Regina,
1469
# but it is still present in ICU sources. Manually remove it to keep our
1470
# tables consistent with IANA.
1471
del links[Zone("Canada/East-Saskatchewan")]
1472
1473
validateTimeZones(zones, links)
1474
1475
return (zones, links)
1476
1477
1478
def readICUTimeZonesFromZoneInfo(icuTzDir, ignoreFactory):
1479
""" Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt
1480
and returns the tuple (zones, links).
1481
"""
1482
zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table"
1483
linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int"
1484
namesKey = "zoneinfo64:table(nofallback)|Names"
1485
1486
tzId = 0
1487
tzLinks = dict()
1488
tzNames = []
1489
1490
for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")):
1491
if name == zoneKey:
1492
tzId += 1
1493
elif name == linkKey:
1494
tzLinks[tzId] = int(value)
1495
tzId += 1
1496
elif name == namesKey:
1497
tzNames.extend(value)
1498
1499
links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()}
1500
zones = {Zone(v) for v in tzNames if Zone(v) not in links}
1501
1502
# Remove the ICU placeholder time zone "Etc/Unknown".
1503
zones.remove(Zone("Etc/Unknown"))
1504
1505
# tzdata2017c removed the link Canada/East-Saskatchewan -> America/Regina,
1506
# but it is still present in ICU sources. Manually remove it to keep our
1507
# tables consistent with IANA.
1508
del links[Zone("Canada/East-Saskatchewan")]
1509
1510
# Remove the placeholder time zone "Factory".
1511
if ignoreFactory:
1512
zones.remove(Zone("Factory"))
1513
1514
validateTimeZones(zones, links)
1515
1516
return (zones, links)
1517
1518
1519
def readICUTimeZones(icuDir, icuTzDir, ignoreFactory):
1520
# zoneinfo64.txt contains the supported time zones by ICU. This data is
1521
# generated from tzdata files, it doesn't include "backzone" in stock ICU.
1522
(zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir, ignoreFactory)
1523
1524
# timezoneTypes.txt contains the canonicalization information for ICU. This
1525
# data is generated from CLDR files. It includes data about time zones from
1526
# tzdata's "backzone" file.
1527
(typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir)
1528
1529
# Information in zoneinfo64 should be a superset of timezoneTypes.
1530
def inZoneInfo64(zone): return zone in zoneinfoZones or zone in zoneinfoLinks
1531
1532
# Remove legacy ICU time zones from zoneinfo64 data.
1533
(legacyZones, legacyLinks) = readICULegacyZones(icuDir)
1534
zoneinfoZones = {zone for zone in zoneinfoZones if zone not in legacyZones}
1535
zoneinfoLinks = {zone: target for (zone, target) in zoneinfoLinks.items()
1536
if zone not in legacyLinks}
1537
1538
notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)]
1539
if notFoundInZoneInfo64:
1540
raise RuntimeError("Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64)
1541
1542
notFoundInZoneInfo64 = [zone for zone in typesLinks.keys() if not inZoneInfo64(zone)]
1543
if notFoundInZoneInfo64:
1544
raise RuntimeError("Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64)
1545
1546
# zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization
1547
# rules are defined through timezoneTypes.txt. Merge both to get the actual zones
1548
# and links used by ICU.
1549
icuZones = set(chain(
1550
(zone for zone in zoneinfoZones if zone not in typesLinks),
1551
(zone for zone in typesZones)
1552
))
1553
icuLinks = dict(chain(
1554
((zone, target) for (zone, target) in zoneinfoLinks.items() if zone not in typesZones),
1555
((zone, target) for (zone, target) in typesLinks.items())
1556
))
1557
1558
return (icuZones, icuLinks)
1559
1560
1561
def readICULegacyZones(icuDir):
1562
""" Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones
1563
and returns the tuple (zones, links).
1564
"""
1565
tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode"))
1566
(zones, links) = readIANAFiles(tzdir, ["icuzones"])
1567
1568
# Remove the ICU placeholder time zone "Etc/Unknown".
1569
zones.remove(Zone("Etc/Unknown"))
1570
1571
# tzdata2017c removed the link Canada/East-Saskatchewan -> America/Regina,
1572
# but it is still present in ICU sources. Manually tag it as a legacy time
1573
# zone so our tables are kept consistent with IANA.
1574
links[Zone("Canada/East-Saskatchewan")] = "America/Regina"
1575
1576
return (zones, links)
1577
1578
1579
def icuTzDataVersion(icuTzDir):
1580
""" Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt. """
1581
def searchInFile(pattern, f):
1582
p = re.compile(pattern)
1583
for line in flines(f, "utf-8-sig"):
1584
m = p.search(line)
1585
if m:
1586
return m.group(1)
1587
return None
1588
1589
zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt")
1590
if not os.path.isfile(zoneinfo):
1591
raise RuntimeError("file not found: %s" % zoneinfo)
1592
version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo)
1593
if version is None:
1594
raise RuntimeError("%s does not contain a valid tzdata version string" % zoneinfo)
1595
return version
1596
1597
1598
def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone):
1599
""" Find incorrect ICU zone entries. """
1600
def isIANATimeZone(zone): return zone in ianaZones or zone in ianaLinks
1601
1602
def isICUTimeZone(zone): return zone in icuZones or zone in icuLinks
1603
1604
def isICULink(zone): return zone in icuLinks
1605
1606
# All IANA zones should be present in ICU.
1607
missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)]
1608
# Normally zones in backzone are also present as links in one of the other
1609
# time zone files. The only exception to this rule is the Asia/Hanoi time
1610
# zone, this zone is only present in the backzone file.
1611
expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")]
1612
if missingTimeZones != expectedMissing:
1613
raise RuntimeError("Not all zones are present in ICU, did you forget "
1614
"to run intl/update-tzdata.sh? %s" % missingTimeZones)
1615
1616
# Zones which are only present in ICU?
1617
additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)]
1618
if additionalTimeZones:
1619
raise RuntimeError("Additional zones present in ICU, did you forget "
1620
"to run intl/update-tzdata.sh? %s" % additionalTimeZones)
1621
1622
# Zones which are marked as links in ICU.
1623
result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone))
1624
1625
# Remove unnecessary UTC mappings.
1626
utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
1627
result = ((zone, target) for (zone, target) in result if zone.name not in utcnames)
1628
1629
return sorted(result, key=itemgetter(0))
1630
1631
1632
def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks):
1633
""" Find incorrect ICU link entries. """
1634
def isIANATimeZone(zone): return zone in ianaZones or zone in ianaLinks
1635
1636
def isICUTimeZone(zone): return zone in icuZones or zone in icuLinks
1637
1638
def isICULink(zone): return zone in icuLinks
1639
1640
def isICUZone(zone): return zone in icuZones
1641
1642
# All links should be present in ICU.
1643
missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)]
1644
if missingTimeZones:
1645
raise RuntimeError("Not all zones are present in ICU, did you forget "
1646
"to run intl/update-tzdata.sh? %s" % missingTimeZones)
1647
1648
# Links which are only present in ICU?
1649
additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)]
1650
if additionalTimeZones:
1651
raise RuntimeError("Additional links present in ICU, did you forget "
1652
"to run intl/update-tzdata.sh? %s" % additionalTimeZones)
1653
1654
result = chain(
1655
# IANA links which have a different target in ICU.
1656
((zone, target, icuLinks[zone]) for (zone, target) in ianaLinks.items()
1657
if isICULink(zone) and target != icuLinks[zone]),
1658
1659
# IANA links which are zones in ICU.
1660
((zone, target, zone.name) for (zone, target) in ianaLinks.items() if isICUZone(zone))
1661
)
1662
1663
# Remove unnecessary UTC mappings.
1664
utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
1665
result = ((zone, target, icuTarget)
1666
for (zone, target, icuTarget) in result
1667
if target not in utcnames or icuTarget not in utcnames)
1668
1669
return sorted(result, key=itemgetter(0))
1670
1671
1672
generatedFileWarning = u"// Generated by make_intl_data.py. DO NOT EDIT."
1673
tzdataVersionComment = u"// tzdata version = {0}"
1674
1675
1676
def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out):
1677
""" Read the time zone info and create a new time zone cpp file. """
1678
print("Processing tzdata mapping...")
1679
(ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory)
1680
(icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory)
1681
(legacyZones, legacyLinks) = readICULegacyZones(icuDir)
1682
1683
incorrectZones = findIncorrectICUZones(
1684
ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone)
1685
if not incorrectZones:
1686
print("<<< No incorrect ICU time zones found, please update Intl.js! >>>")
1687
print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
1688
1689
incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks)
1690
if not incorrectLinks:
1691
print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>")
1692
print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
1693
1694
print("Writing Intl tzdata file...")
1695
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
1696
println = partial(print, file=f)
1697
1698
println(generatedFileWarning)
1699
println(tzdataVersionComment.format(version))
1700
println(u"")
1701
1702
println(u"#ifndef builtin_intl_TimeZoneDataGenerated_h")
1703
println(u"#define builtin_intl_TimeZoneDataGenerated_h")
1704
println(u"")
1705
1706
println(u"namespace js {")
1707
println(u"namespace timezone {")
1708
println(u"")
1709
1710
println(u"// Format:")
1711
println(u'// "ZoneName" // ICU-Name [time zone file]')
1712
println(u"const char* const ianaZonesTreatedAsLinksByICU[] = {")
1713
for (zone, icuZone) in incorrectZones:
1714
println(u' "%s", // %s [%s]' % (zone, icuZone, zone.filename))
1715
println(u"};")
1716
println(u"")
1717
1718
println(u"// Format:")
1719
println(u'// "LinkName", "Target" // ICU-Target [time zone file]')
1720
println(u"struct LinkAndTarget")
1721
println(u"{")
1722
println(u" const char* const link;")
1723
println(u" const char* const target;")
1724
println(u"};")
1725
println(u"")
1726
println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
1727
for (zone, target, icuTarget) in incorrectLinks:
1728
println(u' { "%s", "%s" }, // %s [%s]' % (zone, target, icuTarget, zone.filename))
1729
println(u"};")
1730
println(u"")
1731
1732
println(u"// Legacy ICU time zones, these are not valid IANA time zone names. We also")
1733
println(u"// disallow the old and deprecated System V time zones.")
1735
println(u"const char* const legacyICUTimeZones[] = {")
1736
for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)):
1737
println(u' "%s",' % zone)
1738
println(u"};")
1739
println(u"")
1740
1741
println(u"} // namespace timezone")
1742
println(u"} // namespace js")
1743
println(u"")
1744
println(u"#endif /* builtin_intl_TimeZoneDataGenerated_h */")
1745
1746
1747
def updateBackzoneLinks(tzdataDir, links):
1748
def withZone(fn): return lambda zone_target: fn(zone_target[0])
1749
1750
(backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"])
1751
(stableZones, updatedLinks, updatedZones) = partition(
1752
links.items(),
1753
# Link not changed in backzone.
1754
withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones),
1755
# Link has a new target.
1756
withZone(lambda zone: zone in backzoneLinks),
1757
)
1758
# Keep stable zones and links with updated target.
1759
return dict(chain(
1760
stableZones,
1761
map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks)
1762
))
1763
1764
1765
def generateTzDataLinkTestContent(testDir, version, fileName, description, links):
1766
with io.open(os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="") as f:
1767
println = partial(print, file=f)
1768
1769
println(u'// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
1770
println(u"")
1771
println(generatedFileWarning)
1772
println(tzdataVersionComment.format(version))
1773
println(u"""
1774
const tzMapper = [
1775
x => x,
1776
x => x.toUpperCase(),
1777
x => x.toLowerCase(),
1778
];
1779
""")
1780
1781
println(description)
1782
println(u"const links = {")
1783
for (zone, target) in sorted(links, key=itemgetter(0)):
1784
println(u' "%s": "%s",' % (zone, target))
1785
println(u"};")
1786
1787
println(u"""
1788
for (let [linkName, target] of Object.entries(links)) {
1789
if (target === "Etc/UTC" || target === "Etc/GMT")
1790
target = "UTC";
1791
1792
for (let map of tzMapper) {
1793
let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)});
1794
let resolvedTimeZone = dtf.resolvedOptions().timeZone;
1795
assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`);
1796
}
1797
}
1798
""")
1799
println(u"""
1800
if (typeof reportCompare === "function")
1801
reportCompare(0, 0, "ok");
1802
""")
1803
1804
1805
def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
1806
(zones, links) = readIANAFiles(tzdataDir, ["backward"])
1807
assert len(zones) == 0
1808
1809
if not ignoreBackzone:
1810
links = updateBackzoneLinks(tzdataDir, links)
1811
1812
generateTzDataLinkTestContent(
1813
testDir, version,
1814
"timeZone_backward_links.js",
1815
u"// Link names derived from IANA Time Zone Database, backward file.",
1816
links.items()
1817
)
1818
1819
1820
def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
1821
tzfiles = filterfalse({"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir))
1822
(zones, links) = readIANAFiles(tzdataDir, tzfiles)
1823
1824
if not ignoreBackzone:
1825
links = updateBackzoneLinks(tzdataDir, links)
1826
1827
generateTzDataLinkTestContent(
1828
testDir, version,
1829
"timeZone_notbackward_links.js",
1830
u"// Link names derived from IANA Time Zone Database, excluding backward file.",
1831
links.items()
1832
)
1833
1834
1835
def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir):
1836
backzoneFiles = {"backzone"}
1837
(bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
1838
1839
# Read zone and link infos.
1840
(zones, links) = readIANAFiles(tzdataDir, tzfiles)
1841
(backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
1842
1843
if not ignoreBackzone:
1844
comment = u"""\
1845
// This file was generated with historical, pre-1970 backzone information
1846
// respected. Therefore, every zone key listed below is its own Zone, not
1847
// a Link to a modern-day target as IANA ignoring backzones would say.
1848
1849
"""
1850
else:
1851
comment = u"""\
1852
// This file was generated while ignoring historical, pre-1970 backzone
1853
// information. Therefore, every zone key listed below is part of a Link
1854
// whose target is the corresponding value.
1855
1856
"""
1857
1858
generateTzDataLinkTestContent(
1859
testDir, version,
1860
"timeZone_backzone.js",
1861
comment + u"// Backzone zones derived from IANA Time Zone Database.",
1862
((zone, zone if not ignoreBackzone else links[zone])
1863
for zone in backzones if zone in links)
1864
)
1865
1866
1867
def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir):
1868
backzoneFiles = {"backzone"}
1869
(bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
1870
1871
# Read zone and link infos.
1872
(zones, links) = readIANAFiles(tzdataDir, tzfiles)
1873
(backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
1874
1875
if not ignoreBackzone:
1876
comment = u"""\
1877
// This file was generated with historical, pre-1970 backzone information
1878
// respected. Therefore, every zone key listed below points to a target
1879
// in the backzone file and not to its modern-day target as IANA ignoring
1880
// backzones would say.
1881
1882
"""
1883
else:
1884
comment = u"""\
1885
// This file was generated while ignoring historical, pre-1970 backzone
1886
// information. Therefore, every zone key listed below is part of a Link
1887
// whose target is the corresponding value ignoring any backzone entries.
1888
1889
"""
1890
1891
generateTzDataLinkTestContent(
1892
testDir, version,
1893
"timeZone_backzone_links.js",
1894
comment + u"// Backzone links derived from IANA Time Zone Database.",
1895
((zone, target if not ignoreBackzone else links[zone])
1896
for (zone, target) in backlinks.items())
1897
)
1898
1899
1900
def generateTzDataTests(tzdataDir, version, ignoreBackzone, testDir):
1901
generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
1902
generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
1903
generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir)
1904
generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir)
1905
1906
1907
def updateTzdata(topsrcdir, args):
1908
""" Update the time zone cpp file. """
1909
1910
icuDir = os.path.join(topsrcdir, "intl/icu/source")
1911
if not os.path.isdir(icuDir):
1912
raise RuntimeError("not a directory: %s" % icuDir)
1913
1914
icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source")
1915
if not os.path.isdir(icuTzDir):
1916
raise RuntimeError("not a directory: %s" % icuTzDir)
1917
1918
dateTimeFormatTestDir = os.path.join(topsrcdir, "js/src/tests/non262/Intl/DateTimeFormat")
1919
if not os.path.isdir(dateTimeFormatTestDir):
1920
raise RuntimeError("not a directory: %s" % dateTimeFormatTestDir)
1921
1922
tzDir = args.tz
1923
if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)):
1924
raise RuntimeError("not a directory or file: %s" % tzDir)
1925
ignoreBackzone = args.ignore_backzone
1926
# TODO: Accept or ignore the placeholder time zone "Factory"?
1927
ignoreFactory = False
1928
out = args.out
1929
1930
version = icuTzDataVersion(icuTzDir)
1932
1933
print("Arguments:")
1934
print("\ttzdata version: %s" % version)
1935
print("\ttzdata URL: %s" % url)
1936
print("\ttzdata directory|file: %s" % tzDir)
1937
print("\tICU directory: %s" % icuDir)
1938
print("\tICU timezone directory: %s" % icuTzDir)
1939
print("\tIgnore backzone file: %s" % ignoreBackzone)
1940
print("\tOutput file: %s" % out)
1941
print("")
1942
1943
def updateFrom(f):
1944
if os.path.isfile(f) and tarfile.is_tarfile(f):
1945
with tarfile.open(f, "r:*") as tar:
1946
processTimeZones(TzDataFile(tar), icuDir, icuTzDir, version,
1947
ignoreBackzone, ignoreFactory, out)
1948
generateTzDataTests(TzDataFile(tar), version,
1949
ignoreBackzone, dateTimeFormatTestDir)
1950
elif os.path.isdir(f):
1951
processTimeZones(TzDataDir(f), icuDir, icuTzDir, version,
1952
ignoreBackzone, ignoreFactory, out)
1953
generateTzDataTests(TzDataDir(f), version, ignoreBackzone, dateTimeFormatTestDir)
1954
else:
1955
raise RuntimeError("unknown format")
1956
1957
if tzDir is None:
1958
print("Downloading tzdata file...")
1959
with closing(urlopen(url)) as tzfile:
1960
fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
1961
with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
1962
print("File stored in %s" % tztmpfile.name)
1963
tztmpfile.write(tzfile.read())
1964
tztmpfile.flush()
1965
updateFrom(tztmpfile.name)
1966
else:
1967
updateFrom(tzDir)
1968
1969
1970
def readCurrencyFile(tree):
1971
reCurrency = re.compile(r"^[A-Z]{3}$")
1972
reIntMinorUnits = re.compile(r"^\d+$")
1973
1974
for country in tree.iterfind(".//CcyNtry"):
1975
# Skip entry if no currency information is available.
1976
currency = country.findtext("Ccy")
1977
if currency is None:
1978
continue
1979
assert reCurrency.match(currency)
1980
1981
minorUnits = country.findtext("CcyMnrUnts")
1982
assert minorUnits is not None
1983
1984
# Skip all entries without minorUnits or which use the default minorUnits.
1985
if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2:
1986
currencyName = country.findtext("CcyNm")
1987
countryName = country.findtext("CtryNm")
1988
yield (currency, int(minorUnits), currencyName, countryName)
1989
1990
1991
def writeCurrencyFile(published, currencies, out):
1992
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
1993
println = partial(print, file=f)
1994
1995
println(generatedFileWarning)
1996
println(u"// Version: {}".format(published))
1997
1998
println(u"""
1999
/**
2000
* Mapping from currency codes to the number of decimal digits used for them.
2001
* Default is 2 digits.
2002
*
2003
* Spec: ISO 4217 Currency and Funds Code List.
2005
*/""")
2006
println(u"var currencyDigits = {")
2007
for (currency, entries) in groupby(sorted(currencies, key=itemgetter(0)), itemgetter(0)):
2008
for (_, minorUnits, currencyName, countryName) in entries:
2009
println(u" // {} ({})".format(currencyName, countryName))
2010
println(u" {}: {},".format(currency, minorUnits))
2011
println(u"};")
2012
2013
2014
def updateCurrency(topsrcdir, args):
2015
""" Update the CurrencyDataGenerated.js file. """
2016
import xml.etree.ElementTree as ET
2017
from random import randint
2018
2019
url = args.url
2020
out = args.out
2021
filename = args.file
2022
2023
print("Arguments:")
2024
print("\tDownload url: %s" % url)
2025
print("\tLocal currency file: %s" % filename)
2026
print("\tOutput file: %s" % out)
2027
print("")
2028
2029
def updateFrom(currencyFile):
2030
print("Processing currency code list file...")
2031
tree = ET.parse(currencyFile)
2032
published = tree.getroot().attrib["Pblshd"]
2033
currencies = readCurrencyFile(tree)
2034
2035
print("Writing CurrencyData file...")
2036
writeCurrencyFile(published, currencies, out)
2037
2038
if filename is not None:
2039
print("Always make sure you have the newest currency code list file!")
2040
updateFrom(filename)
2041
else:
2042
print("Downloading currency & funds code list...")
2043
request = UrlRequest(url)
2044
request.add_header(
2045
"User-agent", "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format(
2046
randint(1, 999)))
2047
with closing(urlopen(request)) as currencyFile:
2048
fname = urlsplit(currencyFile.geturl()).path.split("/")[-1]
2049
with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile:
2050
print("File stored in %s" % currencyTmpFile.name)
2051
currencyTmpFile.write(currencyFile.read())
2052
currencyTmpFile.flush()
2053
updateFrom(currencyTmpFile.name)
2054
2055
2056
def writeUnicodeExtensionsMappings(println, mapping):
2057
println(u"""
2058
template <size_t Length>
2059
static inline bool IsUnicodeKey(mozilla::Span<const char> key,
2060
const char (&str)[Length]) {
2061
static_assert(Length == UnicodeKeyLength + 1,
2062
"Unicode extension key is two characters long");
2063
return memcmp(key.data(), str, Length - 1) == 0;
2064
}
2065
2066
template <size_t Length>
2067
static inline bool IsUnicodeType(mozilla::Span<const char> type,
2068
const char (&str)[Length]) {
2069
static_assert(Length > UnicodeKeyLength + 1,
2070
"Unicode extension type contains more than two characters");
2071
return type.size() == (Length - 1) &&
2072
memcmp(type.data(), str, Length - 1) == 0;
2073
}
2074
2075
static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) {
2076
#ifdef DEBUG
2077
auto isNull = [](char c) {
2078
return c == '\\0';
2079
};
2080
#endif
2081
2082
MOZ_ASSERT(std::none_of(b.begin(), b.end(), isNull),
2083
"unexpected null-character in string");
2084
2085
using UnsignedChar = unsigned char;
2086
for (size_t i = 0; i < b.size(); i++) {
2087
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
2088
// we've reached the end of |a|, the below if-statement will always be true.
2089
// That ensures we don't read past the end of |a|.
2090
if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
2091
return r;
2092
}
2093
}
2094
2095
// Return zero if both strings are equal or a negative number if |b| is a
2096
// prefix of |a|.
2097
return -int32_t(UnsignedChar(a[b.size()]));
2098
};
2099
2100
template <size_t Length>
2101
static inline const char* SearchReplacement(const char* (&types)[Length],
2102
const char* (&aliases)[Length],
2103
mozilla::Span<const char> type) {
2104
2105
auto p = std::lower_bound(std::begin(types), std::end(types), type,
2106
[](const auto& a, const auto& b) {
2107
return CompareUnicodeType(a, b) < 0;
2108
});
2109
if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
2110
return aliases[std::distance(std::begin(types), p)];
2111
}
2112
return nullptr;
2113
}
2114
2115
/**
2116
* Mapping from deprecated BCP 47 Unicode extension types to their preferred
2117
* values.
2118
*
2120
*/
2121
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
2122
mozilla::Span<const char> key, mozilla::Span<const char> type) {
2123
#ifdef DEBUG
2124
static auto isAsciiLowercaseAlphanumeric = [](char c) {
2125
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
2126
};
2127
2128
static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
2129
return isAsciiLowercaseAlphanumeric(c) || c == '-';
2130
};
2131
#endif
2132
2133
MOZ_ASSERT(key.size() == UnicodeKeyLength);
2134
MOZ_ASSERT(std::all_of(key.begin(), key.end(),
2135
isAsciiLowercaseAlphanumeric));
2136
2137
MOZ_ASSERT(type.size() > UnicodeKeyLength);
2138
MOZ_ASSERT(std::all_of(type.begin(), type.end(),
2139
isAsciiLowercaseAlphanumericOrDash));
2140
""")
2141
2142
def to_hash_key(replacements):
2143
return str(sorted(replacements.items()))
2144
2145
def write_array(subtags, name, length):
2146
max_entries = (80 - len(" ")) // (length + len('"", '))
2147
2148
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
2149
2150
for entries in grouper(subtags, max_entries):
2151
entries = (u"\"{}\"".format(tag).rjust(length + 2)
2152
for tag in entries if tag is not None)
2153
println(u" {},".format(u", ".join(entries)))
2154
2155
println(u" };")
2156
2157
# Merge duplicate keys.
2158
key_aliases = {}
2159
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
2160
hash_key = to_hash_key(replacements)
2161
if hash_key not in key_aliases:
2162
key_aliases[hash_key] = []
2163
else:
2164
key_aliases[hash_key].append(key)
2165
2166
first_key = True
2167
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
2168
hash_key = to_hash_key(replacements)
2169
if key in key_aliases[hash_key]:
2170
continue
2171
2172
cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key])
2173
2174
if_kind = u"if" if first_key else u"else if"
2175
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
2176
println(u"""
2177
{} ({}) {{""".format(if_kind, cond).strip("\n"))
2178
first_key = False
2179
2180
replacements = sorted(replacements.items(), key=itemgetter(0))
2181
2182
if len(replacements) > 4:
2183
types = [t for (t, _) in replacements]
2184
preferred = [r for (_, r) in replacements]
2185
max_len = max(len(k) for k in types + preferred)
2186
2187
write_array(types, "types", max_len)
2188
write_array(preferred, "aliases", max_len)
2189
println(u"""
2190
return SearchReplacement(types, aliases, type);
2191
""".strip("\n"))
2192
else:
2193
for (type, replacement) in replacements:
2194
println(u"""
2195
if (IsUnicodeType(type, "{}")) {{
2196
return "{}";
2197
}}""".format(type, replacement).strip("\n"))
2198
2199
println(u"""
2200
}""".lstrip("\n"))
2201
2202
println(u"""
2203
return nullptr;
2204
}
2205
""".strip("\n"))
2206
2207
2208
if __name__ == "__main__":
2209
import argparse
2210
2211
# This script must reside in js/src/builtin/intl to work correctly.
2212
(thisDir, thisFile) = os.path.split(os.path.abspath(sys.argv[0]))
2213
dirPaths = os.path.normpath(thisDir).split(os.sep)
2214
if "/".join(dirPaths[-4:]) != "js/src/builtin/intl":
2215
raise RuntimeError("%s must reside in js/src/builtin/intl" % sys.argv[0])
2216
topsrcdir = "/".join(dirPaths[:-4])
2217
2218
def EnsureHttps(v):
2219
if not v.startswith("https:"):
2220
raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
2221
return v
2222
2223
parser = argparse.ArgumentParser(description="Update intl data.")
2224
subparsers = parser.add_subparsers(help="Select update mode")
2225
2226
parser_cldr_tags = subparsers.add_parser("langtags",
2227
help="Update CLDR language tags data")
2228
parser_cldr_tags.add_argument("--version",
2229
metavar="VERSION",
2230
required=True,
2231
help="CLDR version number")
2232
parser_cldr_tags.add_argument("--url",
2233
metavar="URL",
2234
default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
2235
type=EnsureHttps,
2236
help="Download url CLDR data (default: %(default)s)")
2237
parser_cldr_tags.add_argument("--out",
2238
default="LanguageTagGenerated.cpp",
2239
help="Output file (default: %(default)s)")
2240
parser_cldr_tags.add_argument("file",
2241
nargs="?",
2242
help="Local cldr-core.zip file, if omitted uses <URL>")
2243
parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
2244
2245
parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
2246
parser_tz.add_argument("--tz",
2247
help="Local tzdata directory or file, if omitted downloads tzdata "
2248
"distribution from https://www.iana.org/time-zones/")
2249
# ICU doesn't include the backzone file by default, but we still like to
2250
# use the backzone time zone names to avoid user confusion. This does lead
2251
# to formatting "historic" dates (pre-1970 era) with the wrong time zone,
2252
# but that's probably acceptable for now.
2253
parser_tz.add_argument("--ignore-backzone",
2254
action="store_true",
2255
help="Ignore tzdata's 'backzone' file. Can be enabled to generate more "
2256
"accurate time zone canonicalization reflecting the actual time "
2257
"zones as used by ICU.")
2258
parser_tz.add_argument("--out",
2259
default="TimeZoneDataGenerated.h",
2260
help="Output file (default: %(default)s)")
2261
parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir))
2262
2263
parser_currency = subparsers.add_parser("currency", help="Update currency digits mapping")
2264
parser_currency.add_argument("--url",
2265
metavar="URL",
2267
type=EnsureHttps,
2268
help="Download url for the currency & funds code list (default: "
2269
"%(default)s)")
2270
parser_currency.add_argument("--out",
2271
default="CurrencyDataGenerated.js",
2272
help=