Source code

Revision control

Other Tools

1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2
* vim: set ts=8 sts=2 et sw=2 tw=80:
3
* This Source Code Form is subject to the terms of the Mozilla Public
4
* License, v. 2.0. If a copy of the MPL was not distributed with this
5
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#include "builtin/intl/LanguageTag.h"
8
9
#include "mozilla/Assertions.h"
10
#include "mozilla/MathAlgorithms.h"
11
#include "mozilla/Span.h"
12
#include "mozilla/TextUtils.h"
13
#include "mozilla/Variant.h"
14
15
#include <algorithm>
16
#include <iterator>
17
#include <stddef.h>
18
#include <stdint.h>
19
#include <string>
20
#include <string.h>
21
#include <type_traits>
22
#include <utility>
23
24
#include "jsapi.h"
25
#include "jsfriendapi.h"
26
27
#include "builtin/intl/CommonFunctions.h"
28
#include "ds/Sort.h"
29
#include "gc/Tracer.h"
30
#include "js/Result.h"
31
#include "js/TracingAPI.h"
32
#include "js/Utility.h"
33
#include "js/Vector.h"
34
#include "unicode/uloc.h"
35
#include "unicode/utypes.h"
36
#include "util/StringBuffer.h"
37
#include "util/Text.h"
38
#include "vm/JSContext.h"
39
#include "vm/Printer.h"
40
#include "vm/StringType.h"
41
42
namespace js {
43
namespace intl {
44
45
using namespace js::intl::LanguageTagLimits;
46
47
template <typename CharT>
48
bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language) {
49
// Tell the analysis the |std::all_of| function can't GC.
50
JS::AutoSuppressGCAnalysis nogc;
51
52
// unicode_language_subtag = alpha{2,3} | alpha{5,8};
53
size_t length = language.size();
54
const CharT* str = language.data();
55
return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) &&
56
std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>);
57
}
58
59
template bool IsStructurallyValidLanguageTag(
60
mozilla::Span<const char> language);
61
template bool IsStructurallyValidLanguageTag(
62
mozilla::Span<const Latin1Char> language);
63
template bool IsStructurallyValidLanguageTag(
64
mozilla::Span<const char16_t> language);
65
66
template <typename CharT>
67
bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script) {
68
// Tell the analysis the |std::all_of| function can't GC.
69
JS::AutoSuppressGCAnalysis nogc;
70
71
// unicode_script_subtag = alpha{4} ;
72
size_t length = script.size();
73
const CharT* str = script.data();
74
return length == 4 &&
75
std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>);
76
}
77
78
template bool IsStructurallyValidScriptTag(
79
mozilla::Span<const char> script);
80
template bool IsStructurallyValidScriptTag(
81
mozilla::Span<const Latin1Char> script);
82
template bool IsStructurallyValidScriptTag(
83
mozilla::Span<const char16_t> script);
84
85
template <typename CharT>
86
bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region) {
87
// Tell the analysis the |std::all_of| function can't GC.
88
JS::AutoSuppressGCAnalysis nogc;
89
90
// unicode_region_subtag = (alpha{2} | digit{3}) ;
91
size_t length = region.size();
92
const CharT* str = region.data();
93
return (length == 2 &&
94
std::all_of(str, str + length, mozilla::IsAsciiAlpha<CharT>)) ||
95
(length == 3 &&
96
std::all_of(str, str + length, mozilla::IsAsciiDigit<CharT>));
97
}
98
99
template bool IsStructurallyValidRegionTag(
100
mozilla::Span<const char> region);
101
template bool IsStructurallyValidRegionTag(
102
mozilla::Span<const Latin1Char> region);
103
template bool IsStructurallyValidRegionTag(
104
mozilla::Span<const char16_t> region);
105
106
#ifdef DEBUG
107
bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant) {
108
// unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
109
size_t length = variant.size();
110
const char* str = variant.data();
111
return ((5 <= length && length <= 8) ||
112
(length == 4 && mozilla::IsAsciiDigit(str[0]))) &&
113
std::all_of(str, str + length, mozilla::IsAsciiAlphanumeric<char>);
114
}
115
116
bool IsStructurallyValidUnicodeExtensionTag(
117
mozilla::Span<const char> extension) {
118
return LanguageTagParser::canParseUnicodeExtension(extension);
119
}
120
121
static bool IsStructurallyValidExtensionTag(
122
mozilla::Span<const char> extension) {
123
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
124
// NB: Allow any extension, including Unicode and Transform here, because
125
// this function is only used for an assertion.
126
127
size_t length = extension.size();
128
const char* str = extension.data();
129
const char* const end = extension.data() + length;
130
if (length <= 2) {
131
return false;
132
}
133
if (!mozilla::IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') {
134
return false;
135
}
136
str++;
137
if (*str++ != '-') {
138
return false;
139
}
140
while (true) {
141
const char* sep =
142
reinterpret_cast<const char*>(memchr(str, '-', end - str));
143
size_t len = (sep ? sep : end) - str;
144
if (len < 2 || len > 8 ||
145
!std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric<char>)) {
146
return false;
147
}
148
if (!sep) {
149
return true;
150
}
151
str = sep + 1;
152
}
153
}
154
155
bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse) {
156
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
157
158
size_t length = privateUse.size();
159
const char* str = privateUse.data();
160
const char* const end = privateUse.data() + length;
161
if (length <= 2) {
162
return false;
163
}
164
if (str[0] != 'x' && str[0] != 'X') {
165
return false;
166
}
167
str++;
168
if (*str++ != '-') {
169
return false;
170
}
171
while (true) {
172
const char* sep =
173
reinterpret_cast<const char*>(memchr(str, '-', end - str));
174
size_t len = (sep ? sep : end) - str;
175
if (len == 0 || len > 8 ||
176
!std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric<char>)) {
177
return false;
178
}
179
if (!sep) {
180
return true;
181
}
182
str = sep + 1;
183
}
184
}
185
#endif
186
187
ptrdiff_t LanguageTag::unicodeExtensionIndex() const {
188
// The extension subtags aren't necessarily sorted, so we can't use binary
189
// search here.
190
auto p = std::find_if(
191
extensions().begin(), extensions().end(),
192
[](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; });
193
if (p != extensions().end()) {
194
return std::distance(extensions().begin(), p);
195
}
196
return -1;
197
}
198
199
const char* LanguageTag::unicodeExtension() const {
200
ptrdiff_t index = unicodeExtensionIndex();
201
if (index >= 0) {
202
return extensions()[index].get();
203
}
204
return nullptr;
205
}
206
207
bool LanguageTag::setUnicodeExtension(UniqueChars extension) {
208
MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(
209
mozilla::MakeStringSpan(extension.get())));
210
211
// Replace the existing Unicode extension subtag or append a new one.
212
ptrdiff_t index = unicodeExtensionIndex();
213
if (index >= 0) {
214
extensions_[index] = std::move(extension);
215
return true;
216
}
217
return extensions_.append(std::move(extension));
218
}
219
220
void LanguageTag::clearUnicodeExtension() {
221
ptrdiff_t index = unicodeExtensionIndex();
222
if (index >= 0) {
223
extensions_.erase(extensions_.begin() + index);
224
}
225
}
226
227
template <size_t InitialCapacity>
228
static bool SortAlphabetically(JSContext* cx,
229
Vector<UniqueChars, InitialCapacity>& subtags) {
230
size_t length = subtags.length();
231
232
// Zero or one element lists are already sorted.
233
if (length < 2) {
234
return true;
235
}
236
237
// Handle two element lists inline.
238
if (length == 2) {
239
if (strcmp(subtags[0].get(), subtags[1].get()) > 0) {
240
subtags[0].swap(subtags[1]);
241
}
242
return true;
243
}
244
245
Vector<char*, 8> scratch(cx);
246
if (!scratch.resizeUninitialized(length * 2)) {
247
return false;
248
}
249
for (size_t i = 0; i < length; i++) {
250
scratch[i] = subtags[i].release();
251
}
252
253
MOZ_ALWAYS_TRUE(
254
MergeSort(scratch.begin(), length, scratch.begin() + length,
255
[](const char* a, const char* b, bool* lessOrEqualp) {
256
*lessOrEqualp = strcmp(a, b) <= 0;
257
return true;
258
}));
259
260
for (size_t i = 0; i < length; i++) {
261
subtags[i] = UniqueChars(scratch[i]);
262
}
263
return true;
264
}
265
266
bool LanguageTag::canonicalizeBaseName(JSContext* cx) {
267
// Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by
268
// normalizing the case and ordering all subtags. The canonical syntax form
269
// itself is specified in UTS 35, 3.2.1.
270
271
// Language codes need to be in lower case. "JA" -> "ja"
272
language_.toLowerCase();
273
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
274
275
// The first character of a script code needs to be capitalized.
276
// "hans" -> "Hans"
277
script_.toTitleCase();
278
MOZ_ASSERT(script().missing() ||
279
IsStructurallyValidScriptTag(script().span()));
280
281
// Region codes need to be in upper case. "bu" -> "BU"
282
region_.toUpperCase();
283
MOZ_ASSERT(region().missing() ||
284
IsStructurallyValidRegionTag(region().span()));
285
286
// The canonical case for variant subtags is lowercase.
287
for (UniqueChars& variant : variants_) {
288
char* variantChars = variant.get();
289
size_t variantLength = strlen(variantChars);
290
AsciiToLowerCase(variantChars, variantLength, variantChars);
291
292
MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength}));
293
}
294
295
// Extensions and privateuse subtags are case normalized in the
296
// |canonicalizeExtensions| method.
297
298
// The second step in UTS 35, 3.2.1, is to order all subtags.
299
300
if (variants_.length() > 1) {
301
// 1. Any variants are in alphabetical order.
302
if (!SortAlphabetically(cx, variants_)) {
303
return false;
304
}
305
306
// Reject the Locale identifier if a duplicate variant was found, e.g.
307
// "en-variant-Variant".
308
const UniqueChars* duplicate = std::adjacent_find(
309
variants().begin(), variants().end(), [](const auto& a, const auto& b) {
310
return strcmp(a.get(), b.get()) == 0;
311
});
312
if (duplicate != variants().end()) {
313
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
314
JSMSG_DUPLICATE_VARIANT_SUBTAG,
315
duplicate->get());
316
return false;
317
}
318
}
319
320
// 2. Any extensions are in alphabetical order by their singleton.
321
// - A subsequent call to canonicalizeExtensions() will perform this.
322
323
// The next two steps in 3.3.1 replace deprecated language and region
324
// subtags with their preferred mappings.
325
326
if (!updateGrandfatheredMappings(cx)) {
327
return false;
328
}
329
330
// Replace deprecated language subtags with their preferred values.
331
if (!languageMapping(language_) && complexLanguageMapping(language_)) {
332
performComplexLanguageMappings();
333
}
334
335
// No script replacements are currently present.
336
337
// Replace deprecated region subtags with their preferred values.
338
if (region().present()) {
339
if (!regionMapping(region_) && complexRegionMapping(region_)) {
340
performComplexRegionMappings();
341
}
342
}
343
344
// No variant subtag replacements are currently present.
345
// No extension replacements are currently present.
346
// Private use sequences are left as is.
347
348
// The two final steps in 3.3.1, handling irregular grandfathered and
349
// private-use only language tags, don't apply, because these two forms
350
// can't occur in Unicode BCP 47 locale identifiers.
351
352
return true;
353
}
354
355
bool LanguageTag::canonicalizeExtensions(
356
JSContext* cx, UnicodeExtensionCanonicalForm canonicalForm) {
357
// The canonical case for all extension subtags is lowercase.
358
for (UniqueChars& extension : extensions_) {
359
char* extensionChars = extension.get();
360
size_t extensionLength = strlen(extensionChars);
361
AsciiToLowerCase(extensionChars, extensionLength, extensionChars);
362
363
MOZ_ASSERT(
364
IsStructurallyValidExtensionTag({extensionChars, extensionLength}));
365
}
366
367
// Any extensions are in alphabetical order by their singleton.
368
// "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese"
369
if (!SortAlphabetically(cx, extensions_)) {
370
return false;
371
}
372
373
for (UniqueChars& extension : extensions_) {
374
if (extension[0] == 'u') {
375
if (!canonicalizeUnicodeExtension(cx, extension, canonicalForm)) {
376
return false;
377
}
378
} else if (extension[0] == 't') {
379
if (!canonicalizeTransformExtension(cx, extension)) {
380
return false;
381
}
382
}
383
}
384
385
// The canonical case for privateuse subtags is lowercase.
386
if (char* privateuse = privateuse_.get()) {
387
size_t privateuseLength = strlen(privateuse);
388
AsciiToLowerCase(privateuse, privateuseLength, privateuse);
389
390
MOZ_ASSERT(
391
IsStructurallyValidPrivateUseTag({privateuse, privateuseLength}));
392
}
393
return true;
394
}
395
396
/**
397
* CanonicalizeUnicodeExtension( attributes, keywords )
398
*
399
* Canonical syntax per
401
*
402
* - All attributes and keywords are in lowercase.
403
* - Note: The parser already converted keywords to lowercase.
404
* - All attributes are sorted in alphabetical order.
405
* - All keywords are sorted by alphabetical order of their keys.
406
* - Any type value "true" is removed.
407
*
408
* Canonical form:
409
* - All keys and types use the canonical form (from the name attribute;
410
* see Section 3.6.4 U Extension Data Files).
411
*/
412
bool LanguageTag::canonicalizeUnicodeExtension(
413
JSContext* cx, JS::UniqueChars& unicodeExtension,
414
UnicodeExtensionCanonicalForm canonicalForm) {
415
const char* const extension = unicodeExtension.get();
416
MOZ_ASSERT(extension[0] == 'u');
417
MOZ_ASSERT(extension[1] == '-');
418
MOZ_ASSERT(
419
IsStructurallyValidExtensionTag(mozilla::MakeStringSpan(extension)));
420
421
size_t length = strlen(extension);
422
423
LanguageTagParser::AttributesVector attributes(cx);
424
LanguageTagParser::KeywordsVector keywords(cx);
425
426
using Attribute = LanguageTagParser::AttributesVector::ElementType;
427
using Keyword = LanguageTagParser::KeywordsVector::ElementType;
428
429
bool ok;
430
JS_TRY_VAR_OR_RETURN_FALSE(
431
cx, ok,
432
LanguageTagParser::parseUnicodeExtension(
433
cx, mozilla::MakeSpan(extension, length), attributes, keywords));
434
MOZ_ASSERT(ok, "unexpected invalid Unicode extension subtag");
435
436
auto attributesLessOrEqual = [extension](const Attribute& a,
437
const Attribute& b) {
438
const char* astr = a.begin(extension);
439
const char* bstr = b.begin(extension);
440
size_t alen = a.length();
441
size_t blen = b.length();
442
443
if (int r =
444
std::char_traits<char>::compare(astr, bstr, std::min(alen, blen))) {
445
return r < 0;
446
}
447
return alen <= blen;
448
};
449
450
// All attributes are sorted in alphabetical order.
451
size_t attributesLength = attributes.length();
452
if (attributesLength > 1) {
453
if (!attributes.growByUninitialized(attributesLength)) {
454
return false;
455
}
456
457
MOZ_ALWAYS_TRUE(
458
MergeSort(attributes.begin(), attributesLength,
459
attributes.begin() + attributesLength,
460
[&](const auto& a, const auto& b, bool* lessOrEqualp) {
461
*lessOrEqualp = attributesLessOrEqual(a, b);
462
return true;
463
}));
464
465
attributes.shrinkBy(attributesLength);
466
}
467
468
auto keywordsLessOrEqual = [extension](const Keyword& a, const Keyword& b) {
469
const char* astr = a.begin(extension);
470
const char* bstr = b.begin(extension);
471
MOZ_ASSERT(a.length() >= UnicodeKeyLength);
472
MOZ_ASSERT(b.length() >= UnicodeKeyLength);
473
474
return std::char_traits<char>::compare(astr, bstr, UnicodeKeyLength) <= 0;
475
};
476
477
// All keywords are sorted by alphabetical order of keys.
478
size_t keywordsLength = keywords.length();
479
if (keywordsLength > 1) {
480
if (!keywords.growByUninitialized(keywordsLength)) {
481
return false;
482
}
483
484
// Using merge sort, being a stable sort algorithm, guarantees that two
485
// keywords using the same key are never reordered. That means for example
486
// when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to
487
// get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs
488
// before "nu-latn".
489
// This is required so that deduplication below preserves the first keyword
490
// for a given key and discards the rest.
491
MOZ_ALWAYS_TRUE(MergeSort(
492
keywords.begin(), keywordsLength, keywords.begin() + keywordsLength,
493
[&](const auto& a, const auto& b, bool* lessOrEqualp) {
494
*lessOrEqualp = keywordsLessOrEqual(a, b);
495
return true;
496
}));
497
498
keywords.shrinkBy(keywordsLength);
499
}
500
501
Vector<char, 32> sb(cx);
502
if (!sb.append('u')) {
503
return false;
504
}
505
506
// Append all Unicode extension attributes.
507
for (size_t i = 0; i < attributes.length(); i++) {
508
const auto& attribute = attributes[i];
509
510
// Skip duplicate attributes.
511
if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
512
const auto& lastAttribute = attributes[i - 1];
513
if (attribute.length() == lastAttribute.length() &&
514
std::char_traits<char>::compare(attribute.begin(extension),
515
lastAttribute.begin(extension),
516
attribute.length()) == 0) {
517
continue;
518
}
519
MOZ_ASSERT(!attributesLessOrEqual(attribute, lastAttribute));
520
}
521
522
if (!sb.append('-')) {
523
return false;
524
}
525
if (!sb.append(attribute.begin(extension), attribute.length())) {
526
return false;
527
}
528
}
529
530
static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1;
531
532
using StringSpan = mozilla::Span<const char>;
533
534
static auto isTrue = [](StringSpan type) {
535
constexpr char True[] = "true";
536
const size_t TrueLength = strlen(True);
537
return type.size() == TrueLength &&
538
std::char_traits<char>::compare(type.data(), True, TrueLength) == 0;
539
};
540
541
auto appendKey = [&sb, extension](const Keyword& keyword) {
542
MOZ_ASSERT(keyword.length() == UnicodeKeyLength);
543
return sb.append(keyword.begin(extension), UnicodeKeyLength);
544
};
545
546
auto appendKeyword = [&sb, extension](const Keyword& keyword,
547
StringSpan type) {
548
MOZ_ASSERT(keyword.length() > UnicodeKeyLength);
549
550
// Elide the Unicode extension type "true".
551
if (isTrue(type)) {
552
return sb.append(keyword.begin(extension), UnicodeKeyLength);
553
}
554
// Otherwise append the complete Unicode extension keyword.
555
return sb.append(keyword.begin(extension), keyword.length());
556
};
557
558
auto appendReplacement = [&sb, extension](const Keyword& keyword,
559
StringSpan replacement) {
560
MOZ_ASSERT(keyword.length() > UnicodeKeyLength);
561
562
// Elide the type "true" if present in the replacement.
563
if (isTrue(replacement)) {
564
return sb.append(keyword.begin(extension), UnicodeKeyLength);
565
}
566
// Otherwise append the Unicode key (including the separator) and the
567
// replaced type.
568
return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) &&
569
sb.append(replacement.data(), replacement.size());
570
};
571
572
// Append all Unicode extension keywords.
573
for (size_t i = 0; i < keywords.length(); i++) {
574
const auto& keyword = keywords[i];
575
576
// Skip duplicate keywords.
577
if (canonicalForm == UnicodeExtensionCanonicalForm::Yes && i > 0) {
578
const auto& lastKeyword = keywords[i - 1];
579
if (std::char_traits<char>::compare(keyword.begin(extension),
580
lastKeyword.begin(extension),
581
UnicodeKeyLength) == 0) {
582
continue;
583
}
584
MOZ_ASSERT(!keywordsLessOrEqual(keyword, lastKeyword));
585
}
586
587
if (!sb.append('-')) {
588
return false;
589
}
590
591
if (keyword.length() == UnicodeKeyLength) {
592
// Keyword without type value.
593
if (!appendKey(keyword)) {
594
return false;
595
}
596
} else {
597
StringSpan key(keyword.begin(extension), UnicodeKeyLength);
598
StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength,
599
keyword.length() - UnicodeKeyWithSepLength);
600
601
if (canonicalForm == UnicodeExtensionCanonicalForm::Yes) {
602
// Search if there's a replacement for the current Unicode keyword.
603
if (const char* replacement = replaceUnicodeExtensionType(key, type)) {
604
if (!appendReplacement(keyword,
605
mozilla::MakeStringSpan(replacement))) {
606
return false;
607
}
608
} else {
609
if (!appendKeyword(keyword, type)) {
610
return false;
611
}
612
}
613
} else {
614
if (!appendKeyword(keyword, type)) {
615
return false;
616
}
617
}
618
}
619
}
620
621
// We can keep the previous extension when canonicalization didn't modify it.
622
if (sb.length() != length ||
623
std::char_traits<char>::compare(sb.begin(), extension, length) != 0) {
624
// Null-terminate the new string and replace the previous extension.
625
if (!sb.append('\0')) {
626
return false;
627
}
628
UniqueChars canonical(sb.extractOrCopyRawBuffer());
629
if (!canonical) {
630
return false;
631
}
632
unicodeExtension = std::move(canonical);
633
}
634
635
return true;
636
}
637
638
template <class Buffer>
639
static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag,
640
Buffer& sb) {
641
auto appendSubtag = [&sb](const auto& subtag) {
642
auto span = subtag.span();
643
MOZ_ASSERT(span.size() > 0);
644
return sb.append(span.data(), span.size());
645
};
646
647
auto appendSubtagZ = [&sb](const char* subtag) {
648
MOZ_ASSERT(strlen(subtag) > 0);
649
return sb.append(subtag, strlen(subtag));
650
};
651
652
auto appendSubtagsZ = [&sb, &appendSubtagZ](const auto& subtags) {
653
for (const auto& subtag : subtags) {
654
if (!sb.append('-') || !appendSubtagZ(subtag.get())) {
655
return false;
656
}
657
}
658
return true;
659
};
660
661
// Append the language subtag.
662
if (!appendSubtag(tag.language())) {
663
return false;
664
}
665
666
// Append the script subtag if present.
667
if (tag.script().present()) {
668
if (!sb.append('-') || !appendSubtag(tag.script())) {
669
return false;
670
}
671
}
672
673
// Append the region subtag if present.
674
if (tag.region().present()) {
675
if (!sb.append('-') || !appendSubtag(tag.region())) {
676
return false;
677
}
678
}
679
680
// Append the variant subtags if present.
681
if (!appendSubtagsZ(tag.variants())) {
682
return false;
683
}
684
685
// Append the extensions subtags if present.
686
if (!appendSubtagsZ(tag.extensions())) {
687
return false;
688
}
689
690
// Append the private-use subtag if present.
691
if (tag.privateuse()) {
692
if (!sb.append('-') || !appendSubtagZ(tag.privateuse())) {
693
return false;
694
}
695
}
696
697
return true;
698
}
699
700
/**
701
* CanonicalizeTransformExtension
702
*
704
*
705
* - These subtags are all in lowercase (that is the canonical casing for these
706
* subtags), [...].
707
*
708
* And per
710
*
711
* - All keywords and tfields are sorted by alphabetical order of their keys,
712
* within their respective extensions.
713
*/
714
bool LanguageTag::canonicalizeTransformExtension(
715
JSContext* cx, JS::UniqueChars& transformExtension) {
716
const char* const extension = transformExtension.get();
717
MOZ_ASSERT(extension[0] == 't');
718
MOZ_ASSERT(extension[1] == '-');
719
MOZ_ASSERT(
720
IsStructurallyValidExtensionTag(mozilla::MakeStringSpan(extension)));
721
722
size_t length = strlen(extension);
723
724
LanguageTag tag(cx);
725
LanguageTagParser::TFieldVector fields(cx);
726
727
using TField = LanguageTagParser::TFieldVector::ElementType;
728
729
bool ok;
730
JS_TRY_VAR_OR_RETURN_FALSE(
731
cx, ok,
732
LanguageTagParser::parseTransformExtension(
733
cx, mozilla::MakeSpan(extension, length), tag, fields));
734
MOZ_ASSERT(ok, "unexpected invalid transform extension subtag");
735
736
auto tfieldLessOrEqual = [extension](const TField& a, const TField& b) {
737
MOZ_ASSERT(a.length() > TransformKeyLength);
738
MOZ_ASSERT(b.length() > TransformKeyLength);
739
const char* astr = a.begin(extension);
740
const char* bstr = b.begin(extension);
741
return std::char_traits<char>::compare(astr, bstr, TransformKeyLength) <= 0;
742
};
743
744
// All tfields are sorted by alphabetical order of their keys.
745
size_t fieldsLength = fields.length();
746
if (fieldsLength > 1) {
747
if (!fields.growByUninitialized(fieldsLength)) {
748
return false;
749
}
750
751
MOZ_ALWAYS_TRUE(
752
MergeSort(fields.begin(), fieldsLength, fields.begin() + fieldsLength,
753
[&](const auto& a, const auto& b, bool* lessOrEqualp) {
754
*lessOrEqualp = tfieldLessOrEqual(a, b);
755
return true;
756
}));
757
758
fields.shrinkBy(fieldsLength);
759
}
760
761
Vector<char, 32> sb(cx);
762
if (!sb.append('t')) {
763
return false;
764
}
765
766
// Append the language subtag if present.
767
//
768
// [1] is a bit unclear whether or not the `tlang` subtag also needs to be
769
// canonicalized (and case-adjusted). For now simply append it as is.
770
// (|parseTransformExtension| doesn't alter case from the lowercased form we
771
// have previously taken pains to ensure is present in the extension, so no
772
// special effort is required to ensure lowercasing.) If we switch to [2], the
773
// `tlang` subtag also needs to be canonicalized according to the same rules
774
// as `unicode_language_id` subtags are canonicalized. Also see [3].
775
//
779
if (tag.language().present()) {
780
if (!sb.append('-')) {
781
return false;
782
}
783
if (!LanguageTagToString(cx, tag, sb)) {
784
return false;
785
}
786
}
787
788
// Append all fields.
789
//
790
// UTS 35, 3.2.1 specifies:
791
// - Any type or tfield value "true" is removed.
792
//
793
// But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore
794
// this apparently invalid part of the UTS 35 specification and simply
795
// append all `tfield` subtags.
796
for (const auto& field : fields) {
797
if (!sb.append('-')) {
798
return false;
799
}
800
if (!sb.append(field.begin(extension), field.length())) {
801
return false;
802
}
803
}
804
805
// We can keep the previous extension when canonicalization didn't modify it.
806
if (sb.length() != length ||
807
std::char_traits<char>::compare(sb.begin(), extension, length) != 0) {
808
// Null-terminate the new string and replace the previous extension.
809
if (!sb.append('\0')) {
810
return false;
811
}
812
UniqueChars canonical(sb.extractOrCopyRawBuffer());
813
if (!canonical) {
814
return false;
815
}
816
transformExtension = std::move(canonical);
817
}
818
819
return true;
820
}
821
822
JSString* LanguageTag::toString(JSContext* cx) const {
823
JSStringBuilder sb(cx);
824
if (!LanguageTagToString(cx, *this, sb)) {
825
return nullptr;
826
}
827
828
return sb.finishString();
829
}
830
831
UniqueChars LanguageTag::toStringZ(JSContext* cx) const {
832
Vector<char, 16> sb(cx);
833
if (!LanguageTagToString(cx, *this, sb)) {
834
return nullptr;
835
}
836
if (!sb.append('\0')) {
837
return nullptr;
838
}
839
840
return UniqueChars(sb.extractOrCopyRawBuffer());
841
}
842
843
// Zero-terminated ICU Locale ID.
844
using LocaleId =
845
js::Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>;
846
847
enum class LikelySubtags : bool { Add, Remove };
848
849
// Return true iff the language tag is already maximized resp. minimized.
850
static bool HasLikelySubtags(LikelySubtags likelySubtags,
851
const LanguageTag& tag) {
852
// The language tag is already maximized if the language, script, and region
853
// subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are
854
// used.
855
if (likelySubtags == LikelySubtags::Add) {
856
return !tag.language().equalTo("und") &&
857
(tag.script().present() && !tag.script().equalTo("Zzzz")) &&
858
(tag.region().present() && !tag.region().equalTo("ZZ"));
859
}
860
861
// The language tag is already minimized if it only contains a language
862
// subtag whose value is not the placeholder value "und".
863
return !tag.language().equalTo("und") && tag.script().missing() &&
864
tag.region().missing();
865
}
866
867
// Create an ICU locale ID from the given language tag.
868
static bool CreateLocaleForLikelySubtags(const LanguageTag& tag,
869
LocaleId& locale) {
870
MOZ_ASSERT(locale.length() == 0);
871
872
auto appendSubtag = [&locale](const auto& subtag) {
873
auto span = subtag.span();
874
MOZ_ASSERT(span.size() > 0);
875
return locale.append(span.data(), span.size());
876
};
877
878
// Append the language subtag.
879
if (!appendSubtag(tag.language())) {
880
return false;
881
}
882
883
// Append the script subtag if present.
884
if (tag.script().present()) {
885
if (!locale.append('_') || !appendSubtag(tag.script())) {
886
return false;
887
}
888
}
889
890
// Append the region subtag if present.
891
if (tag.region().present()) {
892
if (!locale.append('_') || !appendSubtag(tag.region())) {
893
return false;
894
}
895
}
896
897
// Zero-terminated for use with ICU.
898
return locale.append('\0');
899
}
900
901
// Assign the language, script, and region subtags from an ICU locale ID.
902
//
903
// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to
904
// retrieve these subtags, but unfortunately these functions are rather slow, so
905
// we use our own implementation.
906
static bool AssignFromLocaleId(JSContext* cx, LocaleId& localeId,
907
LanguageTag& tag) {
908
MOZ_ASSERT(localeId.back() == '\0',
909
"Locale ID should be zero-terminated for ICU");
910
911
// Replace the ICU locale ID separator.
912
std::replace(localeId.begin(), localeId.end(), '_', '-');
913
914
// ICU replaces "und" with the empty string, which means "und" becomes "" and
915
// "und-Latn" becomes "-Latn". Handle this case separately.
916
if (localeId[0] == '\0' || localeId[0] == '-') {
917
static constexpr char und[] = "und";
918
size_t length = strlen(und);
919
920
// Insert "und" in front of the locale ID.
921
if (!localeId.growBy(length)) {
922
return false;
923
}
924
memmove(localeId.begin() + length, localeId.begin(), localeId.length());
925
memmove(localeId.begin(), und, length);
926
}
927
928
mozilla::Span<const char> localeSpan(localeId.begin(), localeId.length() - 1);
929
930
// Retrieve the language, script, and region subtags from the locale ID, but
931
// ignore any other subtags.
932
LanguageTag localeTag(cx);
933
if (!LanguageTagParser::parseBaseName(cx, localeSpan, localeTag)) {
934
return false;
935
}
936
937
tag.setLanguage(localeTag.language());
938
tag.setScript(localeTag.script());
939
tag.setRegion(localeTag.region());
940
941
return true;
942
}
943
944
template <decltype(uloc_addLikelySubtags) likelySubtagsFn>
945
static bool CallLikelySubtags(JSContext* cx, const LocaleId& localeId,
946
LocaleId& result) {
947
// Locale ID must be zero-terminated before passing it to ICU.
948
MOZ_ASSERT(localeId.back() == '\0');
949
MOZ_ASSERT(result.length() == 0);
950
951
// Ensure there's enough room for the result.
952
MOZ_ALWAYS_TRUE(result.resize(LocaleId::InlineLength));
953
954
int32_t length = intl::CallICU(
955
cx,
956
[&localeId](char* chars, int32_t size, UErrorCode* status) {
957
return likelySubtagsFn(localeId.begin(), chars, size, status);
958
},
959
result);
960
if (length < 0) {
961
return false;
962
}
963
964
MOZ_ASSERT(
965
size_t(length) <= LocaleId::InlineLength,
966
"Unexpected extra subtags were added by ICU. If this assertion ever "
967
"fails, simply remove it and move on like nothing ever happended.");
968
969
// Resize the vector to the actual string length.
970
result.shrinkTo(length);
971
972
// Zero-terminated for use with ICU.
973
return result.append('\0');
974
}
975
976
// The canonical way to compute the Unicode BCP 47 locale identifier with likely
977
// subtags is as follows:
978
//
979
// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU
980
// locale ID.
981
// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID.
982
// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into
983
// a Unicode BCP 47 locale identifier.
984
//
985
// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow
986
// and we know, by construction, that the input Unicode BCP 47 locale identifier
987
// only contains valid language, script, and region subtags, we can avoid both
988
// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and
989
// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of
990
// |Intl.Locale.prototype.maximize|.)
991
static bool LikelySubtags(JSContext* cx, LikelySubtags likelySubtags,
992
LanguageTag& tag) {
993
// Return early if the input is already maximized/minimized.
994
if (HasLikelySubtags(likelySubtags, tag)) {
995
return true;
996
}
997
998
// Create the locale ID for the input argument.
999
LocaleId locale(cx);
1000
if (!CreateLocaleForLikelySubtags(tag, locale)) {
1001
return false;
1002
}
1003
1004
// UTS #35 requires that locale ID is maximized before its likely subtags are
1005
// removed, so we need to call uloc_addLikelySubtags() for both cases.
1008
1009
LocaleId localeLikelySubtags(cx);
1010
1011
// Add likely subtags to the locale ID. When minimizing we can skip adding the
1012
// likely subtags for already maximized tags. (When maximizing we've already
1013
// verified above that the tag is missing likely subtags.)
1014
bool addLikelySubtags = likelySubtags == LikelySubtags::Add ||
1015
!HasLikelySubtags(LikelySubtags::Add, tag);
1016
1017
if (addLikelySubtags) {
1018
if (!CallLikelySubtags<uloc_addLikelySubtags>(cx, locale,
1019
localeLikelySubtags)) {
1020
return false;
1021
}
1022
}
1023
1024
// Now that we've succesfully maximized the locale, we can minimize it.
1025
if (likelySubtags == LikelySubtags::Remove) {
1026
if (addLikelySubtags) {
1027
// Copy the maximized subtags back into |locale|.
1028
locale = std::move(localeLikelySubtags);
1029
localeLikelySubtags = LocaleId(cx);
1030
}
1031
1032
// Remove likely subtags from the locale ID.
1033
if (!CallLikelySubtags<uloc_minimizeSubtags>(cx, locale,
1034
localeLikelySubtags)) {
1035
return false;
1036
}
1037
}
1038
1039
// Assign the language, script, and region subtags from the locale ID.
1040
if (!AssignFromLocaleId(cx, localeLikelySubtags, tag)) {
1041
return false;
1042
}
1043
1044
// Update mappings in case ICU returned a non-canonical locale.
1045
return tag.canonicalizeBaseName(cx);
1046
}
1047
1048
bool LanguageTag::addLikelySubtags(JSContext* cx) {
1049
return LikelySubtags(cx, LikelySubtags::Add, *this);
1050
}
1051
1052
bool LanguageTag::removeLikelySubtags(JSContext* cx) {
1053
return LikelySubtags(cx, LikelySubtags::Remove, *this);
1054
}
1055
1056
LanguageTagParser::Token LanguageTagParser::nextToken() {
1057
MOZ_ASSERT(index_ <= length_ + 1, "called after 'None' token was read");
1058
1059
TokenKind kind = TokenKind::None;
1060
size_t tokenLength = 0;
1061
for (size_t i = index_; i < length_; i++) {
1062
// UTS 35, section 3.1.
1063
// alpha = [A-Z a-z] ;
1064
// digit = [0-9] ;
1065
char16_t c = charAtUnchecked(i);
1066
if (mozilla::IsAsciiAlpha(c)) {
1067
kind |= TokenKind::Alpha;
1068
} else if (mozilla::IsAsciiDigit(c)) {
1069
kind |= TokenKind::Digit;
1070
} else if (c == '-' && i > index_ && i + 1 < length_) {
1071
break;
1072
} else {
1073
return {TokenKind::Error, 0, 0};
1074
}
1075
tokenLength += 1;
1076
}
1077
1078
Token token{kind, index_, tokenLength};
1079
index_ += tokenLength + 1;
1080
return token;
1081
}
1082
1083
UniqueChars LanguageTagParser::chars(JSContext* cx, size_t index,
1084
size_t length) const {
1085
// Add +1 to null-terminate the string.
1086
auto chars = cx->make_pod_array<char>(length + 1);
1087
if (chars) {
1088
char* dest = chars.get();
1089
if (locale_.is<const JS::Latin1Char*>()) {
1090
std::copy_n(locale_.as<const JS::Latin1Char*>() + index, length, dest);
1091
} else {
1092
std::copy_n(locale_.as<const char16_t*>() + index, length, dest);
1093
}
1094
dest[length] = '\0';
1095
}
1096
return chars;
1097
}
1098
1099
// Parse the `unicode_language_id` production.
1100
//
1101
// unicode_language_id = unicode_language_subtag
1102
// (sep unicode_script_subtag)?
1103
// (sep unicode_region_subtag)?
1104
// (sep unicode_variant_subtag)* ;
1105
//
1106
// sep = "-"
1107
//
1108
// Note: Unicode CLDR locale identifier backward compatibility extensions
1109
// removed from `unicode_language_id`.
1110
//
1111
// |tok| is the current token from |ts|.
1112
//
1113
// All subtags will be added unaltered to |tag|, without canonicalizing their
1114
// case or, in the case of variant subtags, detecting and rejecting duplicate
1115
// variants. Users must subsequently |canonicalizeBaseName| to perform these
1116
// actions.
1117
//
1118
// Do not use this function directly: use |parseBaseName| or
1119
// |parseTlangFromTransformExtension| instead.
1120
JS::Result<bool> LanguageTagParser::internalParseBaseName(JSContext* cx,
1121
LanguageTagParser& ts,
1122
LanguageTag& tag,
1123
Token& tok) {
1124
if (ts.isLanguage(tok)) {
1125
ts.copyChars(tok, tag.language_);
1126
1127
tok = ts.nextToken();
1128
} else {
1129
// The language subtag is mandatory.
1130
return false;
1131
}
1132
1133
if (ts.isScript(tok)) {
1134
ts.copyChars(tok, tag.script_);
1135
1136
tok = ts.nextToken();
1137
}
1138
1139
if (ts.isRegion(tok)) {
1140
ts.copyChars(tok, tag.region_);
1141
1142
tok = ts.nextToken();
1143
}
1144
1145
auto& variants = tag.variants_;
1146
MOZ_ASSERT(variants.length() == 0);
1147
while (ts.isVariant(tok)) {
1148
auto variant = ts.chars(cx, tok);
1149
if (!variant) {
1150
return cx->alreadyReportedOOM();
1151
}
1152
if (!variants.append(std::move(variant))) {
1153
return cx->alreadyReportedOOM();
1154
}
1155
1156
tok = ts.nextToken();
1157
}
1158
1159
return true;
1160
}
1161
1162
static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars(
1163
const char* locale) {
1164
return mozilla::AsVariant(reinterpret_cast<const JS::Latin1Char*>(locale));
1165
}
1166
1167
static mozilla::Variant<const Latin1Char*, const char16_t*> StringChars(
1168
JSLinearString* linear, JS::AutoCheckCannotGC& nogc) {
1169
if (linear->hasLatin1Chars()) {
1170
return mozilla::AsVariant(linear->latin1Chars(nogc));
1171
}
1172
return mozilla::AsVariant(linear->twoByteChars(nogc));
1173
}
1174
1175
JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
1176
JSLinearString* locale,
1177
LanguageTag& tag) {
1178
JS::AutoCheckCannotGC nogc;
1179
LocaleChars localeChars = StringChars(locale, nogc);
1180
return tryParse(cx, localeChars, locale->length(), tag);
1181
}
1182
1183
JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
1184
mozilla::Span<const char> locale,
1185
LanguageTag& tag) {
1186
LocaleChars localeChars = StringChars(locale.data());
1187
return tryParse(cx, localeChars, locale.size(), tag);
1188
}
1189
1190
JS::Result<bool> LanguageTagParser::tryParse(JSContext* cx,
1191
LocaleChars& localeChars,
1192
size_t localeLength,
1193
LanguageTag& tag) {
1194
// unicode_locale_id = unicode_language_id
1195
// extensions*
1196
// pu_extensions? ;
1197
1198
LanguageTagParser ts(localeChars, localeLength);
1199
Token tok = ts.nextToken();
1200
1201
bool ok;
1202
MOZ_TRY_VAR(ok, parseBaseName(cx, ts, tag, tok));
1203
if (!ok) {
1204
return false;
1205
}
1206
1207
// extensions = unicode_locale_extensions
1208
// | transformed_extensions
1209
// | other_extensions ;
1210
1211
// Bit set of seen singletons.
1212
uint64_t seenSingletons = 0;
1213
1214
auto& extensions = tag.extensions_;
1215
while (ts.isExtensionStart(tok)) {
1216
char singleton = ts.singletonKey(tok);
1217
1218
// Reject the input if a duplicate singleton was found.
1219
uint64_t hash = 1ULL << (mozilla::AsciiAlphanumericToNumber(singleton) + 1);
1220
if (seenSingletons & hash) {
1221
return false;
1222
}
1223
seenSingletons |= hash;
1224
1225
Token start = tok;
1226
tok = ts.nextToken();
1227
1228
// We'll check for missing non-singleton subtags after this block by
1229
// comparing |startValue| with the then-current position.
1230
size_t startValue = tok.index();
1231
1232
if (singleton == 'u') {
1233
while (ts.isUnicodeExtensionPart(tok)) {
1234
tok = ts.nextToken();
1235
}
1236
} else if (singleton == 't') {
1237
// transformed_extensions = sep [tT]
1238
// ((sep tlang (sep tfield)*)
1239
// | (sep tfield)+) ;
1240
1241
// tlang = unicode_language_subtag
1242
// (sep unicode_script_subtag)?
1243
// (sep unicode_region_subtag)?
1244
// (sep unicode_variant_subtag)* ;
1245
if (ts.isLanguage(tok)) {
1246
tok = ts.nextToken();
1247
1248
if (ts.isScript(tok)) {
1249
tok = ts.nextToken();
1250
}
1251
1252
if (ts.isRegion(tok)) {
1253
tok = ts.nextToken();
1254
}
1255
1256
while (ts.isVariant(tok)) {
1257
tok = ts.nextToken();
1258
}
1259
}
1260
1261
// tfield = tkey tvalue;
1262
while (ts.isTransformExtensionKey(tok)) {
1263
tok = ts.nextToken();
1264
1265
size_t startTValue = tok.index();
1266
while (ts.isTransformExtensionPart(tok)) {
1267
tok = ts.nextToken();
1268
}
1269
1270
// `tfield` requires at least one `tvalue`.
1271
if (tok.index() <= startTValue) {
1272
return false;
1273
}
1274
}
1275
} else {
1276
while (ts.isOtherExtensionPart(tok)) {
1277
tok = ts.nextToken();
1278
}
1279
}
1280
1281
// Singletons must be followed by a non-singleton subtag, "en-a-b" is not
1282
// allowed.
1283
if (tok.index() <= startValue) {
1284
return false;
1285
}
1286
1287
UniqueChars extension = ts.extension(cx, start, tok);
1288
if (!extension) {
1289
return cx->alreadyReportedOOM();
1290
}
1291
if (!extensions.append(std::move(extension))) {
1292
return cx->alreadyReportedOOM();
1293
}
1294
}
1295
1296
// Trailing `pu_extension` component of the `unicode_locale_id` production.
1297
if (ts.isPrivateUseStart(tok)) {
1298
Token start = tok;
1299
tok = ts.nextToken();
1300
1301
size_t startValue = tok.index();
1302
while (ts.isPrivateUsePart(tok)) {
1303
tok = ts.nextToken();
1304
}
1305
1306
// There must be at least one subtag after the "-x-".
1307
if (tok.index() <= startValue) {
1308
return false;
1309
}
1310
1311
UniqueChars privateUse = ts.extension(cx, start, tok);
1312
if (!privateUse) {
1313
return cx->alreadyReportedOOM();
1314
}
1315
tag.privateuse_ = std::move(privateUse);
1316
}
1317
1318
// Return true if the complete input was successfully parsed.
1319
return tok.isNone();
1320
}
1321
1322
bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale,
1323
LanguageTag& tag) {
1324
bool ok;
1325
JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag));
1326
if (ok) {
1327
return true;
1328
}
1329
if (UniqueChars localeChars = QuoteString(cx, locale, '"')) {
1330
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
1331
JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
1332
}
1333
return false;
1334
}
1335
1336
bool LanguageTagParser::parse(JSContext* cx, mozilla::Span<const char> locale,
1337
LanguageTag& tag) {
1338
bool ok;
1339
JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag));
1340
if (ok) {
1341
return true;
1342
}
1343
if (UniqueChars localeChars =
1344
DuplicateString(cx, locale.data(), locale.size())) {
1345
JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr,
1346
JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
1347
}
1348
return false;
1349
}
1350
1351
bool LanguageTagParser::parseBaseName(JSContext* cx,
1352
mozilla::Span<const char> locale,
1353
LanguageTag& tag) {
1354
LocaleChars localeChars = StringChars(locale.data());
1355
LanguageTagParser ts(localeChars, locale.size());
1356
Token tok = ts.nextToken();
1357
1358
// Parse only the base-name part and ignore any trailing characters.
1359
bool ok;
1360
JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, parseBaseName(cx, ts, tag, tok));
1361
if (ok) {
1362
return true;
1363
}
1364
if (UniqueChars localeChars = DuplicateString(cx, locale.data(),
1365
locale.size())) {
1366
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
1367
JSMSG_INVALID_LANGUAGE_TAG, localeChars.get());
1368
}
1369
return false;
1370
}
1371
1372
// Parse |extension|, which must be a valid `transformed_extensions` subtag, and
1373
// fill |tag| and |fields| from the `tlang` and `tfield` components.
1374
JS::Result<bool> LanguageTagParser::parseTransformExtension(
1375
JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag,
1376
TFieldVector& fields) {
1377
LocaleChars extensionChars = StringChars(extension.data());
1378
LanguageTagParser ts(extensionChars, extension.size());
1379
Token tok = ts.nextToken();
1380
1381
if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') {
1382
return false;
1383
}
1384
1385
tok = ts.nextToken();
1386
1387
if (tok.isNone()) {
1388
return false;
1389
}
1390
1391
if (ts.isLanguage(tok)) {
1392
// We're parsing a possible `tlang` in a known-valid transform extension, so
1393
// use the special-purpose function that takes advantage of this to compute
1394
// lowercased |tag| contents in an optimal manner.
1395
MOZ_TRY(parseTlangInTransformExtension(cx, ts, tag, tok));
1396
1397
// After `tlang` we must have a `tfield` and its `tkey`, or we're at the end
1398
// of the transform extension.
1399
MOZ_ASSERT(ts.isTransformExtensionKey(tok) || tok.isNone());
1400
} else {
1401
// If there's no `tlang` subtag, at least one `tfield` must be present.
1402
MOZ_ASSERT(ts.isTransformExtensionKey(tok));
1403
}
1404
1405
// Trailing `tfield` subtags. (Any other trailing subtags are an error,
1406
// because we're guaranteed to only see a valid tranform extension here.)
1407
while (ts.isTransformExtensionKey(tok)) {
1408
size_t begin = tok.index();
1409
tok = ts.nextToken();
1410
1411
size_t startTValue = tok.index();
1412
while (ts.isTransformExtensionPart(tok)) {
1413
tok = ts.nextToken();
1414
}
1415
1416
// `tfield` requires at least one `tvalue`.
1417
if (tok.index() <= startTValue) {
1418
return false;
1419
}
1420
1421
size_t length = tok.index() - 1 - begin;
1422
if (!fields.emplaceBack(begin, length)) {
1423
return cx->alreadyReportedOOM();
1424
}
1425
}
1426
1427
// Return true if the complete input was successfully parsed.
1428
return tok.isNone();
1429
}
1430
1431
// Parse |extension|, which must be a valid `unicode_locale_extensions` subtag,
1432
// and fill |attributes| and |keywords| from the `attribute` and `keyword`
1433
// components.
1434
JS::Result<bool> LanguageTagParser::parseUnicodeExtension(
1435
JSContext* cx, mozilla::Span<const char> extension,
1436
AttributesVector& attributes, KeywordsVector& keywords) {
1437
LocaleChars extensionChars = StringChars(extension.data());
1438
LanguageTagParser ts(extensionChars, extension.size());
1439
Token tok = ts.nextToken();
1440
1441
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
1442
// (sep attribute)+ (sep keyword)*) ;
1443
1444
if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') {
1445
return false;
1446
}
1447
1448
tok = ts.nextToken();
1449
1450
if (tok.isNone()) {
1451
return false;
1452
}
1453
1454
while (ts.isUnicodeExtensionAttribute(tok)) {
1455
if (!attributes.emplaceBack(tok.index(), tok.length())) {
1456
return cx->alreadyReportedOOM();
1457
}
1458
1459
tok = ts.nextToken();
1460
}
1461
1462
// keyword = key (sep type)? ;
1463
while (ts.isUnicodeExtensionKey(tok)) {
1464
size_t begin = tok.index();
1465
tok = ts.nextToken();
1466
1467
while (ts.isUnicodeExtensionType(tok)) {
1468
tok = ts.nextToken();
1469
}
1470
1471
if (tok.isError()) {
1472
return false;
1473
}
1474
1475
size_t length = tok.index() - 1 - begin;
1476
if (!keywords.emplaceBack(begin, length)) {
1477
return cx->alreadyReportedOOM();
1478
}
1479
}
1480
1481
// Return true if the complete input was successfully parsed.
1482
return tok.isNone();
1483
}
1484
1485
bool LanguageTagParser::canParseUnicodeExtension(
1486
mozilla::Span<const char> extension) {
1487
LocaleChars extensionChars = StringChars(extension.data());
1488
LanguageTagParser ts(extensionChars, extension.size());
1489
Token tok = ts.nextToken();
1490
1491
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
1492
// (sep attribute)+ (sep keyword)*) ;
1493
1494
if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') {
1495
return false;
1496
}
1497
1498
tok = ts.nextToken();
1499
1500
if (tok.isNone()) {
1501
return false;
1502
}
1503
1504
while (ts.isUnicodeExtensionAttribute(tok)) {
1505
tok = ts.nextToken();
1506
}
1507
1508
// keyword = key (sep type)? ;
1509
while (ts.isUnicodeExtensionKey(tok)) {
1510
tok = ts.nextToken();
1511
1512
while (ts.isUnicodeExtensionType(tok)) {
1513
tok = ts.nextToken();
1514
}
1515
1516
if (tok.isError()) {
1517
return false;
1518
}
1519
}
1520
1521
// Return true if the complete input was successfully parsed.
1522
return tok.isNone();
1523
}
1524
1525
bool LanguageTagParser::canParseUnicodeExtensionType(
1526
JSLinearString* unicodeType) {
1527
MOZ_ASSERT(unicodeType->length() > 0, "caller must exclude empty strings");
1528
1529
JS::AutoCheckCannotGC nogc;
1530
LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc);
1531
1532
LanguageTagParser ts(unicodeTypeChars, unicodeType->length());
1533
Token tok = ts.nextToken();
1534
1535
while (ts.isUnicodeExtensionType(tok)) {
1536
tok = ts.nextToken();
1537
}
1538
1539
// Return true if the complete input was successfully parsed.
1540
return tok.isNone();
1541
}
1542
1543
bool ParseStandaloneLanguageTag(HandleLinearString str,
1544
LanguageSubtag& result) {
1545
JS::AutoCheckCannotGC nogc;
1546
if (str->hasLatin1Chars()) {
1547
if (!IsStructurallyValidLanguageTag<Latin1Char>(str->latin1Range(nogc))) {
1548
return false;
1549
}
1550
result.set<Latin1Char>(str->latin1Range(nogc));
1551
} else {
1552
if (!IsStructurallyValidLanguageTag<char16_t>(str->twoByteRange(nogc))) {
1553
return false;
1554
}
1555
result.set<char16_t>(str->twoByteRange(nogc));
1556
}
1557
return true;
1558
}
1559
1560
bool ParseStandaloneScriptTag(HandleLinearString str, ScriptSubtag& result) {
1561
JS::AutoCheckCannotGC nogc;
1562
if (str->hasLatin1Chars()) {
1563
if (!IsStructurallyValidScriptTag<Latin1Char>(str->latin1Range(nogc))) {
1564
return false;
1565
}
1566
result.set<Latin1Char>(str->latin1Range(nogc));
1567
} else {
1568
if (!IsStructurallyValidScriptTag<char16_t>(str->twoByteRange(nogc))) {
1569
return false;
1570
}
1571
result.set<char16_t>(str->twoByteRange(nogc));
1572
}
1573
return true;
1574
}
1575
1576
bool ParseStandaloneRegionTag(HandleLinearString str, RegionSubtag& result) {
1577
JS::AutoCheckCannotGC nogc;
1578
if (str->hasLatin1Chars()) {
1579
if (!IsStructurallyValidRegionTag<Latin1Char>(str->latin1Range(nogc))) {
1580
return false;
1581
}
1582
result.set<Latin1Char>(str->latin1Range(nogc));
1583
} else {
1584
if (!IsStructurallyValidRegionTag<char16_t>(str->twoByteRange(nogc))) {
1585
return false;
1586
}
1587
result.set<char16_t>(str->twoByteRange(nogc));
1588
}
1589
return true;
1590
}
1591
1592
template <typename CharT>
1593
static bool IsAsciiLowercaseAlpha(mozilla::Span<const CharT> span) {
1594
// Tell the analysis the |std::all_of| function can't GC.
1595
JS::AutoSuppressGCAnalysis nogc;
1596
1597
const CharT* ptr = span.data();
1598
size_t length = span.size();
1599
return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<CharT>);
1600
}
1601
1602
static bool IsAsciiLowercaseAlpha(JSLinearString* str) {
1603
JS::AutoCheckCannotGC nogc;
1604
if (str->hasLatin1Chars()) {
1605
return IsAsciiLowercaseAlpha<Latin1Char>(str->latin1Range(nogc));
1606
}
1607
return IsAsciiLowercaseAlpha<char16_t>(str->twoByteRange(nogc));
1608
}
1609
1610
template <typename CharT>
1611
static bool IsAsciiAlpha(mozilla::Span<const CharT> span) {
1612
// Tell the analysis the |std::all_of| function can't GC.
1613
JS::AutoSuppressGCAnalysis nogc;
1614
1615
const CharT* ptr = span.data();
1616
size_t length = span.size();
1617
return std::all_of(ptr, ptr + length, mozilla::IsAsciiAlpha<CharT>);
1618
}
1619
1620
static bool IsAsciiAlpha(JSLinearString* str) {
1621
JS::AutoCheckCannotGC nogc;
1622
if (str->hasLatin1Chars()) {
1623
return IsAsciiAlpha<Latin1Char>(str->latin1Range(nogc));
1624
}
1625
return IsAsciiAlpha<char16_t>(str->twoByteRange(nogc));
1626
}
1627
1628
JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx,
1629
HandleLinearString str) {
1630
// ISO-639 language codes contain either two or three characters.
1631
size_t length = str->length();
1632
if (length != 2 && length != 3) {
1633
return nullptr;
1634
}
1635
1636
// We can directly the return the input below if it's in the correct case.
1637
bool isLowerCase = IsAsciiLowercaseAlpha(str);
1638
if (!isLowerCase) {
1639
// Must be an ASCII alpha string.
1640
if (!IsAsciiAlpha(str)) {
1641
return nullptr;
1642
}
1643
}
1644
1645
LanguageSubtag languageTag;
1646
if (str->hasLatin1Chars()) {
1647
JS::AutoCheckCannotGC nogc;
1648
languageTag.set<Latin1Char>(str->latin1Range(nogc));
1649
} else {
1650
JS::AutoCheckCannotGC nogc;
1651
languageTag.set<char16_t>(str->twoByteRange(nogc));
1652
}
1653
1654
if (!isLowerCase) {
1655
// The language subtag is canonicalized to lower case.
1656
languageTag.toLowerCase();
1657
}
1658
1659
// Reject the input if the canonical tag contains more than just a single
1660
// language subtag.
1661
if (LanguageTag::complexLanguageMapping(languageTag)) {
1662
return nullptr;
1663
}
1664
1665
// Take care to replace deprecated subtags with their preferred values.
1666
JSString* result;
1667
if (LanguageTag::languageMapping(languageTag) || !isLowerCase) {
1668
auto span = languageTag.span();
1669
result = NewStringCopyN<CanGC>(cx, span.data(), span.size());
1670
} else {
1671
result = str;
1672
}
1673
if (!result) {
1674
return cx->alreadyReportedOOM();
1675
}
1676
return result;
1677
}
1678
1679
void js::intl::UnicodeExtensionKeyword::trace(JSTracer* trc) {
1680
TraceRoot(trc, &type_, "UnicodeExtensionKeyword::type");
1681
}
1682
1683
} // namespace intl
1684
} // namespace js