LanguageId.h - mozsearch

firefox-main/js/src/util/LanguageId.h (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: JavaScript Engine

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-

 * vim: set ts=8 sts=2 et sw=2 tw=80:

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef util_LanguageId_h

#define util_LanguageId_h

#include "mozilla/Assertions.h"

#include "mozilla/HashFunctions.h"

#include "mozilla/Maybe.h"

#include "mozilla/Span.h"

#include "mozilla/TextUtils.h"

#include <algorithm>

#include <array>

#include <cstring>

#include <stdint.h>

#include <string_view>

#include <utility>

namespace js {

class LanguageIdString;

/**

 * Compact representation of language identifiers.

 * Language identifiers have the following limitations when compared to Unicode

 * BCP 47 locale identifiers:

 * - Language subtags can have at most three letters.

 * - Variant and extension subtags are not supported.

 * In other words, language identifiers contain only language, script, and

 * region subtags.

 * All locales supported by ICU4C can be represented as language identifiers,

 * except for "en_US_POSIX". "en_US_POSIX" canonicalizes to "en-US-u-va-posix",

 * which contains a Unicode extension sequence, so it's not a valid available

 * ECMA-402 locale, see also <https://tc39.es/ecma402/#available-locales-list>.

 * Features:

 * - Fixed-length fields to avoid any heap allocations.

 * - Minimal size to allow efficient storing in other data structures.

 * - Fast comparison support for prefix-based locale lookup operations.

 * - Methods optimized for fast generated assembly code. Verified by inspecting

 *   the (x86) assembly code for Clang with optimization level O3 and ensuring

 *   all methods generate only basic assembly instructions and don't require

 *   calls to other built-ins.

 * References:

 * https://tc39.es/ecma402/#sec-language-tags

 * https://unicode-org.github.io/icu/userguide/locale/

 * https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers

*/

class LanguageId final {

  // GCC 10 doesn't support defaulted equality operators for plain arrays

  // (<https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93480>). So we can't write

  // this:

//

  // char language_[3] = {};

  // char script_[4] = {};

  // char region_[3] = {};

//

  // In addition to that GCC bug, Clang sometimes (!) generates worse code for

  // comparisons when separate arrays are used.

  std::array<char, 10> chars_{};

  constexpr auto as_span() { return mozilla::Span<char, 10>{chars_}; }

  constexpr auto language_span() { return as_span().Subspan<0, 3>(); }

  constexpr auto script_span() { return as_span().Subspan<3, 4>(); }

  constexpr auto region_span() { return as_span().Subspan<7, 3>(); }

  constexpr auto as_span() const {

    return mozilla::Span<const char, 10>{chars_};

  constexpr auto language_span() const { return as_span().Subspan<0, 3>(); }

  constexpr auto script_span() const { return as_span().Subspan<3, 4>(); }

  constexpr auto region_span() const { return as_span().Subspan<7, 3>(); }

  friend class LanguageIdString;

/**

   * Return true if |language| is a language subtag in canonical case.

   * Canonical case of language subtags is lower-case.

*/

  template <typename CharT>

  static constexpr bool IsValidLanguage(

      std::basic_string_view<CharT> language) {

    return (language.length() == 2 || language.length() == 3) &&

           std::all_of(language.begin(), language.end(),

                       mozilla::IsAsciiLowercaseAlpha<CharT>);

/**

   * Return true if |script| is a script subtag in canonical case.

   * Canonical case of script subtags is title-case.

*/

  template <typename CharT>

  static constexpr bool IsValidScript(std::basic_string_view<CharT> script) {

    return script.length() == 4 && mozilla::IsAsciiUppercaseAlpha(script[0]) &&

           std::all_of(std::next(script.begin()), script.end(),

                       mozilla::IsAsciiLowercaseAlpha<CharT>);

/**

   * Return true if |region| is a alpha region subtag in canonical case.

   * Canonical case of region subtags is upper-case.

*/

  template <typename CharT>

  static constexpr bool IsValidAlphaRegion(

      std::basic_string_view<CharT> region) {

    return region.length() == 2 &&

           std::all_of(region.begin(), region.end(),

                       mozilla::IsAsciiUppercaseAlpha<CharT>);

/**

   * Return true if |region| is a digit region subtag.

*/

  template <typename CharT>

  static constexpr bool IsValidDigitRegion(

      std::basic_string_view<CharT> region) {

    return region.length() == 3 && std::all_of(region.begin(), region.end(),

                                               mozilla::IsAsciiDigit<CharT>);

/**

   * Return true if |region| is a region subtag.

*/

  template <typename CharT>

  static constexpr bool IsValidRegion(std::basic_string_view<CharT> region) {

    return IsValidAlphaRegion(region) || IsValidDigitRegion(region);

  constexpr LanguageId() = default;

 public:

  constexpr bool operator==(const LanguageId&) const = default;

/**

   * Language subtag of this language identifier.

*/

  constexpr auto language() const {

    // Language subtags are two or three characters long.

    size_t length = 2 + (language_span()[2] != '\0');

    return std::string_view{std::data(language_span()), length};

/**

   * Script subtag of this language identifier or empty if no script subtag is

   * present.

*/

  constexpr auto script() const {

    // Script subtags are always four characters long.

    size_t length = hasScript() ? 4 : 0;

    return std::string_view{std::data(script_span()), length};

/**

   * Region subtag of this language identifier or empty if no region subtag is

   * present.

*/

  constexpr auto region() const {

    // Region subtags are two or three characters long.

    size_t length = hasRegion() ? (2 + (region_span()[2] != '\0')) : 0;

    return std::string_view{std::data(region_span()), length};

/**

   * Return true if this language identifier has a script subtag.

*/

  constexpr bool hasScript() const { return script_span()[0] != '\0'; }

/**

   * Return true if this language identifier has a region subtag.

*/

  constexpr bool hasRegion() const { return region_span()[0] != '\0'; }

/**

   * Hash number of this language identifier.

*/

  auto hash() const {

    auto [lead_span, trail_span] = as_span().SplitAt<8>();

    uint64_t lead = 0;

    std::memcpy(&lead, std::data(lead_span), std::size(lead_span));

    uint32_t trail = 0;

    std::memcpy(&trail, std::data(trail_span), std::size(trail_span));

    // Using HashGeneric is much faster than for example HashStringKnownLength.

    return mozilla::HashGeneric(lead, trail);

 private:

  template <char... separators, typename CharT>

  static constexpr mozilla::Maybe<std::pair<LanguageId, size_t>> from(

      std::basic_string_view<CharT> localeId) {

    // Return true iff |sv| starts with a subtag of length |len|.

    auto hasSubtag = [](std::basic_string_view<CharT> sv, size_t len) {

      if (sv.length() == len) {

        return true;

      if (sv.length() > len) {

        auto ch = sv[len];

        return (... || (separators == ch));

      return false;

};

    // Copy the subtag |tag| to |dest| and then removed the processed prefix

    // from |localeId|.

    auto copyAndRemovePrefix = [&](auto dest,

                                   std::basic_string_view<CharT> tag) {

      MOZ_ASSERT(localeId.starts_with(tag), "tag is a prefix");

      MOZ_ASSERT(std::size(dest) >= tag.length(), "dest is large enough");

      std::copy_n(tag.data(), tag.length(), std::data(dest));

      localeId.remove_prefix(tag.length() + (localeId.length() > tag.length()));

};

    LanguageId result{};

    // NB: Two and three letter language tags handled in separate branches to

    // ensure the compiler treats |lang.length()| as a compile-time constant.

    // This leads to smaller and faster generated assembly code, because memcpy

    // calls with a constant length can inlined.

    if (hasSubtag(localeId, 2)) {

      auto lang = localeId.substr(0, 2);

      if (!IsValidLanguage(lang)) [[unlikely]] {

        return mozilla::Nothing();

      copyAndRemovePrefix(result.language_span(), lang);

    } else if (hasSubtag(localeId, 3)) {

      auto lang = localeId.substr(0, 3);

      if (!IsValidLanguage(lang)) [[unlikely]] {

        return mozilla::Nothing();

      copyAndRemovePrefix(result.language_span(), lang);

    } else [[unlikely]] {

      return mozilla::Nothing();

    // Optional script subtag.

    if (hasSubtag(localeId, 4)) {

      auto script = localeId.substr(0, 4);

      if (IsValidScript(script)) [[likely]] {

        copyAndRemovePrefix(result.script_span(), script);

    // Optional region subtag.

    if (hasSubtag(localeId, 2)) {

      auto region = localeId.substr(0, 2);

      if (IsValidAlphaRegion(region)) [[likely]] {

        copyAndRemovePrefix(result.region_span(), region);

    } else if (hasSubtag(localeId, 3)) {

      auto region = localeId.substr(0, 3);

      if (IsValidDigitRegion(region)) [[likely]] {

        copyAndRemovePrefix(result.region_span(), region);

    return mozilla::Some(std::pair{result, localeId.length()});

 public:

/**

   * Create a language identifier from an ICU or Unicode locale identifier.

   * Returns the language identifier and the number of unprocessed characters

   * (trailing subtags or unparseable characters). Return Nothing if the input

   * doesn't start with a language subtag.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in ICU and Unicode locale identifiers are separated by "-" or "_".

*/

  static constexpr auto fromId(std::string_view localeId) {

    return from<'-', '_'>(localeId);

/**

   * Create a language identifier from an ICU or Unicode locale identifier.

   * Returns the language identifier and the number of unprocessed characters

   * (trailing subtags or unparseable characters). Return Nothing if the input

   * doesn't start with a language subtag.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in ICU and Unicode locale identifiers are separated by "-" or "_".

*/

  static constexpr auto fromId(mozilla::Span<const char> localeId) {

    return fromId(std::string_view{localeId.data(), localeId.size()});

/**

   * Create a language identifier from a Unicode BCP 47 locale identifier.

   * Returns the language identifier and the number of unprocessed characters

   * (trailing subtags or unparseable characters). Return Nothing if the input

   * doesn't start with a language subtag.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in BCP 47 locale identifiers are separated by "-".

*/

  static constexpr auto fromBcp49(std::string_view localeId) {

    return from<'-'>(localeId);

/**

   * Create a language identifier from a Unicode BCP 47 locale identifier.

   * Returns the language identifier and the number of unprocessed characters

   * (trailing subtags or unparseable characters). Return Nothing if the input

   * doesn't start with a language subtag.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in BCP 47 locale identifiers are separated by "-".

*/

  static constexpr auto fromBcp49(std::u16string_view localeId) {

    return from<u'-'>(localeId);

/**

   * Create a language identifier from a Unicode BCP 47 locale identifier.

   * Returns the language identifier and the number of unprocessed characters

   * (trailing subtags or unparseable characters). Return Nothing if the input

   * doesn't start with a language subtag.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in BCP 47 locale identifiers are separated by "-".

*/

  template <typename CharT>

  static constexpr auto fromBcp49(mozilla::Span<const CharT> localeId) {

    return fromBcp49(std::basic_string_view{localeId.data(), localeId.size()});

/**

   * Create a language identifier from a valid Unicode BCP 47 locale identifier.

   * The language, script, and region subtags must be in canonical case.

   * Subtags in BCP 47 locale identifiers are separated by "-".

*/

  static consteval auto fromValidBcp49(std::string_view localeId) {

    return fromBcp49(localeId)->first;

/**

   * Create a language identifier from a valid subtags.

   * The language, script, and region subtags must be in canonical case.

*/

  static constexpr auto fromParts(std::string_view language,

                                  std::string_view script,

                                  std::string_view region) {

    MOZ_ASSERT(IsValidLanguage(language));

    MOZ_ASSERT_IF(!script.empty(), IsValidScript(script));

    MOZ_ASSERT_IF(!region.empty(), IsValidRegion(region));

    LanguageId result{};

    language.copy(std::data(result.language_span()), language.length());

    script.copy(std::data(result.script_span()), script.length());

    region.copy(std::data(result.region_span()), region.length());

    return result;

/**

   * Return the language identifier for the undetermined locale "und".

*/

  static constexpr auto und() {

    constexpr LanguageId locale = fromValidBcp49("und");

    return locale;

/**

   * Return the language identifier with any script subtag removed.

*/

  constexpr auto withoutScript() const {

    LanguageId result = *this;

    // mozilla::Span requires that the _same_ span is used for iteration.

    auto script = result.script_span();

    std::fill(std::begin(script), std::end(script), '\0');

    return result;

/**

   * Return the language identifier with any region subtag removed.

*/

  constexpr auto withoutRegion() const {

    LanguageId result = *this;

    // mozilla::Span requires that the _same_ span is used for iteration.

    auto region = result.region_span();

    std::fill(std::begin(region), std::end(region), '\0');

    return result;

/**

   * Return the parent language identifier or "und" if this language identifier

   * consists of a single language subtag.

*/

  constexpr auto parentLocale() const {

    if (hasRegion()) {

      return withoutRegion();

    if (hasScript()) {

      return withoutScript();

    return und();

/**

   * Return `true` if this language identifier is a prefix of `other`.

   * Examples:

   * - "en" is a prefix of "en", "en-Latn", "en-US", and "en-Latn-US".

   * - "en-Latn" is a prefix of "en-Latn" and "en-Latn-US".

   * - "en-US" is a prefix of "en-US".

   * - "en-US" is not a prefix of "en-Latn-US".

   * - "en-Latn-US" is a prefix "en-Latn-US".

*/

  constexpr bool isPrefixOf(LanguageId other) const {

    if (!hasRegion()) {

      // Remove region subtag if this language identifier has no region.

      other = other.withoutRegion();

      if (!hasScript()) {

        // Remove script subtag if this language identifier has no script.

        other = other.withoutScript();

    return *this == other;

/**

   * Return the language identifier string.

*/

  constexpr auto toString() const;

};

static_assert(sizeof(LanguageId) == 10,

              "LanguageId uses a compact language identifier representation");

/**

 * String representation of a language identifier as a Unicode BCP 47 locale

 * identifier.

*/

class LanguageIdString final {

  // Language subtag: 2-3 characters

  // Script subtag: 4 characters

  // Region subtag: 2-3 characters

  // Subtag separator: 1 character ("-")

//

  // Total: 12 + 1 (null terminated for ICU4C).

  std::array<char, 12 + 1> chars_ = {};

  // String length can't exceed 12 characters, so it fits into uint8_t.

  uint8_t length_ = 0;

  friend class LanguageId;

  constexpr explicit LanguageIdString(const LanguageId& langId) {

    static_assert(

        decltype(std::declval<LanguageId>().as_span())::extent +

                3 /* two subtag separators and a trailing NUL character */

            <= std::tuple_size_v<decltype(LanguageIdString::chars_)>,

        "LanguageIdString::chars_ is large enough to hold all subtags");

    auto out = std::begin(chars_);

    // Copy the language subtag.

//

    // Intentionally use `std::copy[_n]()` instead of `string_view::copy()` here

    // and below to copy a compile-time constant number of characters. This may

    // include a trailing NUL character, which will be overwritten if necessary.

    auto language = langId.language();

    MOZ_ASSERT(!language.empty(), "language subtag is never empty");

    // Generated assembly code of this constructor is 25% larger when calling

    // `std::copy` on a mozilla::Span instead of `std::copy_n`. `std::span`

    // generates the same assembly for `std::copy` and `std::copy_n`.

    auto language_span = langId.language_span();

    std::copy_n(std::data(language_span), std::size(language_span), out);

    out += language.length();

    // Copy the script subtag, if present.

    if (auto script = langId.script(); !script.empty()) {

      auto script_span = langId.script_span();

      *out++ = '-';

      std::copy_n(std::data(script_span), std::size(script_span), out);

      out += script.length();

    // Copy the region subtag, if present.

    if (auto region = langId.region(); !region.empty()) {

      auto region_span = langId.region_span();

      *out++ = '-';

      std::copy_n(std::data(region_span), std::size(region_span), out);

      out += region.length();

    length_ = std::distance(std::begin(chars_), out);

    MOZ_ASSERT(chars_[length_] == '\0', "chars_ is null-terminated");

 public:

/**

   * Auto-converts into a `std::string_view`.

*/

  constexpr operator std::string_view() const {

    return std::string_view{std::data(chars_), length_};

/**

   * Auto-converts into a `mozilla::Span`.

*/

  constexpr operator mozilla::Span<const char>() const {

    return mozilla::Span{std::data(chars_), length_};

/**

   * Return the length of the language identifier string.

*/

  constexpr size_t length() const { return length_; }

/**

   * Return a pointer to the language identifier string's characters.

*/

  constexpr const char* data() const { return std::data(chars_); }

/**

   * Return a pointer to a null-terminated character array.

   * Prefer this method over calling `data()` when passing the language

   * identifier string as a null-terminated string, because it gives stronger

   * signal that the characters are null-terminated.

   * The method name is borrowed from `std::string::c_str()`.

*/

  constexpr const char* c_str() const { return std::data(chars_); }

};

static_assert(sizeof(LanguageIdString) <= 2 * sizeof(uint64_t),

              "LanguageIdString fits into two 64-bit registers");

constexpr auto LanguageId::toString() const { return LanguageIdString{*this}; }

}  // namespace js

#endif /* util_LanguageId_h */