nsUnicharUtils.cpp

mozilla-central/intl/unicharutil/util/nsUnicharUtils.cpp (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicharUtils.h"

#include "nsUnicodeProperties.h"

#include "nsUTF8Utils.h"

#include "mozilla/Likely.h"

#include "mozilla/HashFunctions.h"

#include "mozilla/intl/UnicodeProperties.h"

#include "mozilla/StaticPrefs_layout.h"

// We map x -> x, except for upper-case letters,

// which we map to their lower-case equivalents.

static const uint8_t gASCIIToLower[128] = {

    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,

    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,

    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,

    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,

    0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,

    0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,

    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,

    0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,

};

// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast

// when they're called from within the case-insensitive comparators, so we

// define inlined versions.

static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) {

  if (IS_ASCII(aChar)) {

    return gASCIIToLower[aChar];

  return mozilla::intl::UnicodeProperties::ToLower(aChar);

static MOZ_ALWAYS_INLINE uint32_t

ToLowerCaseASCII_inline(const uint32_t aChar) {

  if (IS_ASCII(aChar)) {

    return gASCIIToLower[aChar];

  return aChar;

void ToLowerCase(nsAString& aString) {

  char16_t* buf = aString.BeginWriting();

  ToLowerCase(buf, buf, aString.Length());

void ToLowerCaseASCII(nsAString& aString) {

  char16_t* buf = aString.BeginWriting();

  ToLowerCaseASCII(buf, buf, aString.Length());

char ToLowerCaseASCII(char aChar) {

  if (aChar >= 'A' && aChar <= 'Z') {

    return aChar + 0x20;

  return aChar;

char16_t ToLowerCaseASCII(char16_t aChar) {

  if (aChar >= 'A' && aChar <= 'Z') {

    return aChar + 0x20;

  return aChar;

char32_t ToLowerCaseASCII(char32_t aChar) {

  if (aChar >= 'A' && aChar <= 'Z') {

    return aChar + 0x20;

  return aChar;

char ToUpperCaseASCII(char aChar) {

  if (aChar >= 'a' && aChar <= 'z') {

    return aChar - 0x20;

  return aChar;

char16_t ToUpperCaseASCII(char16_t aChar) {

  if (aChar >= 'a' && aChar <= 'z') {

    return aChar - 0x20;

  return aChar;

char32_t ToUpperCaseASCII(char32_t aChar) {

  if (aChar >= 'a' && aChar <= 'z') {

    return aChar - 0x20;

  return aChar;

void ToLowerCase(const nsAString& aSource, nsAString& aDest) {

  const char16_t* in = aSource.BeginReading();

  size_t len = aSource.Length();

  aDest.SetLength(len);

  char16_t* out = aDest.BeginWriting();

  ToLowerCase(in, out, len);

void ToLowerCaseASCII(const nsAString& aSource, nsAString& aDest) {

  const char16_t* in = aSource.BeginReading();

  size_t len = aSource.Length();

  aDest.SetLength(len);

  char16_t* out = aDest.BeginWriting();

  ToLowerCaseASCII(in, out, len);

uint32_t ToLowerCaseASCII(const uint32_t aChar) {

  return ToLowerCaseASCII_inline(aChar);

void ToUpperCase(nsAString& aString) {

  char16_t* buf = aString.BeginWriting();

  ToUpperCase(buf, buf, aString.Length());

void ToUpperCase(const nsAString& aSource, nsAString& aDest) {

  const char16_t* in = aSource.BeginReading();

  size_t len = aSource.Length();

  aDest.SetLength(len);

  char16_t* out = aDest.BeginWriting();

  ToUpperCase(in, out, len);

#ifdef MOZILLA_INTERNAL_API

uint32_t ToFoldedCase(uint32_t aChar) {

  if (IS_ASCII(aChar)) return gASCIIToLower[aChar];

  return mozilla::unicode::GetFoldedcase(aChar);

void ToFoldedCase(nsAString& aString) {

  char16_t* buf = aString.BeginWriting();

  ToFoldedCase(buf, buf, aString.Length());

void ToFoldedCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {

  for (uint32_t i = 0; i < aLen; i++) {

    uint32_t ch = aIn[i];

    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {

      ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));

      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");

      aOut[i++] = H_SURROGATE(ch);

      aOut[i] = L_SURROGATE(ch);

      continue;

    aOut[i] = ToFoldedCase(ch);

uint32_t ToNaked(uint32_t aChar) {

  if (IS_ASCII(aChar)) {

    return aChar;

  return mozilla::unicode::GetNaked(aChar);

void ToNaked(nsAString& aString) {

  uint32_t i = 0;

  while (i < aString.Length()) {

    uint32_t ch = aString[i];

    if (i < aString.Length() - 1 && NS_IS_SURROGATE_PAIR(ch, aString[i + 1])) {

      ch = SURROGATE_TO_UCS4(ch, aString[i + 1]);

      if (mozilla::unicode::IsCombiningDiacritic(ch)) {

        aString.Cut(i, 2);

      } else {

        ch = mozilla::unicode::GetNaked(ch);

        NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!");

        aString.Replace(i++, 1, H_SURROGATE(ch));

        aString.Replace(i++, 1, L_SURROGATE(ch));

      continue;

    if (mozilla::unicode::IsCombiningDiacritic(ch)) {

      aString.Cut(i, 1);

    } else {

      aString.Replace(i++, 1, ToNaked(ch));

int32_t nsCaseInsensitiveStringComparator(const char16_t* lhs,

                                          const char16_t* rhs, size_t lLength,

                                          size_t rLength) {

  return (lLength == rLength)  ? CaseInsensitiveCompare(lhs, rhs, lLength)

         : (lLength > rLength) ? 1

                               : -1;

int32_t nsCaseInsensitiveUTF8StringComparator(const char* lhs, const char* rhs,

                                              size_t lLength, size_t rLength) {

  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);

int32_t nsASCIICaseInsensitiveStringComparator(const char16_t* lhs,

                                               const char16_t* rhs,

                                               size_t lLength, size_t rLength) {

  if (lLength != rLength) {

    if (lLength > rLength) return 1;

    return -1;

  while (rLength) {

    // we don't care about surrogates here, because we're only

    // lowercasing the ASCII range

    char16_t l = *lhs++;

    char16_t r = *rhs++;

    if (l != r) {

      l = ToLowerCaseASCII_inline(l);

      r = ToLowerCaseASCII_inline(r);

      if (l > r)

        return 1;

      else if (r > l)

        return -1;

    rLength--;

  return 0;

#endif  // MOZILLA_INTERNAL_API

uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); }

void ToLowerCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {

  for (size_t i = 0; i < aLen; i++) {

    uint32_t ch = aIn[i];

    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {

      ch = mozilla::intl::UnicodeProperties::ToLower(

          SURROGATE_TO_UCS4(ch, aIn[i + 1]));

      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");

      aOut[i++] = H_SURROGATE(ch);

      aOut[i] = L_SURROGATE(ch);

      continue;

    aOut[i] = ToLowerCase(ch);

void ToLowerCaseASCII(const char16_t* aIn, char16_t* aOut, size_t aLen) {

  for (size_t i = 0; i < aLen; i++) {

    char16_t ch = aIn[i];

    aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;

uint32_t ToUpperCase(uint32_t aChar) {

  if (IS_ASCII(aChar)) {

    if (IS_ASCII_LOWER(aChar)) {

      return aChar - 0x20;

    return aChar;

  return mozilla::intl::UnicodeProperties::ToUpper(aChar);

void ToUpperCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {

  for (size_t i = 0; i < aLen; i++) {

    uint32_t ch = aIn[i];

    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {

      ch = mozilla::intl::UnicodeProperties::ToUpper(

          SURROGATE_TO_UCS4(ch, aIn[i + 1]));

      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");

      aOut[i++] = H_SURROGATE(ch);

      aOut[i] = L_SURROGATE(ch);

      continue;

    aOut[i] = ToUpperCase(ch);

uint32_t ToTitleCase(uint32_t aChar) {

  if (IS_ASCII(aChar)) {

    return ToUpperCase(aChar);

  return mozilla::unicode::GetTitlecaseForLower(aChar);

int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b,

                               size_t len) {

  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");

  if (len) {

    do {

      uint32_t c1 = *a++;

      uint32_t c2 = *b++;

      // Unfortunately, we need to check for surrogates BEFORE we check

      // for equality, because we could have identical high surrogates

      // but non-identical characters, so we can't just skip them

      // If c1 isn't a surrogate, we don't bother to check c2;

      // in the case where it _is_ a surrogate, we're definitely going to get

      // a mismatch, and don't need to interpret and lowercase it

      if (len > 1 && NS_IS_SURROGATE_PAIR(c1, *a)) {

        c1 = SURROGATE_TO_UCS4(c1, *a++);

        if (NS_IS_SURROGATE_PAIR(c2, *b)) {

          c2 = SURROGATE_TO_UCS4(c2, *b++);

        // If c2 wasn't a surrogate, decrementing len means we'd stop

        // short of the end of string b, but that doesn't actually matter

        // because we're going to find a mismatch and return early

        --len;

      if (c1 != c2) {

        c1 = ToLowerCase_inline(c1);

        c2 = ToLowerCase_inline(c2);

        if (c1 != c2) {

          if (c1 < c2) {

            return -1;

          return 1;

    } while (--len != 0);

  return 0;

// Inlined definition of GetLowerUTF8Codepoint, which we use because we want

// to be fast when called from the case-insensitive comparators.

static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline(

    const char* aStr, const char* aEnd, const char** aNext) {

  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't

  // sign extend.

  const unsigned char* str = (unsigned char*)aStr;

  if (UTF8traits::isASCII(str[0])) {

    // It's ASCII; just convert to lower-case and return it.

    *aNext = aStr + 1;

    return gASCIIToLower[*str];

  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {

    // It's a two-byte sequence, so it looks like

    //  110XXXXX 10XXXXXX.

    // This is definitely in the BMP, so we can store straightaway into a

    // uint16_t.

    uint16_t c;

    c = (str[0] & 0x1F) << 6;

    c += (str[1] & 0x3F);

    // we don't go through ToLowerCase here, because we know this isn't

    // an ASCII character so the ASCII fast-path there is useless

    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 2;

    return c;

  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {

    // It's a three-byte sequence, so it looks like

    //  1110XXXX 10XXXXXX 10XXXXXX.

    // This will just barely fit into 16-bits, so store into a uint16_t.

    uint16_t c;

    c = (str[0] & 0x0F) << 12;

    c += (str[1] & 0x3F) << 6;

    c += (str[2] & 0x3F);

    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 3;

    return c;

  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {

    // It's a four-byte sequence, so it looks like

    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.

    uint32_t c;

    c = (str[0] & 0x07) << 18;

    c += (str[1] & 0x3F) << 12;

    c += (str[2] & 0x3F) << 6;

    c += (str[3] & 0x3F);

    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 4;

    return c;

  // Hm, we don't understand this sequence.

  return -1;

uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd,

                               const char** aNext) {

  return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);

int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight,

                               size_t aLeftBytes, size_t aRightBytes) {

  const char* leftEnd = aLeft + aLeftBytes;

  const char* rightEnd = aRight + aRightBytes;

  while (aLeft < leftEnd && aRight < rightEnd) {

    uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);

    if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1;

    uint32_t rightChar =

        GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);

    if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1;

    // Now leftChar and rightChar are lower-case, so we can compare them.

    if (leftChar != rightChar) {

      if (leftChar > rightChar) return 1;

      return -1;

  // Make sure that if one string is longer than the other we return the

  // correct result.

  if (aLeft < leftEnd) return 1;

  if (aRight < rightEnd) return -1;

  return 0;

static MOZ_ALWAYS_INLINE uint32_t

GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd,

                             const char** aNext, bool aMatchDiacritics) {

  uint32_t c;

  for (;;) {

    c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);

    if (aMatchDiacritics) {

      break;

    if (!mozilla::unicode::IsCombiningDiacritic(c)) {

      break;

    aStr = *aNext;

  return c;

bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,

                                   const char* aLeftEnd, const char* aRightEnd,

                                   const char** aLeftNext,

                                   const char** aRightNext, bool* aErr,

                                   bool aMatchDiacritics) {

  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");

  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");

  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");

  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");

  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext,

                                                   aMatchDiacritics);

  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {

    *aErr = true;

    return false;

  uint32_t rightChar = GetLowerUTF8Codepoint_inline(

      aRight, aRightEnd, aRightNext, aMatchDiacritics);

  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {

    *aErr = true;

    return false;

  // Can't have an error past this point.

  *aErr = false;

  if (!aMatchDiacritics) {

    leftChar = ToNaked(leftChar);

    rightChar = ToNaked(rightChar);

  return leftChar == rightChar;

namespace mozilla {

uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr) {

  uint32_t hash = 0;

  const char* s = aUTF8;

  const char* end = aUTF8 + aLength;

  *aErr = false;

  while (s < end) {

    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);

    if (*aErr) {

      return 0;

    if (ucs4 < PLANE1_BASE) {

      hash = AddToHash(hash, ucs4);

    } else {

      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));

  return hash;

bool IsSegmentBreakSkipChar(uint32_t u) {

  return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) &&

         intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL;

bool IsPunctuationForWordSelect(char16_t aCh) {

  const uint8_t cat = unicode::GetGeneralCategory(aCh);

  switch (cat) {

    case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */

      if (aCh == '_' && !StaticPrefs::layout_word_select_stop_at_underscore()) {

        return false;

      [[fallthrough]];

    case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION:    /* Pd */

    case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION:   /* Pe */

    case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION:   /* Pf */

    case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */

    case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION:   /* Po */

    case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION:    /* Ps */

    case HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL:     /* Sc */

    // Deliberately omitted:

    // case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL:     /* Sk */

    case HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL:  /* Sm */

    case HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: /* So */

      return true;

    default:

      return false;

}  // namespace mozilla