special-case.h - mozsearch

Enable keyboard shortcuts

// Copyright 2019 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

#ifndef V8_REGEXP_SPECIAL_CASE_H_

#define V8_REGEXP_SPECIAL_CASE_H_

#ifdef V8_INTL_SUPPORT

#include "irregexp/RegExpShim.h"

#include "unicode/uchar.h"

#include "unicode/uniset.h"

#include "unicode/unistr.h"

namespace v8 {

namespace internal {

// Sets of Unicode characters that need special handling under "i" mode

// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262

// defines slightly different case-folding rules than Unicode. An

// input character should match a pattern character if the result of

// the Canonicalize algorithm is the same for both characters.

//

// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as

// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character

// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See

// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for

// the precise definition.

//

// While compiling such regular expressions, we need to compute the

// set of characters that should match a given input character. (See

// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)

// For almost all characters, this can be efficiently computed using

// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent

// the remaining special cases.

//

// For a character c, the rules are as follows:

//

// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling

//    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet

//    containing c will produce the set of characters that should

//    match /c/i (or /[c]/i), and only those characters.

//

// 2. If c is in IgnoreSet, then the only character it should match is

//    itself. However, closeOver will add additional incorrect

//    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'

//    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is

//    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to

//    itself, and should not match 'ẞ'. In these cases, we can skip

//    the closeOver entirely, because it will never add an equivalent

//    character.

//

// 3. If c is in SpecialAddSet, then it should match at least one

//    character other than itself. However, closeOver will add at

//    least one additional incorrect match. For example, consider the

//    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase

//    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN

//    SIGN should not match either of the other two characters. As a

//    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in

//    IgnoreSet). To find the correct matches for characters in

//    SpecialAddSet, we closeOver the original character, but filter

//    out the results that do not have the same canonical value.

//

// The contents of these sets are calculated at build time by

// src/regexp/gen-regexp-special-case.cc, which generates

// gen/src/regexp/special-case.cc. This is done by iterating over the

// result of closeOver for each BMP character, and finding sets for

// which at least one character has a different canonical value than

// another character. Characters that match no other characters in

// their equivalence class are added to IgnoreSet. Characters that

// match at least one other character are added to SpecialAddSet.

class RegExpCaseFolding final : public AllStatic {

 public:

  static const icu::UnicodeSet& IgnoreSet();

  static const icu::UnicodeSet& SpecialAddSet();

  // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:

  // Canonicalize) step 3, which is used to determine whether

  // characters match when ignoreCase is true and unicode is false.

  static UChar32 Canonicalize(UChar32 ch) {

    // a. Assert: ch is a UTF-16 code unit.

    CHECK_LE(ch, 0xffff);

    // b. Let s be the String value consisting of the single code unit ch.

    icu::UnicodeString s(ch);

    // c. Let u be the same result produced as if by performing the algorithm

    // for String.prototype.toUpperCase using s as the this value.

    // d. Assert: Type(u) is String.

    icu::UnicodeString& u = s.toUpper();

    // e. If u does not consist of a single code unit, return ch.

    if (u.length() != 1) {

      return ch;

    // f. Let cu be u's single code unit element.

    UChar32 cu = u.char32At(0);

    // g. If the value of ch >= 128 and the value of cu < 128, return ch.

    if (ch >= 128 && cu < 128) {

      return ch;

    // h. Return cu.

    return cu;

};

}  // namespace internal

}  // namespace v8

#endif  // V8_INTL_SUPPORT

#endif  // V8_REGEXP_SPECIAL_CASE_H_