charconv_parse.cc - mozsearch

firefox-main/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2018 The Abseil Authors.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      https://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#include "absl/strings/internal/charconv_parse.h"

#include "absl/strings/charconv.h"

#include <cassert>

#include <cstdint>

#include <limits>

#include "absl/strings/internal/memutil.h"

namespace absl {

ABSL_NAMESPACE_BEGIN

namespace {

// ParseFloat<10> will read the first 19 significant digits of the mantissa.

// This number was chosen for multiple reasons.

//

// (a) First, for whatever integer type we choose to represent the mantissa, we

// want to choose the largest possible number of decimal digits for that integer

// type.  We are using uint64_t, which can express any 19-digit unsigned

// integer.

//

// (b) Second, we need to parse enough digits that the binary value of any

// mantissa we capture has more bits of resolution than the mantissa

// representation in the target float.  Our algorithm requires at least 3 bits

// of headway, but 19 decimal digits give a little more than that.

//

// The following static assertions verify the above comments:

constexpr int kDecimalMantissaDigitsMax = 19;

static_assert(std::numeric_limits<uint64_t>::digits10 ==

                  kDecimalMantissaDigitsMax,

              "(a) above");

// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.

static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");

static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");

static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");

// The lowest valued 19-digit decimal mantissa we can read still contains

// sufficient information to reconstruct a binary mantissa.

static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above");

// ParseFloat<16> will read the first 15 significant digits of the mantissa.

//

// Because a base-16-to-base-2 conversion can be done exactly, we do not need

// to maximize the number of scanned hex digits to improve our conversion.  What

// is required is to scan two more bits than the mantissa can represent, so that

// we always round correctly.

//

// (One extra bit does not suffice to perform correct rounding, since a number

// exactly halfway between two representable floats has unique rounding rules,

// so we need to differentiate between a "halfway between" number and a "closer

// to the larger value" number.)

constexpr int kHexadecimalMantissaDigitsMax = 15;

// The minimum number of significant bits that will be read from

// kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since

// the most significant digit can be a "1", which only contributes a single

// significant bit.

constexpr int kGuaranteedHexadecimalMantissaBitPrecision =

    4 * kHexadecimalMantissaDigitsMax - 3;

static_assert(kGuaranteedHexadecimalMantissaBitPrecision >

                  std::numeric_limits<double>::digits + 2,

              "kHexadecimalMantissaDigitsMax too small");

// We also impose a limit on the number of significant digits we will read from

// an exponent, to avoid having to deal with integer overflow.  We use 9 for

// this purpose.

//

// If we read a 9 digit exponent, the end result of the conversion will

// necessarily be infinity or zero, depending on the sign of the exponent.

// Therefore we can just drop extra digits on the floor without any extra

// logic.

constexpr int kDecimalExponentDigitsMax = 9;

static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,

              "int type too small");

// To avoid incredibly large inputs causing integer overflow for our exponent,

// we impose an arbitrary but very large limit on the number of significant

// digits we will accept.  The implementation refuses to match a string with

// more consecutive significant mantissa digits than this.

constexpr int kDecimalDigitLimit = 50000000;

// Corresponding limit for hexadecimal digit inputs.  This is one fourth the

// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires

// a binary exponent adjustment of 4.

constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;

// The largest exponent we can read is 999999999 (per

// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get

// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these

// comfortably fits in an integer.

//

// We count kDecimalDigitLimit twice because there are independent limits for

// numbers before and after the decimal point.  (In the case where there are no

// significant digits before the decimal point, there are independent limits for

// post-decimal-point leading zeroes and for significant digits.)

static_assert(999999999 + 2 * kDecimalDigitLimit <

                  std::numeric_limits<int>::max(),

              "int type too small");

static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <

                  std::numeric_limits<int>::max(),

              "int type too small");

// Returns true if the provided bitfield allows parsing an exponent value

// (e.g., "1.5e100").

bool AllowExponent(chars_format flags) {

  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;

  bool scientific =

      (flags & chars_format::scientific) == chars_format::scientific;

  return scientific || !fixed;

// Returns true if the provided bitfield requires an exponent value be present.

bool RequireExponent(chars_format flags) {

  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;

  bool scientific =

      (flags & chars_format::scientific) == chars_format::scientific;

  return scientific && !fixed;

const int8_t kAsciiToInt[256] = {

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,

    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1};

// Returns true if `ch` is a digit in the given base

template <int base>

bool IsDigit(char ch);

// Converts a valid `ch` to its digit value in the given base.

template <int base>

unsigned ToDigit(char ch);

// Returns true if `ch` is the exponent delimiter for the given base.

template <int base>

bool IsExponentCharacter(char ch);

// Returns the maximum number of significant digits we will read for a float

// in the given base.

template <int base>

constexpr int MantissaDigitsMax();

// Returns the largest consecutive run of digits we will accept when parsing a

// number in the given base.

template <int base>

constexpr int DigitLimit();

// Returns the amount the exponent must be adjusted by for each dropped digit.

// (For decimal this is 1, since the digits are in base 10 and the exponent base

// is also 10, but for hexadecimal this is 4, since the digits are base 16 but

// the exponent base is 2.)

template <int base>

constexpr int DigitMagnitude();

template <>

bool IsDigit<10>(char ch) {

  return ch >= '0' && ch <= '9';

template <>

bool IsDigit<16>(char ch) {

  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;

template <>

unsigned ToDigit<10>(char ch) {

  return static_cast<unsigned>(ch - '0');

template <>

unsigned ToDigit<16>(char ch) {

  return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]);

template <>

bool IsExponentCharacter<10>(char ch) {

  return ch == 'e' || ch == 'E';

template <>

bool IsExponentCharacter<16>(char ch) {

  return ch == 'p' || ch == 'P';

template <>

constexpr int MantissaDigitsMax<10>() {

  return kDecimalMantissaDigitsMax;

template <>

constexpr int MantissaDigitsMax<16>() {

  return kHexadecimalMantissaDigitsMax;

template <>

constexpr int DigitLimit<10>() {

  return kDecimalDigitLimit;

template <>

constexpr int DigitLimit<16>() {

  return kHexadecimalDigitLimit;

template <>

constexpr int DigitMagnitude<10>() {

  return 1;

template <>

constexpr int DigitMagnitude<16>() {

  return 4;

// Reads decimal digits from [begin, end) into *out.  Returns the number of

// digits consumed.

//

// After max_digits has been read, keeps consuming characters, but no longer

// adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit

// is set; otherwise, it is left unmodified.

//

// If no digits are matched, returns 0 and leaves *out unchanged.

//

// ConsumeDigits does not protect against overflow on *out; max_digits must

// be chosen with respect to type T to avoid the possibility of overflow.

template <int base, typename T>

int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,

                  bool* dropped_nonzero_digit) {

  if (base == 10) {

    assert(max_digits <= std::numeric_limits<T>::digits10);

  } else if (base == 16) {

    assert(max_digits * 4 <= std::numeric_limits<T>::digits);

  const char* const original_begin = begin;

  // Skip leading zeros, but only if *out is zero.

  // They don't cause an overflow so we don't have to count them for

  // `max_digits`.

  while (!*out && end != begin && *begin == '0') ++begin;

  T accumulator = *out;

  const char* significant_digits_end =

      (end - begin > max_digits) ? begin + max_digits : end;

  while (begin < significant_digits_end && IsDigit<base>(*begin)) {

    // Do not guard against *out overflow; max_digits was chosen to avoid this.

    // Do assert against it, to detect problems in debug builds.

    auto digit = static_cast<T>(ToDigit<base>(*begin));

    assert(accumulator * base >= accumulator);

    accumulator *= base;

    assert(accumulator + digit >= accumulator);

    accumulator += digit;

    ++begin;

  bool dropped_nonzero = false;

  while (begin < end && IsDigit<base>(*begin)) {

    dropped_nonzero = dropped_nonzero || (*begin != '0');

    ++begin;

  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {

    *dropped_nonzero_digit = true;

  *out = accumulator;

  return static_cast<int>(begin - original_begin);

// Returns true if `v` is one of the chars allowed inside parentheses following

// a NaN.

bool IsNanChar(char v) {

  return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||

         (v >= 'A' && v <= 'Z');

// Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If

// one is found, sets `out` appropriately and returns true.

bool ParseInfinityOrNan(const char* begin, const char* end,

                        strings_internal::ParsedFloat* out) {

  if (end - begin < 3) {

    return false;

  switch (*begin) {

    case 'i':

    case 'I': {

      // An infinity string consists of the characters "inf" or "infinity",

      // case insensitive.

      if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {

        return false;

      out->type = strings_internal::FloatType::kInfinity;

      if (end - begin >= 8 &&

          strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {

        out->end = begin + 8;

      } else {

        out->end = begin + 3;

      return true;

    case 'n':

    case 'N': {

      // A NaN consists of the characters "nan", case insensitive, optionally

      // followed by a parenthesized sequence of zero or more alphanumeric

      // characters and/or underscores.

      if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {

        return false;

      out->type = strings_internal::FloatType::kNan;

      out->end = begin + 3;

      // NaN is allowed to be followed by a parenthesized string, consisting of

      // only the characters [a-zA-Z0-9_].  Match that if it's present.

      begin += 3;

      if (begin < end && *begin == '(') {

        const char* nan_begin = begin + 1;

        while (nan_begin < end && IsNanChar(*nan_begin)) {

          ++nan_begin;

        if (nan_begin < end && *nan_begin == ')') {

          // We found an extra NaN specifier range

          out->subrange_begin = begin + 1;

          out->subrange_end = nan_begin;

          out->end = nan_begin + 1;

      return true;

    default:

      return false;

}  // namespace

namespace strings_internal {

template <int base>

strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,

                                         chars_format format_flags) {

  strings_internal::ParsedFloat result;

  // Exit early if we're given an empty range.

  if (begin == end) return result;

  // Handle the infinity and NaN cases.

  if (ParseInfinityOrNan(begin, end, &result)) {

    return result;

  const char* const mantissa_begin = begin;

  while (begin < end && *begin == '0') {

    ++begin;  // skip leading zeros

  uint64_t mantissa = 0;

  int exponent_adjustment = 0;

  bool mantissa_is_inexact = false;

  int pre_decimal_digits = ConsumeDigits<base>(

      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);

  begin += pre_decimal_digits;

  int digits_left;

  if (pre_decimal_digits >= DigitLimit<base>()) {

    // refuse to parse pathological inputs

    return result;

  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {

    // We dropped some non-fraction digits on the floor.  Adjust our exponent

    // to compensate.

    exponent_adjustment =

        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());

    digits_left = 0;

  } else {

    digits_left =

        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);

  if (begin < end && *begin == '.') {

    ++begin;

    if (mantissa == 0) {

      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We

      // have to adjust the exponent to reflect the changed place value.

      const char* begin_zeros = begin;

      while (begin < end && *begin == '0') {

        ++begin;

      int zeros_skipped = static_cast<int>(begin - begin_zeros);

      if (zeros_skipped >= DigitLimit<base>()) {

        // refuse to parse pathological inputs

        return result;

      exponent_adjustment -= static_cast<int>(zeros_skipped);

    int post_decimal_digits = ConsumeDigits<base>(

        begin, end, digits_left, &mantissa, &mantissa_is_inexact);

    begin += post_decimal_digits;

    // Since `mantissa` is an integer, each significant digit we read after

    // the decimal point requires an adjustment to the exponent. "1.23e0" will

    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,

    // "123e-2").

    if (post_decimal_digits >= DigitLimit<base>()) {

      // refuse to parse pathological inputs

      return result;

    } else if (post_decimal_digits > digits_left) {

      exponent_adjustment -= digits_left;

    } else {

      exponent_adjustment -= post_decimal_digits;

  // If we've found no mantissa whatsoever, this isn't a number.

  if (mantissa_begin == begin) {

    return result;

  // A bare "." doesn't count as a mantissa either.

  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {

    return result;

  if (mantissa_is_inexact) {

    // We dropped significant digits on the floor.  Handle this appropriately.

    if (base == 10) {

      // If we truncated significant decimal digits, store the full range of the

      // mantissa for future big integer math for exact rounding.

      result.subrange_begin = mantissa_begin;

      result.subrange_end = begin;

    } else if (base == 16) {

      // If we truncated hex digits, reflect this fact by setting the low

      // ("sticky") bit.  This allows for correct rounding in all cases.

      mantissa |= 1;

  result.mantissa = mantissa;

  const char* const exponent_begin = begin;

  result.literal_exponent = 0;

  bool found_exponent = false;

  if (AllowExponent(format_flags) && begin < end &&

      IsExponentCharacter<base>(*begin)) {

    bool negative_exponent = false;

    ++begin;

    if (begin < end && *begin == '-') {

      negative_exponent = true;

      ++begin;

    } else if (begin < end && *begin == '+') {

      ++begin;

    const char* const exponent_digits_begin = begin;

    // Exponent is always expressed in decimal, even for hexadecimal floats.

    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,

                               &result.literal_exponent, nullptr);

    if (begin == exponent_digits_begin) {

      // there were no digits where we expected an exponent.  We failed to read

      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.

      found_exponent = false;

      begin = exponent_begin;

    } else {

      found_exponent = true;

      if (negative_exponent) {

        result.literal_exponent = -result.literal_exponent;

  if (!found_exponent && RequireExponent(format_flags)) {

    // Provided flags required an exponent, but none was found.  This results

    // in a failure to scan.

    return result;

  // Success!

  result.type = strings_internal::FloatType::kNumber;

  if (result.mantissa > 0) {

    result.exponent = result.literal_exponent +

                      (DigitMagnitude<base>() * exponent_adjustment);

  } else {

    result.exponent = 0;

  result.end = begin;

  return result;

template ParsedFloat ParseFloat<10>(const char* begin, const char* end,

                                    chars_format format_flags);

template ParsedFloat ParseFloat<16>(const char* begin, const char* end,

                                    chars_format format_flags);

}  // namespace strings_internal

ABSL_NAMESPACE_END

}  // namespace absl