// Copyright 2018 The Abseil Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "y_absl/strings/internal/charconv_parse.h" #include "y_absl/strings/charconv.h" #include #include #include #include "y_absl/strings/internal/memutil.h" namespace y_absl { Y_ABSL_NAMESPACE_BEGIN namespace { // ParseFloat<10> will read the first 19 significant digits of the mantissa. // This number was chosen for multiple reasons. // // (a) First, for whatever integer type we choose to represent the mantissa, we // want to choose the largest possible number of decimal digits for that integer // type. We are using uint64_t, which can express any 19-digit unsigned // integer. // // (b) Second, we need to parse enough digits that the binary value of any // mantissa we capture has more bits of resolution than the mantissa // representation in the target float. Our algorithm requires at least 3 bits // of headway, but 19 decimal digits give a little more than that. // // The following static assertions verify the above comments: constexpr int kDecimalMantissaDigitsMax = 19; static_assert(std::numeric_limits::digits10 == kDecimalMantissaDigitsMax, "(a) above"); // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa. static_assert(std::numeric_limits::is_iec559, "IEEE double assumed"); static_assert(std::numeric_limits::radix == 2, "IEEE double fact"); static_assert(std::numeric_limits::digits == 53, "IEEE double fact"); // The lowest valued 19-digit decimal mantissa we can read still contains // sufficient information to reconstruct a binary mantissa. static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above"); // ParseFloat<16> will read the first 15 significant digits of the mantissa. // // Because a base-16-to-base-2 conversion can be done exactly, we do not need // to maximize the number of scanned hex digits to improve our conversion. What // is required is to scan two more bits than the mantissa can represent, so that // we always round correctly. // // (One extra bit does not suffice to perform correct rounding, since a number // exactly halfway between two representable floats has unique rounding rules, // so we need to differentiate between a "halfway between" number and a "closer // to the larger value" number.) constexpr int kHexadecimalMantissaDigitsMax = 15; // The minimum number of significant bits that will be read from // kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since // the most significant digit can be a "1", which only contributes a single // significant bit. constexpr int kGuaranteedHexadecimalMantissaBitPrecision = 4 * kHexadecimalMantissaDigitsMax - 3; static_assert(kGuaranteedHexadecimalMantissaBitPrecision > std::numeric_limits::digits + 2, "kHexadecimalMantissaDigitsMax too small"); // We also impose a limit on the number of significant digits we will read from // an exponent, to avoid having to deal with integer overflow. We use 9 for // this purpose. // // If we read a 9 digit exponent, the end result of the conversion will // necessarily be infinity or zero, depending on the sign of the exponent. // Therefore we can just drop extra digits on the floor without any extra // logic. constexpr int kDecimalExponentDigitsMax = 9; static_assert(std::numeric_limits::digits10 >= kDecimalExponentDigitsMax, "int type too small"); // To avoid incredibly large inputs causing integer overflow for our exponent, // we impose an arbitrary but very large limit on the number of significant // digits we will accept. The implementation refuses to match a string with // more consecutive significant mantissa digits than this. constexpr int kDecimalDigitLimit = 50000000; // Corresponding limit for hexadecimal digit inputs. This is one fourth the // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires // a binary exponent adjustment of 4. constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4; // The largest exponent we can read is 999999999 (per // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these // comfortably fits in an integer. // // We count kDecimalDigitLimit twice because there are independent limits for // numbers before and after the decimal point. (In the case where there are no // significant digits before the decimal point, there are independent limits for // post-decimal-point leading zeroes and for significant digits.) static_assert(999999999 + 2 * kDecimalDigitLimit < std::numeric_limits::max(), "int type too small"); static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) < std::numeric_limits::max(), "int type too small"); // Returns true if the provided bitfield allows parsing an exponent value // (e.g., "1.5e100"). bool AllowExponent(chars_format flags) { bool fixed = (flags & chars_format::fixed) == chars_format::fixed; bool scientific = (flags & chars_format::scientific) == chars_format::scientific; return scientific || !fixed; } // Returns true if the provided bitfield requires an exponent value be present. bool RequireExponent(chars_format flags) { bool fixed = (flags & chars_format::fixed) == chars_format::fixed; bool scientific = (flags & chars_format::scientific) == chars_format::scientific; return scientific && !fixed; } const int8_t kAsciiToInt[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; // Returns true if `ch` is a digit in the given base template bool IsDigit(char ch); // Converts a valid `ch` to its digit value in the given base. template unsigned ToDigit(char ch); // Returns true if `ch` is the exponent delimiter for the given base. template bool IsExponentCharacter(char ch); // Returns the maximum number of significant digits we will read for a float // in the given base. template constexpr int MantissaDigitsMax(); // Returns the largest consecutive run of digits we will accept when parsing a // number in the given base. template constexpr int DigitLimit(); // Returns the amount the exponent must be adjusted by for each dropped digit. // (For decimal this is 1, since the digits are in base 10 and the exponent base // is also 10, but for hexadecimal this is 4, since the digits are base 16 but // the exponent base is 2.) template constexpr int DigitMagnitude(); template <> bool IsDigit<10>(char ch) { return ch >= '0' && ch <= '9'; } template <> bool IsDigit<16>(char ch) { return kAsciiToInt[static_cast(ch)] >= 0; } template <> unsigned ToDigit<10>(char ch) { return static_cast(ch - '0'); } template <> unsigned ToDigit<16>(char ch) { return static_cast(kAsciiToInt[static_cast(ch)]); } template <> bool IsExponentCharacter<10>(char ch) { return ch == 'e' || ch == 'E'; } template <> bool IsExponentCharacter<16>(char ch) { return ch == 'p' || ch == 'P'; } template <> constexpr int MantissaDigitsMax<10>() { return kDecimalMantissaDigitsMax; } template <> constexpr int MantissaDigitsMax<16>() { return kHexadecimalMantissaDigitsMax; } template <> constexpr int DigitLimit<10>() { return kDecimalDigitLimit; } template <> constexpr int DigitLimit<16>() { return kHexadecimalDigitLimit; } template <> constexpr int DigitMagnitude<10>() { return 1; } template <> constexpr int DigitMagnitude<16>() { return 4; } // Reads decimal digits from [begin, end) into *out. Returns the number of // digits consumed. // // After max_digits has been read, keeps consuming characters, but no longer // adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit // is set; otherwise, it is left unmodified. // // If no digits are matched, returns 0 and leaves *out unchanged. // // ConsumeDigits does not protect against overflow on *out; max_digits must // be chosen with respect to type T to avoid the possibility of overflow. template int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out, bool* dropped_nonzero_digit) { if (base == 10) { assert(max_digits <= std::numeric_limits::digits10); } else if (base == 16) { assert(max_digits * 4 <= std::numeric_limits::digits); } const char* const original_begin = begin; // Skip leading zeros, but only if *out is zero. // They don't cause an overflow so we don't have to count them for // `max_digits`. while (!*out && end != begin && *begin == '0') ++begin; T accumulator = *out; const char* significant_digits_end = (end - begin > max_digits) ? begin + max_digits : end; while (begin < significant_digits_end && IsDigit(*begin)) { // Do not guard against *out overflow; max_digits was chosen to avoid this. // Do assert against it, to detect problems in debug builds. auto digit = static_cast(ToDigit(*begin)); assert(accumulator * base >= accumulator); accumulator *= base; assert(accumulator + digit >= accumulator); accumulator += digit; ++begin; } bool dropped_nonzero = false; while (begin < end && IsDigit(*begin)) { dropped_nonzero = dropped_nonzero || (*begin != '0'); ++begin; } if (dropped_nonzero && dropped_nonzero_digit != nullptr) { *dropped_nonzero_digit = true; } *out = accumulator; return static_cast(begin - original_begin); } // Returns true if `v` is one of the chars allowed inside parentheses following // a NaN. bool IsNanChar(char v) { return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') || (v >= 'A' && v <= 'Z'); } // Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If // one is found, sets `out` appropriately and returns true. bool ParseInfinityOrNan(const char* begin, const char* end, strings_internal::ParsedFloat* out) { if (end - begin < 3) { return false; } switch (*begin) { case 'i': case 'I': { // An infinity string consists of the characters "inf" or "infinity", // case insensitive. if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) { return false; } out->type = strings_internal::FloatType::kInfinity; if (end - begin >= 8 && strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) { out->end = begin + 8; } else { out->end = begin + 3; } return true; } case 'n': case 'N': { // A NaN consists of the characters "nan", case insensitive, optionally // followed by a parenthesized sequence of zero or more alphanumeric // characters and/or underscores. if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) { return false; } out->type = strings_internal::FloatType::kNan; out->end = begin + 3; // NaN is allowed to be followed by a parenthesized string, consisting of // only the characters [a-zA-Z0-9_]. Match that if it's present. begin += 3; if (begin < end && *begin == '(') { const char* nan_begin = begin + 1; while (nan_begin < end && IsNanChar(*nan_begin)) { ++nan_begin; } if (nan_begin < end && *nan_begin == ')') { // We found an extra NaN specifier range out->subrange_begin = begin + 1; out->subrange_end = nan_begin; out->end = nan_begin + 1; } } return true; } default: return false; } } } // namespace namespace strings_internal { template strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end, chars_format format_flags) { strings_internal::ParsedFloat result; // Exit early if we're given an empty range. if (begin == end) return result; // Handle the infinity and NaN cases. if (ParseInfinityOrNan(begin, end, &result)) { return result; } const char* const mantissa_begin = begin; while (begin < end && *begin == '0') { ++begin; // skip leading zeros } uint64_t mantissa = 0; int exponent_adjustment = 0; bool mantissa_is_inexact = false; int pre_decimal_digits = ConsumeDigits( begin, end, MantissaDigitsMax(), &mantissa, &mantissa_is_inexact); begin += pre_decimal_digits; int digits_left; if (pre_decimal_digits >= DigitLimit()) { // refuse to parse pathological inputs return result; } else if (pre_decimal_digits > MantissaDigitsMax()) { // We dropped some non-fraction digits on the floor. Adjust our exponent // to compensate. exponent_adjustment = static_cast(pre_decimal_digits - MantissaDigitsMax()); digits_left = 0; } else { digits_left = static_cast(MantissaDigitsMax() - pre_decimal_digits); } if (begin < end && *begin == '.') { ++begin; if (mantissa == 0) { // If we haven't seen any nonzero digits yet, keep skipping zeros. We // have to adjust the exponent to reflect the changed place value. const char* begin_zeros = begin; while (begin < end && *begin == '0') { ++begin; } int zeros_skipped = static_cast(begin - begin_zeros); if (zeros_skipped >= DigitLimit()) { // refuse to parse pathological inputs return result; } exponent_adjustment -= static_cast(zeros_skipped); } int post_decimal_digits = ConsumeDigits( begin, end, digits_left, &mantissa, &mantissa_is_inexact); begin += post_decimal_digits; // Since `mantissa` is an integer, each significant digit we read after // the decimal point requires an adjustment to the exponent. "1.23e0" will // be stored as `mantissa` == 123 and `exponent` == -2 (that is, // "123e-2"). if (post_decimal_digits >= DigitLimit()) { // refuse to parse pathological inputs return result; } else if (post_decimal_digits > digits_left) { exponent_adjustment -= digits_left; } else { exponent_adjustment -= post_decimal_digits; } } // If we've found no mantissa whatsoever, this isn't a number. if (mantissa_begin == begin) { return result; } // A bare "." doesn't count as a mantissa either. if (begin - mantissa_begin == 1 && *mantissa_begin == '.') { return result; } if (mantissa_is_inexact) { // We dropped significant digits on the floor. Handle this appropriately. if (base == 10) { // If we truncated significant decimal digits, store the full range of the // mantissa for future big integer math for exact rounding. result.subrange_begin = mantissa_begin; result.subrange_end = begin; } else if (base == 16) { // If we truncated hex digits, reflect this fact by setting the low // ("sticky") bit. This allows for correct rounding in all cases. mantissa |= 1; } } result.mantissa = mantissa; const char* const exponent_begin = begin; result.literal_exponent = 0; bool found_exponent = false; if (AllowExponent(format_flags) && begin < end && IsExponentCharacter(*begin)) { bool negative_exponent = false; ++begin; if (begin < end && *begin == '-') { negative_exponent = true; ++begin; } else if (begin < end && *begin == '+') { ++begin; } const char* const exponent_digits_begin = begin; // Exponent is always expressed in decimal, even for hexadecimal floats. begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax, &result.literal_exponent, nullptr); if (begin == exponent_digits_begin) { // there were no digits where we expected an exponent. We failed to read // an exponent and should not consume the 'e' after all. Rewind 'begin'. found_exponent = false; begin = exponent_begin; } else { found_exponent = true; if (negative_exponent) { result.literal_exponent = -result.literal_exponent; } } } if (!found_exponent && RequireExponent(format_flags)) { // Provided flags required an exponent, but none was found. This results // in a failure to scan. return result; } // Success! result.type = strings_internal::FloatType::kNumber; if (result.mantissa > 0) { result.exponent = result.literal_exponent + (DigitMagnitude() * exponent_adjustment); } else { result.exponent = 0; } result.end = begin; return result; } template ParsedFloat ParseFloat<10>(const char* begin, const char* end, chars_format format_flags); template ParsedFloat ParseFloat<16>(const char* begin, const char* end, chars_format format_flags); } // namespace strings_internal Y_ABSL_NAMESPACE_END } // namespace y_absl