decode_rust_punycode.cc 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. // Copyright 2024 The Abseil Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "absl/debugging/internal/decode_rust_punycode.h"
  15. #include <cstddef>
  16. #include <cstdint>
  17. #include <cstring>
  18. #include "absl/base/config.h"
  19. #include "absl/base/nullability.h"
  20. #include "absl/debugging/internal/bounded_utf8_length_sequence.h"
  21. #include "absl/debugging/internal/utf8_for_code_point.h"
  22. namespace absl {
  23. ABSL_NAMESPACE_BEGIN
  24. namespace debugging_internal {
  25. namespace {
  26. // Decoding Punycode requires repeated random-access insertion into a stream of
  27. // variable-length UTF-8 code-point encodings. We need this to be tolerably
  28. // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
  29. // structures on the heap (async-signal-safety).
  30. //
  31. // It is pragmatic to impose a moderately low limit on the identifier length and
  32. // bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently
  33. // determines where to insert the next code point, and memmove efficiently makes
  34. // room for it.
  35. //
  36. // The chosen limit is a round number several times larger than identifiers
  37. // expected in practice, yet still small enough that a memmove of this many
  38. // UTF-8 characters is not much more expensive than the division and modulus
  39. // operations that Punycode decoding requires.
  40. constexpr uint32_t kMaxChars = 256;
  41. // Constants from RFC 3492 section 5.
  42. constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
  43. constexpr uint32_t kMaxCodePoint = 0x10ffff;
  44. // Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
  45. constexpr uint32_t kMaxI = 1 << 30;
  46. // If punycode_begin .. punycode_end begins with a prefix matching the regular
  47. // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
  48. // underscore into out_begin .. out_end, sets num_ascii_chars to the number of
  49. // bytes copied, and returns true. (A prefix of this sort represents the
  50. // nonempty subsequence of ASCII characters in the corresponding plaintext.)
  51. //
  52. // If punycode_begin .. punycode_end does not contain an underscore, sets
  53. // num_ascii_chars to zero and returns true. (The encoding of a plaintext
  54. // without any ASCII characters does not carry such a prefix.)
  55. //
  56. // Returns false and zeroes num_ascii_chars on failure (either parse error or
  57. // not enough space in the output buffer).
  58. bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
  59. const char* const punycode_end,
  60. char* const out_begin,
  61. char* const out_end,
  62. uint32_t& num_ascii_chars) {
  63. num_ascii_chars = 0;
  64. // Remember the last underscore if any. Also use the same string scan to
  65. // reject any ASCII bytes that do not belong in an identifier, including NUL,
  66. // as well as non-ASCII bytes, which should have been delta-encoded instead.
  67. int last_underscore = -1;
  68. for (int i = 0; i < punycode_end - punycode_begin; ++i) {
  69. const char c = punycode_begin[i];
  70. if (c == '_') {
  71. last_underscore = i;
  72. continue;
  73. }
  74. // We write out the meaning of absl::ascii_isalnum rather than call that
  75. // function because its documentation does not promise it will remain
  76. // async-signal-safe under future development.
  77. if ('a' <= c && c <= 'z') continue;
  78. if ('A' <= c && c <= 'Z') continue;
  79. if ('0' <= c && c <= '9') continue;
  80. return false;
  81. }
  82. // If there was no underscore, that means there were no ASCII characters in
  83. // the plaintext, so there is no prefix to consume. Our work is done.
  84. if (last_underscore < 0) return true;
  85. // Otherwise there will be an underscore delimiter somewhere. It can't be
  86. // initial because then there would be no ASCII characters to its left, and no
  87. // delimiter would have been added in that case.
  88. if (last_underscore == 0) return false;
  89. // Any other position is reasonable. Make sure there's room in the buffer.
  90. if (last_underscore + 1 > out_end - out_begin) return false;
  91. // Consume and write out the ASCII characters.
  92. num_ascii_chars = static_cast<uint32_t>(last_underscore);
  93. std::memcpy(out_begin, punycode_begin, num_ascii_chars);
  94. out_begin[num_ascii_chars] = '\0';
  95. punycode_begin += num_ascii_chars + 1;
  96. return true;
  97. }
  98. // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
  99. // or -1 if `c` is not such a digit.
  100. int DigitValue(char c) {
  101. if ('0' <= c && c <= '9') return c - '0' + 26;
  102. if ('a' <= c && c <= 'z') return c - 'a';
  103. if ('A' <= c && c <= 'Z') return c - 'A';
  104. return -1;
  105. }
  106. // Consumes the next delta encoding from punycode_begin .. punycode_end,
  107. // updating i accordingly. Returns true on success. Returns false on parse
  108. // failure or arithmetic overflow.
  109. bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
  110. uint32_t bias, uint32_t& i) {
  111. uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t
  112. // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
  113. // section 6.2. Each loop iteration scans one digit of the delta.
  114. for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
  115. const int digit_value = DigitValue(*punycode_begin++);
  116. if (digit_value < 0) return false;
  117. // Compute this in 64-bit arithmetic so we can check for overflow afterward.
  118. const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
  119. // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
  120. // invalid input could encode an arbitrarily large delta. Nip that in the
  121. // bud here.
  122. static_assert(
  123. kMaxI >= kMaxChars * kMaxCodePoint,
  124. "kMaxI is too small to prevent spurious failures on good input");
  125. if (new_i > kMaxI) return false;
  126. static_assert(
  127. kMaxI < (uint64_t{1} << 32),
  128. "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
  129. i = static_cast<uint32_t>(new_i);
  130. // Compute the threshold that determines whether this is the last digit and
  131. // (if not) what the next digit's place value will be. This logic from RFC
  132. // 3492 section 6.2 is explained in section 3.3.
  133. uint32_t t;
  134. if (k <= bias + kTMin) {
  135. t = kTMin;
  136. } else if (k >= bias + kTMax) {
  137. t = kTMax;
  138. } else {
  139. t = k - bias;
  140. }
  141. if (static_cast<uint32_t>(digit_value) < t) return true;
  142. // If this gets too large, the range check on new_i in the next iteration
  143. // will catch it. We know this multiplication will not overwrap because w
  144. // is 64 bits wide.
  145. w *= kBase - t;
  146. }
  147. return false;
  148. }
  149. } // namespace
  150. absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
  151. const char* punycode_begin = options.punycode_begin;
  152. const char* const punycode_end = options.punycode_end;
  153. char* const out_begin = options.out_begin;
  154. char* const out_end = options.out_end;
  155. // Write a NUL terminator first. Later memcpy calls will keep bumping it
  156. // along to its new right place.
  157. const size_t out_size = static_cast<size_t>(out_end - out_begin);
  158. if (out_size == 0) return nullptr;
  159. *out_begin = '\0';
  160. // RFC 3492 section 6.2 begins here. We retain the names of integer variables
  161. // appearing in that text.
  162. uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
  163. // If there are any ASCII characters, consume them and their trailing
  164. // underscore delimiter.
  165. if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
  166. out_begin, out_end, num_chars)) {
  167. return nullptr;
  168. }
  169. uint32_t total_utf8_bytes = num_chars;
  170. BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
  171. // "while the input is not exhausted do begin ... end"
  172. while (punycode_begin != punycode_end) {
  173. if (num_chars >= kMaxChars) return nullptr;
  174. const uint32_t old_i = i;
  175. if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
  176. // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.)
  177. uint32_t delta = i - old_i;
  178. delta /= (old_i == 0 ? kDamp : 2);
  179. delta += delta/(num_chars + 1);
  180. bias = 0;
  181. while (delta > ((kBase - kTMin) * kTMax)/2) {
  182. delta /= kBase - kTMin;
  183. bias += kBase;
  184. }
  185. bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
  186. // Back in section 6.2, compute the new code point and insertion index.
  187. static_assert(
  188. kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
  189. "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
  190. n += i/(num_chars + 1);
  191. i %= num_chars + 1;
  192. // To actually insert, we need to convert the code point n to UTF-8 and the
  193. // character index i to an index into the byte stream emitted so far. First
  194. // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
  195. // and anything that won't fit into the remaining output storage.
  196. Utf8ForCodePoint utf8_for_code_point(n);
  197. if (!utf8_for_code_point.ok()) return nullptr;
  198. if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
  199. return nullptr;
  200. }
  201. // Now insert the new character into both our length map and the output.
  202. uint32_t n_index =
  203. utf8_lengths.InsertAndReturnSumOfPredecessors(
  204. i, utf8_for_code_point.length);
  205. std::memmove(
  206. out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
  207. total_utf8_bytes + 1 - n_index);
  208. std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
  209. utf8_for_code_point.length);
  210. total_utf8_bytes += utf8_for_code_point.length;
  211. ++num_chars;
  212. // Finally, advance to the next state before continuing.
  213. ++i;
  214. }
  215. return out_begin + total_utf8_bytes;
  216. }
  217. } // namespace debugging_internal
  218. ABSL_NAMESPACE_END
  219. } // namespace absl