ascii.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. // Copyright 2017 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "y_absl/strings/ascii.h"
  15. #include <climits>
  16. #include <cstddef>
  17. #include <cstring>
  18. #include <util/generic/string.h>
  19. #include "y_absl/base/attributes.h"
  20. #include "y_absl/base/config.h"
  21. #include "y_absl/base/nullability.h"
  22. #include "y_absl/base/optimization.h"
  23. namespace y_absl {
  24. Y_ABSL_NAMESPACE_BEGIN
  25. namespace ascii_internal {
  26. // # Table generated by this Python code (bit 0x02 is currently unused):
  27. // TODO(mbar) Move Python code for generation of table to BUILD and link here.
  28. // NOTE: The kAsciiPropertyBits table used within this code was generated by
  29. // Python code of the following form. (Bit 0x02 is currently unused and
  30. // available.)
  31. //
  32. // def Hex2(n):
  33. // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
  34. // def IsPunct(ch):
  35. // return (ord(ch) >= 32 and ord(ch) < 127 and
  36. // not ch.isspace() and not ch.isalnum())
  37. // def IsBlank(ch):
  38. // return ch in ' \t'
  39. // def IsCntrl(ch):
  40. // return ord(ch) < 32 or ord(ch) == 127
  41. // def IsXDigit(ch):
  42. // return ch.isdigit() or ch.lower() in 'abcdef'
  43. // for i in range(128):
  44. // ch = chr(i)
  45. // mask = ((ch.isalpha() and 0x01 or 0) |
  46. // (ch.isalnum() and 0x04 or 0) |
  47. // (ch.isspace() and 0x08 or 0) |
  48. // (IsPunct(ch) and 0x10 or 0) |
  49. // (IsBlank(ch) and 0x20 or 0) |
  50. // (IsCntrl(ch) and 0x40 or 0) |
  51. // (IsXDigit(ch) and 0x80 or 0))
  52. // print Hex2(mask) + ',',
  53. // if i % 16 == 7:
  54. // print ' //', Hex2(i & 0x78)
  55. // elif i % 16 == 15:
  56. // print
  57. // clang-format off
  58. // Array of bitfields holding character information. Each bit value corresponds
  59. // to a particular character feature. For readability, and because the value
  60. // of these bits is tightly coupled to this implementation, the individual bits
  61. // are not named. Note that bitfields for all characters above ASCII 127 are
  62. // zero-initialized.
  63. Y_ABSL_DLL const unsigned char kPropertyBits[256] = {
  64. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
  65. 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
  66. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
  67. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  68. 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
  69. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  70. 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
  71. 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  72. 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
  73. 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
  74. 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
  75. 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
  76. 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
  77. 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
  78. 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
  79. 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
  80. };
  81. // Array of characters for the ascii_tolower() function. For values 'A'
  82. // through 'Z', return the lower-case character; otherwise, return the
  83. // identity of the passed character.
  84. Y_ABSL_DLL const char kToLower[256] = {
  85. '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
  86. '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
  87. '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
  88. '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
  89. '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
  90. '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
  91. '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
  92. '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
  93. '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
  94. 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  95. 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
  96. 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
  97. '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
  98. '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
  99. '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
  100. '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
  101. '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
  102. '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
  103. '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
  104. '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
  105. '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
  106. '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
  107. '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
  108. '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
  109. '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
  110. '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
  111. '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
  112. '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
  113. '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
  114. '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
  115. '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
  116. '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
  117. };
  118. // Array of characters for the ascii_toupper() function. For values 'a'
  119. // through 'z', return the upper-case character; otherwise, return the
  120. // identity of the passed character.
  121. Y_ABSL_DLL const char kToUpper[256] = {
  122. '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
  123. '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
  124. '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
  125. '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
  126. '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
  127. '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
  128. '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
  129. '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
  130. '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
  131. '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
  132. '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
  133. '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
  134. '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
  135. 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
  136. 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
  137. 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
  138. '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
  139. '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
  140. '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
  141. '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
  142. '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
  143. '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
  144. '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
  145. '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
  146. '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
  147. '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
  148. '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
  149. '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
  150. '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
  151. '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
  152. '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
  153. '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
  154. };
  155. // clang-format on
  156. // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
  157. // Implemented by:
  158. // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
  159. // 2. Comparing to SCHAR_MIN + 26.
  160. template <bool ToUpper>
  161. constexpr bool AsciiInAZRange(unsigned char c) {
  162. constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
  163. constexpr signed char threshold = SCHAR_MIN + 26; // 26 = alphabet size.
  164. // Using unsigned arithmetic as overflows/underflows are well defined.
  165. unsigned char u = c - sub;
  166. // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
  167. return static_cast<signed char>(u) < threshold;
  168. }
  169. // Force-inline so the compiler won't merge the short and long implementations.
  170. template <bool ToUpper>
  171. Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl(
  172. y_absl::Nonnull<char*> p, size_t size) {
  173. // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
  174. // When we need to flip the case, we can xor with this bit to achieve the
  175. // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
  176. // could have chosen 'z' and 'Z', or any other pair of characters as they all
  177. // have the same single bit difference.
  178. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
  179. for (size_t i = 0; i < size; ++i) {
  180. unsigned char v = static_cast<unsigned char>(p[i]);
  181. v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
  182. p[i] = static_cast<char>(v);
  183. }
  184. }
  185. // The string size threshold for starting using the long string version.
  186. constexpr size_t kCaseFoldThreshold = 16;
  187. // No-inline so the compiler won't merge the short and long implementations.
  188. template <bool ToUpper>
  189. Y_ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong(
  190. y_absl::Nonnull<char*> p, size_t size) {
  191. Y_ABSL_ASSUME(size >= kCaseFoldThreshold);
  192. AsciiStrCaseFoldImpl<ToUpper>(p, size);
  193. }
  194. // Splitting to short and long strings to allow vectorization decisions
  195. // to be made separately in the long and short cases.
  196. template <bool ToUpper>
  197. constexpr void AsciiStrCaseFold(y_absl::Nonnull<char*> p, size_t size) {
  198. size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl<ToUpper>(p, size)
  199. : AsciiStrCaseFoldLong<ToUpper>(p, size);
  200. }
  201. static constexpr size_t ValidateAsciiCasefold() {
  202. constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
  203. size_t incorrect_index = 0;
  204. char lowered[num_chars] = {};
  205. char uppered[num_chars] = {};
  206. for (unsigned int i = 0; i < num_chars; ++i) {
  207. uppered[i] = lowered[i] = static_cast<char>(i);
  208. }
  209. AsciiStrCaseFold<false>(&lowered[0], num_chars);
  210. AsciiStrCaseFold<true>(&uppered[0], num_chars);
  211. for (size_t i = 0; i < num_chars; ++i) {
  212. const char ch = static_cast<char>(i),
  213. ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
  214. ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
  215. if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
  216. incorrect_index = i > 0 ? i : num_chars;
  217. break;
  218. }
  219. }
  220. return incorrect_index;
  221. }
  222. static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
  223. } // namespace ascii_internal
  224. void AsciiStrToLower(y_absl::Nonnull<TString*> s) {
  225. return ascii_internal::AsciiStrCaseFold<false>(&(*s)[0], s->size());
  226. }
  227. void AsciiStrToUpper(y_absl::Nonnull<TString*> s) {
  228. return ascii_internal::AsciiStrCaseFold<true>(&(*s)[0], s->size());
  229. }
  230. void RemoveExtraAsciiWhitespace(y_absl::Nonnull<TString*> str) {
  231. auto stripped = StripAsciiWhitespace(*str);
  232. if (stripped.empty()) {
  233. str->clear();
  234. return;
  235. }
  236. auto input_it = stripped.begin();
  237. auto input_end = stripped.end();
  238. auto output_it = &(*str)[0];
  239. bool is_ws = false;
  240. for (; input_it < input_end; ++input_it) {
  241. if (is_ws) {
  242. // Consecutive whitespace? Keep only the last.
  243. is_ws = y_absl::ascii_isspace(static_cast<unsigned char>(*input_it));
  244. if (is_ws) --output_it;
  245. } else {
  246. is_ws = y_absl::ascii_isspace(static_cast<unsigned char>(*input_it));
  247. }
  248. *output_it = *input_it;
  249. ++output_it;
  250. }
  251. str->erase(static_cast<size_t>(output_it - &(*str)[0]));
  252. }
  253. Y_ABSL_NAMESPACE_END
  254. } // namespace y_absl