utf8.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. #include "utf8.h"
  2. #include <util/charset/wide.h>
  3. #include <ctype.h>
  4. #include <vector>
  5. namespace NYql {
  6. namespace {
  7. unsigned char GetRange(unsigned char c) {
  8. // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  9. // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
  10. static const unsigned char type[] = {
  11. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  12. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  13. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  14. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  15. 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
  16. 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
  17. 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
  18. 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
  19. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  20. 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  21. };
  22. return type[c];
  23. }
  24. struct TByteRange {
  25. ui8 First = 0;
  26. ui8 Last = 0;
  27. };
  28. struct TUtf8Ranges {
  29. size_t BytesCount = 0;
  30. TByteRange Bytes[4] = {};
  31. };
  32. // see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8
  33. inline static const std::vector<TUtf8Ranges> Utf8Ranges = {
  34. { 1, { {0x00, 0x7f}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, } },
  35. { 2, { {0xc2, 0xdf}, {0x80, 0xbf}, {0x00, 0x00}, {0x00, 0x00}, } },
  36. { 3, { {0xe0, 0xe0}, {0xa0, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
  37. { 3, { {0xe1, 0xec}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
  38. { 3, { {0xed, 0xed}, {0x80, 0x9f}, {0x80, 0xbf}, {0x00, 0x00}, } },
  39. { 3, { {0xee, 0xef}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
  40. { 4, { {0xf0, 0xf0}, {0x90, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
  41. { 4, { {0xf1, 0xf3}, {0x80, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
  42. { 4, { {0xf4, 0xf4}, {0x80, 0x8f}, {0x80, 0xbf}, {0x80, 0xbf}, } },
  43. };
  44. std::optional<std::string> RoundBadUtf8(size_t range, std::string_view inputString, size_t pos,
  45. bool roundDown)
  46. {
  47. Y_ENSURE(range > 0);
  48. Y_ENSURE(range < Utf8Ranges.size());
  49. const std::string prefix{inputString.substr(0, pos)};
  50. std::string_view suffix = inputString.substr(pos, Utf8Ranges[range].BytesCount);
  51. std::string newSuffix;
  52. if (roundDown) {
  53. std::optional<size_t> lastNonMin;
  54. for (size_t i = 0; i < suffix.size(); ++i) {
  55. if (Utf8Ranges[range].Bytes[i].First < ui8(suffix[i])) {
  56. lastNonMin = i;
  57. }
  58. }
  59. if (!lastNonMin) {
  60. for (size_t i = 0; i < Utf8Ranges[range - 1].BytesCount; ++i) {
  61. newSuffix.push_back(Utf8Ranges[range - 1].Bytes[i].Last);
  62. }
  63. } else {
  64. for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
  65. if (i < *lastNonMin) {
  66. ui8 c = suffix[i];
  67. newSuffix.push_back(c);
  68. } else if (i == *lastNonMin) {
  69. ui8 c = suffix[i];
  70. newSuffix.push_back(std::min<ui8>(c - 1, Utf8Ranges[range].Bytes[i].Last));
  71. } else {
  72. newSuffix.push_back(Utf8Ranges[range].Bytes[i].Last);
  73. }
  74. }
  75. }
  76. } else {
  77. std::optional<size_t> lastNonMax;
  78. bool valid = true;
  79. for (size_t i = 0; i < suffix.size(); ++i) {
  80. ui8 last = Utf8Ranges[range].Bytes[i].Last;
  81. ui8 first = Utf8Ranges[range].Bytes[i].First;
  82. ui8 curr = ui8(suffix[i]);
  83. valid = valid && curr <= last && curr >= first;
  84. if (curr < last) {
  85. lastNonMax = i;
  86. }
  87. }
  88. if (valid) {
  89. newSuffix = suffix;
  90. for (size_t i = suffix.size(); i < Utf8Ranges[range].BytesCount; ++i) {
  91. newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
  92. }
  93. } else if (!lastNonMax) {
  94. return NextValidUtf8(prefix);
  95. } else {
  96. for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
  97. if (i < *lastNonMax) {
  98. ui8 c = suffix[i];
  99. newSuffix.push_back(c);
  100. } else if (i == *lastNonMax) {
  101. ui8 c = suffix[i];
  102. newSuffix.push_back(std::max<ui8>(c + 1, Utf8Ranges[range].Bytes[i].First));
  103. } else {
  104. newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
  105. }
  106. }
  107. }
  108. }
  109. return prefix + newSuffix;
  110. }
  111. }
  112. bool IsUtf8(const std::string_view& str) {
  113. for (auto it = str.cbegin(); str.cend() != it;) {
  114. #define COPY() if (str.cend() != it) { c = *it++; } else { return false; }
  115. #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
  116. #define TAIL() COPY(); TRANS(0x70)
  117. auto c = *it++;
  118. if (!(c & 0x80))
  119. continue;
  120. bool result = true;
  121. switch (GetRange(static_cast<unsigned char>(c))) {
  122. case 2: TAIL(); break;
  123. case 3: TAIL(); TAIL(); break;
  124. case 4: COPY(); TRANS(0x50); TAIL(); break;
  125. case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); break;
  126. case 6: TAIL(); TAIL(); TAIL(); break;
  127. case 10: COPY(); TRANS(0x20); TAIL(); break;
  128. case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); break;
  129. default: return false;
  130. }
  131. if (!result) return false;
  132. #undef COPY
  133. #undef TRANS
  134. #undef TAIL
  135. }
  136. return true;
  137. }
  138. unsigned char WideCharSize(char head) {
  139. switch (GetRange(static_cast<unsigned char>(head))) {
  140. case 0: return 1;
  141. case 2: return 2;
  142. case 3: return 3;
  143. case 4: return 3;
  144. case 5: return 4;
  145. case 6: return 4;
  146. case 10: return 3;
  147. case 11: return 4;
  148. default: return 0;
  149. }
  150. }
  151. std::optional<std::string> RoundToNearestValidUtf8(const std::string_view& str, bool roundDown) {
  152. const size_t ss = str.size();
  153. for (size_t pos = 0; pos < ss; ) {
  154. ui8 c = str[pos];
  155. for (size_t i = 0; i < Utf8Ranges.size(); ++i) {
  156. auto& range = Utf8Ranges[i];
  157. if (c < range.Bytes[0].First) {
  158. return RoundBadUtf8(i, str, pos, roundDown);
  159. }
  160. if (c <= range.Bytes[0].Last) {
  161. // valid UTF8 code point start
  162. for (size_t j = 1; j < range.BytesCount; ++j) {
  163. if (pos + j >= ss) {
  164. return RoundBadUtf8(i, str, pos, roundDown);
  165. }
  166. ui8 cur = str[pos + j];
  167. if (!(cur >= range.Bytes[j].First && cur <= range.Bytes[j].Last)) {
  168. return RoundBadUtf8(i, str, pos, roundDown);
  169. }
  170. }
  171. pos += range.BytesCount;
  172. break;
  173. } else if (i + 1 == Utf8Ranges.size()) {
  174. if (!roundDown) {
  175. return NextValidUtf8(str.substr(0, pos));
  176. }
  177. return RoundBadUtf8(i, str, pos, roundDown);
  178. }
  179. }
  180. }
  181. return std::string(str);
  182. }
  183. std::optional<std::string> NextValidUtf8(const std::string_view& str) {
  184. Y_ENSURE(IsUtf8(str));
  185. TUtf32String wide = UTF8ToUTF32<false>(str);
  186. bool incremented = false;
  187. size_t toDrop = 0;
  188. for (auto it = wide.rbegin(); it != wide.rend(); ++it) {
  189. auto& c = *it;
  190. if (c < 0x10ffff) {
  191. c = (c == 0xd7ff) ? 0xe000 : (c + 1);
  192. incremented = true;
  193. break;
  194. } else {
  195. ++toDrop;
  196. }
  197. }
  198. if (!incremented) {
  199. return {};
  200. }
  201. Y_ENSURE(toDrop < wide.size());
  202. wide.resize(wide.size() - toDrop);
  203. TString result = WideToUTF8(wide);
  204. return std::string(result.data(), result.size());
  205. }
  206. std::optional<std::string> NextLexicographicString(const std::string_view& str) {
  207. bool incremented = false;
  208. size_t toDrop = 0;
  209. std::string result{str};
  210. for (auto it = result.rbegin(); it != result.rend(); ++it) {
  211. auto& c = *it;
  212. if (ui8(c) < 0xff) {
  213. ++c;
  214. incremented = true;
  215. break;
  216. } else {
  217. ++toDrop;
  218. }
  219. }
  220. if (!incremented) {
  221. return {};
  222. }
  223. Y_ENSURE(toDrop < result.size());
  224. result.resize(result.size() - toDrop);
  225. return result;
  226. }
  227. }