utf8.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. #pragma once
  2. #include "recode_result.h"
  3. #include <util/generic/strbuf.h>
  4. #include <util/generic/string.h>
  5. #include <util/generic/yexception.h>
  6. #include <util/system/defaults.h>
  7. #include <util/system/yassert.h>
  8. extern const wchar32 BROKEN_RUNE;
  9. inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
  10. // Y_ASSERT (utf8_rune_len <= 4);
  11. return "\0\0\037\017\007"[utf8_rune_len];
  12. }
  13. inline size_t UTF8RuneLen(const unsigned char lead_byte) {
  14. // b0XXXXXXX
  15. if ((lead_byte & 0x80) == 0x00) {
  16. return 1;
  17. }
  18. // b110XXXXX
  19. if ((lead_byte & 0xe0) == 0xc0) {
  20. return 2;
  21. }
  22. // b1110XXXX
  23. if ((lead_byte & 0xf0) == 0xe0) {
  24. return 3;
  25. }
  26. // b11110XXX
  27. if ((lead_byte & 0xf8) == 0xf0) {
  28. return 4;
  29. }
  30. // b10XXXXXX
  31. return 0;
  32. }
  33. inline size_t UTF8RuneLenByUCS(wchar32 rune) {
  34. if (rune < 0x80) {
  35. return 1U;
  36. } else if (rune < 0x800) {
  37. return 2U;
  38. } else if (rune < 0x10000) {
  39. return 3U;
  40. } else if (rune < 0x200000) {
  41. return 4U;
  42. } else if (rune < 0x4000000) {
  43. return 5U;
  44. } else {
  45. return 6U;
  46. }
  47. }
  48. inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
  49. rune = c;
  50. rune &= UTF8LeadByteMask(len);
  51. }
  52. inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
  53. rune <<= 6;
  54. rune |= c & 0x3F;
  55. }
  56. inline bool IsUTF8ContinuationByte(unsigned char c) {
  57. return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
  58. }
  59. //! returns length of the current UTF8 character
  60. //! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
  61. //! @param p pointer to the current character
  62. //! @param e end of the character sequence
  63. inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
  64. Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
  65. switch (UTF8RuneLen(*p)) {
  66. case 0:
  67. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
  68. case 1:
  69. n = 1;
  70. return RECODE_OK;
  71. case 2:
  72. if (p + 2 > e) {
  73. return RECODE_EOINPUT;
  74. } else if (!IsUTF8ContinuationByte(p[1])) {
  75. return RECODE_BROKENSYMBOL;
  76. } else {
  77. n = 2;
  78. return RECODE_OK;
  79. }
  80. case 3:
  81. if (p + 3 > e) {
  82. return RECODE_EOINPUT;
  83. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
  84. return RECODE_BROKENSYMBOL;
  85. } else {
  86. n = 3;
  87. return RECODE_OK;
  88. }
  89. default: // actually 4
  90. if (p + 4 > e) {
  91. return RECODE_EOINPUT;
  92. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
  93. return RECODE_BROKENSYMBOL;
  94. } else {
  95. n = 4;
  96. return RECODE_OK;
  97. }
  98. }
  99. }
  100. //! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
  101. //! @param text UTF8 encoded text
  102. //! @param len the length of the text in bytes
  103. //! @param number number of encoded symbols in the text
  104. inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
  105. const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
  106. const unsigned char* const last = cur + len;
  107. number = 0;
  108. size_t runeLen;
  109. bool res = true;
  110. while (cur != last) {
  111. if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
  112. res = false;
  113. break;
  114. }
  115. cur += runeLen;
  116. Y_ASSERT(cur <= last);
  117. ++number;
  118. }
  119. return res;
  120. }
  121. inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
  122. size_t number;
  123. if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
  124. ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
  125. }
  126. return number;
  127. }
  128. enum class StrictUTF8 {
  129. Yes,
  130. No
  131. };
  132. template <size_t runeLen, StrictUTF8 strictMode>
  133. inline bool IsValidUTF8Rune(wchar32 rune);
  134. template <>
  135. inline bool IsValidUTF8Rune<2, StrictUTF8::Yes>(wchar32 rune) {
  136. // check for overlong encoding
  137. return rune >= 0x80;
  138. }
  139. template <>
  140. inline bool IsValidUTF8Rune<2, StrictUTF8::No>(wchar32 rune) {
  141. return IsValidUTF8Rune<2, StrictUTF8::Yes>(rune);
  142. }
  143. template <>
  144. inline bool IsValidUTF8Rune<3, StrictUTF8::Yes>(wchar32 rune) {
  145. // surrogates are forbidden by RFC3629 section 3
  146. return rune >= 0x800 && (rune < 0xD800 || rune > 0xDFFF);
  147. }
  148. template <>
  149. inline bool IsValidUTF8Rune<3, StrictUTF8::No>(wchar32 rune) {
  150. // check for overlong encoding
  151. return rune >= 0x800;
  152. }
  153. template <>
  154. inline bool IsValidUTF8Rune<4, StrictUTF8::Yes>(wchar32 rune) {
  155. // check if this is a valid sumbod without overlong encoding
  156. return rune <= 0x10FFFF && rune >= 0x10000;
  157. }
  158. template <>
  159. inline bool IsValidUTF8Rune<4, StrictUTF8::No>(wchar32 rune) {
  160. return IsValidUTF8Rune<4, StrictUTF8::Yes>(rune);
  161. }
  162. //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
  163. //! @param rune value of the current character
  164. //! @param rune_len length of the UTF8 bytes sequence that has been read
  165. //! @param s pointer to the current character
  166. //! @param end the end of the character sequence
  167. template <StrictUTF8 strictMode = StrictUTF8::No>
  168. inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
  169. rune = BROKEN_RUNE;
  170. rune_len = 0;
  171. wchar32 _rune;
  172. size_t _len = UTF8RuneLen(*s);
  173. if (s + _len > end) {
  174. return RECODE_EOINPUT; // [EOINPUT]
  175. }
  176. if (_len == 0) {
  177. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
  178. }
  179. _rune = *s++; // [00000000 0XXXXXXX]
  180. if (_len > 1) {
  181. _rune &= UTF8LeadByteMask(_len);
  182. unsigned char ch = *s++;
  183. if (!IsUTF8ContinuationByte(ch)) {
  184. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in second byte
  185. }
  186. PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY]
  187. if (_len > 2) {
  188. ch = *s++;
  189. if (!IsUTF8ContinuationByte(ch)) {
  190. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in third byte
  191. }
  192. PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ]
  193. if (_len > 3) {
  194. ch = *s;
  195. if (!IsUTF8ContinuationByte(ch)) {
  196. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in fourth byte
  197. }
  198. PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ]
  199. if (!IsValidUTF8Rune<4, strictMode>(_rune)) {
  200. return RECODE_BROKENSYMBOL;
  201. }
  202. } else {
  203. if (!IsValidUTF8Rune<3, strictMode>(_rune)) {
  204. return RECODE_BROKENSYMBOL;
  205. }
  206. }
  207. } else {
  208. if (!IsValidUTF8Rune<2, strictMode>(_rune)) {
  209. return RECODE_BROKENSYMBOL;
  210. }
  211. }
  212. }
  213. rune_len = _len;
  214. rune = _rune;
  215. return RECODE_OK;
  216. }
  217. //! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
  218. //! @param c value of the current character
  219. //! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
  220. //! @param e the end of the character sequence
  221. template <StrictUTF8 strictMode = StrictUTF8::No>
  222. Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
  223. Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
  224. switch (UTF8RuneLen(*p)) {
  225. case 0:
  226. rune = BROKEN_RUNE;
  227. return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
  228. case 1:
  229. rune = *p; // [00000000 0XXXXXXX]
  230. ++p;
  231. return RECODE_OK;
  232. case 2:
  233. if (p + 2 > e) {
  234. return RECODE_EOINPUT;
  235. } else if (!IsUTF8ContinuationByte(p[1])) {
  236. rune = BROKEN_RUNE;
  237. return RECODE_BROKENSYMBOL;
  238. } else {
  239. PutUTF8LeadBits(rune, *p++, 2); // [00000000 000XXXXX]
  240. PutUTF8SixBits(rune, *p++); // [00000XXX XXYYYYYY]
  241. if (!IsValidUTF8Rune<2, strictMode>(rune)) {
  242. p -= 2;
  243. rune = BROKEN_RUNE;
  244. return RECODE_BROKENSYMBOL;
  245. }
  246. return RECODE_OK;
  247. }
  248. case 3:
  249. if (p + 3 > e) {
  250. return RECODE_EOINPUT;
  251. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
  252. rune = BROKEN_RUNE;
  253. return RECODE_BROKENSYMBOL;
  254. } else {
  255. PutUTF8LeadBits(rune, *p++, 3); // [00000000 0000XXXX]
  256. PutUTF8SixBits(rune, *p++); // [000000XX XXYYYYYY]
  257. PutUTF8SixBits(rune, *p++); // [XXXXYYYY YYZZZZZZ]
  258. // check for overlong encoding and surrogates
  259. if (!IsValidUTF8Rune<3, strictMode>(rune)) {
  260. p -= 3;
  261. rune = BROKEN_RUNE;
  262. return RECODE_BROKENSYMBOL;
  263. }
  264. return RECODE_OK;
  265. }
  266. case 4:
  267. if (p + 4 > e) {
  268. return RECODE_EOINPUT;
  269. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
  270. rune = BROKEN_RUNE;
  271. return RECODE_BROKENSYMBOL;
  272. } else {
  273. PutUTF8LeadBits(rune, *p++, 4); // [00000000 00000000 00000XXX]
  274. PutUTF8SixBits(rune, *p++); // [00000000 0000000X XXYYYYYY]
  275. PutUTF8SixBits(rune, *p++); // [00000000 0XXXYYYY YYZZZZZZ]
  276. PutUTF8SixBits(rune, *p++); // [000XXXYY YYYYZZZZ ZZQQQQQQ]
  277. if (!IsValidUTF8Rune<4, strictMode>(rune)) {
  278. p -= 4;
  279. rune = BROKEN_RUNE;
  280. return RECODE_BROKENSYMBOL;
  281. }
  282. return RECODE_OK;
  283. }
  284. default: // >4
  285. rune = BROKEN_RUNE;
  286. return RECODE_BROKENSYMBOL;
  287. }
  288. }
  289. //! writes one unicode symbol into a character sequence encoded UTF8
  290. //! checks for end of the buffer and returns the result of encoding
  291. //! @param rune value of the current character
  292. //! @param rune_len length of the UTF8 byte sequence that has been written
  293. //! @param s pointer to the output buffer
  294. //! @param tail available size of the buffer
  295. inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
  296. rune_len = 0;
  297. if (rune < 0x80) {
  298. if (tail <= 0) {
  299. return RECODE_EOOUTPUT;
  300. }
  301. *s = static_cast<unsigned char>(rune);
  302. rune_len = 1;
  303. return RECODE_OK;
  304. }
  305. if (rune < 0x800) {
  306. if (tail <= 1) {
  307. return RECODE_EOOUTPUT;
  308. }
  309. *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
  310. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  311. rune_len = 2;
  312. return RECODE_OK;
  313. }
  314. if (rune < 0x10000) {
  315. if (tail <= 2) {
  316. return RECODE_EOOUTPUT;
  317. }
  318. *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
  319. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  320. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  321. rune_len = 3;
  322. return RECODE_OK;
  323. }
  324. /*if (rune < 0x200000)*/ {
  325. if (tail <= 3) {
  326. return RECODE_EOOUTPUT;
  327. }
  328. *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
  329. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
  330. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  331. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  332. rune_len = 4;
  333. return RECODE_OK;
  334. }
  335. }
  336. inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
  337. return SafeWriteUTF8Char(rune, rune_len, s, end - s);
  338. }
  339. //! writes one unicode symbol into a character sequence encoded UTF8
  340. //! @attention this function works as @c SafeWriteUTF8Char it does not check
  341. //! the size of the output buffer, it supposes that buffer is long enough
  342. //! @param rune value of the current character
  343. //! @param rune_len length of the UTF8 byte sequence that has been written
  344. //! @param s pointer to the output buffer
  345. inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
  346. if (rune < 0x80) {
  347. *s = static_cast<unsigned char>(rune);
  348. rune_len = 1;
  349. return;
  350. }
  351. if (rune < 0x800) {
  352. *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
  353. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  354. rune_len = 2;
  355. return;
  356. }
  357. if (rune < 0x10000) {
  358. *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
  359. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  360. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  361. rune_len = 3;
  362. return;
  363. }
  364. /*if (rune < 0x200000)*/ {
  365. *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
  366. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
  367. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  368. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  369. rune_len = 4;
  370. }
  371. }
  372. TStringBuf SubstrUTF8(const TStringBuf str Y_LIFETIME_BOUND, size_t pos, size_t len);
  373. enum EUTF8Detect {
  374. NotUTF8,
  375. UTF8,
  376. ASCII
  377. };
  378. EUTF8Detect UTF8Detect(const char* s, size_t len);
  379. inline EUTF8Detect UTF8Detect(const TStringBuf input) {
  380. return UTF8Detect(input.data(), input.size());
  381. }
  382. inline bool IsUtf(const char* input, size_t len) {
  383. return UTF8Detect(input, len) != NotUTF8;
  384. }
  385. inline bool IsUtf(const TStringBuf input) {
  386. return IsUtf(input.data(), input.size());
  387. }
  388. //! returns true, if result is not the same as input, and put it in newString
  389. //! returns false, if result is unmodified
  390. bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
  391. TString ToLowerUTF8(const TString& s);
  392. TString ToLowerUTF8(TStringBuf s);
  393. TString ToLowerUTF8(const char* s);
  394. inline TString ToLowerUTF8(const std::string& s) {
  395. return ToLowerUTF8(TStringBuf(s));
  396. }
  397. //! returns true, if result is not the same as input, and put it in newString
  398. //! returns false, if result is unmodified
  399. bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
  400. TString ToUpperUTF8(const TString& s);
  401. TString ToUpperUTF8(TStringBuf s);
  402. TString ToUpperUTF8(const char* s);