utf8.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. #pragma once
  2. #include "recode_result.h"
  3. #include <util/generic/strbuf.h>
  4. #include <util/generic/string.h>
  5. #include <util/generic/yexception.h>
  6. #include <util/system/defaults.h>
  7. #include <util/system/yassert.h>
  8. extern const wchar32 BROKEN_RUNE;
  9. inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
  10. // Y_ASSERT (utf8_rune_len <= 4);
  11. return "\0\0\037\017\007"[utf8_rune_len];
  12. }
  13. inline size_t UTF8RuneLen(const unsigned char lead_byte) {
  14. //b0XXXXXXX
  15. if ((lead_byte & 0x80) == 0x00) {
  16. return 1;
  17. }
  18. //b110XXXXX
  19. if ((lead_byte & 0xe0) == 0xc0) {
  20. return 2;
  21. }
  22. //b1110XXXX
  23. if ((lead_byte & 0xf0) == 0xe0) {
  24. return 3;
  25. }
  26. //b11110XXX
  27. if ((lead_byte & 0xf8) == 0xf0) {
  28. return 4;
  29. }
  30. //b10XXXXXX
  31. return 0;
  32. }
  33. inline size_t UTF8RuneLenByUCS(wchar32 rune) {
  34. if (rune < 0x80)
  35. return 1U;
  36. else if (rune < 0x800)
  37. return 2U;
  38. else if (rune < 0x10000)
  39. return 3U;
  40. else if (rune < 0x200000)
  41. return 4U;
  42. else if (rune < 0x4000000)
  43. return 5U;
  44. else
  45. return 6U;
  46. }
  47. inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
  48. rune = c;
  49. rune &= UTF8LeadByteMask(len);
  50. }
  51. inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
  52. rune <<= 6;
  53. rune |= c & 0x3F;
  54. }
  55. inline bool IsUTF8ContinuationByte(unsigned char c) {
  56. return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
  57. }
  58. //! returns length of the current UTF8 character
  59. //! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
  60. //! @param p pointer to the current character
  61. //! @param e end of the character sequence
  62. inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
  63. Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
  64. switch (UTF8RuneLen(*p)) {
  65. case 0:
  66. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
  67. case 1:
  68. n = 1;
  69. return RECODE_OK;
  70. case 2:
  71. if (p + 2 > e) {
  72. return RECODE_EOINPUT;
  73. } else if (!IsUTF8ContinuationByte(p[1])) {
  74. return RECODE_BROKENSYMBOL;
  75. } else {
  76. n = 2;
  77. return RECODE_OK;
  78. }
  79. case 3:
  80. if (p + 3 > e) {
  81. return RECODE_EOINPUT;
  82. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
  83. return RECODE_BROKENSYMBOL;
  84. } else {
  85. n = 3;
  86. return RECODE_OK;
  87. }
  88. default: // actually 4
  89. if (p + 4 > e) {
  90. return RECODE_EOINPUT;
  91. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
  92. return RECODE_BROKENSYMBOL;
  93. } else {
  94. n = 4;
  95. return RECODE_OK;
  96. }
  97. }
  98. }
  99. //! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
  100. //! @param text UTF8 encoded text
  101. //! @param len the length of the text in bytes
  102. //! @param number number of encoded symbols in the text
  103. inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
  104. const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
  105. const unsigned char* const last = cur + len;
  106. number = 0;
  107. size_t runeLen;
  108. bool res = true;
  109. while (cur != last) {
  110. if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
  111. res = false;
  112. break;
  113. }
  114. cur += runeLen;
  115. Y_ASSERT(cur <= last);
  116. ++number;
  117. }
  118. return res;
  119. }
  120. inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
  121. size_t number;
  122. if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
  123. ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
  124. }
  125. return number;
  126. }
  127. enum class StrictUTF8 {
  128. Yes,
  129. No
  130. };
  131. template <size_t runeLen, StrictUTF8 strictMode>
  132. inline bool IsValidUTF8Rune(wchar32 rune);
  133. template <>
  134. inline bool IsValidUTF8Rune<2, StrictUTF8::Yes>(wchar32 rune) {
  135. // check for overlong encoding
  136. return rune >= 0x80;
  137. }
  138. template <>
  139. inline bool IsValidUTF8Rune<2, StrictUTF8::No>(wchar32 rune) {
  140. return IsValidUTF8Rune<2, StrictUTF8::Yes>(rune);
  141. }
  142. template <>
  143. inline bool IsValidUTF8Rune<3, StrictUTF8::Yes>(wchar32 rune) {
  144. // surrogates are forbidden by RFC3629 section 3
  145. return rune >= 0x800 && (rune < 0xD800 || rune > 0xDFFF);
  146. }
  147. template <>
  148. inline bool IsValidUTF8Rune<3, StrictUTF8::No>(wchar32 rune) {
  149. // check for overlong encoding
  150. return rune >= 0x800;
  151. }
  152. template <>
  153. inline bool IsValidUTF8Rune<4, StrictUTF8::Yes>(wchar32 rune) {
  154. // check if this is a valid sumbod without overlong encoding
  155. return rune <= 0x10FFFF && rune >= 0x10000;
  156. }
  157. template <>
  158. inline bool IsValidUTF8Rune<4, StrictUTF8::No>(wchar32 rune) {
  159. return IsValidUTF8Rune<4, StrictUTF8::Yes>(rune);
  160. }
  161. //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
  162. //! @param rune value of the current character
  163. //! @param rune_len length of the UTF8 bytes sequence that has been read
  164. //! @param s pointer to the current character
  165. //! @param end the end of the character sequence
  166. template <StrictUTF8 strictMode = StrictUTF8::No>
  167. inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
  168. rune = BROKEN_RUNE;
  169. rune_len = 0;
  170. wchar32 _rune;
  171. size_t _len = UTF8RuneLen(*s);
  172. if (s + _len > end)
  173. return RECODE_EOINPUT; //[EOINPUT]
  174. if (_len == 0)
  175. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
  176. _rune = *s++; //[00000000 0XXXXXXX]
  177. if (_len > 1) {
  178. _rune &= UTF8LeadByteMask(_len);
  179. unsigned char ch = *s++;
  180. if (!IsUTF8ContinuationByte(ch))
  181. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
  182. PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
  183. if (_len > 2) {
  184. ch = *s++;
  185. if (!IsUTF8ContinuationByte(ch))
  186. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
  187. PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
  188. if (_len > 3) {
  189. ch = *s;
  190. if (!IsUTF8ContinuationByte(ch))
  191. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
  192. PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
  193. if (!IsValidUTF8Rune<4, strictMode>(_rune))
  194. return RECODE_BROKENSYMBOL;
  195. } else {
  196. if (!IsValidUTF8Rune<3, strictMode>(_rune))
  197. return RECODE_BROKENSYMBOL;
  198. }
  199. } else {
  200. if (!IsValidUTF8Rune<2, strictMode>(_rune))
  201. return RECODE_BROKENSYMBOL;
  202. }
  203. }
  204. rune_len = _len;
  205. rune = _rune;
  206. return RECODE_OK;
  207. }
  208. //! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
  209. //! @param c value of the current character
  210. //! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
  211. //! @param e the end of the character sequence
  212. template <StrictUTF8 strictMode = StrictUTF8::No>
  213. Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
  214. Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
  215. switch (UTF8RuneLen(*p)) {
  216. case 0:
  217. rune = BROKEN_RUNE;
  218. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
  219. case 1:
  220. rune = *p; //[00000000 0XXXXXXX]
  221. ++p;
  222. return RECODE_OK;
  223. case 2:
  224. if (p + 2 > e) {
  225. return RECODE_EOINPUT;
  226. } else if (!IsUTF8ContinuationByte(p[1])) {
  227. rune = BROKEN_RUNE;
  228. return RECODE_BROKENSYMBOL;
  229. } else {
  230. PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
  231. PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
  232. if (!IsValidUTF8Rune<2, strictMode>(rune)) {
  233. p -= 2;
  234. rune = BROKEN_RUNE;
  235. return RECODE_BROKENSYMBOL;
  236. }
  237. return RECODE_OK;
  238. }
  239. case 3:
  240. if (p + 3 > e) {
  241. return RECODE_EOINPUT;
  242. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
  243. rune = BROKEN_RUNE;
  244. return RECODE_BROKENSYMBOL;
  245. } else {
  246. PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
  247. PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
  248. PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
  249. // check for overlong encoding and surrogates
  250. if (!IsValidUTF8Rune<3, strictMode>(rune)) {
  251. p -= 3;
  252. rune = BROKEN_RUNE;
  253. return RECODE_BROKENSYMBOL;
  254. }
  255. return RECODE_OK;
  256. }
  257. case 4:
  258. if (p + 4 > e) {
  259. return RECODE_EOINPUT;
  260. } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
  261. rune = BROKEN_RUNE;
  262. return RECODE_BROKENSYMBOL;
  263. } else {
  264. PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
  265. PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
  266. PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
  267. PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
  268. if (!IsValidUTF8Rune<4, strictMode>(rune)) {
  269. p -= 4;
  270. rune = BROKEN_RUNE;
  271. return RECODE_BROKENSYMBOL;
  272. }
  273. return RECODE_OK;
  274. }
  275. default: // >4
  276. rune = BROKEN_RUNE;
  277. return RECODE_BROKENSYMBOL;
  278. }
  279. }
  280. //! writes one unicode symbol into a character sequence encoded UTF8
  281. //! checks for end of the buffer and returns the result of encoding
  282. //! @param rune value of the current character
  283. //! @param rune_len length of the UTF8 byte sequence that has been written
  284. //! @param s pointer to the output buffer
  285. //! @param tail available size of the buffer
  286. inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
  287. rune_len = 0;
  288. if (rune < 0x80) {
  289. if (tail <= 0)
  290. return RECODE_EOOUTPUT;
  291. *s = static_cast<unsigned char>(rune);
  292. rune_len = 1;
  293. return RECODE_OK;
  294. }
  295. if (rune < 0x800) {
  296. if (tail <= 1)
  297. return RECODE_EOOUTPUT;
  298. *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
  299. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  300. rune_len = 2;
  301. return RECODE_OK;
  302. }
  303. if (rune < 0x10000) {
  304. if (tail <= 2)
  305. return RECODE_EOOUTPUT;
  306. *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
  307. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  308. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  309. rune_len = 3;
  310. return RECODE_OK;
  311. }
  312. /*if (rune < 0x200000)*/ {
  313. if (tail <= 3)
  314. return RECODE_EOOUTPUT;
  315. *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
  316. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
  317. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  318. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  319. rune_len = 4;
  320. return RECODE_OK;
  321. }
  322. }
  323. inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
  324. return SafeWriteUTF8Char(rune, rune_len, s, end - s);
  325. }
  326. //! writes one unicode symbol into a character sequence encoded UTF8
  327. //! @attention this function works as @c SafeWriteUTF8Char it does not check
  328. //! the size of the output buffer, it supposes that buffer is long enough
  329. //! @param rune value of the current character
  330. //! @param rune_len length of the UTF8 byte sequence that has been written
  331. //! @param s pointer to the output buffer
  332. inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
  333. if (rune < 0x80) {
  334. *s = static_cast<unsigned char>(rune);
  335. rune_len = 1;
  336. return;
  337. }
  338. if (rune < 0x800) {
  339. *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
  340. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  341. rune_len = 2;
  342. return;
  343. }
  344. if (rune < 0x10000) {
  345. *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
  346. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  347. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  348. rune_len = 3;
  349. return;
  350. }
  351. /*if (rune < 0x200000)*/ {
  352. *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
  353. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
  354. *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
  355. *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
  356. rune_len = 4;
  357. }
  358. }
  359. TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
  360. enum EUTF8Detect {
  361. NotUTF8,
  362. UTF8,
  363. ASCII
  364. };
  365. EUTF8Detect UTF8Detect(const char* s, size_t len);
  366. inline EUTF8Detect UTF8Detect(const TStringBuf input) {
  367. return UTF8Detect(input.data(), input.size());
  368. }
  369. inline bool IsUtf(const char* input, size_t len) {
  370. return UTF8Detect(input, len) != NotUTF8;
  371. }
  372. inline bool IsUtf(const TStringBuf input) {
  373. return IsUtf(input.data(), input.size());
  374. }
  375. //! returns true, if result is not the same as input, and put it in newString
  376. //! returns false, if result is unmodified
  377. bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
  378. TString ToLowerUTF8(const TString& s);
  379. TString ToLowerUTF8(TStringBuf s);
  380. TString ToLowerUTF8(const char* s);
  381. inline TString ToLowerUTF8(const std::string& s) {
  382. return ToLowerUTF8(TStringBuf(s));
  383. }
  384. //! returns true, if result is not the same as input, and put it in newString
  385. //! returns false, if result is unmodified
  386. bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
  387. TString ToUpperUTF8(const TString& s);
  388. TString ToUpperUTF8(TStringBuf s);
  389. TString ToUpperUTF8(const char* s);