unicode.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. // -*- C++ -*-
  2. //===----------------------------------------------------------------------===//
  3. //
  4. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  5. // See https://llvm.org/LICENSE.txt for license information.
  6. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #ifndef _LIBCPP___FORMAT_UNICODE_H
  10. #define _LIBCPP___FORMAT_UNICODE_H
  11. #include <__assert>
  12. #include <__config>
  13. #include <__format/extended_grapheme_cluster_table.h>
  14. #include <__type_traits/make_unsigned.h>
  15. #include <__utility/unreachable.h>
  16. #include <bit>
  17. #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
  18. # pragma GCC system_header
  19. #endif
  20. _LIBCPP_BEGIN_NAMESPACE_STD
  21. #if _LIBCPP_STD_VER > 17
  22. namespace __unicode {
  23. # if _LIBCPP_STD_VER > 20
  24. /// The result of consuming a code point using P2286' semantics
  25. ///
  26. /// TODO FMT Combine __consume and __consume_p2286 in one function.
  27. struct __consume_p2286_result {
  28. // A size of 0 means well formed. This to differenciate between
  29. // a valid code point and a code unit that's invalid like 0b11111xxx.
  30. int __ill_formed_size;
  31. // If well formed the consumed code point.
  32. // Otherwise the ill-formed code units as unsigned 8-bit values. They are
  33. // stored in reverse order, to make it easier to extract the values.
  34. char32_t __value;
  35. };
  36. # endif // _LIBCPP_STD_VER > 20
  37. # ifndef _LIBCPP_HAS_NO_UNICODE
  38. /// Implements the grapheme cluster boundary rules
  39. ///
  40. /// These rules are used to implement format's width estimation as stated in
  41. /// [format.string.std]/11
  42. ///
  43. /// The Standard refers to UAX \#29 for Unicode 12.0.0
  44. /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  45. ///
  46. /// The data tables used are
  47. /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
  48. /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
  49. /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
  50. inline constexpr char32_t __replacement_character = U'\ufffd';
  51. _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) {
  52. do {
  53. if ((*__char & 0b1000'0000) != 0b1000'0000)
  54. return false;
  55. --__count;
  56. ++__char;
  57. } while (__count);
  58. return true;
  59. }
  60. /// Helper class to extract a code unit from a Unicode character range.
  61. ///
  62. /// The stored range is a view. There are multiple specialization for different
  63. /// character types.
  64. template <class _CharT>
  65. class __code_point_view;
  66. /// UTF-8 specialization.
  67. template <>
  68. class __code_point_view<char> {
  69. public:
  70. _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last)
  71. : __first_(__first), __last_(__last) {}
  72. _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  73. _LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; }
  74. _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
  75. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  76. // Based on the number of leading 1 bits the number of code units in the
  77. // code point can be determined. See
  78. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  79. switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
  80. case 0:
  81. return *__first_++;
  82. case 2:
  83. if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
  84. break;
  85. else {
  86. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
  87. __value <<= 6;
  88. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  89. return __value;
  90. }
  91. case 3:
  92. if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
  93. break;
  94. else {
  95. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
  96. __value <<= 6;
  97. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  98. __value <<= 6;
  99. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  100. return __value;
  101. }
  102. case 4:
  103. if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
  104. break;
  105. else {
  106. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
  107. __value <<= 6;
  108. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  109. __value <<= 6;
  110. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  111. __value <<= 6;
  112. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  113. return __value;
  114. }
  115. }
  116. // An invalid number of leading ones can be garbage or a code unit in the
  117. // middle of a code point. By consuming one code unit the parser may get
  118. // "in sync" after a few code units.
  119. ++__first_;
  120. return __replacement_character;
  121. }
  122. # if _LIBCPP_STD_VER > 20
  123. _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
  124. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  125. // Based on the number of leading 1 bits the number of code units in the
  126. // code point can be determined. See
  127. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  128. switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
  129. case 0:
  130. return {0, static_cast<unsigned char>(*__first_++)};
  131. case 2:
  132. if (__last_ - __first_ < 2) [[unlikely]]
  133. break;
  134. if (__unicode::__is_continuation(__first_ + 1, 1)) {
  135. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
  136. __value <<= 6;
  137. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  138. return {0, __value};
  139. }
  140. break;
  141. case 3:
  142. if (__last_ - __first_ < 3) [[unlikely]]
  143. break;
  144. if (__unicode::__is_continuation(__first_ + 1, 2)) {
  145. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
  146. __value <<= 6;
  147. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  148. __value <<= 6;
  149. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  150. return {0, __value};
  151. }
  152. break;
  153. case 4:
  154. if (__last_ - __first_ < 4) [[unlikely]]
  155. break;
  156. if (__unicode::__is_continuation(__first_ + 1, 3)) {
  157. char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
  158. __value <<= 6;
  159. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  160. __value <<= 6;
  161. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  162. __value <<= 6;
  163. __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  164. if (__value > 0x10FFFF) // Outside the valid Unicode range?
  165. return {4, __value};
  166. return {0, __value};
  167. }
  168. break;
  169. }
  170. // An invalid number of leading ones can be garbage or a code unit in the
  171. // middle of a code point. By consuming one code unit the parser may get
  172. // "in sync" after a few code units.
  173. return {1, static_cast<unsigned char>(*__first_++)};
  174. }
  175. # endif // _LIBCPP_STD_VER > 20
  176. private:
  177. const char* __first_;
  178. const char* __last_;
  179. };
  180. # ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
  181. _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
  182. return __value >= 0xd800 && __value <= 0xdbff;
  183. }
  184. _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
  185. return __value >= 0xdc00 && __value <= 0xdfff;
  186. }
  187. /// This specialization depends on the size of wchar_t
  188. /// - 2 UTF-16 (for example Windows and AIX)
  189. /// - 4 UTF-32 (for example Linux)
  190. template <>
  191. class __code_point_view<wchar_t> {
  192. public:
  193. static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
  194. _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last)
  195. : __first_(__first), __last_(__last) {}
  196. _LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; }
  197. _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  198. _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
  199. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  200. if constexpr (sizeof(wchar_t) == 2) {
  201. char32_t __result = *__first_++;
  202. // Is the code unit part of a surrogate pair? See
  203. // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
  204. if (__result >= 0xd800 && __result <= 0xDfff) {
  205. // Malformed Unicode.
  206. if (__first_ == __last_) [[unlikely]]
  207. return __replacement_character;
  208. __result -= 0xd800;
  209. __result <<= 10;
  210. __result += *__first_++ - 0xdc00;
  211. __result += 0x10000;
  212. }
  213. return __result;
  214. } else if constexpr (sizeof(wchar_t) == 4) {
  215. char32_t __result = *__first_++;
  216. if (__result > 0x10FFFF) [[unlikely]]
  217. return __replacement_character;
  218. return __result;
  219. } else {
  220. __libcpp_unreachable();
  221. }
  222. }
  223. # if _LIBCPP_STD_VER > 20
  224. _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
  225. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  226. char32_t __result = *__first_++;
  227. if constexpr (sizeof(wchar_t) == 2) {
  228. // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
  229. if (__is_surrogate_pair_high(__result)) {
  230. // Malformed Unicode.
  231. if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
  232. return {1, __result};
  233. __result -= 0xd800;
  234. __result <<= 10;
  235. __result += *__first_++ - 0xdc00;
  236. __result += 0x10000;
  237. } else if (__is_surrogate_pair_low(__result))
  238. // A code point shouldn't start with the low surrogate pair
  239. return {1, __result};
  240. } else {
  241. if (__result > 0x10FFFF) [[unlikely]]
  242. return {1, __result};
  243. }
  244. return {0, __result};
  245. }
  246. # endif // _LIBCPP_STD_VER > 20
  247. private:
  248. const wchar_t* __first_;
  249. const wchar_t* __last_;
  250. };
  251. # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
  252. _LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
  253. bool& __ri_break_allowed,
  254. bool __has_extened_pictographic,
  255. __extended_grapheme_custer_property_boundary::__property __prev,
  256. __extended_grapheme_custer_property_boundary::__property __next) {
  257. using __extended_grapheme_custer_property_boundary::__property;
  258. __has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
  259. // https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
  260. // *** Break at the start and end of text, unless the text is empty. ***
  261. _LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
  262. _LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
  263. // *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
  264. if (__prev == __property::__CR && __next == __property::__LF) // GB3
  265. return false;
  266. if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
  267. return true;
  268. if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5
  269. return true;
  270. // *** Do not break Hangul syllable sequences. ***
  271. if (__prev == __property::__L &&
  272. (__next == __property::__L || __next == __property::__V || __next == __property::__LV ||
  273. __next == __property::__LVT)) // GB6
  274. return false;
  275. if ((__prev == __property::__LV || __prev == __property::__V) &&
  276. (__next == __property::__V || __next == __property::__T)) // GB7
  277. return false;
  278. if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8
  279. return false;
  280. // *** Do not break before extending characters or ZWJ. ***
  281. if (__next == __property::__Extend || __next == __property::__ZWJ)
  282. return false; // GB9
  283. // *** Do not break before SpacingMarks, or after Prepend characters. ***
  284. if (__next == __property::__SpacingMark) // GB9a
  285. return false;
  286. if (__prev == __property::__Prepend) // GB9b
  287. return false;
  288. // *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
  289. // GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
  290. //
  291. // Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
  292. // - \p{Extended_Pictographic} x Extend
  293. // - Extend x Extend
  294. // - \p{Extended_Pictographic} x ZWJ
  295. // - Extend x ZWJ
  296. //
  297. // So the only case left to test is
  298. // - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
  299. // where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
  300. if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic)
  301. return false;
  302. // *** Do not break within emoji flag sequences ***
  303. // That is, do not break between regional indicator (RI) symbols if there
  304. // is an odd number of RI characters before the break point.
  305. if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13
  306. __ri_break_allowed = !__ri_break_allowed;
  307. return __ri_break_allowed;
  308. }
  309. // *** Otherwise, break everywhere. ***
  310. return true; // GB999
  311. }
  312. /// Helper class to extract an extended grapheme cluster from a Unicode character range.
  313. ///
  314. /// This function is used to determine the column width of an extended grapheme
  315. /// cluster. In order to do that only the first code point is evaluated.
  316. /// Therefore only this code point is extracted.
  317. template <class _CharT>
  318. class __extended_grapheme_cluster_view {
  319. public:
  320. _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last)
  321. : __code_point_view_(__first, __last),
  322. __next_code_point_(__code_point_view_.__consume()),
  323. __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
  324. struct __cluster {
  325. /// The first code point of the extended grapheme cluster.
  326. ///
  327. /// The first code point is used to estimate the width of the extended
  328. /// grapheme cluster.
  329. char32_t __code_point_;
  330. /// Points one beyond the last code unit in the extended grapheme cluster.
  331. ///
  332. /// It's expected the caller has the start position and thus can determine
  333. /// the code unit range of the extended grapheme cluster.
  334. const _CharT* __last_;
  335. };
  336. _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
  337. _LIBCPP_ASSERT(
  338. __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
  339. "can't move beyond the end of input");
  340. char32_t __code_point = __next_code_point_;
  341. if (!__code_point_view_.__at_end())
  342. return {__code_point, __get_break()};
  343. __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
  344. return {__code_point, __code_point_view_.__position()};
  345. }
  346. private:
  347. __code_point_view<_CharT> __code_point_view_;
  348. char32_t __next_code_point_;
  349. __extended_grapheme_custer_property_boundary::__property __next_prop_;
  350. _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() {
  351. bool __ri_break_allowed = true;
  352. bool __has_extened_pictographic = false;
  353. while (true) {
  354. const _CharT* __result = __code_point_view_.__position();
  355. __extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
  356. if (__code_point_view_.__at_end()) {
  357. __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
  358. return __result;
  359. }
  360. __next_code_point_ = __code_point_view_.__consume();
  361. __next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
  362. __has_extened_pictographic |=
  363. __prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
  364. if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
  365. return __result;
  366. }
  367. }
  368. };
  369. template <class _CharT>
  370. __extended_grapheme_cluster_view(const _CharT*, const _CharT*) -> __extended_grapheme_cluster_view<_CharT>;
  371. # else // _LIBCPP_HAS_NO_UNICODE
  372. // For ASCII every character is a "code point".
  373. // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
  374. template <class _CharT>
  375. class __code_point_view {
  376. public:
  377. _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const _CharT* __first, const _CharT* __last)
  378. : __first_(__first), __last_(__last) {}
  379. _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  380. _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __position() const noexcept { return __first_; }
  381. _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
  382. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  383. return *__first_++;
  384. }
  385. # if _LIBCPP_STD_VER > 20
  386. _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
  387. _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  388. return {0, std::make_unsigned_t<_CharT>(*__first_++)};
  389. }
  390. # endif // _LIBCPP_STD_VER > 20
  391. private:
  392. const _CharT* __first_;
  393. const _CharT* __last_;
  394. };
  395. # endif // _LIBCPP_HAS_NO_UNICODE
  396. } // namespace __unicode
  397. #endif //_LIBCPP_STD_VER > 17
  398. _LIBCPP_END_NAMESPACE_STD
  399. #endif // _LIBCPP___FORMAT_UNICODE_H