unidata.h 15 KB


  1. #pragma once
  2. #include "unicode_table.h"
  3. #include <util/system/defaults.h> // wchar32, ui64, ULL()
  4. enum WC_TYPE { // TODO move no NUnicode
  5. Lu_UPPER = 1, // 'Ъ'
  6. Ll_LOWER = 2, // 'ъ'
  7. Lt_TITLE = 3, // 'Ъ'
  8. Lm_EXTENDER = 4, // '-'
  9. Lm_LETTER = 5, // 'ъ'
  10. Lo_OTHER = 6, // '?'
  11. Lo_IDEOGRAPH = 7, // '?'
  12. Lo_KATAKANA = 8, // '?'
  13. Lo_HIRAGANA = 9, // '?'
  14. Lo_LEADING = 10, // '?'
  15. Lo_VOWEL = 11, // '?'
  16. Lo_TRAILING = 12, // '?'
  17. Mn_NONSPACING = 13, // '`'
  18. Me_ENCLOSING = 14, // '`'
  19. Mc_SPACING = 15, // '`'
  20. Nd_DIGIT = 16, // '9' // convert to digit
  21. Nl_LETTER = 17, // 'X' // X,V,C,L,I ...
  22. Nl_IDEOGRAPH = 18, // '?'
  23. No_OTHER = 19, // '9'
  24. Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0)
  25. Zs_ZWSPACE = 21, // ' ' // nothing ?
  26. Zl_LINE = 22, // '\n'
  27. Zp_PARAGRAPH = 23, // '\n'
  28. Cc_ASCII = 24, // '\x1A' // can not happen
  29. Cc_SPACE = 25, // '\x1A' // can not happen
  30. Cc_SEPARATOR = 26, // '\x1A' // can not happen
  31. Cf_FORMAT = 27, // '\x1A' // nothing ?
  32. Cf_JOIN = 28, // '\x1A' // nothing ?
  33. Cf_BIDI = 29, // '\x1A' // nothing ?
  34. Cf_ZWNBSP = 30, // '\x1A' // nothing ?
  35. Cn_UNASSIGNED = 0, // '?'
  36. Co_PRIVATE = 0, // '?'
  37. Cs_LOW = 31, // '?'
  38. Cs_HIGH = 32, // '?'
  39. Pd_DASH = 33, // '-'
  40. Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS
  41. Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET
  42. Ps_QUOTE = 36, // '"'
  43. Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET
  44. Pe_QUOTE = 38, // '"'
  45. Pi_QUOTE = 39, // '"'
  46. Pf_QUOTE = 40, // '"'
  47. Pc_CONNECTOR = 41, // '_' [_] LOW LINE
  48. Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS
  49. Po_QUOTE = 43, // '"' ["] QUOTATION MARK
  50. Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK
  51. Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7)
  52. Po_HYPHEN = 46, // '-'
  53. Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE
  54. Sm_MINUS = 48, // '-'
  55. Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN
  56. Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT
  57. So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0)
  58. Ps_SINGLE_QUOTE = 52, // '\'' ['] OPENING SINGLE QUOTE
  59. Pe_SINGLE_QUOTE = 53, // '\'' ['] CLOSING SINGLE QUOTE
  60. Pi_SINGLE_QUOTE = 54, // '\'' ['] INITIAL SINGLE QUOTE
  61. Pf_SINGLE_QUOTE = 55, // '\'' ['] FINAL SINGLE QUOTE
  62. Po_SINGLE_QUOTE = 56, // '\'' ['] APOSTROPHE and PRIME
  63. CCL_NUM = 57,
  64. CCL_MASK = 0x3F,
  65. IS_ASCII_XDIGIT = 1 << 6,
  66. IS_DIGIT = 1 << 7,
  67. IS_NONBREAK = 1 << 8,
  68. IS_PRIVATE = 1 << 9,
  69. IS_COMPAT = 1 << 10,
  70. IS_CANON = 1 << 11,
  71. NFD_QC = 1 << 12,
  72. NFC_QC = 1 << 13,
  73. NFKD_QC = 1 << 14,
  74. NFKC_QC = 1 << 15,
  75. BIDI_OFFSET = 16,
  76. SVAL_OFFSET = 22,
  77. };
  78. const size_t DEFCHAR_BUF = 58; // CCL_NUM + 1
  79. #define SHIFT(i) (ULL(1) << (i))
  80. namespace NUnicode {
  81. using TCombining = ui8;
  82. namespace NPrivate {
  83. struct TProperty {
  84. ui32 Info;
  85. i32 Lower;
  86. i32 Upper;
  87. i32 Title;
  88. TCombining Combining;
  89. };
  90. extern const size_t DEFAULT_KEY;
  91. using TUnidataTable = NUnicodeTable::TTable<NUnicodeTable::TSubtable<NUnicodeTable::UNICODE_TABLE_SHIFT, NUnicodeTable::TValues<TProperty>>>;
  92. const TUnidataTable& UnidataTable();
  93. inline const TProperty& CharProperty(wchar32 ch) {
  94. return UnidataTable().Get(ch, DEFAULT_KEY);
  95. }
  96. inline ui32 CharInfo(wchar32 ch) {
  97. return CharProperty(ch).Info;
  98. }
  99. inline bool IsBidi(wchar32 ch, ui32 type) {
  100. return ((NUnicode::NPrivate::CharInfo(ch) >> BIDI_OFFSET) & 15) == type;
  101. }
  102. } // namespace NPrivate
  103. inline size_t UnicodeInstancesLimit() {
  104. return NPrivate::UnidataTable().Size();
  105. }
  106. inline TCombining DecompositionCombining(wchar32 ch) {
  107. return NPrivate::CharProperty(ch).Combining;
  108. }
  109. inline WC_TYPE CharType(wchar32 ch) {
  110. return (WC_TYPE)(NUnicode::NPrivate::CharInfo(ch) & CCL_MASK);
  111. }
  112. inline bool CharHasType(wchar32 ch, ui64 type_bits) {
  113. return (SHIFT(NUnicode::CharType(ch)) & type_bits) != 0;
  114. }
  115. } // namespace NUnicode
  116. // all usefull properties
  117. inline bool IsComposed(wchar32 ch) {
  118. return NUnicode::NPrivate::CharInfo(ch) & (IS_COMPAT | IS_CANON);
  119. }
  120. inline bool IsCanonComposed(wchar32 ch) {
  121. return NUnicode::NPrivate::CharInfo(ch) & IS_CANON;
  122. }
  123. inline bool IsCompatComposed(wchar32 ch) {
  124. return NUnicode::NPrivate::CharInfo(ch) & IS_COMPAT;
  125. }
  126. inline bool IsWhitespace(wchar32 ch) {
  127. return NUnicode::CharHasType(ch, SHIFT(Cc_SPACE) | SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE) | SHIFT(Zl_LINE) | SHIFT(Zp_PARAGRAPH));
  128. }
  129. inline bool IsAsciiCntrl(wchar32 ch) {
  130. return NUnicode::CharHasType(ch, SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
  131. }
  132. inline bool IsBidiCntrl(wchar32 ch) {
  133. return NUnicode::CharHasType(ch, SHIFT(Cf_BIDI));
  134. }
  135. inline bool IsJoinCntrl(wchar32 ch) {
  136. return NUnicode::CharHasType(ch, SHIFT(Cf_JOIN));
  137. }
  138. inline bool IsFormatCntrl(wchar32 ch) {
  139. return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT));
  140. }
  141. inline bool IsIgnorableCntrl(wchar32 ch) {
  142. return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP));
  143. }
  144. inline bool IsCntrl(wchar32 ch) {
  145. return NUnicode::CharHasType(ch,
  146. SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) |
  147. SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
  148. }
  149. inline bool IsZerowidth(wchar32 ch) {
  150. return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | SHIFT(Zs_ZWSPACE));
  151. }
  152. inline bool IsLineSep(wchar32 ch) {
  153. return NUnicode::CharHasType(ch, SHIFT(Zl_LINE));
  154. }
  155. inline bool IsParaSep(wchar32 ch) {
  156. return NUnicode::CharHasType(ch, SHIFT(Zp_PARAGRAPH));
  157. }
  158. inline bool IsDash(wchar32 ch) {
  159. return NUnicode::CharHasType(ch, SHIFT(Pd_DASH) | SHIFT(Pd_HYPHEN) | SHIFT(Sm_MINUS));
  160. }
  161. inline bool IsHyphen(wchar32 ch) {
  162. return NUnicode::CharHasType(ch, SHIFT(Pd_HYPHEN) | SHIFT(Po_HYPHEN));
  163. }
  164. inline bool IsQuotation(wchar32 ch) {
  165. return NUnicode::CharHasType(ch,
  166. SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) |
  167. SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
  168. SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
  169. }
  170. inline bool IsSingleQuotation(wchar32 ch) {
  171. return NUnicode::CharHasType(ch,
  172. SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) |
  173. SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
  174. }
  175. inline bool IsTerminal(wchar32 ch) {
  176. return NUnicode::CharHasType(ch, SHIFT(Po_TERMINAL));
  177. }
  178. inline bool IsPairedPunct(wchar32 ch) {
  179. return NUnicode::CharHasType(ch,
  180. SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) |
  181. SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
  182. SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
  183. }
  184. inline bool IsLeftPunct(wchar32 ch) {
  185. return NUnicode::CharHasType(ch, SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Ps_SINGLE_QUOTE));
  186. }
  187. inline bool IsRightPunct(wchar32 ch) {
  188. return NUnicode::CharHasType(ch, SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pe_SINGLE_QUOTE));
  189. }
  190. inline bool IsCombining(wchar32 ch) {
  191. return NUnicode::CharHasType(ch, SHIFT(Mc_SPACING) | SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
  192. }
  193. inline bool IsNonspacing(wchar32 ch) {
  194. return NUnicode::CharHasType(ch, SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
  195. }
  196. inline bool IsAlphabetic(wchar32 ch) {
  197. return NUnicode::CharHasType(ch,
  198. SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER));
  199. }
  200. inline bool IsIdeographic(wchar32 ch) {
  201. return NUnicode::CharHasType(ch, SHIFT(Lo_IDEOGRAPH) | SHIFT(Nl_IDEOGRAPH));
  202. }
  203. inline bool IsKatakana(wchar32 ch) {
  204. return NUnicode::CharHasType(ch, SHIFT(Lo_KATAKANA));
  205. }
  206. inline bool IsHiragana(wchar32 ch) {
  207. return NUnicode::CharHasType(ch, SHIFT(Lo_HIRAGANA));
  208. }
  209. inline bool IsHangulLeading(wchar32 ch) {
  210. return NUnicode::CharHasType(ch, SHIFT(Lo_LEADING));
  211. }
  212. inline bool IsHangulVowel(wchar32 ch) {
  213. return NUnicode::CharHasType(ch, SHIFT(Lo_VOWEL));
  214. }
  215. inline bool IsHangulTrailing(wchar32 ch) {
  216. return NUnicode::CharHasType(ch, SHIFT(Lo_TRAILING));
  217. }
  218. inline bool IsHexdigit(wchar32 ch) {
  219. return NUnicode::NPrivate::CharInfo(ch) & IS_ASCII_XDIGIT;
  220. }
  221. inline bool IsDecdigit(wchar32 ch) {
  222. return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT));
  223. }
  224. inline bool IsNumeric(wchar32 ch) {
  225. return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
  226. }
  227. inline bool IsCurrency(wchar32 ch) {
  228. return NUnicode::CharHasType(ch, SHIFT(Sc_CURRENCY));
  229. }
  230. inline bool IsMath(wchar32 ch) {
  231. return NUnicode::CharHasType(ch, SHIFT(Sm_MATH));
  232. }
  233. inline bool IsSymbol(wchar32 ch) {
  234. return NUnicode::CharHasType(ch, SHIFT(Sm_MATH) | SHIFT(Sm_MINUS) | SHIFT(Sc_CURRENCY) | SHIFT(Sk_MODIFIER) | SHIFT(So_OTHER));
  235. }
  236. inline bool IsLowSurrogate(wchar32 ch) {
  237. return NUnicode::CharHasType(ch, SHIFT(Cs_LOW));
  238. }
  239. inline bool IsHighSurrogate(wchar32 ch) {
  240. return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
  241. }
  242. inline bool IsNonbreak(wchar32 ch) {
  243. return NUnicode::NPrivate::CharInfo(ch) & IS_NONBREAK;
  244. }
  245. inline bool IsPrivate(wchar32 ch) {
  246. return (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE) && !NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
  247. }
  248. inline bool IsUnassigned(wchar32 ch) {
  249. return (NUnicode::CharType(ch) == 0) && !(NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
  250. }
  251. inline bool IsPrivateHighSurrogate(wchar32 ch) {
  252. return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH)) && (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
  253. }
  254. // transformations
  255. inline wchar32 ToLower(wchar32 ch) {
  256. return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Lower);
  257. }
  258. inline wchar32 ToUpper(wchar32 ch) {
  259. return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Upper);
  260. }
  261. inline wchar32 ToTitle(wchar32 ch) {
  262. return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Title);
  263. }
  264. inline int ToDigit(wchar32 ch) {
  265. ui32 i = NUnicode::NPrivate::CharInfo(ch);
  266. return (i & IS_DIGIT) ? static_cast<int>(i >> SVAL_OFFSET) : -1;
  267. }
  268. // BIDI properties
  269. inline bool IsBidiLeft(wchar32 ch) {
  270. return NUnicode::NPrivate::IsBidi(ch, 1);
  271. }
  272. inline bool IsBidiRight(wchar32 ch) {
  273. return NUnicode::NPrivate::IsBidi(ch, 2);
  274. }
  275. inline bool IsBidiEuronum(wchar32 ch) {
  276. return NUnicode::NPrivate::IsBidi(ch, 3);
  277. }
  278. inline bool IsBidiEurosep(wchar32 ch) {
  279. return NUnicode::NPrivate::IsBidi(ch, 4);
  280. }
  281. inline bool IsBidiEuroterm(wchar32 ch) {
  282. return NUnicode::NPrivate::IsBidi(ch, 5);
  283. }
  284. inline bool IsBidiArabnum(wchar32 ch) {
  285. return NUnicode::NPrivate::IsBidi(ch, 6);
  286. }
  287. inline bool IsBidiCommsep(wchar32 ch) {
  288. return NUnicode::NPrivate::IsBidi(ch, 7);
  289. }
  290. inline bool IsBidiBlocksep(wchar32 ch) {
  291. return NUnicode::NPrivate::IsBidi(ch, 8);
  292. }
  293. inline bool IsBidiSegmsep(wchar32 ch) {
  294. return NUnicode::NPrivate::IsBidi(ch, 9);
  295. }
  296. inline bool IsBidiSpace(wchar32 ch) {
  297. return NUnicode::NPrivate::IsBidi(ch, 10);
  298. }
  299. inline bool IsBidiNeutral(wchar32 ch) {
  300. return NUnicode::NPrivate::IsBidi(ch, 11);
  301. }
  302. inline bool IsBidiNotappl(wchar32 ch) {
  303. return NUnicode::NPrivate::IsBidi(ch, 0);
  304. }
  305. inline bool IsSpace(wchar32 ch) {
  306. return IsWhitespace(ch);
  307. }
  308. inline bool IsLower(wchar32 ch) {
  309. return NUnicode::CharHasType(ch, SHIFT(Ll_LOWER));
  310. }
  311. inline bool IsUpper(wchar32 ch) {
  312. return NUnicode::CharHasType(ch, SHIFT(Lu_UPPER));
  313. }
  314. inline bool IsTitle(wchar32 ch) {
  315. return NUnicode::CharHasType(ch, SHIFT(Lt_TITLE));
  316. }
  317. inline bool IsAlpha(wchar32 ch) {
  318. return NUnicode::CharHasType(ch,
  319. SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
  320. SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
  321. SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING));
  322. }
  323. inline bool IsAlnum(wchar32 ch) {
  324. return NUnicode::CharHasType(ch,
  325. SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
  326. SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
  327. SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) |
  328. SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
  329. }
  330. inline bool IsPunct(wchar32 ch) {
  331. return NUnicode::CharHasType(ch,
  332. SHIFT(Pd_DASH) |
  333. SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) |
  334. SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) |
  335. SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE));
  336. }
  337. inline bool IsXdigit(wchar32 ch) {
  338. return IsHexdigit(ch);
  339. }
  340. inline bool IsDigit(wchar32 ch) {
  341. return IsDecdigit(ch);
  342. }
  343. inline bool IsCommonDigit(wchar32 ch) {
  344. // IsDigit returns true for some exotic symbols like "VAI DIGIT TWO" (U+A622)
  345. // and cannot be used safely with FromString() convertors
  346. const wchar32 ZERO = '0';
  347. const wchar32 NINE = '9';
  348. return ch >= ZERO && ch <= NINE;
  349. }
  350. inline bool IsGraph(wchar32 ch) {
  351. return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch);
  352. }
  353. inline bool IsBlank(wchar32 ch) {
  354. return NUnicode::CharHasType(ch, SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE)) || ch == '\t';
  355. }
  356. inline bool IsPrint(wchar32 ch) {
  357. return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch) || IsBlank(ch);
  358. }
  359. inline bool IsRomanDigit(wchar32 ch) {
  360. if (NUnicode::CharHasType(ch, SHIFT(Nl_LETTER)) && 0x2160 <= ch && ch <= 0x2188) {
  361. return true;
  362. }
  363. if (ch < 127) {
  364. switch (static_cast<char>(::ToLower(ch))) {
  365. case 'i':
  366. case 'v':
  367. case 'x':
  368. case 'l':
  369. case 'c':
  370. case 'd':
  371. case 'm':
  372. return true;
  373. }
  374. }
  375. return false;
  376. }
  377. #undef SHIFT