123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421 |
- #pragma once
- #include "unicode_table.h"
- #include <util/system/defaults.h> // wchar32, ui64, ULL()
- enum WC_TYPE { // TODO move no NUnicode
- Lu_UPPER = 1, // 'Ъ'
- Ll_LOWER = 2, // 'ъ'
- Lt_TITLE = 3, // 'Ъ'
- Lm_EXTENDER = 4, // '-'
- Lm_LETTER = 5, // 'ъ'
- Lo_OTHER = 6, // '?'
- Lo_IDEOGRAPH = 7, // '?'
- Lo_KATAKANA = 8, // '?'
- Lo_HIRAGANA = 9, // '?'
- Lo_LEADING = 10, // '?'
- Lo_VOWEL = 11, // '?'
- Lo_TRAILING = 12, // '?'
- Mn_NONSPACING = 13, // '`'
- Me_ENCLOSING = 14, // '`'
- Mc_SPACING = 15, // '`'
- Nd_DIGIT = 16, // '9' // convert to digit
- Nl_LETTER = 17, // 'X' // X,V,C,L,I ...
- Nl_IDEOGRAPH = 18, // '?'
- No_OTHER = 19, // '9'
- Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0)
- Zs_ZWSPACE = 21, // ' ' // nothing ?
- Zl_LINE = 22, // '\n'
- Zp_PARAGRAPH = 23, // '\n'
- Cc_ASCII = 24, // '\x1A' // can not happen
- Cc_SPACE = 25, // '\x1A' // can not happen
- Cc_SEPARATOR = 26, // '\x1A' // can not happen
- Cf_FORMAT = 27, // '\x1A' // nothing ?
- Cf_JOIN = 28, // '\x1A' // nothing ?
- Cf_BIDI = 29, // '\x1A' // nothing ?
- Cf_ZWNBSP = 30, // '\x1A' // nothing ?
- Cn_UNASSIGNED = 0, // '?'
- Co_PRIVATE = 0, // '?'
- Cs_LOW = 31, // '?'
- Cs_HIGH = 32, // '?'
- Pd_DASH = 33, // '-'
- Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS
- Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET
- Ps_QUOTE = 36, // '"'
- Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET
- Pe_QUOTE = 38, // '"'
- Pi_QUOTE = 39, // '"'
- Pf_QUOTE = 40, // '"'
- Pc_CONNECTOR = 41, // '_' [_] LOW LINE
- Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS
- Po_QUOTE = 43, // '"' ["] QUOTATION MARK
- Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK
- Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7)
- Po_HYPHEN = 46, // '-'
- Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE
- Sm_MINUS = 48, // '-'
- Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN
- Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT
- So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0)
- Ps_SINGLE_QUOTE = 52, // '\'' ['] OPENING SINGLE QUOTE
- Pe_SINGLE_QUOTE = 53, // '\'' ['] CLOSING SINGLE QUOTE
- Pi_SINGLE_QUOTE = 54, // '\'' ['] INITIAL SINGLE QUOTE
- Pf_SINGLE_QUOTE = 55, // '\'' ['] FINAL SINGLE QUOTE
- Po_SINGLE_QUOTE = 56, // '\'' ['] APOSTROPHE and PRIME
- CCL_NUM = 57,
- CCL_MASK = 0x3F,
- IS_ASCII_XDIGIT = 1 << 6,
- IS_DIGIT = 1 << 7,
- IS_NONBREAK = 1 << 8,
- IS_PRIVATE = 1 << 9,
- IS_COMPAT = 1 << 10,
- IS_CANON = 1 << 11,
- NFD_QC = 1 << 12,
- NFC_QC = 1 << 13,
- NFKD_QC = 1 << 14,
- NFKC_QC = 1 << 15,
- BIDI_OFFSET = 16,
- SVAL_OFFSET = 22,
- };
- const size_t DEFCHAR_BUF = 58; // CCL_NUM + 1
- #define SHIFT(i) (ULL(1) << (i))
- namespace NUnicode {
- using TCombining = ui8;
- namespace NPrivate {
- struct TProperty {
- ui32 Info;
- i32 Lower;
- i32 Upper;
- i32 Title;
- TCombining Combining;
- };
- extern const size_t DEFAULT_KEY;
- using TUnidataTable = NUnicodeTable::TTable<NUnicodeTable::TSubtable<NUnicodeTable::UNICODE_TABLE_SHIFT, NUnicodeTable::TValues<TProperty>>>;
- const TUnidataTable& UnidataTable();
- inline const TProperty& CharProperty(wchar32 ch) {
- return UnidataTable().Get(ch, DEFAULT_KEY);
- }
- inline ui32 CharInfo(wchar32 ch) {
- return CharProperty(ch).Info;
- }
- inline bool IsBidi(wchar32 ch, ui32 type) {
- return ((NUnicode::NPrivate::CharInfo(ch) >> BIDI_OFFSET) & 15) == type;
- }
- } // namespace NPrivate
- inline size_t UnicodeInstancesLimit() {
- return NPrivate::UnidataTable().Size();
- }
- inline TCombining DecompositionCombining(wchar32 ch) {
- return NPrivate::CharProperty(ch).Combining;
- }
- inline WC_TYPE CharType(wchar32 ch) {
- return (WC_TYPE)(NUnicode::NPrivate::CharInfo(ch) & CCL_MASK);
- }
- inline bool CharHasType(wchar32 ch, ui64 type_bits) {
- return (SHIFT(NUnicode::CharType(ch)) & type_bits) != 0;
- }
- } // namespace NUnicode
- // all usefull properties
- inline bool IsComposed(wchar32 ch) {
- return NUnicode::NPrivate::CharInfo(ch) & (IS_COMPAT | IS_CANON);
- }
- inline bool IsCanonComposed(wchar32 ch) {
- return NUnicode::NPrivate::CharInfo(ch) & IS_CANON;
- }
- inline bool IsCompatComposed(wchar32 ch) {
- return NUnicode::NPrivate::CharInfo(ch) & IS_COMPAT;
- }
- inline bool IsWhitespace(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cc_SPACE) | SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE) | SHIFT(Zl_LINE) | SHIFT(Zp_PARAGRAPH));
- }
- inline bool IsAsciiCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
- }
- inline bool IsBidiCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cf_BIDI));
- }
- inline bool IsJoinCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cf_JOIN));
- }
- inline bool IsFormatCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT));
- }
- inline bool IsIgnorableCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP));
- }
- inline bool IsCntrl(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) |
- SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
- }
- inline bool IsZerowidth(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | SHIFT(Zs_ZWSPACE));
- }
- inline bool IsLineSep(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Zl_LINE));
- }
- inline bool IsParaSep(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Zp_PARAGRAPH));
- }
- inline bool IsDash(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Pd_DASH) | SHIFT(Pd_HYPHEN) | SHIFT(Sm_MINUS));
- }
- inline bool IsHyphen(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Pd_HYPHEN) | SHIFT(Po_HYPHEN));
- }
- inline bool IsQuotation(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) |
- SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
- SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
- }
- inline bool IsSingleQuotation(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) |
- SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
- }
- inline bool IsTerminal(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Po_TERMINAL));
- }
- inline bool IsPairedPunct(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) |
- SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
- SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
- }
- inline bool IsLeftPunct(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Ps_SINGLE_QUOTE));
- }
- inline bool IsRightPunct(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pe_SINGLE_QUOTE));
- }
- inline bool IsCombining(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Mc_SPACING) | SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
- }
- inline bool IsNonspacing(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
- }
- inline bool IsAlphabetic(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER));
- }
- inline bool IsIdeographic(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_IDEOGRAPH) | SHIFT(Nl_IDEOGRAPH));
- }
- inline bool IsKatakana(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_KATAKANA));
- }
- inline bool IsHiragana(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_HIRAGANA));
- }
- inline bool IsHangulLeading(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_LEADING));
- }
- inline bool IsHangulVowel(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_VOWEL));
- }
- inline bool IsHangulTrailing(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lo_TRAILING));
- }
- inline bool IsHexdigit(wchar32 ch) {
- return NUnicode::NPrivate::CharInfo(ch) & IS_ASCII_XDIGIT;
- }
- inline bool IsDecdigit(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT));
- }
- inline bool IsNumeric(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
- }
- inline bool IsCurrency(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Sc_CURRENCY));
- }
- inline bool IsMath(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Sm_MATH));
- }
- inline bool IsSymbol(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Sm_MATH) | SHIFT(Sm_MINUS) | SHIFT(Sc_CURRENCY) | SHIFT(Sk_MODIFIER) | SHIFT(So_OTHER));
- }
- inline bool IsLowSurrogate(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cs_LOW));
- }
- inline bool IsHighSurrogate(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
- }
- inline bool IsNonbreak(wchar32 ch) {
- return NUnicode::NPrivate::CharInfo(ch) & IS_NONBREAK;
- }
- inline bool IsPrivate(wchar32 ch) {
- return (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE) && !NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
- }
- inline bool IsUnassigned(wchar32 ch) {
- return (NUnicode::CharType(ch) == 0) && !(NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
- }
- inline bool IsPrivateHighSurrogate(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH)) && (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
- }
- // transformations
- inline wchar32 ToLower(wchar32 ch) {
- return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Lower);
- }
- inline wchar32 ToUpper(wchar32 ch) {
- return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Upper);
- }
- inline wchar32 ToTitle(wchar32 ch) {
- return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Title);
- }
- inline int ToDigit(wchar32 ch) {
- ui32 i = NUnicode::NPrivate::CharInfo(ch);
- return (i & IS_DIGIT) ? static_cast<int>(i >> SVAL_OFFSET) : -1;
- }
- // BIDI properties
- inline bool IsBidiLeft(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 1);
- }
- inline bool IsBidiRight(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 2);
- }
- inline bool IsBidiEuronum(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 3);
- }
- inline bool IsBidiEurosep(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 4);
- }
- inline bool IsBidiEuroterm(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 5);
- }
- inline bool IsBidiArabnum(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 6);
- }
- inline bool IsBidiCommsep(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 7);
- }
- inline bool IsBidiBlocksep(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 8);
- }
- inline bool IsBidiSegmsep(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 9);
- }
- inline bool IsBidiSpace(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 10);
- }
- inline bool IsBidiNeutral(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 11);
- }
- inline bool IsBidiNotappl(wchar32 ch) {
- return NUnicode::NPrivate::IsBidi(ch, 0);
- }
- inline bool IsSpace(wchar32 ch) {
- return IsWhitespace(ch);
- }
- inline bool IsLower(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Ll_LOWER));
- }
- inline bool IsUpper(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lu_UPPER));
- }
- inline bool IsTitle(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Lt_TITLE));
- }
- inline bool IsAlpha(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
- SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
- SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING));
- }
- inline bool IsAlnum(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
- SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
- SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) |
- SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
- }
- inline bool IsPunct(wchar32 ch) {
- return NUnicode::CharHasType(ch,
- SHIFT(Pd_DASH) |
- SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) |
- SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) |
- SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE));
- }
- inline bool IsXdigit(wchar32 ch) {
- return IsHexdigit(ch);
- }
- inline bool IsDigit(wchar32 ch) {
- return IsDecdigit(ch);
- }
- inline bool IsCommonDigit(wchar32 ch) {
- // IsDigit returns true for some exotic symbols like "VAI DIGIT TWO" (U+A622)
- // and cannot be used safely with FromString() convertors
- const wchar32 ZERO = '0';
- const wchar32 NINE = '9';
- return ch >= ZERO && ch <= NINE;
- }
- inline bool IsGraph(wchar32 ch) {
- return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch);
- }
- inline bool IsBlank(wchar32 ch) {
- return NUnicode::CharHasType(ch, SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE)) || ch == '\t';
- }
- inline bool IsPrint(wchar32 ch) {
- return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch) || IsBlank(ch);
- }
- inline bool IsRomanDigit(wchar32 ch) {
- if (NUnicode::CharHasType(ch, SHIFT(Nl_LETTER)) && 0x2160 <= ch && ch <= 0x2188)
- return true;
- if (ch < 127) {
- switch (static_cast<char>(::ToLower(ch))) {
- case 'i':
- case 'v':
- case 'x':
- case 'l':
- case 'c':
- case 'd':
- case 'm':
- return true;
- }
- }
- return false;
- }
- #undef SHIFT
|