uprops.cpp 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uprops.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002feb24
  16. * created by: Markus W. Scherer
  17. *
  18. * Implementations for mostly non-core Unicode character properties
  19. * stored in uprops.icu.
  20. *
  21. * With the APIs implemented here, almost all properties files and
  22. * their associated implementation files are used from this file,
  23. * including those for normalization and case mappings.
  24. */
  25. #include "unicode/utypes.h"
  26. #include "unicode/uchar.h"
  27. #include "unicode/ucptrie.h"
  28. #include "unicode/udata.h"
  29. #include "unicode/unorm2.h"
  30. #include "unicode/uscript.h"
  31. #include "unicode/ustring.h"
  32. #include "unicode/utf16.h"
  33. #include "cstring.h"
  34. #include "emojiprops.h"
  35. #include "mutex.h"
  36. #include "normalizer2impl.h"
  37. #include "umutex.h"
  38. #include "ubidi_props.h"
  39. #include "uprops.h"
  40. #include "ucase.h"
  41. #include "ucln_cmn.h"
  42. #include "ulayout_props.h"
  43. #include "ustr_imp.h"
  44. U_NAMESPACE_USE
  45. // Unicode text layout properties data -----------------------------------------
  46. namespace {
  47. icu::UInitOnce gLayoutInitOnce {};
  48. UDataMemory *gLayoutMemory = nullptr;
  49. UCPTrie *gInpcTrie = nullptr; // Indic_Positional_Category
  50. UCPTrie *gInscTrie = nullptr; // Indic_Syllabic_Category
  51. UCPTrie *gVoTrie = nullptr; // Vertical_Orientation
  52. int32_t gMaxInpcValue = 0;
  53. int32_t gMaxInscValue = 0;
  54. int32_t gMaxVoValue = 0;
  55. UBool U_CALLCONV uprops_cleanup() {
  56. udata_close(gLayoutMemory);
  57. gLayoutMemory = nullptr;
  58. ucptrie_close(gInpcTrie);
  59. gInpcTrie = nullptr;
  60. ucptrie_close(gInscTrie);
  61. gInscTrie = nullptr;
  62. ucptrie_close(gVoTrie);
  63. gVoTrie = nullptr;
  64. gMaxInpcValue = 0;
  65. gMaxInscValue = 0;
  66. gMaxVoValue = 0;
  67. gLayoutInitOnce.reset();
  68. return true;
  69. }
  70. UBool U_CALLCONV
  71. ulayout_isAcceptable(void * /*context*/,
  72. const char * /* type */, const char * /*name*/,
  73. const UDataInfo *pInfo) {
  74. return pInfo->size >= 20 &&
  75. pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
  76. pInfo->charsetFamily == U_CHARSET_FAMILY &&
  77. pInfo->dataFormat[0] == ULAYOUT_FMT_0 &&
  78. pInfo->dataFormat[1] == ULAYOUT_FMT_1 &&
  79. pInfo->dataFormat[2] == ULAYOUT_FMT_2 &&
  80. pInfo->dataFormat[3] == ULAYOUT_FMT_3 &&
  81. pInfo->formatVersion[0] == 1;
  82. }
  83. // UInitOnce singleton initialization function
  84. void U_CALLCONV ulayout_load(UErrorCode &errorCode) {
  85. gLayoutMemory = udata_openChoice(
  86. nullptr, ULAYOUT_DATA_TYPE, ULAYOUT_DATA_NAME,
  87. ulayout_isAcceptable, nullptr, &errorCode);
  88. if (U_FAILURE(errorCode)) { return; }
  89. const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(gLayoutMemory));
  90. const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes);
  91. int32_t indexesLength = inIndexes[ULAYOUT_IX_INDEXES_LENGTH];
  92. if (indexesLength < 12) {
  93. errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
  94. return;
  95. }
  96. int32_t offset = indexesLength * 4;
  97. int32_t top = inIndexes[ULAYOUT_IX_INPC_TRIE_TOP];
  98. int32_t trieSize = top - offset;
  99. if (trieSize >= 16) {
  100. gInpcTrie = ucptrie_openFromBinary(
  101. UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY,
  102. inBytes + offset, trieSize, nullptr, &errorCode);
  103. }
  104. offset = top;
  105. top = inIndexes[ULAYOUT_IX_INSC_TRIE_TOP];
  106. trieSize = top - offset;
  107. if (trieSize >= 16) {
  108. gInscTrie = ucptrie_openFromBinary(
  109. UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY,
  110. inBytes + offset, trieSize, nullptr, &errorCode);
  111. }
  112. offset = top;
  113. top = inIndexes[ULAYOUT_IX_VO_TRIE_TOP];
  114. trieSize = top - offset;
  115. if (trieSize >= 16) {
  116. gVoTrie = ucptrie_openFromBinary(
  117. UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY,
  118. inBytes + offset, trieSize, nullptr, &errorCode);
  119. }
  120. uint32_t maxValues = inIndexes[ULAYOUT_IX_MAX_VALUES];
  121. gMaxInpcValue = maxValues >> ULAYOUT_MAX_INPC_SHIFT;
  122. gMaxInscValue = (maxValues >> ULAYOUT_MAX_INSC_SHIFT) & 0xff;
  123. gMaxVoValue = (maxValues >> ULAYOUT_MAX_VO_SHIFT) & 0xff;
  124. ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup);
  125. }
  126. UBool ulayout_ensureData(UErrorCode &errorCode) {
  127. if (U_FAILURE(errorCode)) { return false; }
  128. umtx_initOnce(gLayoutInitOnce, &ulayout_load, errorCode);
  129. return U_SUCCESS(errorCode);
  130. }
  131. UBool ulayout_ensureData() {
  132. UErrorCode errorCode = U_ZERO_ERROR;
  133. return ulayout_ensureData(errorCode);
  134. }
  135. } // namespace
  136. /* general properties API functions ----------------------------------------- */
  137. struct BinaryProperty;
  138. typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which);
  139. struct BinaryProperty {
  140. int32_t column; // SRC_PROPSVEC column, or "source" if mask==0
  141. uint32_t mask;
  142. BinaryPropertyContains *contains;
  143. };
  144. static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) {
  145. /* systematic, directly stored properties */
  146. return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0;
  147. }
  148. static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
  149. return ucase_hasBinaryProperty(c, which);
  150. }
  151. static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  152. return ubidi_isBidiControl(c);
  153. }
  154. static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  155. return ubidi_isMirrored(c);
  156. }
  157. static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  158. return ubidi_isJoinControl(c);
  159. }
  160. #if UCONFIG_NO_NORMALIZATION
  161. static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) {
  162. return false;
  163. }
  164. #else
  165. static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  166. // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
  167. UErrorCode errorCode=U_ZERO_ERROR;
  168. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
  169. return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c));
  170. }
  171. #endif
  172. // UCHAR_NF*_INERT properties
  173. #if UCONFIG_NO_NORMALIZATION
  174. static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) {
  175. return false;
  176. }
  177. #else
  178. static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
  179. UErrorCode errorCode=U_ZERO_ERROR;
  180. const Normalizer2 *norm2=Normalizer2Factory::getInstance(
  181. static_cast<UNormalizationMode>(which - UCHAR_NFD_INERT + UNORM_NFD), errorCode);
  182. return U_SUCCESS(errorCode) && norm2->isInert(c);
  183. }
  184. #endif
  185. #if UCONFIG_NO_NORMALIZATION
  186. static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) {
  187. return false;
  188. }
  189. #else
  190. static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  191. UnicodeString nfd;
  192. UErrorCode errorCode=U_ZERO_ERROR;
  193. const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode);
  194. if(U_FAILURE(errorCode)) {
  195. return false;
  196. }
  197. if(nfcNorm2->getDecomposition(c, nfd)) {
  198. /* c has a decomposition */
  199. if(nfd.length()==1) {
  200. c=nfd[0]; /* single BMP code point */
  201. } else if(nfd.length()<=U16_MAX_LENGTH &&
  202. nfd.length()==U16_LENGTH(c=nfd.char32At(0))
  203. ) {
  204. /* single supplementary code point */
  205. } else {
  206. c=U_SENTINEL;
  207. }
  208. } else if(c<0) {
  209. return false; /* protect against bad input */
  210. }
  211. if(c>=0) {
  212. /* single code point */
  213. const char16_t *resultString;
  214. return ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT) >= 0;
  215. } else {
  216. /* guess some large but stack-friendly capacity */
  217. char16_t dest[2*UCASE_MAX_STRING_LENGTH];
  218. int32_t destLength;
  219. destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest),
  220. nfd.getBuffer(), nfd.length(),
  221. U_FOLD_CASE_DEFAULT, &errorCode);
  222. return U_SUCCESS(errorCode) &&
  223. 0!=u_strCompare(nfd.getBuffer(), nfd.length(),
  224. dest, destLength, false);
  225. }
  226. }
  227. #endif
  228. #if UCONFIG_NO_NORMALIZATION
  229. static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) {
  230. return false;
  231. }
  232. #else
  233. static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  234. UErrorCode errorCode=U_ZERO_ERROR;
  235. const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
  236. if(U_FAILURE(errorCode)) {
  237. return false;
  238. }
  239. UnicodeString src(c);
  240. UnicodeString dest;
  241. {
  242. // The ReorderingBuffer must be in a block because its destructor
  243. // needs to release dest's buffer before we look at its contents.
  244. ReorderingBuffer buffer(*kcf, dest);
  245. // Small destCapacity for NFKC_CF(c).
  246. if(buffer.init(5, errorCode)) {
  247. const char16_t *srcArray=src.getBuffer();
  248. kcf->compose(srcArray, srcArray+src.length(), false,
  249. true, buffer, errorCode);
  250. }
  251. }
  252. return U_SUCCESS(errorCode) && dest!=src;
  253. }
  254. #endif
  255. #if UCONFIG_NO_NORMALIZATION
  256. static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) {
  257. return false;
  258. }
  259. #else
  260. static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  261. UErrorCode errorCode=U_ZERO_ERROR;
  262. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
  263. return
  264. U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) &&
  265. impl->isCanonSegmentStarter(c);
  266. }
  267. #endif
  268. static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  269. return u_isalnumPOSIX(c);
  270. }
  271. static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  272. return u_isblank(c);
  273. }
  274. static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  275. return u_isgraphPOSIX(c);
  276. }
  277. static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  278. return u_isprintPOSIX(c);
  279. }
  280. static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  281. return u_isxdigit(c);
  282. }
  283. static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  284. // Property starts are a subset of lb=RI etc.
  285. return 0x1F1E6<=c && c<=0x1F1FF;
  286. }
  287. static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
  288. return EmojiProps::hasBinaryProperty(c, which);
  289. }
  290. static UBool isIDSUnaryOperator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  291. // New in Unicode 15.1 for just two characters.
  292. return 0x2FFE<=c && c<=0x2FFF;
  293. }
  294. /** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */
  295. static constexpr UChar32 ID_COMPAT_MATH_CONTINUE[] = {
  296. 0x00B2, 0x00B3 + 1,
  297. 0x00B9, 0x00B9 + 1,
  298. 0x2070, 0x2070 + 1,
  299. 0x2074, 0x207E + 1,
  300. 0x2080, 0x208E + 1
  301. };
  302. /** ID_Compat_Math_Start characters, from UCD PropList.txt. */
  303. static constexpr UChar32 ID_COMPAT_MATH_START[] = {
  304. 0x2202,
  305. 0x2207,
  306. 0x221E,
  307. 0x1D6C1,
  308. 0x1D6DB,
  309. 0x1D6FB,
  310. 0x1D715,
  311. 0x1D735,
  312. 0x1D74F,
  313. 0x1D76F,
  314. 0x1D789,
  315. 0x1D7A9,
  316. 0x1D7C3
  317. };
  318. /** Ranges (start/limit pairs) of Modifier_Combining_mark (only), from UCD PropList.txt. */
  319. static constexpr UChar32 MODIFIER_COMBINING_MARK[] = {
  320. 0x0654, 0x0655 + 1,
  321. 0x0658, 0x0658 + 1, // U+0658
  322. 0x06DC, 0x06DC + 1, // U+06DC
  323. 0x06E3, 0x06E3 + 1, // U+06E3
  324. 0x06E7, 0x06E8 + 1,
  325. 0x08CA, 0x08CB + 1,
  326. 0x08CD, 0x08CF + 1,
  327. 0x08D3, 0x08D3 + 1, // U+08D3
  328. 0x08F3, 0x08F3 + 1 // U+08F3
  329. };
  330. static UBool isIDCompatMathStart(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  331. if (c < ID_COMPAT_MATH_START[0]) { return false; } // fastpath for common scripts
  332. for (UChar32 startChar : ID_COMPAT_MATH_START) {
  333. if (c == startChar) { return true; }
  334. }
  335. return false;
  336. }
  337. static UBool isIDCompatMathContinue(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) {
  338. for (int32_t i = 0; i < UPRV_LENGTHOF(ID_COMPAT_MATH_CONTINUE); i += 2) {
  339. if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; } // below range start
  340. if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; } // below range limit
  341. }
  342. return isIDCompatMathStart(prop, c, UCHAR_ID_COMPAT_MATH_START);
  343. }
  344. static UBool isModifierCombiningMark(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  345. for (int32_t i = 0; i < UPRV_LENGTHOF(MODIFIER_COMBINING_MARK); i += 2) {
  346. if (c < MODIFIER_COMBINING_MARK[i]) { return false; } // below range start
  347. if (c < MODIFIER_COMBINING_MARK[i + 1]) { return true; } // below range limit
  348. }
  349. return false;
  350. }
  351. static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
  352. /*
  353. * column and mask values for binary properties from u_getUnicodeProperties().
  354. * Must be in order of corresponding UProperty,
  355. * and there must be exactly one entry per binary UProperty.
  356. *
  357. * Properties with mask==0 are handled in code.
  358. * For them, column is the UPropertySource value.
  359. */
  360. { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains },
  361. { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains },
  362. { UPROPS_SRC_BIDI, 0, isBidiControl },
  363. { UPROPS_SRC_BIDI, 0, isMirrored },
  364. { 1, U_MASK(UPROPS_DASH), defaultContains },
  365. { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains },
  366. { 1, U_MASK(UPROPS_DEPRECATED), defaultContains },
  367. { 1, U_MASK(UPROPS_DIACRITIC), defaultContains },
  368. { 1, U_MASK(UPROPS_EXTENDER), defaultContains },
  369. { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion },
  370. { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains },
  371. { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains },
  372. { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains },
  373. { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains },
  374. { 1, U_MASK(UPROPS_HYPHEN), defaultContains },
  375. { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains },
  376. { 1, U_MASK(UPROPS_ID_START), defaultContains },
  377. { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains },
  378. { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains },
  379. { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains },
  380. { UPROPS_SRC_BIDI, 0, isJoinControl },
  381. { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains },
  382. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE
  383. { 1, U_MASK(UPROPS_MATH), defaultContains },
  384. { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains },
  385. { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains },
  386. { 1, U_MASK(UPROPS_RADICAL), defaultContains },
  387. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED
  388. { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains },
  389. { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains },
  390. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE
  391. { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains },
  392. { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains },
  393. { 1, U_MASK(UPROPS_XID_START), defaultContains },
  394. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE
  395. { 1, U_MASK(UPROPS_S_TERM), defaultContains },
  396. { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains },
  397. { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT
  398. { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT
  399. { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT
  400. { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT
  401. { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter },
  402. { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains },
  403. { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains },
  404. { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum },
  405. { UPROPS_SRC_CHAR, 0, isPOSIX_blank },
  406. { UPROPS_SRC_CHAR, 0, isPOSIX_graph },
  407. { UPROPS_SRC_CHAR, 0, isPOSIX_print },
  408. { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit },
  409. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED
  410. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE
  411. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED
  412. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED
  413. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED
  414. { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded },
  415. { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED
  416. { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded },
  417. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI
  418. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_PRESENTATION
  419. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER
  420. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER_BASE
  421. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_COMPONENT
  422. { 2, 0, isRegionalIndicator },
  423. { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains },
  424. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EXTENDED_PICTOGRAPHIC
  425. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_BASIC_EMOJI
  426. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_KEYCAP_SEQUENCE
  427. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE
  428. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE
  429. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE
  430. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE
  431. { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI
  432. { UPROPS_SRC_IDSU, 0, isIDSUnaryOperator }, // UCHAR_IDS_UNARY_OPERATOR
  433. { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathStart }, // UCHAR_ID_COMPAT_MATH_START
  434. { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathContinue }, // UCHAR_ID_COMPAT_MATH_CONTINUE
  435. { UPROPS_SRC_MCM, 0 , isModifierCombiningMark }, // UCHAR_MODIFIER_COMBINING_MARK
  436. };
  437. U_CAPI UBool U_EXPORT2
  438. u_hasBinaryProperty(UChar32 c, UProperty which) {
  439. /* c is range-checked in the functions that are called from here */
  440. if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) {
  441. /* not a known binary property */
  442. return false;
  443. } else {
  444. const BinaryProperty &prop=binProps[which];
  445. return prop.contains(prop, c, which);
  446. }
  447. }
  448. /* Checks if the Unicode character can start a Unicode identifier.*/
  449. U_CAPI UBool U_EXPORT2
  450. u_isIDStart(UChar32 c) {
  451. return u_hasBinaryProperty(c, UCHAR_ID_START);
  452. }
  453. /* Checks if the Unicode character can be a Unicode identifier part other than starting the
  454. identifier.*/
  455. U_CAPI UBool U_EXPORT2
  456. u_isIDPart(UChar32 c) {
  457. return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE);
  458. }
  459. U_CAPI UBool U_EXPORT2
  460. u_stringHasBinaryProperty(const char16_t *s, int32_t length, UProperty which) {
  461. if (s == nullptr && length != 0) { return false; }
  462. if (length == 1) {
  463. return u_hasBinaryProperty(s[0], which); // single code point
  464. } else if (length == 2 || (length < 0 && *s != 0)) { // not empty string
  465. // first code point
  466. int32_t i = 0;
  467. UChar32 c;
  468. U16_NEXT(s, i, length, c);
  469. if (length > 0 ? i == length : s[i] == 0) {
  470. return u_hasBinaryProperty(c, which); // single code point
  471. }
  472. }
  473. // Only call into EmojiProps for a relevant property,
  474. // so that we not unnecessarily try to load its data file.
  475. return UCHAR_BASIC_EMOJI <= which && which <= UCHAR_RGI_EMOJI &&
  476. EmojiProps::hasBinaryProperty(s, length, which);
  477. }
  478. struct IntProperty;
  479. typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which);
  480. typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which);
  481. struct IntProperty {
  482. int32_t column; // SRC_PROPSVEC column, or "source" if mask==0
  483. uint32_t mask;
  484. int32_t shift; // =maxValue if getMaxValueFromShift() is used
  485. IntPropertyGetValue *getValue;
  486. IntPropertyGetMaxValue *getMaxValue;
  487. };
  488. static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) {
  489. /* systematic, directly stored properties */
  490. return static_cast<int32_t>(u_getUnicodeProperties(c, prop.column) & prop.mask) >> prop.shift;
  491. }
  492. static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) {
  493. return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift;
  494. }
  495. static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) {
  496. return prop.shift;
  497. }
  498. static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  499. return static_cast<int32_t>(u_charDirection(c));
  500. }
  501. static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  502. return static_cast<int32_t>(ubidi_getPairedBracketType(c));
  503. }
  504. static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
  505. return ubidi_getMaxValue(which);
  506. }
  507. static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  508. return static_cast<int32_t>(ublock_getCode(c));
  509. }
  510. static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
  511. return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK;
  512. }
  513. #if UCONFIG_NO_NORMALIZATION
  514. static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) {
  515. return 0;
  516. }
  517. #else
  518. static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  519. return u_getCombiningClass(c);
  520. }
  521. #endif
  522. static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  523. return static_cast<int32_t>(u_charType(c));
  524. }
  525. static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  526. return ubidi_getJoiningGroup(c);
  527. }
  528. static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  529. return ubidi_getJoiningType(c);
  530. }
  531. static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  532. int32_t ntv = static_cast<int32_t>(GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)));
  533. return UPROPS_NTV_GET_TYPE(ntv);
  534. }
  535. static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  536. UErrorCode errorCode=U_ZERO_ERROR;
  537. return static_cast<int32_t>(uscript_getScript(c, &errorCode));
  538. }
  539. static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
  540. return uprv_getMaxValues(0)&UPROPS_MAX_SCRIPT;
  541. }
  542. /*
  543. * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
  544. * Hangul_Syllable_Type used to be fully redundant with a subset of Grapheme_Cluster_Break.
  545. *
  546. * Starting with Unicode 16, this is no longer true for HST=V vs. GCB=V in some cases:
  547. * Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
  548. * they are of course not related to Hangul syllables.
  549. */
  550. static const UHangulSyllableType gcbToHst[]={
  551. U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */
  552. U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */
  553. U_HST_NOT_APPLICABLE, /* U_GCB_CR */
  554. U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */
  555. U_HST_LEADING_JAMO, /* U_GCB_L */
  556. U_HST_NOT_APPLICABLE, /* U_GCB_LF */
  557. U_HST_LV_SYLLABLE, /* U_GCB_LV */
  558. U_HST_LVT_SYLLABLE, /* U_GCB_LVT */
  559. U_HST_TRAILING_JAMO, /* U_GCB_T */
  560. U_HST_VOWEL_JAMO /* U_GCB_V */
  561. /*
  562. * Omit GCB values beyond what we need for hst.
  563. * The code below checks for the array length.
  564. */
  565. };
  566. static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  567. // Ignore supplementary code points: They all have HST=NA.
  568. // This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels).
  569. if(c>0xffff) {
  570. return U_HST_NOT_APPLICABLE;
  571. }
  572. /* see comments on gcbToHst[] above */
  573. int32_t gcb = static_cast<int32_t>(u_getUnicodeProperties(c, 2) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT;
  574. if(gcb<UPRV_LENGTHOF(gcbToHst)) {
  575. return gcbToHst[gcb];
  576. } else {
  577. return U_HST_NOT_APPLICABLE;
  578. }
  579. }
  580. #if UCONFIG_NO_NORMALIZATION
  581. static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) {
  582. return 0;
  583. }
  584. #else
  585. static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) {
  586. return static_cast<int32_t>(unorm_getQuickCheck(c, static_cast<UNormalizationMode>(which - UCHAR_NFD_QUICK_CHECK + UNORM_NFD)));
  587. }
  588. #endif
  589. #if UCONFIG_NO_NORMALIZATION
  590. static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) {
  591. return 0;
  592. }
  593. #else
  594. static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  595. return unorm_getFCD16(c)>>8;
  596. }
  597. #endif
  598. #if UCONFIG_NO_NORMALIZATION
  599. static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) {
  600. return 0;
  601. }
  602. #else
  603. static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
  604. return unorm_getFCD16(c)&0xff;
  605. }
  606. #endif
  607. static int32_t getInPC(const IntProperty &, UChar32 c, UProperty) {
  608. return ulayout_ensureData() && gInpcTrie != nullptr ? ucptrie_get(gInpcTrie, c) : 0;
  609. }
  610. static int32_t getInSC(const IntProperty &, UChar32 c, UProperty) {
  611. return ulayout_ensureData() && gInscTrie != nullptr ? ucptrie_get(gInscTrie, c) : 0;
  612. }
  613. static int32_t getVo(const IntProperty &, UChar32 c, UProperty) {
  614. return ulayout_ensureData() && gVoTrie != nullptr ? ucptrie_get(gVoTrie, c) : 0;
  615. }
  616. static int32_t layoutGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
  617. if (!ulayout_ensureData()) { return 0; }
  618. switch (which) {
  619. case UCHAR_INDIC_POSITIONAL_CATEGORY:
  620. return gMaxInpcValue;
  621. case UCHAR_INDIC_SYLLABIC_CATEGORY:
  622. return gMaxInscValue;
  623. case UCHAR_VERTICAL_ORIENTATION:
  624. return gMaxVoValue;
  625. default:
  626. return 0;
  627. }
  628. }
  629. static int32_t getIDStatusValue(const IntProperty & /*prop*/, UChar32 c, UProperty /*which*/) {
  630. uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT;
  631. return value >= UPROPS_ID_TYPE_ALLOWED_MIN ? U_ID_STATUS_ALLOWED : U_ID_STATUS_RESTRICTED;
  632. }
  633. static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
  634. /*
  635. * column, mask and shift values for int-value properties from u_getUnicodeProperties().
  636. * Must be in order of corresponding UProperty,
  637. * and there must be exactly one entry per int UProperty.
  638. *
  639. * Properties with mask==0 are handled in code.
  640. * For them, column is the UPropertySource value.
  641. */
  642. { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
  643. { UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue },
  644. { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift },
  645. { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue },
  646. { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue },
  647. { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_CHAR_CATEGORY_COUNT) - 1, getGeneralCategory, getMaxValueFromShift },
  648. { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue },
  649. { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue },
  650. { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue },
  651. { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_NT_COUNT) - 1, getNumericType, getMaxValueFromShift },
  652. { UPROPS_SRC_PROPSVEC, 0, 0, getScript, scriptGetMaxValue },
  653. { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_HST_COUNT) - 1, getHangulSyllableType, getMaxValueFromShift },
  654. // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes"
  655. { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift },
  656. // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes"
  657. { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift },
  658. // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE
  659. { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift },
  660. // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE
  661. { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift },
  662. { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift },
  663. { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift },
  664. { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue },
  665. { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue },
  666. { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue },
  667. { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue },
  668. { UPROPS_SRC_INPC, 0, 0, getInPC, layoutGetMaxValue },
  669. { UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue },
  670. { UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue },
  671. { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift },
  672. { 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue },
  673. };
  674. U_CAPI int32_t U_EXPORT2
  675. u_getIntPropertyValue(UChar32 c, UProperty which) {
  676. if(which<UCHAR_INT_START) {
  677. if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) {
  678. const BinaryProperty &prop=binProps[which];
  679. return prop.contains(prop, c, which);
  680. }
  681. } else if(which<UCHAR_INT_LIMIT) {
  682. const IntProperty &prop=intProps[which-UCHAR_INT_START];
  683. return prop.getValue(prop, c, which);
  684. } else if(which==UCHAR_GENERAL_CATEGORY_MASK) {
  685. return U_MASK(u_charType(c));
  686. }
  687. return 0; // undefined
  688. }
  689. U_CAPI int32_t U_EXPORT2
  690. u_getIntPropertyMinValue(UProperty /*which*/) {
  691. return 0; /* all binary/enum/int properties have a minimum value of 0 */
  692. }
  693. U_CAPI int32_t U_EXPORT2
  694. u_getIntPropertyMaxValue(UProperty which) {
  695. if(which<UCHAR_INT_START) {
  696. if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) {
  697. return 1; // maximum true for all binary properties
  698. }
  699. } else if(which<UCHAR_INT_LIMIT) {
  700. const IntProperty &prop=intProps[which-UCHAR_INT_START];
  701. return prop.getMaxValue(prop, which);
  702. }
  703. return -1; // undefined
  704. }
  705. U_CFUNC UPropertySource U_EXPORT2
  706. uprops_getSource(UProperty which) {
  707. if(which<UCHAR_BINARY_START) {
  708. return UPROPS_SRC_NONE; /* undefined */
  709. } else if(which<UCHAR_BINARY_LIMIT) {
  710. const BinaryProperty &prop=binProps[which];
  711. if(prop.mask!=0) {
  712. return UPROPS_SRC_PROPSVEC;
  713. } else {
  714. return (UPropertySource)prop.column;
  715. }
  716. } else if(which<UCHAR_INT_START) {
  717. return UPROPS_SRC_NONE; /* undefined */
  718. } else if(which<UCHAR_INT_LIMIT) {
  719. const IntProperty &prop=intProps[which-UCHAR_INT_START];
  720. if(prop.mask!=0) {
  721. return UPROPS_SRC_PROPSVEC;
  722. } else {
  723. return (UPropertySource)prop.column;
  724. }
  725. } else if(which<UCHAR_STRING_START) {
  726. switch(which) {
  727. case UCHAR_GENERAL_CATEGORY_MASK:
  728. case UCHAR_NUMERIC_VALUE:
  729. return UPROPS_SRC_CHAR;
  730. default:
  731. return UPROPS_SRC_NONE;
  732. }
  733. } else if(which<UCHAR_STRING_LIMIT) {
  734. switch(which) {
  735. case UCHAR_AGE:
  736. return UPROPS_SRC_PROPSVEC;
  737. case UCHAR_BIDI_MIRRORING_GLYPH:
  738. return UPROPS_SRC_BIDI;
  739. case UCHAR_CASE_FOLDING:
  740. case UCHAR_LOWERCASE_MAPPING:
  741. case UCHAR_SIMPLE_CASE_FOLDING:
  742. case UCHAR_SIMPLE_LOWERCASE_MAPPING:
  743. case UCHAR_SIMPLE_TITLECASE_MAPPING:
  744. case UCHAR_SIMPLE_UPPERCASE_MAPPING:
  745. case UCHAR_TITLECASE_MAPPING:
  746. case UCHAR_UPPERCASE_MAPPING:
  747. return UPROPS_SRC_CASE;
  748. case UCHAR_ISO_COMMENT:
  749. case UCHAR_NAME:
  750. case UCHAR_UNICODE_1_NAME:
  751. return UPROPS_SRC_NAMES;
  752. default:
  753. return UPROPS_SRC_NONE;
  754. }
  755. } else {
  756. switch(which) {
  757. case UCHAR_SCRIPT_EXTENSIONS:
  758. case UCHAR_IDENTIFIER_TYPE:
  759. return UPROPS_SRC_PROPSVEC;
  760. default:
  761. return UPROPS_SRC_NONE; /* undefined */
  762. }
  763. }
  764. }
  765. U_CFUNC void U_EXPORT2
  766. uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) {
  767. if (U_FAILURE(*pErrorCode)) { return; }
  768. if (src == UPROPS_SRC_ID_COMPAT_MATH) {
  769. // range limits
  770. for (UChar32 c : ID_COMPAT_MATH_CONTINUE) {
  771. sa->add(sa->set, c);
  772. }
  773. // single characters
  774. for (UChar32 c : ID_COMPAT_MATH_START) {
  775. sa->add(sa->set, c);
  776. sa->add(sa->set, c + 1);
  777. }
  778. return;
  779. }
  780. if (src == UPROPS_SRC_MCM) {
  781. // range limits
  782. for (UChar32 c : MODIFIER_COMBINING_MARK) {
  783. sa->add(sa->set, c);
  784. }
  785. return;
  786. }
  787. if (!ulayout_ensureData(*pErrorCode)) { return; }
  788. const UCPTrie *trie;
  789. switch (src) {
  790. case UPROPS_SRC_INPC:
  791. trie = gInpcTrie;
  792. break;
  793. case UPROPS_SRC_INSC:
  794. trie = gInscTrie;
  795. break;
  796. case UPROPS_SRC_VO:
  797. trie = gVoTrie;
  798. break;
  799. default:
  800. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  801. return;
  802. }
  803. if (trie == nullptr) {
  804. *pErrorCode = U_MISSING_RESOURCE_ERROR;
  805. return;
  806. }
  807. // Add the start code point of each same-value range of the trie.
  808. UChar32 start = 0, end;
  809. while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
  810. nullptr, nullptr, nullptr)) >= 0) {
  811. sa->add(sa->set, start);
  812. start = end + 1;
  813. }
  814. }
  815. U_CAPI bool U_EXPORT2
  816. u_hasIDType(UChar32 c, UIdentifierType type) {
  817. uint32_t typeIndex = type; // also guards against negative type integers
  818. if (typeIndex >= UPRV_LENGTHOF(uprops_idTypeToEncoded)) {
  819. return false;
  820. }
  821. uint32_t encodedType = uprops_idTypeToEncoded[typeIndex];
  822. uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT;
  823. if ((encodedType & UPROPS_ID_TYPE_BIT) != 0) {
  824. return value < UPROPS_ID_TYPE_FORBIDDEN && (value & encodedType) != 0;
  825. } else {
  826. return value == encodedType;
  827. }
  828. }
  829. namespace {
  830. void maybeAppendType(uint32_t value, uint32_t bit, UIdentifierType t,
  831. UIdentifierType *types, int32_t &length, int32_t capacity) {
  832. if ((value & bit) != 0) {
  833. if (length < capacity) {
  834. types[length] = t;
  835. }
  836. ++length;
  837. }
  838. }
  839. } // namespace
  840. U_CAPI int32_t U_EXPORT2
  841. u_getIDTypes(UChar32 c, UIdentifierType *types, int32_t capacity, UErrorCode *pErrorCode) {
  842. if (U_FAILURE(*pErrorCode)) { return 0; }
  843. if (capacity < 0 || (capacity > 0 && types == nullptr)) {
  844. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  845. return 0;
  846. }
  847. uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT;
  848. if ((value & UPROPS_ID_TYPE_FORBIDDEN) == UPROPS_ID_TYPE_FORBIDDEN ||
  849. value == UPROPS_ID_TYPE_NOT_CHARACTER) {
  850. // single value
  851. if (capacity > 0) {
  852. UIdentifierType t;
  853. switch (value) {
  854. case UPROPS_ID_TYPE_NOT_CHARACTER: t = U_ID_TYPE_NOT_CHARACTER; break;
  855. case UPROPS_ID_TYPE_DEPRECATED: t = U_ID_TYPE_DEPRECATED; break;
  856. case UPROPS_ID_TYPE_DEFAULT_IGNORABLE: t = U_ID_TYPE_DEFAULT_IGNORABLE; break;
  857. case UPROPS_ID_TYPE_NOT_NFKC: t = U_ID_TYPE_NOT_NFKC; break;
  858. case UPROPS_ID_TYPE_INCLUSION: t = U_ID_TYPE_INCLUSION; break;
  859. case UPROPS_ID_TYPE_RECOMMENDED: t = U_ID_TYPE_RECOMMENDED; break;
  860. default:
  861. *pErrorCode = U_INVALID_FORMAT_ERROR;
  862. return 0;
  863. }
  864. types[0] = t;
  865. } else {
  866. *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
  867. }
  868. return 1;
  869. } else {
  870. // one or more combinable bits
  871. int32_t length = 0;
  872. maybeAppendType(value, UPROPS_ID_TYPE_NOT_XID, U_ID_TYPE_NOT_XID,
  873. types, length, capacity);
  874. maybeAppendType(value, UPROPS_ID_TYPE_EXCLUSION, U_ID_TYPE_EXCLUSION,
  875. types, length, capacity);
  876. maybeAppendType(value, UPROPS_ID_TYPE_OBSOLETE, U_ID_TYPE_OBSOLETE,
  877. types, length, capacity);
  878. maybeAppendType(value, UPROPS_ID_TYPE_TECHNICAL, U_ID_TYPE_TECHNICAL,
  879. types, length, capacity);
  880. maybeAppendType(value, UPROPS_ID_TYPE_UNCOMMON_USE, U_ID_TYPE_UNCOMMON_USE,
  881. types, length, capacity);
  882. maybeAppendType(value, UPROPS_ID_TYPE_LIMITED_USE, U_ID_TYPE_LIMITED_USE,
  883. types, length, capacity);
  884. if (length >= capacity) {
  885. *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
  886. }
  887. return length;
  888. }
  889. }
  890. #if !UCONFIG_NO_NORMALIZATION
  891. U_CAPI int32_t U_EXPORT2
  892. u_getFC_NFKC_Closure(UChar32 c, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
  893. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  894. return 0;
  895. }
  896. if(destCapacity<0 || (dest==nullptr && destCapacity>0)) {
  897. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  898. return 0;
  899. }
  900. // Compute the FC_NFKC_Closure on the fly:
  901. // We have the API for complete coverage of Unicode properties, although
  902. // this value by itself is not useful via API.
  903. // (What could be useful is a custom normalization table that combines
  904. // case folding and NFKC.)
  905. // For the derivation, see Unicode's DerivedNormalizationProps.txt.
  906. const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode);
  907. if(U_FAILURE(*pErrorCode)) {
  908. return 0;
  909. }
  910. // first: b = NFKC(Fold(a))
  911. UnicodeString folded1String;
  912. const char16_t *folded1;
  913. int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT);
  914. if(folded1Length<0) {
  915. const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc);
  916. if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) {
  917. return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC
  918. }
  919. folded1String.setTo(c);
  920. } else {
  921. if(folded1Length>UCASE_MAX_STRING_LENGTH) {
  922. folded1String.setTo(folded1Length);
  923. } else {
  924. folded1String.setTo(false, folded1, folded1Length);
  925. }
  926. }
  927. UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode);
  928. // second: c = NFKC(Fold(b))
  929. UnicodeString folded2String(kc1);
  930. UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode);
  931. // if (c != b) add the mapping from a to c
  932. if(U_FAILURE(*pErrorCode) || kc1==kc2) {
  933. return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
  934. } else {
  935. return kc2.extract(dest, destCapacity, *pErrorCode);
  936. }
  937. }
  938. #endif